In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
import eli5
from eli5.sklearn import PermutationImportance

In [2]:
df = pd.read_csv("input/heart.csv")

In [3]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


# Attributes
1. Age
2. Sex
  * 0 = female
  * 1 = make
3. Chest pain type (cp)
  * 1 = typical angina
  * 2 = atypical angina
  * 3 = non-anginal pain
  * 4 = asymptomatic
4. Resting blood presure (trestbps)
5. Serum cholestoral in mg/dl (chol)
6. Fasting blood sugar > 120 mg/dl (fbs)
7. Resting electrocardiographic results (restecg)
  * Values 0-2
8. Maximum heart rate achieved (thalach)
9. Exercise induced angina (exang)
  * 1 = yes; 0 = no
10. ST depression induced by exercise relative to rest (oldpeak)
11. Slope of the peak exercise ST segment (slope)
12. Number of major vessels colored by flourosopy
  * 0-3
13. thal
  * 3 = normal
  * 6 = fixed defect
  * 7 = reversable defect
14. target (Heart disease)
  * 0 = no
  * 1 = yes

In [4]:
df.sample(3)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
108,50,0,1,120,244,0,1,162,0,1.1,2,0,2,1
213,61,0,0,145,307,0,0,146,1,1.0,1,0,3,0


In [5]:
df['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

# Exploratory Data Analysis:


In [7]:
df.dtypes.value_counts()

int64      13
float64     1
dtype: int64

#### Since some of the attributes have multiple categories, we will use *one-hot encoding* to get rid of any unwanted bias that could arise from arbitrary numbering of categories

In [8]:
df = pd.get_dummies(df)
print("Shape: ", df.shape)

Shape:  (303, 14)


#### Now we will create our training and testing sets from the dataframe

In [9]:
train, test = train_test_split(df, test_size=0.30, random_state=42)

In [10]:
print("Train Shape: ", train.shape)
print("Test Shape: ", test.shape)

Train Shape:  (212, 14)
Test Shape:  (91, 14)


#### We've dealt with categorical variables, so we can start looking for correlations between each attribute and the target
We'll start by looking at the Pearson correlation coefficients to give us an idea of possible relationships

In [15]:
correlations = train.corr()['target'].sort_values()
print('Correlations (Most negative to most positive): \n', correlations)

Correlations (Most negative to most positive): 
 exang      -0.464248
oldpeak    -0.433180
ca         -0.426944
thal       -0.390069
sex        -0.298758
age        -0.224143
chol       -0.053322
trestbps   -0.053108
fbs         0.038448
restecg     0.083150
slope       0.344297
thalach     0.403860
cp          0.433499
target      1.000000
Name: target, dtype: float64


---

# Random Forest
#### We'll start off with a random forest classifier, which is an ensemble algorithm that works through bootstrap aggregating, or bagging.

In [58]:
X_train = train.drop('target', 1)
X_test = test.drop('target', 1)
y_train = train['target']
y_test = test['target']

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [59]:
pred = model.predict(X_test)

In [60]:
accuracy = metrics.accuracy_score(y_test, pred)
print('Accuracy: ', accuracy)

Accuracy:  0.7912087912087912


In [66]:
perm = PermutationImportance(model, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

Weight,Feature
0.0725  ± 0.0298,ca
0.0440  ± 0.0520,cp
0.0418  ± 0.0215,sex
0.0396  ± 0.0513,oldpeak
0.0242  ± 0.0292,thalach
0.0220  ± 0.0393,age
0.0110  ± 0.0241,restecg
-0.0000  ± 0.0139,fbs
-0.0000  ± 0.0311,exang
-0.0044  ± 0.0357,chol
