### Imports

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import cufflinks as cf
cf.go_offline()

%matplotlib inline

### Dataset information

In [None]:
dataset = pd.read_csv('../input/heart-disease-uci/heart.csv')
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.describe()

#### Description of various column fields in the data set

* age: The person's age in years
* sex: The person's sex (1 = male, 0 = female)
* cp: The chest pain experienced (Value 0: Absent Value 1: typical angina, Value 2: atypical angina, Value 3: non-anginal pain)
* trestbps: The person's resting blood pressure (mm Hg on admission to the hospital)
* chol: The person's cholesterol measurement in mg/dl
* fbs: The person's fasting blood sugar (> 120 mg/dl, 1 = true; 0 = false)
* restecg: Resting electrocardiographic measurement (0 = normal, 1 = having ST-T wave abnormality, 2 = showing probable or definite left ventricular hypertrophy by Estes' criteria)
* thalach: The person's maximum heart rate achieved
* exang: Exercise induced angina (1 = yes; 0 = no)
* oldpeak: ST depression induced by exercise relative to rest ('ST' relates to positions on the ECG plot.)
* slope: the slope of the peak exercise ST segment (Value 1: upsloping, Value 2: flat, Value 3: downsloping)
* ca: The number of major vessels (0-3)
* thal: A blood disorder called thalassemia (3 = normal; 6 = fixed defect; 7 = reversable defect)\n
* target: Heart disease (0 = no, 1 = yes)

### Exploring Target and Features

##### Target 

In [None]:
display(dataset['target'].value_counts())
print("\n")
sns.countplot(dataset['target'])

##### Visualing data using T-SNE

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(learning_rate=100)

tsne_features = tsne.fit_transform(dataset)

dataset['X'] = tsne_features[:,0]
dataset['Y'] = tsne_features[:,1]

In [None]:
#f, axes = plt.subplots(2, 2, figsize=(25, 20), sharex=True)
#sns.despine(left=True)
fig = plt.figure(figsize=(25,30))

ax1 = fig.add_subplot(421)
sns.scatterplot(data=dataset, x='X', y='Y', hue='target',ax=ax1, s=120)
ax1.set_title('Target', fontsize=30)
ax1.set_xticks([])
ax1.set_yticks([])

ax2 = fig.add_subplot(422)
sns.scatterplot(data=dataset, x='X', y='Y', hue='cp', palette="viridis", ax=ax2, s=120)
ax2.set_title('Chest Pain', fontsize=30)
ax2.set_xticks([])
ax2.set_yticks([])

ax3 = fig.add_subplot(423)
sns.scatterplot(data=dataset, x='X', y='Y', hue='restecg', palette="viridis", ax=ax3, s=120)
ax3.set_title('Rest ECG', fontsize=30)
ax3.set_xticks([])
ax3.set_yticks([])

ax4 = fig.add_subplot(424)
sns.scatterplot(data=dataset, x='X', y='Y', hue='thalach', palette="hsv", ax=ax4, s=120)
ax4.set_title('Max Heart Rate', fontsize=30)
ax4.set_xticks([])
ax4.set_yticks([])

ax5 = fig.add_subplot(425)
sns.scatterplot(data=dataset, x='X', y='Y', hue='chol', palette="hsv", ax=ax5, s=120)
ax5.set_title('Cholestrol', fontsize=30)
ax5.set_xticks([])
ax5.set_yticks([])
plt.rc('figure', titlesize=40)
plt.rc('legend', fontsize=20)

ax5 = fig.add_subplot(426)
sns.scatterplot(data=dataset, x='X', y='Y', hue='oldpeak', palette="twilight", ax=ax5, s=120)
ax5.set_title('Oldpeak', fontsize=30)
ax5.set_xticks([])
ax5.set_yticks([])
plt.rc('figure', titlesize=40)
plt.rc('legend', fontsize=20) 

ax6 = fig.add_subplot(427)
sns.scatterplot(data=dataset, x='X', y='Y', hue='fbs', ax=ax6, s=120)
ax6.set_title('Fasting Blood Sugar', fontsize=30)
ax6.set_xticks([])
ax6.set_yticks([])
plt.rc('figure', titlesize=40)
plt.rc('legend', fontsize=20) 

ax7 = fig.add_subplot(428)
sns.scatterplot(data=dataset, x='X', y='Y', hue='trestbps', palette="hsv",ax=ax7, s=120)
ax7.set_title('Resting Blood Pressure', fontsize=30)
ax7.set_xticks([])
ax7.set_yticks([])
#plt.tight_layout(h_pad=25)

#### Observations from Tsne- plot

* Chest pain and ECG reports are the key factors in identifying whether a person is suffering from a heart disease or not.

* Patients with heart rate around 100 seem to be healthy 
* Patients with high blood sugar are at risk

Changing column names to improve understanding

In [None]:
dataset.columns = ['age', 'sex', 'chest_pain', 'rest_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate',
       'exercise_induced_angina', 'st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target','X','Y']

Assigning categorical variables by actual description inorder to improve readablility for visualization

In [None]:
dataset['sex'][dataset['sex'] == 0] = 'female'
dataset['sex'][dataset['sex'] == 1] = 'male'

dataset['chest_pain'][dataset['chest_pain'] == 1] = 'typical angina'
dataset['chest_pain'][dataset['chest_pain'] == 2] = 'atypical angina'
dataset['chest_pain'][dataset['chest_pain'] == 3] = 'non-anginal pain'

dataset['fasting_blood_sugar'][dataset['fasting_blood_sugar'] == 0] = 'low'
dataset['fasting_blood_sugar'][dataset['fasting_blood_sugar'] == 1] = 'high'

dataset['rest_ecg'][dataset['rest_ecg'] == 0] = 'normal'
dataset['rest_ecg'][dataset['rest_ecg'] == 1] = 'ST-T wave abnormality'
dataset['rest_ecg'][dataset['rest_ecg'] == 2] = 'left ventricular hypertrophy'

dataset['st_slope'][dataset['st_slope'] == 1] = 'upsloping'
dataset['st_slope'][dataset['st_slope'] == 2] = 'flat'
dataset['st_slope'][dataset['st_slope'] == 3] = 'downsloping'

dataset['thalassemia'][dataset['thalassemia'] == 1] = 'normal'
dataset['thalassemia'][dataset['thalassemia'] == 2] = 'fixed defect'
dataset['thalassemia'][dataset['thalassemia'] == 3] = 'reversable defect'

In [None]:
dataset.head()

##### Patient's Medical Report vs Age (Categorized by Target variable)

In [None]:
fig = plt.figure(figsize=(25,15))

ax1 = fig.add_subplot(221)
sns.swarmplot(data=dataset, x='rest_ecg', y='age', hue='target',ax=ax1, s=10)
ax1.set_title('Rest ECG', fontsize=30)

ax2 = fig.add_subplot(222)
sns.swarmplot(data=dataset, x='chest_pain', y='age', hue='target',ax=ax2, s=10)
ax2.set_title('Chest Pain', fontsize=30)

ax3 = fig.add_subplot(223)
sns.swarmplot(data=dataset, x='thalassemia', y='age', hue='target',ax=ax3, s=10)
ax3.set_title('Thal', fontsize=30)

ax4 = fig.add_subplot(224)
sns.swarmplot(data=dataset, x='num_major_vessels', y='age', hue='target',ax=ax4, s=10)
ax4.set_title('No.of major Vessels', fontsize=30)

In [None]:
dataset = pd.get_dummies(dataset, drop_first=True)

dataset.head()

##### Splitting the data

In [None]:
from sklearn.model_selection import train_test_split

X = dataset.drop(['target','X','Y'], axis=1)
y = dataset['target']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=10, stratify=y)

In [None]:
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

--------------------






#### Random Forrest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


rf = RandomForestClassifier()

params = {
                'max_depth' : [1,2,3,4,5,6],
               'min_samples_leaf' : [0.01,0.02,0.04,0.06],
                'max_features' : [0.1,0.2,0.4,0.8],
                'n_estimators' : [100,150,200,250,300]
                
        }

rf_cv = GridSearchCV(estimator=rf,
                          param_grid=params,
                       ##    n_iter=100,
                          cv=10,
                          scoring='accuracy',
                          n_jobs=-1,
                           verbose=3
                          )

In [None]:
rf_cv.fit(X_train, y_train)

y_pred = rf_cv.predict(X_test)
y_pred_proba = rf_cv.predict_proba(X_test)[:,1]

In [None]:
rf_best_est = rf_cv.best_estimator_

rf_cv.best_params_

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

fig, ax = plt.subplots()
ax.plot(fpr, tpr)
ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c=".3")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.rcParams['font.size'] = 12
plt.title('ROC curve for diabetes classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)

In [None]:
auc(fpr, tpr)

In [None]:
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
print("ROC_AUC score")
print(roc_auc_score(y_test, y_pred))

print("\nTraining Score")
print(rf_best_est.score(X_train, y_train))

print("\nTesting Score")
print(rf_best_est.score(X_test, y_test))

print("\n")
print(confusion_matrix(y_test, y_pred))
print("\n")
print(classification_report(y_test, y_pred))



In [None]:
cnf_matrix = confusion_matrix(y_test, y_pred)
class_names = ['-VE','+VE']
np.set_printoptions(precision=2)


plt.figure(figsize=(8,6))
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, 
                      title='Normalized confusion matrix')

-----------------------------------------------------------------


#### Import features 

1. Using Random forrest Feature importance

In [None]:
# Get feature importance
selected_features = X.columns.to_list()
feature_importance = pd.DataFrame(selected_features, columns = ["Feature Label"])
feature_importance["Feature Importance"] = rf_best_est.feature_importances_

# Sort by feature importance
feature_importance = feature_importance.sort_values(by="Feature Importance", ascending=False)

# Set graph style
sns.set(font_scale = 1.75)
sns.set_style({"axes.facecolor": "1.0", "axes.edgecolor": "0.85", "grid.color": "0.85",
               "grid.linestyle": "-", 'axes.labelcolor': '0.4', "xtick.color": "0.4",
               'ytick.color': '0.4'})

# Set figure size and create barplot
f, ax = plt.subplots(figsize=(12, 9))
sns.barplot(x = "Feature Importance", y = "Feature Label",
            palette = reversed(sns.color_palette('YlOrRd', 15)),  data = feature_importance)

# Generate a bolded horizontal line at y = 0
ax.axvline(x = 0, color = 'black', linewidth = 4, alpha = .7)

# Turn frame off
ax.set_frame_on(False)

# Tight layout
plt.tight_layout()

# Save Figure
plt.savefig("feature_importance.png", dpi = 1080)

2. Using Permutation importance

In [None]:
import eli5 
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(rf_best_est, random_state=105).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X.columns.to_list())

In [None]:
from sklearn.tree import export_graphviz

estimator = rf_best_est.estimators_[18]
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = selected_features,
                class_names = ['no disease','disease'],
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')

In [None]:
from pdpbox import pdp, get_dataset, info_plots

feature_names = X.columns.values.tolist()
X_test_df = pd.DataFrame(data=X_test, columns=feature_names)

feat_name = 'num_major_vessels'
pdp_dist = pdp.pdp_isolate(model=rf_best_est, dataset=X_test_df, model_features=feature_names, feature=feat_name)

pdp.pdp_plot(pdp_dist, feat_name)
plt.show()

Higher the number of blood vessels lower is the probability of heart attack

In [None]:
feat_name = 'age'
pdp_dist = pdp.pdp_isolate(model=rf_best_est, dataset=X_test_df, model_features=feature_names, feature=feat_name)

pdp.pdp_plot(pdp_dist, feat_name)
plt.show()

with increase in age it seems that the risk is less which is quite not true.

In [None]:
feat_name = 'cholesterol'
pdp_dist = pdp.pdp_isolate(model=rf_best_est, dataset=X_test_df, model_features=feature_names, feature=feat_name)

pdp.pdp_plot(pdp_dist, feat_name)
plt.show()

Decrease in value of cholesterol decreases the risk of heart disease

In [None]:
feat_name = 'max_heart_rate'
pdp_dist = pdp.pdp_isolate(model=rf_best_est, dataset=X_test_df, model_features=feature_names, feature=feat_name)

pdp.pdp_plot(pdp_dist, feat_name)
plt.show()

In [None]:
feat_name = 'st_depression'
pdp_dist = pdp.pdp_isolate(model=rf_best_est, dataset=X_test_df, model_features=feature_names, feature=feat_name)

pdp.pdp_plot(pdp_dist, feat_name)
plt.show()

In [None]:
import shap 

explainer = shap.TreeExplainer(rf_best_est)
shap_values = explainer.shap_values(X_test_df)

shap.summary_plot(shap_values[1], X_test_df, plot_type="bar")

In [None]:
shap.summary_plot(shap_values[1], X_test_df)

As clearly seen in the above shap summary,
* Low value(blue on right) of thalassemia_refersilble defect leads to higher probability of heart disease
* Same is for number of blood vessels, exercise and others.
* The opposite is for exercise induced angina

Let's see how different variables affect prediction for individual patients 

In [None]:
def plot_shap(model, patient):

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(patient)
    shap.initjs()
    return shap.force_plot(explainer.expected_value[1], shap_values[1], patient)

In [None]:
patient = X_test_df.iloc[1,:].astype(float)
plot_shap(rf_best_est, patient)

For above patient, predicted score is 0.73 as compared to the base line value 0.5484.

As we can see the values in red have caused the prediction to go high, we can compare the same from the shap summary plot above

In [None]:
patient = X_test_df.iloc[2,:].astype(float)
plot_shap(rf_best_est, patient)

In [None]:
patient = X_test_df.iloc[5,:].astype(float)
plot_shap(rf_best_est, patient)

In [None]:
shap.dependence_plot('num_major_vessels', shap_values[1], X_test_df, interaction_index="st_depression")

It is now clear that low value of number of vessels causes high risk. There is small impact of st_depression as seen from the colors

Disease(Red) vs No disease(Blue) for 50 patients

We can hover on below data to get information for each patient

In [None]:
shap_values = explainer.shap_values(X_train.iloc[:50])
shap.force_plot(explainer.expected_value[1], shap_values[1], X_test_df.iloc[:50])