In [None]:
# https://www.kaggle.com/ronitf/heart-disease-uci

# Helpful Resource
# https://machinelearningmastery.com/overfitting-machine-learning-models/

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import shap
from sklearn import tree
from sklearn.metrics import precision_recall_curve , plot_precision_recall_curve , accuracy_score , precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz


# age
# sex
# chest pain type (4 values)
# resting blood pressure
# serum cholestoral in mg/dl
# fasting blood sugar > 120 mg/dl
# resting electrocardiographic results (values 0,1,2)
# thalach = maximum heart rate achieved
# exercise induced angina
# oldpeak = ST depression induced by exercise relative to rest
# the slope of the peak exercise ST segment
# ca = number of major vessels (0-3) colored by flourosopy (i.e major vessels that can be seen)
# thal: (thalassemia) 3 = normal; 6 = fixed defect; 7 = reversable defect


### A look at the data

In [None]:
data = pd.read_csv("../input/heart-disease-uci/heart.csv")
data.head(20)

### Correlation Matrix

In [None]:
corr = data.drop("target" , 1).corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

f, ax = plt.subplots(figsize=(20, 15))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
_=sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.drop("target" , 1) , data.target, test_size=.2)

## Comparisons between different models

### ***Using a Decision Tree Model***

In [None]:
plt.figure(figsize=(50,50))
model = DecisionTreeClassifier(criterion='entropy')
_ = model.fit(X_train, y_train)
_ = tree.plot_tree(model , feature_names=[x for x in data.columns])

#### Cross Validation

In [None]:
print('10 fold')
scores = cross_val_score(model , X_test,y_test , cv=10 , scoring="accuracy") # cv=10
print("%0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))
scores = cross_val_score(model , X_test,y_test , cv=10 , scoring="recall")
print("%0.3f recall with a standard deviation of %0.3f" % (scores.mean(), scores.std()))
print('\n')

print('5 fold')
scores = cross_val_score(model , X_test,y_test , cv=5 , scoring="accuracy")# cv=5
print("%0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))
scores = cross_val_score(model , X_test,y_test , cv=5 , scoring="recall")
print("%0.3f recall with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

#### Confusion Matrix

In [None]:
y_pred_single_decision_tree=model.predict(X_test)
# cm = confusion_matrix(y_train , y_pred_single_decision_tree)
print(accuracy_score(y_test , y_pred_single_decision_tree))
print(precision_score(y_test , y_pred_single_decision_tree))
_ = plot_confusion_matrix(model, X_test , y_test)

#### Confusion Matrix Reports

In [None]:
# Precision = TruePositives / (TruePositives + FalsePositives)
# Recall = TruePositives / (TruePositives + FalseNegatives)
_ = plot_precision_recall_curve(model, X_test, y_test)
print(classification_report(y_test, y_pred_single_decision_tree))

#### Will it overfit?

In [None]:
train_scores, test_scores = list(), list()
values = [i for i in range(1, 31)]
for i in values:
  model = DecisionTreeClassifier(criterion='entropy',max_depth=i)
  model.fit(X_train, y_train)
  train_yhat = model.predict(X_train)
  train_acc = accuracy_score(y_train , train_yhat)
  train_scores.append(train_acc)
  test_yhat = model.predict(X_test)
  test_acc = accuracy_score(y_test, test_yhat)
  test_scores.append(test_acc)
  print('>%d, train: %.3f, test: %.3f' % (i, train_acc, test_acc))
plt.plot(values, train_scores, '-o', label='Train')
plt.plot(values, test_scores, '-o', label='Test')
plt.legend()
plt.show()

#### Explanation

In [None]:
shap.initjs()
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test , y_test)
shap.summary_plot(shap_values, X_test)
shap.summary_plot(shap_values[0], X_test)
shap.summary_plot(shap_values[1], X_test)

#### Dependence Plot for Predicted Heart Disease

In [None]:
shap_values_ind = explainer.shap_values(X_test)

In [None]:
for col in X_train.columns:
  shap.dependence_plot(col , shap_values_ind[1] , X_test)

#### Dependence Plot for predicted No Heart Disease

In [None]:
for col in X_train.columns:
  shap.dependence_plot(col , shap_values_ind[0] , X_test)

#### *Conclusion*

---


For a Decision Tree Model, the number of major vessels colored by flourosopy (i.e 'ca' feature), chest pain level (i.e, 'cp' feature) and thalassemia (i.e 'thal' feature) tend to cause the greatest impact to predicting whether a patient is diagnosed with heart disease or not. Accuracy for a Decision Tree is not the best on either number of folds (5 or 10).

### ***Using a Random Forest Model***

In [None]:
model = RandomForestClassifier(criterion='entropy' , n_estimators=500)
_ = model.fit(X_train, y_train)

#### Cross Validation

In [None]:
print('10 fold')
scores = cross_val_score(model , X_test,y_test , cv=10 , scoring="accuracy") # cv=10
print("%0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))
scores = cross_val_score(model , X_test,y_test , cv=10 , scoring="recall")
print("%0.3f recall with a standard deviation of %0.3f" % (scores.mean(), scores.std()))
print('\n')

print('5 fold')
scores = cross_val_score(model , X_test,y_test , cv=5 , scoring="accuracy")# cv=5
print("%0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))
scores = cross_val_score(model , X_test,y_test , cv=5 , scoring="recall")
print("%0.3f recall with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

#### Confusion Matrix

In [None]:
y_pred_random_forest_tree=model.predict(X_test)
cm = confusion_matrix(y_test , y_pred_random_forest_tree)
print(accuracy_score(y_test , y_pred_random_forest_tree))
print(precision_score(y_test , y_pred_random_forest_tree))
_ = plot_confusion_matrix(model, X_test , y_test)

#### Confusion Matrix Reports

In [None]:
# Precision = TruePositives / (TruePositives + FalsePositives)
# Recall = TruePositives / (TruePositives + FalseNegatives)
_ = plot_precision_recall_curve(model, X_test, y_test)
print(classification_report(y_test, y_pred_random_forest_tree))

#### Explanation

In [None]:
shap.initjs()
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test , y_test)
shap.summary_plot(shap_values, X_test)
shap.summary_plot(shap_values[0], X_test)
shap.summary_plot(shap_values[1], X_test)

#### Dependence Plot for Predicted Heart Disease

In [None]:
shap_values_ind = explainer.shap_values(X_test)

In [None]:
for col in X_train.columns:
  shap.dependence_plot(col , shap_values_ind[1] , X_test)

#### Dependence Plot for predicted No Heart Disease

In [None]:
for col in X_train.columns:
  shap.dependence_plot(col , shap_values_ind[0] , X_test)

#### *Conclusion* 

---

The most impactful features using a Random Forest Model are (like a Decision Tree Model), thal, ca, and cp. However, the recall, accuracy, and cross validation scores of a Random Forest Model tend to be moderately better than a Decision Tree Model.


### ***Using a Logistic Regression Model***

In [None]:
for col in data.columns.drop(['target' , 'sex'] , 1):
  X = data[col].values.reshape(-1,1)
  Y = data['target'].values.reshape(-1,1)
  model = LogisticRegression(max_iter=1000).fit(X, np.ravel(Y.astype(float)))
  x_pred_proba = model.predict_proba(X)
  plt.xlabel(col)
  plt.ylabel('target')
  plt.scatter(X , Y)
  plt.scatter(X , x_pred_proba[:,0]) # plot each col probability of them NOT having heart disease 
  plt.show()

#### Cross Validation

In [None]:
model = LogisticRegression(max_iter=1000).fit(X_train, y_train)

In [None]:
print('10 fold')
scores = cross_val_score(model , X_test,y_test , cv=10 , scoring="accuracy") # cv=10
print("%0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))
scores = cross_val_score(model , X_test,y_test , cv=10 , scoring="recall")
print("%0.3f recall with a standard deviation of %0.3f" % (scores.mean(), scores.std()))
print('\n')

print('5 fold')
scores = cross_val_score(model , X_test,y_test , cv=5 , scoring="accuracy")# cv=5
print("%0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))
scores = cross_val_score(model , X_test,y_test , cv=5 , scoring="recall")
print("%0.3f recall with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

#### Confusion Matrix

In [None]:
y_pred_log_regression=model.predict(X_test)
cm = confusion_matrix(y_test , y_pred_log_regression)
print(accuracy_score(y_test , y_pred_log_regression))
print(precision_score(y_test , y_pred_log_regression))
_ = plot_confusion_matrix(model , X_test , y_test)

#### Confusion Matrix Reports

In [None]:
_ = plot_precision_recall_curve(model, X_test, y_test)
print(classification_report(y_test, y_pred_log_regression))

#### Explanation

In [None]:
shap.initjs()
model.fit(X_train , y_train)
explainer = shap.KernelExplainer(model.predict , X_test)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)

#### Dependence Plot for Predicted No Heart Disease

In [None]:
for col in X_train.columns:
  shap.dependence_plot(col , shap_values_ind[1] , X_test)

#### Dependence Plot for Predicted Heart Disease

In [None]:
for col in X_train.columns:
  shap.dependence_plot(col , shap_values_ind[1] , X_test)

#### *Conclusion* 

---



## Final predictions and conclusions across all models

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.drop("target" , 1) , data.target, test_size=.2)
pd.set_option("max_rows", None)

### Decision Tree

In [None]:
decision_tree_model = DecisionTreeClassifier(criterion='entropy')
_=decision_tree_model.fit(X_train , y_train)
decision_tree_y_predictions = decision_tree_model.predict(X_test)
decision_tree_y_predictions = pd.DataFrame(decision_tree_y_predictions)
decision_tree_y_predictions = decision_tree_y_predictions.values

In [None]:
df = pd.DataFrame(X_test , copy=True)
df['actual'] = y_test
df['predicted'] = decision_tree_y_predictions.astype(int)
df

### Random Forest

In [None]:
random_forest_model = RandomForestClassifier(criterion='entropy' , n_estimators=500)
_ = random_forest_model.fit(X_train, y_train)
random_forest_model = random_forest_model.predict(X_test)
random_forest_model = pd.DataFrame(random_forest_model)
random_forest_model = random_forest_model.values

In [None]:
df = pd.DataFrame(X_test , copy=True)
df['actual'] = y_test
df['predicted'] = random_forest_model.astype(int)
df

### Logistic Regression

In [None]:
logistic_regression_model = LogisticRegression(max_iter=1000).fit(X_train, y_train)
_ = logistic_regression_model.fit(X_train, y_train)
logistic_regression_model = logistic_regression_model.predict(X_test)
logistic_regression_model = pd.DataFrame(logistic_regression_model)
logistic_regression_model = logistic_regression_model.values

In [None]:
df = pd.DataFrame(X_test , copy=True)
df['actual'] = y_test
df['predicted'] = logistic_regression_model.astype(int)
df

### Conclusion

---

Random forests and logistic regression consistently outperform Decision Trees in terms of correct predicitions. Given the dataset is so small, the Decision tree model did not tend to overfit for this dataset (due to the lack of noise), but analysis was done anyway to showcase the plots and values of training and testing. 

Across all three algorithms, the most impactful features tend to be the same, with 'ca'(i.e fluoroscopy), 'thal' (i.e thalassemia) and 'cp' (i.e chest pain), consistently ranking among the top 3 most impactful features.

In a realistic setting, a Random Forest seems most appropriate due to the slighlty better and consistent precision, recall and accuracy values, as well as substantial faster explanation runtimes. 

The purpose of the notebook is just to compare multiple algorithms in terms of performance and predictions, in a production setting I would have skipped the single Decision Tree model entirely and opted for a Random Forest (a random forest is really just multiple decision trees with their average taken).

In any case, I (wrongfully) thought cholesterol and age would be the two most signficant factors in properly classifying heart disease. However, given the other features I was clearly wrong in my original hypothesis, so I did learn something. 