In [None]:
import numpy as np
import pandas as pd
import shap
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Checking data information

In [None]:
heart_data = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')
o2_data    = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/o2Saturation.csv')
print('HEART DATA')
print(heart_data.info())
print('SATURATION DATA')
print(o2_data.info())

# Checking proportion of heart attack likeliness on data (0 = less chance; 1 = more chance)

In [None]:
heart_counter = Counter(heart_data['output']); heart_counter

# Checking correlation between variables

## Variable information
### Found original feature description in [here](http://rstudio-pubs-static.s3.amazonaws.com/24341_184a58191486470cab97acdbbfe78ed5.html)
- age -> age in years
- sex -> sex (1 = male; 0 = female)
- cp -> chest pain type (0 = typical angina; 1 = atypical angina; 2 = non-anginal pain; 3 = asymptomatic)
- trestbps -> resting blood pressure (in mm Hg on admission to the hospital)
- chol -> serum cholestoral in mg/dl
- fbs -> fasting blood sugar > 120 mg/dl (1 = true; 0 = false)
- restecg -> resting electrocardiographic results (0 = normal; 1 = having ST-T; 2 = hypertrophy)
- thalach -> maximum heart rate achieved
- exng -> exercise induced angina (1 = yes; 0 = no)
- oldpeak -> ST depression induced by exercise relative to rest
- slope -> the slope of the peak exercise ST segment (0 = upsloping; 1 = flat; 2 = downsloping)
- caa -> number of major vessels (0-3) colored by flourosopy
- thal -> Thalium stress test result (3 = normal; 6 = fixed defect; 7 = reversable defect) - **In this data set, the values are 0, 1, 2, and 3. Don't know how to interpret it**
- num -> the predicted attribute - diagnosis of heart disease (angiographic disease status) (Value 0 = < 50% diameter narrowing; Value 1 = > 50% diameter narrowing)

In [None]:
plt.figure(figsize=(18,12))
plt.title('Correlation of variables')
sns.heatmap(heart_data.corr(), annot=True)
plt.show()

# Checking distribution of:
- cp (chest pain type)
- restecg (resting electrocard. results)
- thalachh (max heart rate achieved)
- slp (slope of the peak exercise ST segment)

In [None]:
fig=plt.figure(figsize=(18, 12))
fig = plt.subplot(221)
sns.distplot(heart_data[heart_data['output'] == 0]["cp"], color='green',label='Unlikely HT') 
sns.distplot(heart_data[heart_data['output'] == 1]["cp"], color='red',label='Likely HT') #Red for heart disease
plt.title('HT distibution over chest pain type', fontsize=15)
plt.legend()

fig = plt.subplot(222)
sns.distplot(heart_data[heart_data['output'] == 0]["restecg"], color='green',label='Unlikely HT') 
sns.distplot(heart_data[heart_data['output'] == 1]["restecg"], color='red',label='Likely HT') #Red for heart disease
plt.title('HT distibution over resting eletrocard. results', fontsize=15)
plt.legend()

fig = plt.subplot(223)
sns.distplot(heart_data[heart_data['output'] == 0]["thalachh"], color='green',label='Unlikely HT') 
sns.distplot(heart_data[heart_data['output'] == 1]["thalachh"], color='red',label='Likely HT') #Red for heart disease
plt.title('HT distibution over max. heart rate achieved', fontsize=15)
plt.legend()

fig = plt.subplot(224)
sns.distplot(heart_data[heart_data['output'] == 0]["slp"], color='green',label='Unlikely HT') 
sns.distplot(heart_data[heart_data['output'] == 1]["slp"], color='red',label='Likely HT') #Red for heart disease
plt.title('HT distibution over slope of peak exercise ST segment', fontsize=15)
plt.legend()
plt.tight_layout()
plt.show()

# Interpreting the distribution graphs:
- For **chest pain type**, type 1 (atypical angina) and 2 (non-anginal pain) are most likely related to risk of heart attack. Type 0 (typical angina) is likely related to normal condition, thus indicates lower risk of heart attack. Type 3 (asymptomatic) is unclear, but tends to indicate risk of heart attack in this data set;
- For **resting eletrocardiographic results**, type 1 (having ST-T wave changes) is likely related to risk of heart attack. These wave changes possibly signal [unsatisfactory blood pressure control](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5380254/). Type 0 (normal) and 2 (hypertrophy) are likely related to lower risk of heart attack. However, left ventricular hypertrophy ([diagnosed on ECG](https://www.sciencedirect.com/science/article/abs/pii/0002870368900355)) is related to [cardiovascular morbidity](https://www.sciencedirect.com/science/article/abs/pii/S000287030125488X);
- For **maximum heart rate achieved**, it is possible to observe that higher heart rate values are related to higher risk of heart attack (see further information [here](https://pubmed.ncbi.nlm.nih.gov/19615487/#:~:text=It%20has%20been%20shown%20that,pressure%20by%2010%20mm%20Hg.));
- For **slope of peak exercise ST-segment**, type 2 (downsloping) is likely related to risk of heart attack. ST segment depression during exercise is possibly linked to coronary artery diseases and/or anatomically or functionally severe coronary obstruction (see [here](https://www.jacc.org/doi/abs/10.1016/0735-1097(95)00085-I)). Type 0 (upsloping) might be related to an [ischemic response to exercise](https://pubmed.ncbi.nlm.nih.gov/11868055/), which means shortage of oxygen caused by restriction in blood supply to tissues. Although it [might be normal to see mild upsloping ST-segment during heavy exercise](https://journals.sagepub.com/doi/abs/10.1177/2047487312444370), one must carefully analyse the context in order to yield prognostic value from it. Type 1 (flat) is related to a [normal, isoelectric, measure](https://www.sciencedirect.com/science/article/pii/B9780323401692000032). In the context of this data set, it is possible to observe that type 0 ST-segment is linked to lower risk of heart attack;

### Let's continue to the first model (baseline) which will be a simple linear regression

In [None]:
column_names = list(heart_data.columns[0:13])
X = np.array(heart_data[heart_data.columns[0:13]])
y = np.array(heart_data[heart_data.columns[-1]])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=80)

y_train_count = Counter(y_train)
y_test_count = Counter(y_test)
print('Output count for train set is {} and for test set is {}'.format(y_train_count, y_test_count))

In [None]:
linear_reg = LinearRegression()
bl_model   = linear_reg.fit(X_train, y_train)
bl_predict = bl_model.predict(X_test)
bl_acc     = accuracy_score(y_test, bl_predict.round())
bl_matrix  = confusion_matrix(y_test, bl_predict.round())

score_f1      = f1_score(y_test, bl_predict.round())  
optim_predict = np.where(bl_predict > (score_f1/2), 1, 0)
optim_matrix  = confusion_matrix(y_test, optim_predict) 
optim_acc     = accuracy_score(y_test, optim_predict)
print('Accuracy (standard threshold): {}'.format(bl_acc.round(2)))
print('Accuracy optimized by the F1 score: {}'.format(optim_acc.round(2)))
print(classification_report(y_test, bl_predict.round()))

fig=plt.figure(figsize=(12, 6))
fig = plt.subplot(121)
plt.title('Confusion matrix for standard threshold', fontsize=15)
sns.heatmap(bl_matrix, annot=True, cmap='crest') 

fig = plt.subplot(122)
plt.title('Confusion matrix optimized by F1 score', fontsize=15)
sns.heatmap(optim_matrix, annot=True, cmap='crest') 
plt.tight_layout()
plt.show()

# Using SHAP values (Linear Explainer) to check avg impact of variables on model output

In [None]:
explainer_lr = shap.LinearExplainer(bl_model, masker=shap.maskers.Impute(data=X_train)).shap_values(X_train)
plt.figure(dpi=100)
shap.summary_plot(explainer_lr, X_train, plot_type="violin", feature_names=column_names)
plt.show()

# Lets see what a random forest classifier can show us

In [None]:
# Lets not bother tuning hyperparameters for now
rfc_std = RandomForestClassifier(max_features=2)
rfc_std.fit(X_train,y_train)
rfc_std_pred = rfc_std.predict(X_test); rfc_std_pred

rfc_acc     = accuracy_score(y_test, rfc_std_pred)
rfc_matrix  = confusion_matrix(y_test, rfc_std_pred)

print('Accuracy: {}'.format(rfc_acc.round(2)))
print(classification_report(y_test, rfc_std_pred.round()))

sns.heatmap(rfc_matrix, annot=True, cmap='crest') 
plt.title('Confusion matrix for standard threshold', fontsize=15)
plt.show()

# Using SHAP values (Tree Explainer) to check avg impact of variables on model output

In [None]:
explainer_rfc = shap.TreeExplainer(rfc_std, masker=shap.maskers.Impute(data=X_train)).shap_values(X_train)

plt.figure(dpi=120)
plt.subplot(121)
shap.summary_plot(explainer_rfc[0], X_train, feature_names=column_names, show=False, plot_type='violin')
plt.title('SHAP values for class 0', fontsize=10)
plt.subplot(122)
shap.summary_plot(explainer_rfc[1], X_train, feature_names=column_names, show=False, plot_type='violin')
plt.title('SHAP values for class 1', fontsize=10)
plt.tight_layout()
plt.show()

# Final take for the time being
### The variables *cp, thall, exng, thalachh* are impacting model output in both linear regression and random forest classifier. In the later model (random forest), *caa* and *oldpeak* are showing up as variables with high impact on model output as well. These are my considerations:
- **Chest pain type** (cp):
  - High feature values (the cp types) are positively correlated to impact on model output (pulling the output towards a higher number and thus meaning higher chance of heart attack) and vice-versa;
- **Thalium stress test result** (thall):
  - Disclaimer -> The values for thall were not documented in the present data set. I was able find the original data set, but the categories were different;
  - High feature values are negatively correlated to impact on model output, which might be interpreted as high thall values being related to lower chance of heart attack and vice versa;
- **Exercise induced angina** (exng):
  - Positive results for exng (feeling of pressure in the chest, jaw or arm caused by exercise) are negatively correlated to impact on model output, thus it is related to lower risk of heart attack (and vice-versa), which might seem absurd;
- **Maximum heart rate achieved** (thalach):
  - High feature values (maximum heart rate) are positively correlated to impact on model output and thus related to higher risk of heart attack (and vice-versa);
- **Number of major vessels colored by fluoroscopy** (caa):
  - High feature values (number of vessels colored by fluoroscopy) are negatively correlated to impact on model output and thus related to lower risk of heart attack, which might also seem absurd;
- **ST depression induced by exercise relative to rest** (oldpeak):
  - High feature values are negatively correlated to impact on model output, thus it is related to lower risk of heart attack (third WTF moment thus far);

## As can be seen above, there were SHAP results for 3 variables that confused me as a biologist:
### **Exercise induced angina - 0 meaning no angina (discomfort felt in the chest caused by myocardial ischemia) and 1 meaning angina:**
#### The SHAP values showed that the presence of exercise induced angina was related to lower risk of heart attack. Although angina pectoris may be a signal of coronary heart disease, positive values in exercise induced angina can be seen as normal depending on the adopted grading system. The Canadian Cardiovascular Society grading system described in this [letter](https://www.ahajournals.org/doi/pdf/10.1161/circ.54.3.947585) showed that type 1 angina, which is angina when doing "strenuous or rapid or prolonged exertion at work or recreation" can be normal. This might be the case here, but I would love other opinions on the matter;
### **Number of major vessels colored by fluoroscopy - 0 to 5. This [technique](https://www.ajronline.org/doi/abs/10.2214/ajr.135.6.1131) signal regions when certain vascular diseases are found, such as stenoses, obstruction, emboli, ulcerative plaques, and aneurysms, among others:**
#### The SHAP values showed that the higher numbers of colored major vessels are related to lower risk of heart attack. I can't seem to find reasons for this in the literature;
### **ST depression induced by exercise relative to rest - 0 to 6.2. Although I was not able to find further information for this variable in the present data set, I believe this interval is the ratio of ST depression measure (in millimeters) while exercising and while resting, so high values correspond to higher ST depression in exercise compared to resting and vice-versa (see further information [here](https://www.sciencedirect.com/science/article/abs/pii/000287039190279Q), and [here](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1768611/#:~:text=Exercise%20induced%20ST%20segment%20depression,diagnosis%20of%20obstructive%20coronary%20atherosclerosis.&text=It%20has%20also%20been%20associated,coronary%20artery%20disease%20(CAD).):**
#### The SHAP values showed that higher values are related to lower risk of heart attack. Once again, I was not able to find reasons for this observation in the literature.

In [None]:
fig=plt.figure(figsize=(18, 6))
fig = plt.subplot(121)
sns.distplot(heart_data[heart_data['output'] == 0]["caa"], color='green',label='Unlikely HT') 
sns.distplot(heart_data[heart_data['output'] == 1]["caa"], color='red',label='Likely HT')
plt.title('HT distibution over number of major vessels colored', fontsize=15)

fig = plt.subplot(122)
sns.distplot(heart_data[heart_data['output'] == 0]["oldpeak"], color='green',label='Unlikely HT') 
sns.distplot(heart_data[heart_data['output'] == 1]["oldpeak"], color='red',label='Likely HT')
plt.title('HT distibution over ST depression ratio', fontsize=15)
plt.show()

In [None]:
colored_vessels_raw = heart_data.caa.values
colored_vessels = np.where(colored_vessels_raw > 0, 1, 0)
colored_vessels = Counter(colored_vessels).most_common()
print(colored_vessels)
print('In this data set, the frequency of cases with at least one major vessel colored is {}%.'.format(round(100/colored_vessels_raw.shape[0]*colored_vessels[1][1],2)))

## As can be seen in the above graphs, these two variable have a higher density closer to 0, which might be seem as a biased data distribution:
### Higher risk of heart attack for individuals with 0 major vessels colored -> in a broader context, individuals with vascular diseases might account for less than 5% of the population (see this [study](https://www.jacc.org/doi/full/10.1016/j.jacc.2013.01.054)). With that in mind, one can see how this low prevalence can affect a small data set such as this one we are using. This dataset do not follow the real-world distribution of this variable, thus it might be safe to assume the reason why higher values for caa negatively correlate to higher chance of heart attack is because the physiological significance of the variable is lost due to bias;
### Higher risk of heart attack for individuals with 1-1 ratio of ST depression measures -> I am kind of tired to go after good citations today (might do it later this week), but I believe the value of 0 for ST depression induced by exercise relative to rest might speak to the idea that individuals that have ST depression while resting tend to no be competent at physical exercises and tend to maintain the ST depression if exercising. This might explain why we have a high density of examples falling close to 0. Lower values for this variable might be related to higher risk of heart attack because of that.