## Imports

In [None]:
#Library imports
import numpy as np
import pandas as pd
#Ploting
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
#Model
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
#Metrics
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from sklearn.metrics import f1_score, make_scorer, roc_curve, auc
from scipy.stats import uniform
from scipy.stats import randint
#dataset
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#saving the dataset as df
df = pd.read_csv("/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv")

## **Pre prossesing the data**

In [None]:
#Let's see if there is any "missing" data
df.describe()

In [None]:
df.head()

## Features Distribution

In [None]:
#Let's plot the variables distribution

#Starting with age
fig = plt.figure(figsize=(25, 25))
fig = plt.subplot(4,3,1)
sns.kdeplot(df['age'],label=' Age')
sns.despine()
plt.legend()

#Now for the sex
fig = plt.subplot(4, 3, 2)
df['sex'].replace([0,1],['Women','Men']).value_counts().plot(kind='barh', label=' Entries')
sns.despine()
plt.legend()

#Distribution of different types of chest pain
fig = plt.subplot(4, 3, 3)
cp = df['cp'].replace([0, 1, 2, 3],['Typical','Atypical','Non-anginal','Asymptomatic']).value_counts().plot(kind='barh', label=' Chest Pain entries')
sns.despine()
plt.legend()

#Resting blood pressure
fig = plt.subplot(4, 3, 4)
sns.histplot(x = df['trtbps'], kde=True, label=' Blood pressure')
plt.legend()
sns.despine()

#Cholestoral
fig = plt.subplot(4, 3, 5)
sns.histplot(x = df['chol'], kde = True, label = 'Cholestoral')
plt.legend()

#Blood sugar while fasting
fig = plt.subplot(4, 3, 6)
sns.countplot(x=df['fbs'].replace([0, 1],['No', 'Yes']))
plt.title('Is blood sugar > 120 mg/dl?')
sns.despine()

#Resting eletrocardiographic results
fig = plt.subplot(4, 3, 7)
sns.countplot(y=df['restecg'].replace([0, 1, 2], ['Normal', 'ST-T wave abnormality', 
                                                  'left ventricular hypertrophy']))
sns.despine()
plt.title('Resting eletrocardiographic results')

#Maximum heart rate achieved
fig = plt.subplot(4, 3, 8)
sns.histplot(x = df['thalachh'], kde=True)
sns.despine()
plt.title('Maximum heart rate achieved')

#Distribution of induced angina
fig = plt.subplot(4, 3, 9)
sns.countplot(x = df['exng'].replace([0, 1], ['No','Yes']))
plt.title('Does angina occur on exercise?')
sns.despine()

#Old peak
fig = plt.subplot(4, 3, 10)
sns.kdeplot(x = df['oldpeak'], label='St depression')
#Observation: an st depression is a finding on an electrocardiogram
plt.title('ST depression induced by exercise relative to rest')
sns.despine()

#Slope of the peak exercise
fig = plt.subplot(4, 3, 11)
sns.countplot(x = df['slp'].replace([0, 1, 2], ['Unsloping', 'Flat', 'Donwsloping']))
plt.title('At the peak of an exercise, how does the ST segment slope?')
sns.despine()

#Number of major vessels
fig = plt.subplot(4, 3, 12)
sns.countplot(x = df['caa'])
plt.title('Number of major vessels')
sns.despine()
#I don't get why in the original paper there are only 3 vessels and here are 5, but let's continue

#The thall column is also strange in the dataset (3 entries, and here are 4), I won't be using it.

From here, some valuable information can be observed. 

- In this dataset, it's more probable to find men, between 50 to 65 years old. 

- Most of the recorded abdominal pain is typical anginal and non-anginal, and don't occur during exercise. 

- The typical blood pressure is between 120mg/dl to 140mg/dl, although, while fasting, the pressure reduces to lower than 120mg/dl. 

- The cholestoral levels are usually between 200 and 300. 

- While resting, the eletrocardiographic results are normal or with a ST-T wave abnormality. While resting the ST depression is normally between 0 and 2.

- The maximum heart rate is more distributed in 140 to 180.

Let's try looking for some correlations.


### Pearson's correlation heatmap

In [None]:
#The pearson's r correlation heatmap
plt.figure(figsize=(12,12))
sns.heatmap(df.corr(), vmin=-1, cmap="Blues", annot=True)

The output column refers to the chance of heart attack. Looking at the last line (or column) in the heatmap, the variable with more "color" has more correlation with the output. Of course, correlation is not causality, but it's at least good to know that:
- In the dataset, there is an observed slightly positive correlation between chest pain and maximum heart rate to a greater risk of heart attack.(If that wasn't obvious)

- In contrary, there is a slightly negative correlation between age and heart attack. (The younger the lower the risk) Sex and heart attack. (In the dataset, men appear more having higher risk).

### Comparison in a single feature

In [None]:
# Making a distribution comparison between outputs
atk = df[df['output']==1]['age'].fillna(0.0).astype(float)
#Creates a mask and apply 0 to any number not inside output = 1 
atk_no = df[df['output']==0]['age'].fillna(0.0).astype(float)
#Creates another mask and apply 0 to any number not inside output = 0
fi = ff.create_distplot([atk, atk_no], ['Risk cases','No risk cases'], bin_size=0.65, curve_type='normal'
                        ,colors =  ['crimson','black'])
fi.update_layout(
    title="Heart Attack distibution over age",
    xaxis_title="Age")
fi.show()

It seems that the data collected more cases of heart attack at youger ages. Also, the risk of heart attack starts to decline in the 50 years mark. If we were to apply a hipothesis test here, we would be able to make some inferences, but I'll leave it for later.
Let's apply the same idea for some other cases.

In [None]:
# Same idea, but for maximum heart rate
atk = df[df['output']==1]['thalachh'].fillna(0.0).astype(float)
atk_no = df[df['output']==0]['thalachh'].fillna(0.0).astype(float)
fi = ff.create_distplot([atk, atk_no], ['Risk cases','No risk cases'], bin_size=0.65, curve_type='normal'
                        ,colors =  ['crimson','black'])
fi.update_layout(
    title="Heart Attack distibution over maximum heart rate",
    xaxis_title="Heart rate")
fi.show()

It seems that the risk of heart attack increases just as the maximum heart rate increases, which makes some logical sense.

### **Processing** 

There is not many values to alocate between train and test in the dataset. In this case I will use the stratified K fold cross validation method, in order to preserve information.

In [None]:
#Let's start by selecting some random information to cross validate later.
df_out = df['output'].to_numpy()
df_feats = df.drop(columns='output').to_numpy()
#For this, I'll use the scikit learn library 
cv = StratifiedKFold(n_splits = 10)
#I'll get the values of the cv
for train_index, test_index in cv.split(df_feats, df_out):
    xtrain, xtest = df_feats[train_index], df_feats[test_index]
    ytrain, ytest = df_out[train_index], df_out[test_index]


# K nearest neighbor

In [None]:
#Let's see how the nearest neighbor performs
knn = KNeighborsClassifier(n_neighbors= 5)
knn = knn.fit(xtrain, ytrain)
knnpred = knn.predict(xtest)
print(f'The acccuracy of the KNN is {accuracy_score(ytest, knnpred):.4}')
print(f'The F1 score of the KNN is {f1_score(ytest, knnpred):.4}')
print(f'There are {len(ytest)} values to test.')
print('Its confusion matrix is:')

In [None]:
plot_confusion_matrix(knn, xtest, ytest, colorbar=False,
                     cmap='Blues')

I don't plan on getting the KNN to actually perform better here, I just wanted to see how well it would perform in a "simplistic way". That said, the model is actually performing pretty well considering that it was very simple to apply, but it can do better.

## Random Forest with search on hyper parameters

In [None]:
#Let's start with some basics
forest = RandomForestClassifier()
#For the search, F1 score will be interesting to compare precision and recall metrics
f1 = make_scorer(f1_score)
#Now it's just some code for the random forest params it will look into
params = dict(n_estimators = randint(100, 200), bootstrap = [True, False], criterion = ['gini', 'entropy'])

In [None]:
#Code for the search
search = RandomizedSearchCV(forest, params, n_iter = 5,
                            cv = cv, scoring = f1, random_state = 15)
search.fit(df_feats,df_out)

In [None]:
#The best params are
print(f'Best score: {search.best_score_}\nBest paramns: {search.best_params_}\nBest estimator: {search.best_estimator_}\nNumber of splits:{search.n_splits_}')

# **Metrics**

In [None]:
#Let's use the params and apply it on the model now
forest = RandomForestClassifier(n_estimators = 171, criterion = 'entropy',
                               bootstrap = True, random_state=15)

#Now for the model
forest.fit(xtrain, ytrain)
forestpred = forest.predict(xtest)
print(f"The model's accuracy score is {accuracy_score(ytest, forestpred):.4}")
print(f"The model's f1 score is {f1_score(ytest, forestpred):.4}")
print(f'There are {len(ytest)} values to test.')

In [None]:
plot_confusion_matrix(forest, xtest, ytest, colorbar=False,
                     cmap='Blues')

The model has a better accuracy and F1 score. By it's confusion matrix it looks like the model has better classification power over the true positive risk, but still has some problems to classify as seem by the false positives and negatives. Looking at the AUC, we can see if the model is doing the best it can.

In [None]:
#Ploting a AUC to look for possible scenarios
fpr, tpr, threshold = roc_curve(ytest, forestpred)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6.5,6.5))
plt.title('Receiver Operating Characteristic',fontsize=20)
plt.plot(fpr, tpr, 'b', label = f'AUC = {roc_auc:0.2f}', lw=2)
plt.legend(loc = 'lower right', fontsize=15)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate', fontsize=15)
plt.xlabel('False Positive Rate', fontsize=15)
plt.show()