In [None]:
## imports

import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from hyperopt import Trials, fmin, tpe, hp, STATUS_OK,STATUS_FAIL
from scipy.stats import shapiro


In [None]:
data = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')
data.head()

In [None]:
data.describe()

In [None]:
data1 = data.copy()
data1['sex'] = data1['sex'].apply(lambda x : "Male" if x==0 else "Female")
data1['exng'] = data1['exng'].apply(lambda x : "Exercise induced" if x==0 else "Not Exercise induced")
data1['fbs'] = data1['fbs'].apply(lambda x : "High sugar" if x==1 else "Low sugar")

data1['restecg'] = data1['restecg'].replace({0:'normal',
                                            1:'ST-T wave abnormality',
                                            2:'left ventricular hypertrophy'})

data1['output'] = data1['output'].apply(lambda x : "Higher chance of heart attack" if x==1 else "Low chance of heart attack")

In [None]:
## continuous columns 
cont_cols = [['age','trtbps'],
             ['chol','thalachh']]
x_labels = [['Age','Resting Blood Pressure'],
            ['Cholestrol levels','Maximum heart rate']]

fig,ax = plt.subplots(nrows=2,ncols=2,figsize=(25,15))
for i in range(2):
    for j in range(2):
        ax_sub = sns.histplot(x=cont_cols[i][j],hue='output',data=data1,ax=ax[i][j],multiple='stack',alpha=0.5)
        ax_sub.set(xlabel = x_labels[i][j],title='Distribution of ' + x_labels[i][j])

### Observations 
* It seems like age, cholestrol and maximum heart rate levels are normal.
* We will further down the line check the p-value and confirm it.

In [None]:
###cat plots

cat_cols = [['sex','exng','cp'],
            ['caa','fbs','restecg']]

x_labels = [['Sex of the patient','Angina','Chest Pain Type'],
            ['Number of major vessels','Fasting Blood Sugar','Resting ECG']]


fig,ax = plt.subplots(nrows=2,ncols=3,figsize=(25,15))

for i in range(0,2):
    for j in range(0,3):
        ax_sub = sns.countplot(x=cat_cols[i][j],hue='output',data=data1,ax=ax[i][j])
        ax_sub.set(xlabel=x_labels[i][j],title = 'Count by ' + x_labels[i][j] )

The above plot displays count of all the categorical values grouped by output. From the plots - 
* Exercise induced angina leads to a higher chance of a heart attack.
* There is a higher risk of a heart attack when you have non-anginal pain.

Ofcourse, these observations are limited to the sample size we have at hand.

In [None]:
## continuous columns 
cont_cols = [['age','trtbps'],
             ['chol','thalachh']]
x_labels = [['Age','Resting Blood Pressure'],
            ['Cholestrol levels','Maximum heart rate']]

fig,ax = plt.subplots(nrows=2,ncols=2,figsize=(25,15))
for i in range(2):
    for j in range(2):
        ax_sub = sns.boxplot(x='output',y=cont_cols[i][j],data=data1,ax=ax[i][j])
        ax_sub.set(xlabel = x_labels[i][j],title='Boxplot of ' + x_labels[i][j])

### Observations - 
* We can clearly see some outliers in blood pressure and heart attack columns.


### Factors leading to higher chance of heart attack - 
* 40 to 58 aged people.

* Exercise induced and Non anginal chest pain

* B.P between 120 and 140

* Cholesterol level b/w 200 to ~250

* Having ST-T wave normality

* If maximum heart rate is above 150

* 0 major vessels


In [None]:
ax = plt.figure(figsize=(15,10))
sns.heatmap(data=data.corr(),annot=True)

### Observations - 
* CP column, i.e. chest pain has the highest correlation with the output column.
* Same is with maximum heart rate achieved.

These observations are obvious because symptoms are physical. It would have been more fun if we could look at some of the inferred or symptoms which are not physical. 

## Feature Engineering

### Removing outliers - 
* Removal of outliers is done by calculating IQR
* I am using shapiro-wilk test to calculate the P-value. 

In [None]:
## outlier removal
for col in [item for sublist in cont_cols for item in sublist]:
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    
    p_v = shapiro(data[col]).pvalue
    print("P-Value before removal of outlier from " + col + ": " + str(p_v))

    iqr = q3-q1
    iqr_left = q1 - iqr*1.5
    iqr_right = q3 + iqr*1.5
    print(iqr_left, iqr_right)
    
    data = data[(data[col]>=iqr_left) & (data[col]<=iqr_right)]
    p_v = shapiro(data[col]).pvalue
    print("P-Value after removal of outlier from " + col + ": " + str(p_v))
    
    print("\n\n\n")

#### Observations - 
* The null hypothesis in SW test is that the distribution is normally distributed. If p < 0.05, we say that null hypothesis is rejected.
* Clearly, Cholestrol column is normal as the the p-value after removal of outliers is 0.178.
* I am surprised that rest of all the continuous value columns are not normal. From the above histogram plots, they seemed normal.


Okay, let's move ahead to machine learning model.

In [None]:
target = data['output']
data.drop('output',axis=1,inplace=True)

In [None]:
data.head()

#### Feature Scaling

In [None]:
cont_cols_list = [item for sublist in cont_cols for item in sublist]

In [None]:
X_train,X_test,y_train,y_test = train_test_split(data,target,train_size=0.8,random_state=42)
sc = StandardScaler()
X_train[cont_cols_list] = sc.fit_transform(X_train[cont_cols_list])
X_test[cont_cols_list] = sc.transform(X_test[cont_cols_list])

#### Modelling
I am using something called hyperopt to pick up all the parameters and even the model type for the classification problem. Hyperopt uses something called Tree Parzen estimators to come up with the next set of parameters to evaluate. There are three things required - 
* Objective function - Trains the model and return 1 - **cross_val_score**
 * The Fmin in hyperopt minimizes the objective function.
 

* Search Space - [This](https://github.com/hyperopt/hyperopt/wiki/FMin) tutorial lists down all different kinds of methods you can pass for search spaces.

* Trials object - The object used to store all the results.



In [None]:
def objective_function(params):
    try:
        if params['type'] == 'dtree':
            clf = DecisionTreeClassifier(criterion = params['criterion'], max_depth = params['max_depth'],
                                        min_samples_split = int(params['min_samples_split']))
        elif params['type'] == 'svm':
            if params['kernel']['ktype'] == 'linear':
                clf = SVC(C = params['C'],kernel = 'linear')
            else:
                clf  = SVC(C = params['C'],kernel = 'rbf',degree = params['kernel']['degree'])

        elif params['type'] == 'random_forest':
            clf = RandomForestClassifier(criterion = params['criterion'], max_depth = params['max_depth'],
                                        min_samples_split = int(params['min_samples_split']))

        loss = 1 - cross_val_score(clf,X_train,y_train,cv=5).mean()
        return {'loss': loss, 'status':STATUS_OK,'clf':clf}
    except:
        return {'loss':1,'status':STATUS_FAIL,'clf':None}


A search space consists of nested function expressions, including stochastic expressions. The stochastic expressions are the hyperparameters. Sampling from this nested stochastic program defines the random search algorithm. The hyperparameter optimization algorithms work by replacing normal "sampling" logic with adaptive exploration strategies, which make no attempt to actually sample from the distributions specified in the search space.

In [None]:
search_space = hp.choice('classifier_type', [
    {
        'type': 'random_forest',
        'criterion': hp.choice('rftree_criterion', ['gini', 'entropy']),
        'max_depth': hp.choice('rftree_max_depth',[3,4,5,6,7]),
        'min_samples_split': hp.uniform('rftree_min_samples_split', 3, 20),
        'n_estimators': hp.uniform('rftree_n_estimators', 25, 100),
        
    },
    {
        'type': 'svm',
        'C': hp.choice('svm_C', [1,2,3,4,5,6,7,8,9]),
        'kernel': hp.choice('svm_kernel', [
            {'ktype': 'linear'},
            {'ktype': 'RBF','degree':hp.choice('rbf_degree',[2,3,4,5,6,7])},
            ]),
    },
    {
        'type': 'dtree',
        'criterion': hp.choice('dtree_criterion', ['gini', 'entropy']),
        'max_depth': hp.choice('dtree_max_depth',[3,4,5,6,7]),
        'min_samples_split': hp.uniform('dtree_min_samples_split', 3, 20),
        'n_estimators': hp.uniform('dtree_n_estimators', 25, 100),
    },
    ])

In [None]:

trials = Trials()
best = fmin(objective_function,search_space,trials=trials,algo=tpe.suggest,max_evals=150)

In [None]:
best_clf = trials.best_trial['result']['clf']
best_clf.fit(X_train,y_train)
preds = best_clf.predict(X_test)

Plotting trial results

In [None]:
x = []
y = []
for i,j in enumerate(trials.results):
    x.append(i)
    y.append(j['loss'])

plt.figure(figsize=(20,10))
ax = sns.lineplot(x=x,y=y)
ax.set(xlabel = 'Iteration Number', ylabel = '1 - cross_val_score',title='Score vs iteration')

We can see at about 60th iteration we reached the minimum. 

#### Accuracy report

In [None]:
print(str(classification_report(y_test,preds)))

In [None]:
sns.heatmap(pd.crosstab(y_test,preds,rownames=['Actual'],colnames=['Predicted']),annot=True)


* The purpose of this notebook is not to bring out the best model. The notebook documents my thought process during the full investigative journey of this dataset. Ofcourse, adding more models, more hyperparameters in hyperopt search space and increase in the number of trials also can lead to more performance.
* Since, the dataset size is very low small, change in the confusion matrix numbers can lead to higher accuracy numbers.

**If you liked the notebook, please don't forget to upvote.**