In [None]:
# Author: Pierre Jeanne
# Date Created:  16 May 2021

# Heart Attack Analysis & Prediction Dataset
## A dataset for heart attack classification

<img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ7ZWEo96vWDovxH-6QR0jFPj1Fi1zgkBQsmA&usqp=CAU" width="500px">

### Goal:
We should help predicting possible diameter narrowing thatcan lead to heart failure. The target variable is `num`:
- 0 = less chance of heart attack
- 1 = more chance of heart attack

### Warning:
In the discussion, it is said that the dataset presents on Kaggle is a poor copy of the original dataset, and that the target values were swapped. So, here we will work with the original dataset.
https://archive.ics.uci.edu/ml/datasets/heart+disease

[1: Exploratory Data Analysis](#1)
- [1.1: EDA with dataprep](#1.1)
- [1.2: Report summary main points](#1.2)

[2: Target variable](#2)

[3: Machine learning](#3)
- [3.1 Preprocessing](#3.1)
    - [3.1.1: Split the data](#3.1)
    - [3.1.2: Scale the data](#3.1.2)
    - [3.1.3: Classification with logistic regression](#3.1.3)
    - [3.1.4: Classification with k-Nearest Neighbors](#3.1.4)
    - [3.1.5: Classification with Support Vector Machines](#3.1.5)
    - [3.1.6: Classification with random Forest classifier](#3.1.6)
    
- [3.2: Conclusion](#3.2)

In [None]:
import numpy as np
import pandas as pd
import pandas_profiling as pp
# display progress bar during iteration
from tqdm.notebook import tqdm

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import learning_curve
from sklearn.metrics import plot_confusion_matrix

# stat on data
from scipy import stats
from scipy.stats import norm, skew

# slip the data
from sklearn.model_selection import train_test_split
# scale the data
from sklearn.preprocessing import MinMaxScaler
# cross validation
from sklearn.model_selection import cross_val_score
# classification model
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
# hyperparameter tunning
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
# model evaluation
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
pip install dataprep

In [None]:
# LIBRARY FOR EDA
from dataprep.eda import *
from dataprep.eda import plot
from dataprep.eda import plot_correlation
from dataprep.eda import plot_missing

## Downloading data

In [None]:
header_list = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"]
url="http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
df_heart = pd.read_csv(url,header=None, names=header_list)
df_heart.head(3)

**About this dataset**

- `Age` : Age of the patient
- `Sex` : Sex of the patient
- `cp` : Chest Pain type chest pain type
    - Value 0: asymptomatic
    - Value 1: typical angina
    - Value 2: atypical angina
    - Value 3: non-anginal pain

- `trtbps` : resting blood pressure (in mm Hg)
- `chol` : cholestoral in mg/dl fetched via BMI sensor
- `fbs` : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
- `rest_ecg` : resting electrocardiographic results
    - Value 0: normal
    - Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
    - Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
- `thalach` : maximum heart rate achieved
- `exang`: exercise induced angina (1 = yes; 0 = no)
- `oldpeak`: ST depression induced by exercise relative to rest
- `slp`: the slope of the peak exercise ST segment (1 = upsloping; 2 = flat; 3 = downsloping)
- `ca`: number of major vessels (0-3)
- `thal` - 3 = normal; 6 = fixed defect; 7 = reversable defect
- `num` : 0= less chance of heart attack 1= more chance of heart attack

## <a id="1"></a>
<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:#7ca4cd; border:0' role="tab" aria-controls="home"><center>1: Exploratory Data Analysis</center></h3>


## <a id="1.1"></a>
**1.1: EDA with dataprep** 

In [None]:
import pandas_profiling as pp
pp.ProfileReport(df_heart)

## <a id="1.2"></a>
**1.2: Report summary main points**
- there are 5 Numeric and 9 Categorical	variables 
- the Categorical variables have a float and object types
- `ca` and `thal` have 4 and 2 '?' instead of missing value, respectively
- the target variable has 5 different values: 0, 1,2,3 and 4

In the original study, the presence of heart disease in the patient is given by an integer valued from 0 (no presence) to 4. Experiments with the Cleveland database have concentrated on simply attempting to distinguish presence (values 1,2,3,4) from absence (value 0). So, we will replace all the value higher than 1 by 1 to obtain the desired output:
- 0 = less chance of heart attack
- 1 = more chance of heart attack

In [None]:
# let's remove the Nan values
print("number of row before dropping nan values: {}".format(df_heart.shape[0]))
df_heart = df_heart[(df_heart['ca']!='?')&(df_heart['thal']!='?')]
print("number of row after dropping nan values: {}".format(df_heart.shape[0]))

In [None]:
# replace 2,3 and 4 by 1 in the target variable
df_heart.loc[:,'num']=df_heart['num'].astype(str)
df_heart.loc[:,'num']=df_heart['num'].replace(['2','3','4'], ['1','1','1'])

In [None]:
# verify the types of the variables
df_heart.dtypes

In [None]:
# assign categorical type to the categorical variable
categorical_variables = ['sex','cp','fbs','restecg','exang','slope','ca','thal','num']

for col in categorical_variables:
    df_heart[col] = df_heart[col].astype('category')

In [None]:
df_heart.dtypes

In [None]:
# get dummies
df_heart = pd.get_dummies(df_heart,drop_first = True)

In [None]:
plot_correlation(df_heart)

In [None]:
#  plot the correlations between variables and their relations with target varibales
fig, axes = plt.subplots(2,2,figsize=(10,10))
fig.subplots_adjust(hspace=0.3)
ax0, ax1, ax2, ax3 = axes.flatten() 
sns.scatterplot(data=df_heart,x='thalach',y='age',hue='num_1',ax=ax0)
sns.scatterplot(data=df_heart,x='thalach',y='oldpeak',hue='num_1',ax=ax1)
sns.scatterplot(data=df_heart,x='trestbps',y="age",hue='num_1',ax=ax2)
sns.scatterplot(data=df_heart,x='chol',y="age",hue='num_1',ax=ax3)

plt.show()

## <a id="2"></a>
<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:#7ca4cd; border:0' role="tab" aria-controls="home"><center>2: Target variable</center></h3>

In [None]:
# Displaying the count for non Deviated hole 
majority_class = df_heart.loc[df_heart['num_1'] == 0].count()[0]

# Showing the count for Deviated hole 
minority_class = df_heart.loc[df_heart['num_1'] == 1].count()[0]

# Printing the classes for the deviated and non-deviated class 
print('low risk of heart failure (num_1 = 0): {}'.format(majority_class))
print('high risk of heart failure (num_1 = 1) : {}'.format(minority_class))


sns.countplot(x="num_1", data=df_heart)
plt.show()

The target data is imbalanced 

The challenge of working with imbalanced datasets is that most machine learning techniques will ignore, and in turn have poor performance on, the minority class, although typically it is performance on the minority class that is most important.

One approach to addressing imbalanced datasets is to use **SMOTE**.

SMOTE is an oversampling method. It works by creating synthetic samples from the minor class instead of creating copies. The algorithm selects two or more similar instances (using a distance measure) and perturbing an instance one attribute at a time by a random amount within the difference to the neighboring instances. This is a type of data augmentation for the minority class and is referred to as the Synthetic Minority Oversampling Technique, or **SMOTE** for short.

In [None]:
from imblearn.over_sampling import SMOTE
X = df_heart.drop('num_1',axis=1)
y = df_heart[['num_1']].values.ravel()
# Using SMOTE to Balance the imbalanced data 
X_resampled, y_resampled = SMOTE().fit_resample(X, y)

X_resampled = pd.DataFrame(X_resampled, columns=X.columns ) 

In [None]:
# convert y_resampled to df
df_y_resampled = pd.DataFrame(y_resampled,columns=['num_1'])

# showing a plot of the Balanced dataset 
majority_class = df_y_resampled.loc[df_y_resampled['num_1'] == 0].count()[0]

# Showing the count for Non Hole Deviation 
minority_class = df_y_resampled.loc[df_y_resampled['num_1'] == 1].count()[0]

# Printing the classes for the deviated and non-deviated class 
print('Non Deviated Class (num_1 = 0): {}'.format(majority_class))
print('Deviated Class (num_1 = 1) : {}'.format(minority_class))


sns.countplot(x="num_1", data=df_y_resampled)
plt.show()

## <a id="3"></a>
<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:#7ca4cd; border:0' role="tab" aria-controls="home"><center>3- Machine learning</center></h3>

## <a id="3.1"></a>
**3.1: Preprocessing**

**3.1.1: Split the data**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,test_size = .2, random_state=0)

<a id="3.1.2"></a>
**3.1.2: Scale the data**

Here, we normalize the continuous variables only, leaving the dummy variables alone. We also use the min-max scaler to give those continuous variables the same minimum of zero, max of one, range of 1. 

In [None]:
scaler = MinMaxScaler()
numeric_col = ['age','trestbps','chol','thalach','oldpeak']
# fit and transform "x_train"
X_train.loc[:,numeric_col] = scaler.fit_transform(X_train[numeric_col])
# transform "x_test"
X_test.loc[:,numeric_col] = scaler.transform(X_test[numeric_col])

## <a id="3.1.3"></a>
**3.1.3: Classification with logistic regression**

In [None]:
list_scores = []
def plot_result_cv(list_score_train,list_score_test,model,pred,name):
    print('mean scores on training set: {:2f}, and testing set: {:2f}'.format(np.mean(list_score_train),np.mean(list_score_test)))
    fig = plt.figure(figsize=(15,5))
    fig.subplots_adjust(hspace=0.4,wspace=0.3)
    ax0 = fig.add_subplot(1,2,1)
    ax0 = plt.plot(list_score_train,'go-',label='CV score on training set')
    ax0 = plt.plot(list_score_test,'ro-',label='CV score on testing set')
    ax0 = plt.xlabel('nb of fold cross-validation')
    ax0 = plt.ylabel('score')
    ax0 = plt.legend() 
    
    ax1 = fig.add_subplot(1,2,2)
    plot_confusion_matrix(model, X_test, y_test, cmap=plt.cm.Blues,ax=ax1);  
    
    accuracy = accuracy_score(y_test,pred)
    recall = recall_score(y_test,pred)
    precision = precision_score(y_test,pred)
    f1 = f1_score(y_test,pred)
    
    print('accuracy: ', accuracy)
    print('recall: ',recall)
    print('precision: ', precision)
    print('f1: ', f1)
    
    list_scores.append({'Model Name': name, 'Accuracy': accuracy, 'Recall': recall, 'Precision': precision, 'F1':f1})
    
    plt.show()

In [None]:
clf_lr = LogisticRegression()

# Compute accuracy on the training set with 5-fold cross-validation
cv_scores_train = cross_val_score(clf_lr,X_train, y_train,cv=12,scoring = 'accuracy')
# Compute accuracy on the training set with 5-fold cross-validation
cv_scores_test = cross_val_score(clf_lr,X_test, y_test,cv=12,scoring = 'accuracy')

clf_lr_mean_train = np.mean(cv_scores_train)
clf_lr_mean_test = np.mean(cv_scores_test)

clf_lr.fit(X_train,y_train)
pred = clf_lr.predict(X_test)
# plot result cv
plot_result_cv(cv_scores_train,cv_scores_test,clf_lr,pred,'logreg')

<a id="3.1.4"></a>
**3.1.4: Classification with k-Nearest Neighbors**

In [None]:
param_grid = {'n_neighbors':np.arange(1,50)}
knn = KNeighborsClassifier()
knn_cv= GridSearchCV(knn,param_grid,cv=7)
knn_cv.fit(X_train,y_train)
knn_cv.best_params_

In [None]:
clf_knn = KNeighborsClassifier(n_neighbors=48)

# Compute accuracy on the training set with 5-fold cross-validation
cv_scores_train = cross_val_score(clf_knn,X_train, y_train,cv=12,scoring = 'accuracy')
# Compute accuracy on the training set with 5-fold cross-validation
cv_scores_test = cross_val_score(clf_knn,X_test, y_test,cv=12,scoring = 'accuracy')

clf_knn_mean_train = np.mean(cv_scores_train)
clf_knn_mean_test = np.mean(cv_scores_test)

clf_knn.fit(X_train,y_train)
pred = clf_knn.predict(X_test)
# plot result cv
plot_result_cv(cv_scores_train,cv_scores_test,clf_knn, pred,'knn_48')

<a id="3.1.5"></a>
**3.1.5: Classification with Support Vector Machines**

In [None]:
clf_svm = svm.SVC()

parameters = { 'C':np.arange(1,5,1),'gamma':[0.001, 0.005, 0.01, 0.05, 0.09, 0.1, 0.2, 0.5,1],
              'kernel':['rbf', 'sigmoid', 'linear', 'poly',]}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = clf_svm, param_grid = parameters,cv = 7, n_jobs = -1, verbose = 2,scoring = 'accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
clf_svm = svm.SVC(C= 4, gamma= 0.05,kernel = 'sigmoid')

# Compute accuracy on the training set with 5-fold cross-validation
cv_scores_train = cross_val_score(clf_svm,X_train, y_train,cv=12)
# Compute accuracy on the training set with 5-fold cross-validation
cv_scores_test = cross_val_score(clf_svm,X_test, y_test,cv=12)

clf_svm_mean_train = np.mean(cv_scores_train)
clf_svm_mean_test = np.mean(cv_scores_test)

clf_svm.fit(X_train,y_train)
pred = clf_svm.predict(X_test)
# plot result cv
plot_result_cv(cv_scores_train,cv_scores_test,clf_svm,pred,'svc')


<a id="3.1.6"></a>
**3.1.6: Classification with random Forest classifier**
**random search**

In [None]:
# Setup the parameters and distributions to sample from: param_dist
parameters = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [600]}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = parameters, n_iter = 300, cv = 4, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
# view the best parameters from fitting the random search:
rf_random.best_params_

**Grid Search with Cross Validation**
Random search allowed us to narrow down the range for each hyperparameter. Now that we know where to concentrate our search, we can explicitly specify every combination of settings to try. We do this with GridSearchCV, a method that, instead of sampling randomly from a distribution, evaluates all combinations we define. To use Grid Search, we make another grid based on the best values provided by random search:

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [None],
    'max_features': ['sqrt'],
    'min_samples_leaf': [2,3],
    'min_samples_split': [8,9,10,11,12],
    'n_estimators': [600]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_

**best model**

In [None]:
clf_rf = RandomForestClassifier(bootstrap ='True',max_depth = None,max_features = 'sqrt',
                                min_samples_leaf = 2, min_samples_split = 11, n_estimators = 600)

# Compute accuracy on the training set with 5-fold cross-validation
cv_scores_train = cross_val_score(clf_rf,X_train, y_train,cv=12)
# Compute accuracy on the training set with 5-fold cross-validation
cv_scores_test = cross_val_score(clf_rf,X_test, y_test,cv=12)

clf_rf_mean_train = np.mean(cv_scores_train)
clf_rf_mean_test = np.mean(cv_scores_test)

clf_rf.fit(X_train,y_train)
pred = clf_rf.predict(X_test)
# plot result cv
plot_result_cv(cv_scores_train,cv_scores_test,clf_rf,pred,'Rand_Forest')

<a id="3.2"></a>
**3.2: Conclusion**

The best classification is obtained with the Support Vector Machines model (acc. of 0.79)

In [None]:
df_scores = pd.DataFrame(list_scores)
df_scores.style.highlight_max(color = 'lightgreen', axis = 0)