# Heart Attack Prediction

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data=pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
data.head()

## Exploratory Data Analysis

In [None]:
data.describe()

## Check the data types

In [None]:
data.dtypes

## Check for null values

In [None]:
data.isnull().sum()

## Pair plot

In [None]:
sns.pairplot(data,hue='output')

## Heart Attack by Sex

In [None]:
sns.barplot(data.sex,data.output)

## Effect of cholesterol on Heart Attack  

In [None]:
sns.barplot(data.output,data.chol)

## Scaling of the Data

In [None]:
datan=data.nunique()
float_cols=datan[datan>5].index.to_list()
float_cols

In [None]:
from sklearn.preprocessing import StandardScaler
s=StandardScaler()

for col in float_cols:
  data[col]=s.fit_transform(data[[col]])

In [None]:
data.head()

## Correlations

In [None]:
plt.subplots(figsize=(13,9))
sns.heatmap(data.corr(),annot=True,cbar=False)

# Modeling

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import  accuracy_score, classification_report, confusion_matrix

## Splitting the data into Train and Test

In [None]:
X=data.drop('output',axis=1)
y=data['output']

x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=172)


## RandomForest Model

In [None]:
param_grid={'n_estimators':[40,50,70,100,200,300,400,500],
            'max_features':[2,3,4,5,6,7,9],
            'max_depth':[1,2,3,4,5,6,7]}

Grid_rf=GridSearchCV(RandomForestClassifier(),param_grid,n_jobs=-1,scoring='accuracy')
Grid_rf.fit(x_train,y_train)

In [None]:
Grid_rf.best_params_

In [None]:
y_pred=Grid_rf.predict(x_test)
print(classification_report(y_test,y_pred))

### Training the RandomForest with best parameters

In [None]:
rf_param=Grid_rf.best_params_

In [None]:
rf=RandomForestClassifier(**rf_param)

In [None]:
rf.fit(x_train,y_train)
y_pred_rf=rf.predict(x_test)

In [None]:
fig=plt.figure(figsize=(10,10))
sns.heatmap(confusion_matrix(y_test,y_pred_rf),annot=True,fmt='d',annot_kws={'size':40},cbar=False)
plt.title('RandomForest Model',fontsize='32')

In [None]:
print(classification_report(y_test,y_pred_rf))

## Logistic Regression Model

In [None]:
param_grid={'C':[0.0001,0.001,0.01,0.01,1.0,2.0,3.0],
            }
Grid_lr=GridSearchCV(LogisticRegression(penalty='l2',solver='saga'),
                     param_grid,
                     scoring='accuracy',
                     n_jobs=-1)
Grid_lr.fit(x_train,y_train)

In [None]:
Grid_lr.best_params_

In [None]:
y_pred=Grid_lr.predict(x_test)
print(classification_report(y_test,y_pred))

### Training Logistic Regression with best parameters

In [None]:
lr_kwgs=Grid_lr.best_params_

In [None]:
lr=LogisticRegression(**lr_kwgs)

In [None]:
lr.fit(x_train,y_train)
y_pred_lr=lr.predict(x_test)

In [None]:
fig=plt.figure(figsize=(10,10))
sns.heatmap(confusion_matrix(y_test,y_pred_lr),annot=True,fmt='d',annot_kws={'size':40},cbar=False)
plt.title('Logistic Regression Model',fontsize='32')

In [None]:
print(classification_report(y_test,y_pred_lr))

## AdaBoost Model

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ADA=AdaBoostClassifier(DecisionTreeClassifier(max_depth=1))

param_grid={'n_estimators':[10,20,30,40,50,80,100,200,100],
            'learning_rate':[0.001,0.01,0.1,1]}

Ada_grid=GridSearchCV(ADA,
                      param_grid=param_grid,
                      scoring='accuracy',
                      n_jobs=-1)
Ada_grid.fit(x_train,y_train)

In [None]:
Ada_grid.best_params_


In [None]:
print(classification_report(y_test,Ada_grid.predict(x_test)))

### Training AdaBoost with the best paramters

In [None]:
ada_params=Ada_grid.best_params_

In [None]:
Ada_model=AdaBoostClassifier(**ada_params)
Ada_model.fit(x_train,y_train)
y_pred_ad=Ada_model.predict(x_test)

In [None]:
fig=plt.figure(figsize=(10,10))
sns.heatmap(confusion_matrix(y_test,y_pred_ad),annot=True,fmt='d',annot_kws={'size':40},cbar=False)
plt.title('AdaBoost Model',fontsize='32')

In [None]:
print(classification_report(y_test,y_pred_ad))

## Combining all models 

In [None]:
from sklearn.ensemble import VotingClassifier

estimator=[('RandomForest',rf),
           ('LogisticRegression',lr),
           ('AdaGrad',Ada_model),
           ('RandomForest2',rf),
           ]

voter=VotingClassifier(estimator,voting='hard')
voter.fit(x_train,y_train)


In [None]:
print(classification_report(y_test,voter.predict(x_test)))

We got better accuracy after combining all the models

## Experimenting with FeedForward Neural Networks

In [None]:
from keras.layers import Dense,Activation,Dropout
from keras.models import Sequential
from keras.optimizers import RMSprop

In [None]:
model_1=Sequential()

model_1.add(Dense(20,activation='relu',input_shape=(13,)))
model_1.add(Dense(20,activation='relu'))
model_1.add(Dense(20,activation='relu'))
model_1.add(Dense(20,activation='relu'))
model_1.add(Dense(20,activation='relu'))
model_1.add(Dense(1,activation='sigmoid'))

opt=RMSprop(lr=0.001,decay=1e-6)
model_1.compile(optimizer=opt,loss='binary_crossentropy',metrics=['accuracy'])

history=model_1.fit(x_train,y_train,
            batch_size=32,
            epochs=12,
            validation_data=(x_test,y_test),
            shuffle=True)

In [None]:
fig,ax=plt.subplots()
ax.plot(history.history['loss'],'b',label='train loss');
ax.plot(history.history['val_loss'],'r',label='valid loss');
plt.legend();

In [None]:
y_pred_nn=model_1.predict_classes(x_test)
print(classification_report(y_test,y_pred))

We got an accuracy of 84% less than our Combined Models

## Comparing Results

In [None]:
fig=plt.figure(figsize=(10,10))
sns.heatmap(confusion_matrix(y_test,y_pred_nn),annot=True,fmt='d',annot_kws={'size':40},cbar=False)
plt.title('FeedForward Neural Network',fontsize='32')


In [None]:
fig=plt.figure(figsize=(10,10))
sns.heatmap(confusion_matrix(y_test,voter.predict(x_test)),annot=True,fmt='d',annot_kws={'size':40},cbar=False)
plt.title('Combined Model',fontsize='32')


In [None]:
fig=plt.figure(figsize=(10,10))
sns.heatmap(confusion_matrix(y_test,Ada_model.predict(x_test)),annot=True,fmt='d',annot_kws={'size':40},cbar=False)
plt.title('AdaBoost Model',fontsize='32')


In [None]:
fig=plt.figure(figsize=(10,10))
sns.heatmap(confusion_matrix(y_test,y_pred_rf),annot=True,fmt='d',annot_kws={'size':40},cbar=False)
plt.title('RandomForest Model',fontsize='32')

The Combined Model has better accuracy compared to the other models besides RandomForest Model, the Combined Model and the RandomForest has the same accuracy of 87%