In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
%matplotlib inline 
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data loading and preprocessing

In [None]:
df = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
df

In [None]:
sns.countplot(x='output', data=df)

In [None]:
df.info()

# EDA

In [None]:
# In this step, you will be able to understand more about your data (e.g. bias, limitations, range of values, etc)
# Can help you to choose your ML model
# Try to gain insights or interesting pattern in your data from this step
# Check any anomaly in the data

In [None]:
df.describe()

In [None]:
_ = sns.pairplot(df, corner=True)

In [None]:
# correlation matrix heatmap visualization
sns.set(style="white")
# Generate a mask for the upper triangle
# Set up the matplotlib figure to control size of heatmap
fig, ax = plt.subplots(figsize=(15,11))
# Plot the heatmap
_ = sns.heatmap(df.corr(), annot=True, annot_kws={"size": 12}, square=True, 
                cmap='coolwarm' , vmin=-1, vmax=1, fmt='.2f')  # annot=True display corr label
# _ = sns.heatmap(df.corr(), annot=True, annot_kws={"size": 12}, square=True, 
#                 cmap='coolwarm' , vmin=-1, vmax=1, fmt='.2f')
# Prevent Heatmap Cut-Off Issue
#bottom, top = ax.get_ylim()
#ax.set_ylim(bottom+0.5, top-0.5)

# Data preparation for training and testing

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score
from sklearn.preprocessing import StandardScaler 

In [None]:
X=df.drop(['output'],axis=1)
y=df['output']
# Split the Data
X_y_train_test = train_test_split(X, y, test_size = 0.2, shuffle=True, random_state = 0)
X_train, X_test, y_train, y_test =X_y_train_test

In [None]:
# Create the classifiers
gboost = GradientBoostingClassifier()
#rf = RandomForestClassifier(n_estimators=25, max_depth=7, random_state=77) 
rf = RandomForestClassifier(random_state=77) 
svm_clf = SVC()
#lr = LogisticRegression(max_iter=2000,C=20)
lr = LogisticRegression(max_iter=2000)
#knn = KNeighborsClassifier(n_neighbors=8)
knn = KNeighborsClassifier()

In [None]:
# Pipeline

def train_predict_recall(model, X_y_train_test):
    X_train, X_test, y_train, y_test = X_y_train_test
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = recall_score(y_test, y_pred, average='binary')
    return score

In [None]:
train_predict_recall(gboost, X_y_train_test)

In [None]:
train_predict_recall(rf, X_y_train_test)

In [None]:
train_predict_recall(svm_clf, X_y_train_test)

In [None]:
train_predict_recall(lr, X_y_train_test)

In [None]:
train_predict_recall(knn, X_y_train_test)

In [None]:
#Knn Optimization

In [None]:
#Optimization - view the optimum n_neighbours by plotting n_neighbours vs Accuracy scove and test score 
training_accuracy = []
test_accuracy = []
# try n_neighbors from 1 to 11
neighbors_settings = range(1, 11)
for n_neighbors in neighbors_settings:
    #instantiate KNN classifier
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    #fit classifier
    knn.fit(X_train, y_train)
    #record training set accuracy
    training_accuracy.append(knn.score(X_train, y_train))
    # record generalization accuracy or test accuracy
    test_accuracy.append(knn.score(X_test, y_test))

# basic chart plot
plt.plot(neighbors_settings, training_accuracy, label="training accuracy")
plt.plot(neighbors_settings, test_accuracy, label="test accuracy")
print()
plt.ylabel("Accuracy")
plt.xlabel("n_neighbors")
plt.legend()
print("Best value for n_neighbours seems to peak at n_neighbours=8 where test accuracy is highest")
print()
#Feedback loop - now put the new value of n_neighbours back into Classifier before the Optimization stage

In [None]:
#Random Forest  Optimization n_estimators 

In [None]:
#Optimization - view the optimum parameters by plotting parameters vs Accuracy scove and test score 
training_accuracy = []
test_accuracy = []                                               
# try parameter n_estimators  from 1 to 200                      
estimators_settings = range(1, 200)                                      
for n_estimators in estimators_settings:                       
    #instantiate rf classifier
    rf = RandomForestClassifier(n_estimators=n_estimators, random_state=77)  
    #fit classifier
    rf.fit(X_train, y_train)                                      
    #record training set accuracy
    training_accuracy.append(rf.score(X_train, y_train))          
    # record generalization accuracy or test accuracy
    test_accuracy.append(rf.score(X_test, y_test))               

# basic chart plot
plt.plot(estimators_settings, training_accuracy, label="training accuracy")   
plt.plot(estimators_settings, test_accuracy, label="test accuracy")      
print()
plt.ylabel("Accuracy")
plt.xlabel("n_estimators")                                        
plt.legend()
print("Best value for n_estimators seems to peak at n_estimators_setting = 25 where test accuracy is highest")         ####
print()
#Feedback loop - now put the new value of n_neighbours back into Classifier before the Optimization stage

In [None]:
#Random Forest  Optimization max_depth

In [None]:
#Optimization - view the optimum parameters by plotting parameters vs Accuracy scove and test score 
training_accuracy = []
test_accuracy = []                                               
# try parameter max_depth from 2 to 25                     
maxdepth_settings = range(2, 25)                                      
for max_depth in maxdepth_settings:                       
    #instantiate rf classifier
    rf = RandomForestClassifier(max_depth=max_depth, random_state=77)  
    #fit classifier
    rf.fit(X_train, y_train)                                      
    #record training set accuracy
    training_accuracy.append(rf.score(X_train, y_train))          
    # record generalization accuracy or test accuracy
    test_accuracy.append(rf.score(X_test, y_test))               

# basic chart plot
plt.plot(maxdepth_settings, training_accuracy, label="training accuracy")   
plt.plot(maxdepth_settings, test_accuracy, label="test accuracy")      
print()
plt.ylabel("Accuracy")
plt.xlabel("max_depth")                                        
plt.legend()
print("Best value for max_depth seems to peak at maxdepth_settings = 8 where test accuracy is highest")         ####
print()
#Feedback loop - now put the new value of max_depth back into Classifier before the Optimization stage

In [None]:
#Logistic Regression Optimization of C

In [None]:
#Optimization - view the optimum parameters by plotting parameters vs Accuracy scove and test score 
training_accuracy = []
test_accuracy = []                                               
# try parameter C  from 1 to 200, step=1                     
c_settings = np.arange(1, 200.0, 1)                                      
for c in c_settings:                       
    #instantiate lr classifier
    lr= LogisticRegression(max_iter=2000,C=c)  
    #fit classifier
    lr.fit(X_train, y_train)                                      
    #record training set accuracy
    training_accuracy.append(lr.score(X_train, y_train))          
    # record generalization accuracy or test accuracy
    test_accuracy.append(lr.score(X_test, y_test))               

# basic chart plot
plt.plot(c_settings, training_accuracy, label="training accuracy")   
plt.plot(c_settings, test_accuracy, label="test accuracy")      
print()
plt.ylabel("Accuracy")
plt.xlabel("C")                                        
plt.legend()
print("Best value for C seems to peak at c_setting = 20 where test accuracy is highest")         ####
print()
#Feedback loop - now put the new value of C back into Classifier before the Optimization stage

# Hyperparameters tuning

In [None]:
#GRIDSEARCH Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier()

param_grid = { 
    'n_estimators': [25,50, 100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,6,8,10,12],
    'criterion' :['gini', 'entropy']
}

gs_clf = GridSearchCV(rf,
                      param_grid, 
                      cv=5,
                      scoring='recall_macro',
                      n_jobs=-1)
gs_clf.fit(X_train, y_train)

In [None]:
print("Best estimated parameters from GridSearch:")

In [None]:
gs_clf.best_params_

In [None]:
train_predict_recall(gs_clf.best_estimator_, X_y_train_test)

In [None]:
#GRIDSEARCH Logistic Regression

In [None]:
#Grid Search LR
from sklearn.model_selection import GridSearchCV
clf = LogisticRegression(max_iter=200000)
grid_values = {'C':range(1,100,1)}
grid_clf_acc = GridSearchCV(clf, param_grid = grid_values,scoring = 'recall_macro')
grid_clf_acc.fit(X_train, y_train)

#Predict values based on new parameters
y_pred_acc = grid_clf_acc.predict(X_test)

# New Model Evaluation metrics 
print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred_acc)))
print('Precision Score : ' + str(precision_score(y_test,y_pred_acc)))
print('Recall Score : ' + str(recall_score(y_test,y_pred_acc)))
print('F1 Score : ' + str(f1_score(y_test,y_pred_acc)))

#Logistic Regression (Grid Search) Confusion matrix
confusion_matrix(y_test,y_pred_acc)

# Post-mortem analysis

In [None]:
#Random Forest Analysis

In [None]:
rf.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(rf.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(rf.score(X_test, y_test)))

In [None]:
# Classification Report
print("Classification report for Random Forest Model :")
print()
print(classification_report(y_test, rf.predict(X_test)))
print()

In [None]:
# Plot the confusion matrix using Seaborn library for RF
print("Confusion Matrix for Random Forest Model:")
plt.figure(figsize=(5,5))
_ = sns.heatmap(confusion_matrix(y_test, rf.predict(X_test)), 
                annot=True,fmt='', annot_kws={"size": 18},cmap=plt.cm.winter_r) 
_ = plt.ylabel('Actual', fontweight='bold')
_ = plt.xlabel('Predicted', fontweight='bold')

In [None]:
#Logistic Regression Analysis

In [None]:
lr.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(lr.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(lr.score(X_test, y_test)))

In [None]:
# Classification Report LR
print("Classification report for Logistic Regression Model :")
print()
print(classification_report(y_test, lr.predict(X_test)))
print()

In [None]:
# Plot the confusion matrix using Seaborn library for LR
print("Confusion Matrix for Logistic Regression Model:")
plt.figure(figsize=(5,5))
_ = sns.heatmap(confusion_matrix(y_test, lr.predict(X_test)), 
                annot=True,fmt='', annot_kws={"size": 18},cmap=plt.cm.winter_r) 
_ = plt.ylabel('Actual', fontweight='bold')
_ = plt.xlabel('Predicted', fontweight='bold')

In [None]:
#Logistic Regression gives the best model with Accuracy 0.84