In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Library Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter("ignore")

## Load the dataset and validate the data load

In [None]:
heart = pd.read_csv("../input/heart-attack-analysis-prediction-dataset/heart.csv")

heart.head()

In [None]:
# Transform few attributes to categorical
heart['sex'] = pd.Categorical(heart['sex'])
heart['cp'] = pd.Categorical(heart['cp'])
heart['fbs'] = pd.Categorical(heart['fbs'])
heart['restecg'] = pd.Categorical(heart['restecg'])
heart['exng'] = pd.Categorical(heart['exng'])
heart['slp'] = pd.Categorical(heart['slp'])
heart['caa'] = pd.Categorical(heart['caa'])
heart['thall'] = pd.Categorical(heart['thall'])

In [None]:
# Check on the data structure
heart.info()

In [None]:
# Check for missing values
heart.isna().sum()

## Exploratory Data Analysis

In [None]:
# Explore the response variable
print(heart['output'].value_counts())
sns.countplot(x='output', data=heart);

In [None]:
fig = plt.figure(figsize=(22, 18))
fig.subplots_adjust(hspace=0.4, wspace=0.4)

ax = fig.add_subplot(2, 4, 1)
sns.countplot(x='sex', hue='output', data=heart, ax=ax)

ax = fig.add_subplot(2, 4, 2)
sns.countplot(x='cp', hue='output', data=heart, ax=ax)

ax = fig.add_subplot(2, 4, 3)
sns.countplot(x='fbs', hue='output', data=heart, ax=ax)

ax = fig.add_subplot(2, 4, 4)
sns.countplot(x='restecg', hue='output', data=heart, ax=ax)

ax = fig.add_subplot(2, 4, 5)
sns.countplot(x='exng', hue='output', data=heart, ax=ax)

ax = fig.add_subplot(2, 4, 6)
sns.countplot(x='slp', hue='output', data=heart, ax=ax)

ax = fig.add_subplot(2, 4, 7)
sns.countplot(x='caa', hue='output', data=heart, ax=ax)

ax = fig.add_subplot(2, 4, 8)
sns.countplot(x='thall', hue='output', data=heart, ax=ax)

plt.show()

In [None]:
# Explore Age attribute by Sex with respect to Output

sns.displot(data=heart, x="age", hue="output", col="sex", kind="kde")

print("Average Age by Output\n")
print(heart.groupby("output")['age'].mean())
print("\n")

In [None]:
# Explore trtbps attribute by Sex with respect to Output

sns.displot(data=heart, x="trtbps", hue="output", col="sex", kind="kde")

print("Average trtbps by Output\n")
print(heart.groupby("output")['trtbps'].mean())
print("\n")

In [None]:
# Explore Chol attribute by Sex with respect to Output

sns.displot(data=heart, x="chol", hue="output", col="sex", kind="kde")

print("Average Cholesterol by Output\n")
print(heart.groupby("output")['chol'].mean())
print("\n")

In [None]:
# Explore thalachh attribute by Sex with respect to Output

sns.displot(data=heart, x="thalachh", hue="output", col="sex", kind="kde")

print("Average thalachh by Output\n")
print(heart.groupby("output")['thalachh'].mean())
print("\n")

In [None]:
# Explore oldpeak attribute by Sex with respect to Output

sns.displot(data=heart, x="oldpeak", hue="output", col="sex", kind="kde")

print("Average oldpeak by Output\n")
print(heart.groupby("output")['oldpeak'].mean())
print("\n")

In [None]:
# Lets understand correlations
plt.figure(figsize=(18, 12))
sns.heatmap(heart.corr(), annot=True)

## Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

features = heart.drop(labels=['output'], axis=1)
target = heart['output']

features_train, features_test, target_train, target_test = train_test_split(features, 
                                                                            target, 
                                                                            test_size=0.3, random_state=101)

## Scaling the features

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
features_train_scaled = sc.fit_transform(features_train)
features_test_scaled = sc.transform(features_test)

## Model - Random Forest

In [None]:
# We will run the Random Forest Classifier on GridSearch for best hyperparameters

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier


param_grid = {'n_estimators': [50, 100, 200], 'max_features': ['auto', 'sqrt'], 'bootstrap': [True, False], 'criterion':['entropy', 'gini']}
rfcgrid = GridSearchCV(RandomForestClassifier(random_state=101), param_grid, verbose=100, cv=10, n_jobs=-2)
rfcgrid.fit(features_train, target_train)

In [None]:
# Checking the best params
rfcgrid.best_params_

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score

rfcpredictions = rfcgrid.predict(features_test)

print("Confusion Matrix - Random Forest")
print(confusion_matrix(target_test,rfcpredictions))
print("\n")
print("Accuracy Score - Random Forest")
print(accuracy_score(target_test, rfcpredictions))
print("\n")
print("F1 Score - Random Forest")
print(f1_score(target_test, rfcpredictions))
print("\n")
print("Classification Report - Random Forest")
print(classification_report(target_test,rfcpredictions))

## Model - SVM

In [None]:
from sklearn.svm import SVC

# We will define a list of param values in a grid and run SV classifier across various combinations

param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['linear', 'rbf', 'sigmoid', 'poly']}
svmgrid = GridSearchCV(SVC(), param_grid, verbose=100, cv=10, n_jobs=-2)
svmgrid.fit(features_train_scaled, target_train)

In [None]:
# Check for best params
svmgrid.best_params_

In [None]:
svm_predictions = svmgrid.predict(features_test_scaled)

print("Confusion Matrix - Support Vector Machines")
print(confusion_matrix(target_test, svm_predictions))
print("\n")
print("Accuracy Score - Support Vector Machines")
print(accuracy_score(target_test, svm_predictions))
print("\n")
print("F1 Score - Support Vector Machines")
print(f1_score(target_test, svm_predictions))
print("\n")
print("Classification Report - Support Vector Machines")
print(classification_report(target_test, svm_predictions))

## Model - Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

param_grid = {'n_estimators':[100, 200, 300], 'loss' : ['deviance', 'exponential'], 'learning_rate':[0.001, 0.01, 0.1, 1, 10], 'criterion':['friedman_mse', 'mse', 'mae']}
grid = GridSearchCV(GradientBoostingClassifier(), param_grid, verbose=True, cv=10, n_jobs=-2)
grid.fit(features_train_scaled, target_train)

In [None]:
# check the best params
grid.best_params_

In [None]:
grid_predictions = grid.predict(features_test_scaled)

print("Confusion Matrix - Gradient Boosting")
print(confusion_matrix(target_test,grid_predictions))
print("\n")
print("Accuracy Score - Gradient Boosting")
print(accuracy_score(target_test, grid_predictions))
print("\n")
print("F1 Score - Gradient Boosting")
print(f1_score(target_test, grid_predictions))
print("\n")
print("Classification Report - Gradient Boosting")
print(classification_report(target_test, grid_predictions)) 

## We see SVM Performs best with 88% accuracy and F1 Score of 0.88