In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Reading the DataFrame :

In [None]:
df = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')
df.head()

## Gathering Basic Info about the Data :

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.shape

## Exploratory Data Analysis :

In [None]:
df1 = df.copy()
#Seperate copy of df will be used for visualizations purpose

### Checking for Skewness :

In [None]:
df1['output'].value_counts()

### *Slightly skewed outputs but not worth concerning*

In [None]:
print("Categorical Variables: 'sex', 'cp', 'fbs', 'restecg', 'exng', 'thall', 'caa', 'slp' ")
print("Numerical Variables: 'age', 'trtbps', 'chol', 'thalachh', 'oldpeak ")

### *Upon seperate researching on the database 'thall = 0' and 'caa = 4' are not valid types. So replacing such values by its median*

In [None]:
df1[df1['thall'] == 0]

In [None]:
df1[df1['caa'] == 4]

In [None]:
df1['thall'].replace({0:2}, inplace=True)

In [None]:
df1['caa'].replace({4:1}, inplace=True)

### Displaying Unique categories types of the categorical features

In [None]:
print(df1['sex'].unique())
print(df1['cp'].unique())
print(df1['fbs'].unique())
print(df1['restecg'].unique())
print(df1['exng'].unique())
print(df1['thall'].unique())
print(df1['slp'].unique())
print(df1['output'].unique())

### Finding Correlations

In [None]:
corr_df1 = df1.corr()
corr_df1['output'].sort_values(ascending=False)

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(corr_df1, annot=True)

### *Based on correlations data, Chest pain, thallach, slp has high positive correlations with high chances of higher attacks.*

### *Similarly, caa, exng, oldpeak, thall has high negative correlation with high chances of heart attacks*

### *Sex and age seems to moderately affect the chances of Heart Attacks.* 

### *Cholesterol and tbs seems to have slight effect on Heart Attack chances*

### Replacing category type numbers by their names for easy visualiations

In [None]:
df1['sex'] = df1['sex'].map({1:"Male", 0:"Female"})
df1['cp'] = df1['cp'].map({3:"Asymptomatic", 2:"Non-Anginal", 1:"Atypical", 0:"Typical"})
df1['fbs'] = df1['fbs'].map({1:">=120", 0:"<120"})
df1['restecg'] = df1['restecg'].map({0:"Normal", 1:" ST-T wave abnormality", 2:"probable or definite left ventricular hypertrophy"})
df1['exng'] = df1['exng'].map({1:"Yes", 0:"No"})
df1['thall'] = df1['thall'].map({1:"Fixed Defect", 2:"Normal", 3:"Reversible Defect"})
df1['slp'] = df1['slp'].map({0:"Downsloping", 1:"Flat", 2:"Upsloping"})
df1['output'] = df1['output'].map({1:"High chances", 0:"Low Chances"})

In [None]:
df1.head()

### Histogram Plots for the Numerical Variables against Output Label :

In [None]:
sns.histplot(x='age', hue='output', bins=30, data=df1)
plt.title("Heart Attack Counts w.r.t Age");

Age range about 37-55 have higher chances of Heart Attack. Age range 55-70 have lower chances of Heart Attacks

In [None]:
sns.histplot(x='trtbps', hue='output', bins=40, data=df1)
plt.title("Heart Attack Counts w.r.t Resting Blood Pressure");

Resting blood pressure in the range of 120-140 have higher chances of heart attacks.

In [None]:
sns.histplot(x='chol', hue='output',bins=40, data=df1)
plt.title("Heart Attack Counts w.r.t Cholesterol Level in mg/dl");

Cholesterol Level in the range of 200 - 270 have higher chances of heart Attacks

In [None]:
sns.histplot(x='oldpeak', hue='output', bins=30, data=df1)
plt.title("Heart Attack Counts w.r.t ST depression induced by exercise relative to rest");

Range between 0-0.8 have higher chances of Heart Attacks

### Plotting CountPlots for the Categorical Variables against Output Label :

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='output', hue='sex', data=df1)
plt.title("Heart Attack Counts w.r.t Sex");

Males have higher chances of Heart Attacks. But this attribute shouldn't be a deciding factor

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='output', hue='cp', data=df1)
plt.title("Heart Attack Counts w.r.t Chest Pain");

Non-Anginal chest pains leads to higher Heart Attacks than the others types of pains. Typical Pains might greatly indicate lower chances of Heart Attack.

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='output', hue='fbs', data=df1)
plt.title("Heart Attack Counts w.r.t Fasting Blood Sugar");

No firm conclusion can be derived from here. Also evident from its low correlation (-0.028..) with chances of heart attack

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='output', hue='restecg', data=df1)
plt.title("Heart Attack Counts w.r.t Resting ECG Results");

Having ST-T wave normality increases chances of Heart Attacks.

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='output', hue='exng', data=df1)
plt.title("Heart Attack Counts w.r.t Exercise Induced Angina");

Absence of Exercise Induced Angima increases chances of Heart Attacks

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='output', hue='thall', data=df1)
plt.title("Heart Attack Counts w.r.t Thalium Stress Test Results");

Thallium Test Results indicating Type 2 shows higher chances of Heart Attacks

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x='output', hue='slp', data=df1)
plt.title("Heart Attack Counts w.r.t The slope of the peak exercise ST segment");

Uplsloping of the peak exercise ST segment leads to higher Heart Attacks

### Checking for Duplicate Rows :

In [None]:
df1[df1.duplicated()]

### *Seems there is 1 row of duplicate data. We'll remove that*

In [None]:
df1.drop_duplicates(inplace=True)
df1.head()

### Checking for significant outliers

In [None]:
sns.boxplot(df1['chol'])

In [None]:
sns.boxplot(df1['thalachh'])

In [None]:
sns.boxplot(df1['trtbps'])

### *Seems like there is 1 concerning outlier in cholesterol boxplot having chol > 500. So we'll remove the row having that value of chol. Outliers of other features doesn't seem to be of much concern*

In [None]:
df1.drop(df1[df1['chol'] > 500].index, inplace = True)

In [None]:
df1['chol'].max()

## Moving to Prediction 

In [None]:
df2 = df.copy()
df2.head()

In [None]:
df2.shape

### *Removing the cholesterol outlier and 1 duplicate row found during EDA*

In [None]:
df2.drop_duplicates(inplace=True)
df2.head()

In [None]:
df2.drop(df2[df2['chol'] > 500].index, inplace = True)

### Now seperating data to form Input and Output Variables :

In [None]:
y = df2['output']
X = df2.drop('output', axis=1)

In [None]:
X.head()

In [None]:
y.head()

In [None]:
print(X.shape)
print(y.shape)

## Splitting into Training and Test Set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## Scaling the Data

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
X_train_scaled

In [None]:
X_test_scaled = scaler.transform(X_test)
X_test_scaled

In [None]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)

## Defining Machine Learning Models

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.ensemble import VotingClassifier

kn_clf = KNeighborsClassifier()
log_clf = LogisticRegression()
svm_clf = SVC()
dt_clf = DecisionTreeClassifier()
rnd_clf = RandomForestClassifier()
ada_clf = AdaBoostClassifier()
gdb_clf = GradientBoostingClassifier()

estimators_list = [('KNeighborsClassifier', kn_clf), ('LogisticRegression', log_clf), ('SVC', svm_clf), ('DecisionTreeClassifier', dt_clf),('RandomForestClassifier', rnd_clf),('AdaBoostClassifier', ada_clf), ('GradientBoostingClassifier', gdb_clf)]
voting_clf = VotingClassifier(estimators=estimators_list, voting='hard')
voting_clf.fit(X_train_scaled, y_train)

In [None]:
from sklearn.metrics import accuracy_score
for name, clf in (estimators_list):
  clf.fit(X_train_scaled, y_train)
  y_pred = clf.predict(X_test_scaled)
  print(name, accuracy_score(y_test, y_pred))

### *Considering Random Forest as our Best Model:*

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score

rf = RandomForestClassifier(random_state=101)
rf.fit(X_train_scaled,y_train)
rf_pred = rf.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test,rf_pred))
print("Confusion matrix:\n", confusion_matrix(y_test,rf_pred))
print("Precision Score:\n", precision_score(y_test, rf_pred))
print("Recall Score:\n", recall_score(y_test, rf_pred))

### *We need a higher Recall score compared to the Precision score. So the above results are good enough*

### ROC-AUC Score:

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score

y_proba = cross_val_predict(rf, X_train_scaled, y_train, cv=5, method='predict_proba')
y_scores = y_proba[:, 1]
roc_auc_score(y_train, y_scores)


In [None]:
y_proba = cross_val_predict(rf, X_test_scaled, y_test, cv=5, method='predict_proba')
y_scores = y_proba[:, 1]
roc_auc_score(y_test, y_scores)


### Predicting on a random instance:

In [None]:
X_test_random = X_test_scaled[42].reshape(1,-1)
X_test_random

In [None]:
y_test_random = y_test.iloc[42]
y_test_random

In [None]:
print(rf.predict(X_test_random))