<div align="center">

# Heart Attack Extensive EDA, Visualizations and Prediction
</div>

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint
from sklearn.metrics import accuracy_score, precision_score, classification_report, plot_confusion_matrix, roc_curve, auc

import warnings 
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
data = pd.read_csv("../input/heart-attack-analysis-prediction-dataset/heart.csv")
data.head()

## Exploring the Data

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

## Exploratory Data Analysis

### Checking the Data Distribution

In [None]:
val_counts = data["output"].value_counts()
no_heart_attack = (val_counts[0] / data.shape[0]) * 100
heart_attack = (val_counts[1] / data.shape[0]) * 100

print(f"Heart Attack: {math.floor(heart_attack)}%")
print(f"No Heart Attack: {math.ceil(no_heart_attack)}%")

print()

sns.barplot(x = ["No Heart Attack", "Heart Attack"], y = [no_heart_attack, heart_attack])
plt.show()

<b>Observation: </b>Hear Attack percentage is 54 and No Heart Attack percentage is 46. So, the dataset is balanced and there is no need to balance the data.

## Bivariate Analysis

### Plots for columns with categorical values

<b>Checking the distribution of Sex with respect to Heart Disease</b>

In [None]:
ax = plt.figure(figsize = (9,5))
sns.countplot(
    data=data,
    x = "sex",
    hue="output",
)
plt.title("Distribution of Gender w.r.t Heart Attack")
plt.show()

In [None]:
val_counts = data.groupby("sex")["output"].value_counts()
male_heart_attack_percentage = (val_counts[1][1] / (val_counts[1][1]+val_counts[1][0])) * 100
female_heart_attack_percentage = (val_counts[0][1] / (val_counts[0][1]+val_counts[0][0])) * 100

print(f"Male heart Attack: {male_heart_attack_percentage}%")
print(f"Female heart Attack: {female_heart_attack_percentage}%")

In [None]:
plt.figure(figsize = (9,5))
x = ["Male","Female"]
y = [male_heart_attack_percentage,female_heart_attack_percentage]
plt.barh(x, y)
  
for index, value in enumerate(y):
    plt.text(value, index,
             str(value)[:4]+"%")

plt.title("Gender vs Heart Attack")
plt.show()

<b>Observation: </b>The Heart attack percentage for females is 75% and for males is around 45% i.e. Females have more chances for having Heart Attack.

<b>Checking the distribution of Exercise Induced Angina (exng) with respect to Heart Disease</b>

In [None]:
plt.figure(figsize = (9,5))
sns.countplot(
    data=data,
    x = "exng",
    hue="output",
)
plt.title("Distribution of Exercise Induced Angina w.r.t Heart Attack")
plt.show()

In [None]:
exang_val_counts = data.groupby("exng")["output"].value_counts()
exang_heart_attack_percentage = (exang_val_counts[1][1] / (exang_val_counts[1][1]+exang_val_counts[1][0])) * 100
no_exang_heart_attack_percentage = (exang_val_counts[0][1] / (exang_val_counts[0][1]+exang_val_counts[0][0])) * 100

print(f"Exang heart Attack: {exang_heart_attack_percentage}%")
print(f"No Exang heart Attack: {no_exang_heart_attack_percentage}%")

plt.figure(figsize = (9,5))
x = ["Exang","No Exang"]
y = [exang_heart_attack_percentage,no_exang_heart_attack_percentage]
plt.barh(x, y)
  
for index, value in enumerate(y):
    plt.text(value, index,
             str(value)[:4]+"%")

plt.title("Exercise Induced Angina vs Heart Attack")
plt.show()

<b>Observation: </b>From the above data we can see that the people having Exercise Induced Angina have an heart attack percentage of 23.2% where as people with no Exercise Induced Angina have an heart attack percentage of 69.6%. Therefore people with Exercise Induced Angina have less chances of getting heart attack.

<b>Checking the distribution of Chest Pain with respect to Heart Disease</b>

In [None]:
plt.figure(figsize = (9,5))
sns.countplot(
    data=data,
    x = "cp",
    hue="output",
)
plt.title("Distribution of Chest Pain w.r.t Heart Attack")
plt.show()

In [None]:
cp_val_counts = data.groupby("cp")["output"].value_counts()

x = []
y = []

for i in range(4):
    temp_per = (cp_val_counts[i][1] / (cp_val_counts[i][1]+cp_val_counts[i][0])) * 100
    x.append("CP_"+str(i))
    y.append(temp_per)
    print(f"CP_{i} Heart Attack Percentage: {temp_per}%")
    
print()

plt.figure(figsize = (9,5))
plt.barh(x[::-1], y[::-1])
  
for index, value in enumerate(y[::-1]):
    plt.text(value, index,
             str(value)[:4]+"%")

plt.title("Chest Pain vs Heart Attack")
plt.show()

<ul>
    <li>Value 0: typical angina</li>
    <li>Value 1: atypical angina</li>
    <li>Value 2: non-anginal pain</li>
    <li>Value 3: asymptomatic</li>
</ul>
<br/>
<b>Observation: </b> People with chest pain type 1 (atypical angina) are more prone to heart attack when compared to other chest pain categories.

<b>Checking the distribution of Fasting Blood Sugar with respect to Heart Disease</b>

In [None]:
plt.figure(figsize = (9,5))
sns.countplot(
    data=data,
    x = "fbs",
    hue="output",
)
plt.title("Distribution of Fasting Blood Sugar w.r.t Heart Attack")
plt.show()

In [None]:
fbs_val_counts = data.groupby("fbs")["output"].value_counts()
fbs_heart_attack_percentage = (fbs_val_counts[1][1] / (fbs_val_counts[1][1]+fbs_val_counts[1][0])) * 100
no_fbs_exang_heart_attack_percentage = (fbs_val_counts[0][1] / (fbs_val_counts[0][1]+fbs_val_counts[0][0])) * 100

print(f"FBS heart Attack: {exang_heart_attack_percentage}%")
print(f"No FBS heart Attack: {no_exang_heart_attack_percentage}%")

print()

plt.figure(figsize = (9,5))
x = ["FBS","No FBS"]
y = [fbs_heart_attack_percentage,no_fbs_exang_heart_attack_percentage]
plt.barh(x, y)
  
for index, value in enumerate(y):
    plt.text(value, index,
             str(value)[:4]+"%")

plt.title("Fasting Blood Sugar(FBS) vs Heart Attack")
plt.show()

<b>Observation: </b>There is no much difference between people with FBS and people without FBS. There is a little difference of 4.9%

<b>Checking the distribution of resting electrocardiographic results with respect to Heart Disease</b>

In [None]:
plt.figure(figsize = (9,5))
sns.countplot(
    data=data,
    x = "restecg",
    hue="output",
)
plt.title("Distribution of Resting Electrocardiographic Results w.r.t Heart Attack")
plt.show()

restecg_val_counts = data.groupby("restecg")["output"].value_counts()

x = []
y = []

for i in range(3):
    temp_per = (restecg_val_counts[i][1] / (restecg_val_counts[i][1]+restecg_val_counts[i][0])) * 100
    x.append("RESTECG_"+str(i))
    y.append(temp_per)
    print(f"RESTECG_{i} Heart Attack Percentage: {temp_per}%")
    
print()

plt.figure(figsize = (9,5))
plt.barh(x[::-1], y[::-1])
  
for index, value in enumerate(y[::-1]):
    plt.text(value, index,
             str(value)[:4]+"%")

plt.title("Resting Electrocardiographic Results vs Heart Attack")
plt.show()

<ul>
    <li>RESTECG_0: normal</li>
    <li>RESTECG_1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)</li>
    <li>RESTECG_2: showing probable or definite left ventricular hypertrophy by Estes' criteria</li>
</ul>
<b>Observation: </b>People with RESTECG 1 are more pronne to heart attack follwed by RESTECG 0 and RESTECG 2.

### Plots for columns with Continuous values

<b>Checking the distribution of Age with respect to Heart Disease</b>

In [None]:
ax = plt.figure(figsize = (9,5))
sns.histplot(
    data=data,
    x = "age",
    hue="output",
    binwidth=2
)
plt.title("Distribution of Age w.r.t Heart Attack")
plt.show()

<b>Observation: </b>People with age between 50-70 years are more prone to Heart Attack.

In [None]:
continuous_var_cols = ["age", "trtbps", "chol", "thalachh", "oldpeak", "output"]

sns.pairplot(data[continuous_var_cols], hue="output")
plt.show()

<b>Observation: </b>In the above plot we can see that no two columns are highly correlated. Hence there is no need of feature elimination for columns with continous data.

<b>Distribution plot for all columns with continuous variables</b>

In [None]:
for col in continuous_var_cols[:-1]:
    plt.figure(figsize = (9,5))
    sns.histplot(
        data=data,
        x = col,
        hue="output",
        binwidth=2,
        kde=True
    )
    plt.title(f"Distribution of {col.upper()} w.r.t Heart Attack")
    plt.show()
    print()

<b>Checking Correlation Among Attributes</b>

In [None]:
corr_matrix = data[continuous_var_cols].corr()

plt.figure(figsize=(11,7))
sns.heatmap(corr_matrix, center=0, annot=True)
plt.title("Correlation among Attributes")
plt.show()

## Building a Predictive Model

In [None]:
# Splitting the data into train and test

X = data.drop(columns=['output'])
y = data['output']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 24)
print(f"Train Data: {X_train.shape}, {y_train.shape}")
print(f"Test Data: {X_test.shape}, {y_test.shape}")

### Checking for best baseline model

In [None]:
all_models = {
    "LogisticRegressionModel": LogisticRegression(),
    "RandomForestModel": RandomForestClassifier(random_state = 18),
    "XGBoostClassifier": XGBClassifier(random_state = 18, ),
    "KNeighborsClassifier": KNeighborsClassifier(n_neighbors=15),
    "SVMClassifier": SVC(),
    "DecisionTreeClassifier":DecisionTreeClassifier(),
    "AdaBoostClassifier":AdaBoostClassifier(base_estimator=RandomForestClassifier(random_state=18))
}

model_names = []
model_scores = []

for model_name in all_models:
    pipeline = make_pipeline(StandardScaler(), all_models[model_name])
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    model_names.append(model_name)
    model_scores.append(accuracy * 100)
    print(f"{model_name} got {str(accuracy*100)[:5]}% Accuracy.")

plt.figure(figsize = (9,5))
plt.barh(model_names, model_scores)
  
for index, value in enumerate(model_scores):
    plt.text(value, index,
             str(value)[:4]+"%")

plt.title("Models vs Accuracy")
plt.show()

From the above models we can see that Random Forest Classifier performs better than other model. So, let us go ahed with RandomForestClassifier.

### Optimizing Random Forest Classifier

In [None]:
param_dist={'max_depth':[3,5,7,10,None],
              'n_estimators':[10,50,100,150,200,250,300,400,500],
              'max_features':randint(1,15),
               'criterion':['gini','entropy'],
               'bootstrap':[True,False],
               'min_samples_leaf':randint(1,15),
              }

In [None]:
train_x = StandardScaler().fit_transform(X_train)

In [None]:
rf_classifier = RandomForestClassifier(n_jobs = -1, random_state = 18)
search_clfr = RandomizedSearchCV(rf_classifier, param_distributions = param_dist, n_jobs=-1, n_iter = 100, cv = 5)
search_clfr.fit(train_x, y_train)

In [None]:
params = search_clfr.best_params_
score = search_clfr.best_score_
print(params)
print(score)
search_clfr.best_estimator_

In [None]:
final_rf_model = RandomForestClassifier(bootstrap=False, max_depth=10, max_features=2,
                       min_samples_leaf=7, n_jobs=-1, random_state=18)
prediction_pipeline = make_pipeline(StandardScaler(), final_rf_model)
prediction_pipeline.fit(X_train, y_train)

In [None]:
prediction = prediction_pipeline.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, prediction)*100}")
print(f"Precision: {precision_score(y_test, prediction)}")

class_probabilities = prediction_pipeline.predict_proba(X_test)
preds = class_probabilities[:, 1]

fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)

print(f"AUC for our classifier is: {roc_auc}")

print()

# Plotting the ROC
plt.figure(figsize = (9,7))
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

print()

fig, ax = plt.subplots(figsize=(11, 7))
plot_confusion_matrix(pipeline, X_test, y_test, ax=ax)
plt.grid(False)
plt.title("Confusion Matrix")
plt.show()

## Conclusion

After extensive exploratory data analysis (EDA) and experimenting with variety of classification models, these were the observations:

1. The Heart attack percentage for females is 75% and for males is around 45% i.e. Females have more chances for having Heart Attack.
2. People having Exercise Induced Angina have an heart attack percentage of 23.2% where as people with no Exercise Induced Angina have an heart attack percentage of 69.6%. Therefore people with Exercise Induced Angina have less chances of getting heart attack.
3. People with chest pain type 1 (atypical angina) are more prone to heart attack.
4. People with RESTECG 1 are more pronne to heart attack follwed by RESTECG 0 and RESTECG 2.
5. People with less age has higher chances of Heart Attack.
6. People with age between 50-70 years are more prone to Heart Attack.
7. Out of all the experimented classification models Random Forest Classifier performed better than other models with an AUC of 0.942 and accuracy of 90.16% and precision of 0.86

<table>
    <tr>
        <th>Metrics</th>
        <th>Random Forest Classifier</th>
    </tr>
    <tr>
        <th>Accuracy</th>
        <td>90.16%</td>
    </tr>
    <tr>
        <th>Precision</th>
        <td>0.86</td>
    </tr>
    <tr>
        <th>AUC</th>
        <td>0.942</td>
    </tr>
</table>
<br/>

<div align="center">
<b>PLEASE DO UPVOTE IF YOU LIKED THIS NOTEBOOK</b>
</div>