In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
np.set_printoptions(precision=4)
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score , confusion_matrix
from sklearn.inspection import permutation_importance
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/heart-attack-analysis-prediction-dataset/heart.csv')
df.head(5)

In [None]:
df.shape

In [None]:
df_all = df.copy()

df_more = df[df['output']==1]

df_less = df[df['output']==0]

In [None]:
def plot_pie(column, title="All"):
    fig,axs = plt.subplots(1,1)
    data = df_all[column].value_counts()
    plt.pie(data,autopct='%1.2f%%',labels=data.index)
    plt.title(title)
    plt.show()
    
def plot_hist(column, title="all"):
    plt.hist(df_all[column],density=True)
    plt.title(title)
    plt.show()

def plot_bar(column, sort=False, title="all"):
    if sort:
        data_all = df_all[column].value_counts().sort_index()
    else:
        data_all = df_all[column].value_counts()
    plt.bar(data_all.index,data_all)
    plt.title(title)
    plt.show()
    
def plot_bar_compare(column, sort=False):
    if sort:
        data_churned = df_more[column].value_counts().sort_index()
        data_nonchurned = df_less[column].value_counts().sort_index()
    else:
        data_churned = df_more[column].value_counts()
        data_nonchurned = df_less[column].value_counts()
    
    fig,axs = plt.subplots(2,1)
    plt.subplots_adjust(left=0, bottom=0, right=1, top=2, wspace=0, hspace=0.2)
    axs[0].bar(data_nonchurned.index,data_nonchurned)
    axs[0].title.set_text('No Stroke')
    axs[1].bar(data_churned.index,data_churned)
    axs[1].title.set_text('Has Stroke')
    plt.show()

def plot_hist_compare(column, bins=5):
    plt.hist([df_less[column], df_more[column]] , color=['c','r'])
    plt.legend(('Less Chance', 'More Chance'))
    plt.show()
    
def plot_pie_compare(column):
    data_churned = df_more[column].value_counts()
    data_nonchurned = df_less[column].value_counts()
    
    fig,axs = plt.subplots(2,1)
    plt.subplots_adjust(left=0, bottom=0, right=1, top=2, wspace=0, hspace=0.2)
    axs[0].pie(data_nonchurned,autopct='%1.2f%%',labels=data_nonchurned.index)
    axs[0].title.set_text('Less Chance')
    axs[1].pie(data_churned,autopct='%1.2f%%',labels=data_churned.index)
    axs[1].title.set_text('More Chance')
    plt.show()

def plot_boxplot(column, title=""):
    sns.boxplot(x="output", y=column, palette=["c", "r"],
            hue="output",  data=df_all).set_title(title, fontsize=15)

def check_median(column):
    data_churned = df_more[column].describe()
    data_nonchurned = df_less[column].describe()
    print('Less Chance: {}'.format(data_nonchurned['50%']))
    print('More Chance: {}'.format(data_churned['50%']))

def check_most(column):
    data_churned = df_more[column].value_counts()
    data_nonchurned = df_less[column].value_counts()
    print('Less Chance: {}'.format(data_nonchurned.index[0]))
    print('More Chance: {}'.format(data_churned.index[0]))

# Checking for imbalance data

In [None]:
plot_pie('output')

The dataset is balance

# check for missing data

In [None]:
df.isnull().sum()

No missing data found

# EDA

## age

In [None]:
plot_hist('age')
plot_hist_compare('age')

In [None]:
check_median('age')

No Clear Difference

## sex

In [None]:
plot_pie('sex')
plot_pie_compare('sex')

the percentage of gender label 0 from "more chance" group is 26.25% higher than "less chance" group

## cp

In [None]:
plot_pie('cp')
plot_pie_compare('cp')

In [None]:
check_most('cp')

cp 2 is dominated in "more chance" group and cp 0 is dominated in "less chance" group

## trtbps 

In [None]:
plot_hist('trtbps')

In [None]:
plot_boxplot('trtbps')

In [None]:
check_median('trtbps')

No Clear Difference

## chol 

In [None]:
plot_hist('chol')
plot_hist_compare('chol')

In [None]:
plot_boxplot('chol')

In [None]:
check_median('chol')

## fbs

In [None]:
plot_pie("fbs")
plot_pie_compare('fbs')

In [None]:
check_most('fbs')

No Clear Difference

## restecg

In [None]:
plot_pie("restecg")
plot_pie_compare('restecg')

In [None]:
check_most("restecg")

less chance group is dominated by restecg 0 and more chance group is dominated by restecg 1

## thalachh

In [None]:
plot_hist('thalachh')
plot_hist_compare('thalachh')

In [None]:
plot_boxplot('thalachh')

In [None]:
check_median('thalachh')

the value of thalachh (maximum heart rate achieved) from "more chance" group is higher than "less chance" group

## exng

In [None]:
plot_pie('exng')
plot_pie_compare('exng')

In [None]:
check_most('exng')

less chance group is dominated by exng 1 and more chance group is dominated by exng 0

## oldpeak

In [None]:
plot_hist('oldpeak')
plot_hist_compare('oldpeak')

In [None]:
plot_boxplot('oldpeak')

In [None]:
check_median('oldpeak')

the median of oldpeak from more chance group is lower than less chance group

## slp

In [None]:
plot_pie('slp')
plot_pie_compare('slp')

In [None]:
check_most('slp')

less chance group is dominated by slp 1 and more chance group is dominated by slp 2

## caa

In [None]:
plot_pie('caa')
plot_pie_compare('caa')

In [None]:
check_most('caa')

the percentage of caa 0 from more chance group is 46.18% higher than less chance group

## thall

In [None]:
plot_pie('thall')
plot_pie_compare('thall')

In [None]:
check_most('thall')

less chance group is dominated by thall 3 and more chance group is dominated by thall 2

# Data Visualization Result

| | Less chance | More Chance | Note
| :- | :-: | :-: | :-: |
| age (Median) | 58 | 52| No Clear Difference
| sex (Most) | 1 | 1 | the percentage of gender label 0 from "more chance" group is 26.25% higher than "less chance" group
| cp (Most) | 0 | 2 | less chance group is dominated by cp 0 and more chance group is dominated by cp 2
| trtbps (Median) | 130 | 130 | No Clear Difference
| chol (Median) | 249 | 234 | the median of "less chance" group is little higher than "more chance group". but there is No Clear Difference from both group
| fbs (Most) | 0 | 0  | No Clear Difference
| restecg (Most)| 0 | 1 | less chance group is dominated by restecg 0 and more chance group is dominated by restecg 1
| thalachh (Median) | 142 | 161 | the value of thalachh (maximum heart rate achieved) from "more chance" group is higher than "less chance" group
| exng (Most) | 1 | 0 | less chance group is dominated by exng 1 and more chance group is dominated by exng 0
| oldpeak (Median)| 1.4 | 0.2 | the median of oldpeak from more chance group is lower than less chance group 
| slp (Most) | 1 | 2|less chance group is dominated by slp 1 and more chance group is dominated by slp 2
|caa (Most) |0|0|the percentage of caa 0 from more chance group is 46.18% higher than less chance group
|thall (Most) |3|2| less chance group is dominated by thall 3 and more chance group is dominated by thall 2

# Data Preprocessing

In [None]:
X = df.copy()

y = X['output']

#Drop the Attrition_Flag Column
X = X.drop(['output','age','trtbps','fbs'], axis=1)

In [None]:
#transform categorical data
X = pd.get_dummies(X, columns=['cp','restecg', 'exng', 'slp', 'caa', 'thall'], drop_first=True)

In [None]:
X.columns

In [None]:
#Split to data train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1234)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Import ML Libraries
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

classifiers = [[CatBoostClassifier(verbose=0),'CatBoost Classifier'],[XGBClassifier(),'XGB Classifier'], [RandomForestClassifier(),'Random Forest'], 
    [KNeighborsClassifier(), 'K-Nearest Neighbours'], [SGDClassifier(),'SGD Classifier'], [SVC(),'SVC'],[LGBMClassifier(),'LGBM Classifier'],
              [GaussianNB(),'GaussianNB'],[DecisionTreeClassifier(),'Decision Tree Classifier'],[LogisticRegression(),'Logistic Regression']]

# Prediction

In [None]:
for cls in classifiers:
    model = cls[0]
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    print(cls[1])
    print ('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    print("Accuracy : ", accuracy_score(y_test, y_pred) *  100)
    print("Recall : ", recall_score(y_test, y_pred) *  100)
    print("Precision : ", precision_score(y_test, y_pred) *  100)
    print("F1 score : ", f1_score(y_test, y_pred) *  100)

# Conclusion

the best algorithm for predicting Heart Attack is **K-Nearest Neighbours** 

* acc: 84.61538461538461, 
* recall: 88.23529411764706, 
* precision: 84.90566037735849, 
* f1: 86.53846153846155)