#       Heart attack analysis and prediction

**Import necessary tools**
* pandas
* numpy
* matplotlib
* seaborn
* sklearn
* scipy

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from scipy import stats
import warnings
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.naive_bayes import GaussianNB
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load dataset

In [None]:
df = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
cat_col = [i for i in df.columns if df[i].nunique()<10]
print('Categorical features of dataset:',cat_col)
print('\n')

num_col = [i for i in df.columns if df[i].nunique()>10]
print('Numerical features of dataset:',num_col)

In [None]:
print('Sahpe of dataset:',df.shape)

In [None]:
print('Null values in dataset:',df.isna().sum().sum())

In [None]:
val = df['output'].value_counts()
val

In [None]:
plt.pie(val,
        autopct='%1.2f%%',
        labels=['0(Low risk)','1(High risk)'],
        colors=['g','r'])
 
plt.legend()
plt.show()

**Analysis**

In [None]:
df.sex.value_counts()

## **CP**

In [None]:
print(df[['cp','output']].groupby(['cp'],as_index = False).mean())

## fbs 

In [None]:
print(df[['fbs','output']].groupby(['fbs'],as_index=False).mean())

## exng

In [None]:
print(df[['exng','output']].groupby(['exng'],as_index=False).mean())

# slp

In [None]:
print(df[['slp','output']].groupby(['slp'],as_index=False).mean())

# caa

In [None]:
print(df[['caa','output']].groupby(['caa'],as_index=False).mean())

# thall

In [None]:
print(df[['thall','output']].groupby(['thall'],as_index=False).mean())

# trtbps

In [None]:
df['_trtbps_']=pd.cut(df['trtbps'],5)
print(df[['_trtbps_','output']].groupby(['_trtbps_'],as_index=False).mean())

# age

In [None]:
df['_age_']=pd.cut(df['age'],5)

In [None]:
print(df[['_age_','output']].groupby(['_age_'],as_index=True).mean())

In [None]:
print(df[['exng','output']].groupby(['exng'],as_index=True).mean())

In [None]:
df.columns

In [None]:
sns.swarmplot(x='caa',y='thalachh',data=df,hue='output')
plt.show()

In [None]:
sns.swarmplot(x='fbs',y='thalachh',data=df,hue='output')
plt.show()

In [None]:
sns.swarmplot(x='fbs',y='oldpeak',data=df,hue='output')
plt.show()

# conslusion of analysis:
* those heve less oldpeak they have high chance of heart-attack
* those have high thalachh they have more chance of hert-attack

In [None]:
sns.boxplot(y=df.trtbps)

In [None]:
sns.boxplot(y=df.chol)

In [None]:
sns.boxplot(y=df.thalachh)

In [None]:
sns.boxplot(y=df.oldpeak)

***From numerical data he identify that is outliers is there,removing of outliers is best for ML model accuracy***

In [None]:
out = np.abs(stats.zscore(df[num_col]))
threshold=3
print(np.where(out>=threshold))
print(('value of z[28][2]='),(out[28][2]))

In [None]:
fig,(a1,a2,a3)=plt.subplots(1,3,figsize=(10,5))
sns.boxplot(y=df['age'],ax=a1)
sns.boxplot(y=df['trtbps'],ax=a2)
sns.boxplot(y=df['chol'],ax=a3)
fig, (a1,a2) = plt.subplots(1,2,figsize=(15,5))
sns.boxplot(y=df['thalachh'],ax=a1)
sns.boxplot(y=df['oldpeak'],ax=a2)

# Preprocessing data for ML

In [None]:
df.columns

In [None]:
X = df.iloc[:,:-3]
y= df['output']

1. Split data for training and testing

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                test_size=0.25,
                                                random_state=25,)

# Decission Tree

In [None]:
cret = {'gini':[],'entropy':[]}
for i in cret.keys():
    for j in range(1,20):
        dec = DecisionTreeClassifier(criterion=i,max_depth=j)
        dec.fit(X_train,y_train)
        y_pred = dec.predict(X_test)
        cret[i].append(accuracy_score(y_test,y_pred))

In [None]:
plt.plot(cret['gini'],c='r',marker='o',label='gini')
plt.plot(cret['entropy'],c='g',marker='+',label='entropy')
plt.legend()
plt.show()

In [None]:
dec = DecisionTreeClassifier(criterion='entropy',max_depth=5)
dec.fit(X_train,y_train)
y_pred = dec.predict(X_test)

In [None]:
plt.figure(figsize=[10,5])

_ = plot_tree(dec,filled=True,feature_names=X.columns,node_ids=True)

In [None]:
print('Test accuracy:',dec.score(X_test,y_test))
print('Train accuracy:',dec.score(X_train,y_train))
decission=round(dec.score(X_test,y_test)*100,2)
decission

# Random Forest

In [None]:
rnd = RandomForestClassifier(n_estimators=100,max_depth=3,random_state=2)
rnd.fit(X_train,y_train)
predict= rnd.predict(X_test)

In [None]:
print('Test accuracy:',rnd.score(X_test,y_test))
print('Train accuracy:',rnd.score(X_train,y_train))
rand = round(rnd.score(X_train,y_train)*100,2)
rand

# KNN

In [None]:
wt={'uniform':[],'distance':[]}
for i in wt.keys():
    for j in range(1,21,2):
        neig=KNeighborsClassifier(n_neighbors=j,
                                 weights=i)
        neig.fit(X_train,y_train)
        y_pred=neig.predict(X_test)
        wt[i].append(accuracy_score(y_test,y_pred))

In [None]:
plt.plot(wt['uniform'],marker='o',c='r',label='uniform')
plt.plot(wt['distance'],marker='o',c='g',label='distance')
plt.legend()
plt.show()

In [None]:
wt={'1':[],'2':[]}
for i in wt.keys():
    for j in range(1,21,2):
        neig=KNeighborsClassifier(n_neighbors=j,
                                 weights='distance',
                                 p=int(i))
        neig.fit(X_train,y_train)
        y_pred=neig.predict(X_test)
        wt[i].append(accuracy_score(y_test,y_pred))

In [None]:
plt.plot(wt['1'],marker='o',c='r',label='1')
plt.plot(wt['2'],marker='o',c='g',label='2')
plt.legend()
plt.show()

In [None]:
neig=KNeighborsClassifier(n_neighbors=16,
                                 weights='distance',
                                 p=1)
neig.fit(X_train,y_train)
y_pred=neig.predict(X_test)

In [None]:
print('Test accuracy:',neig.score(X_test,y_test))
knn=round(neig.score(X_test,y_test)*100,2)
knn

In [None]:
confusion_matrix(y_test,y_pred)

# Naive Bayes

In [None]:
guss=GaussianNB()
guss.fit(X_train,y_train)
predict = guss.predict(X_test)

In [None]:
print('Test accuracy:',guss.score(X_test,y_test))
gussian = round(guss.score(X_test,y_test)*100,2)
gussian

# Compare model score

In [None]:
model = pd.DataFrame({
    'model':['knn','decission tree','random forest','navi bayes'],
    'score':[knn,decission,rand,gussian]
})

In [None]:
sns.barplot(y='model',x='score',data=model)

In [None]:
model.sort_values(by='score',ascending=True, ignore_index=True)