# <center> Heart Attack Prediction

# Importing libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')

# Importing Data

In [None]:
data = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
data.rename(columns = {'output':'Disease'}, inplace = True)

In [None]:
data.head()

# Understanding the data features

Age : Age of the patient<br>

Sex : Sex of the patient<br>

exang: exercise induced angina (1 = yes; 0 = no)<br>

ca: number of major vessels (0-3)<br>

cp : Chest Pain type chest pain type<br>
Value 1: typical angina<br>
Value 2: atypical angina<br>
Value 3: non-anginal pain<br>
Value 4: asymptomatic<br>

trtbps : resting blood pressure (in mm Hg)<br>

chol : cholestoral in mg/dl fetched via BMI sensor<br>

fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)<br>

rest_ecg : resting electrocardiographic results<br>
Value 0: normal<br>
Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)<br>
Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria<br>

thalach : maximum heart rate achieved<br>
<br>
target : 0= less chance of heart attack 1= more chance of heart attack

# EDA

In [None]:
data.info()

In [None]:
data.groupby('Disease').mean().transpose()

In [None]:
sns.countplot(data['Disease'])

In [None]:
continuous = ['oldpeak','thalachh','chol','trtbps','age']
discrete = ['sex','cp','fbs','restecg','exng','slp','caa','thall']

In [None]:
plt.figure(figsize=(18,10))
m=1
for i in continuous:
    plt.subplot(2,3,m)
    sns.histplot(x=data[i],hue=data['Disease'],kde=False,palette='inferno',fill=False,element='poly') 
    #plt.show()
    m+=1

In [None]:
plt.figure(figsize=(20,12))
m=1
for i in discrete:
    plt.subplot(3,3,m)
    sns.countplot(x=data[i],hue=data['Disease'],palette='inferno',) 
    #plt.show()
    m+=1

In [None]:
px.scatter_3d(data,x='thalachh',y='oldpeak',color='Disease',z='caa')

# Feature Selection

In [None]:
X = data.drop('Disease',axis=1)
Y = data['Disease']

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
fs = SelectKBest(score_func=chi2, k='all')
fs.fit(X, Y)
per = []
for i in fs.scores_:
    per.append(round(((i/sum(fs.scores_))*100),3))

features_data = pd.DataFrame({'Feature':X.columns,'Scores':fs.scores_,'Importance (%)':per}).sort_values(by=['Scores'],ascending=False)

plt.figure(figsize=(9,4))
sns.barplot( 'Importance (%)','Feature',orient='h',data=features_data,palette='CMRmap')
insignificant = features_data.loc[features_data['Importance (%)']<0.005]['Feature'].unique()
features_data = features_data.set_index('Feature')
features_data

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

# Test Train split

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=100)

# Model Selection

In [None]:
from sklearn.metrics import accuracy_score,classification_report

#XGB
import xgboost as xgb
from xgboost import XGBClassifier
xgb = XGBClassifier() 


# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

#RFC
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

#KNN
from sklearn.neighbors import KNeighborsClassifier
accuracy = []
for i in range(1,40):    
    kn = KNeighborsClassifier(n_neighbors=i)
    kn.fit(X_train,Y_train)
    predK = kn.predict(X_test)
    accuracy.append([accuracy_score(Y_test,predK),i])
    #print('Tested for k =',i)
temp = accuracy[0]
for m in accuracy:
    if temp[0] < m[0]:
        temp=m
knn = KNeighborsClassifier(n_neighbors=temp[1])

#SVM
from sklearn.svm import SVC
svc = SVC()

from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10, 100, 1000,2000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)

print('Models Imported')

In [None]:
model_acc = []
models = [xgb,lr,rfc,knn,svc,grid]
#model_name = ['xgb','lr','rfc','kno','svc','grid']
for i in models:
    i.fit(X_train,Y_train)
    model_acc.append(accuracy_score(Y_test,i.predict(X_test)))
                      
models = pd.DataFrame({'Models':models,'Accuracy':model_acc})

In [None]:
models = models.sort_values(by=['Accuracy'],ascending=False).reset_index().drop('index',axis=1)
best = models['Models'][0]
models['Models']=models['Models'].astype(str).str.split("(", n = 2, expand = True)[0]
models

In [None]:
print('Hence the best model is',models['Models'][0],'with an accuracy of',round((models['Accuracy'][0]*100),2),'%')
print('\nThe classification report is:')
print(classification_report(Y_test,best.predict(X_test)))

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(best,X_test, Y_test,normalize= 'true',cmap='viridis')