# HEARTH DISEASE

![](https://www.canwelivebetter.bayer.com/sites/default/files/2018-05/NEW_Heartbeat_looping_GIF_NORMAL_0.gif)

## 1. Introduction
### 1.1. Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("darkgrid")
import warnings
warnings.filterwarnings("ignore")
pd.options.display.float_format = '{:,.2f}'.format

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler,RobustScaler,label_binarize,RobustScaler
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score,ConfusionMatrixDisplay,precision_score,recall_score,f1_score,classification_report,roc_curve,plot_roc_curve,auc,precision_recall_curve,plot_precision_recall_curve,average_precision_score
from sklearn.multiclass import OneVsRestClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeClassifier

### 1.2. Importing Dataset

In [None]:
df = pd.read_csv('../input/heart-disease-uci/heart.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
print('DF Shape:', df.shape)

In [None]:
desc = df.describe().T
df1 = pd.DataFrame(index=['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'], 
                   columns= ["count","mean","std","min",
                             "25%","50%","75%","max"], data= desc )

f,ax = plt.subplots(figsize=(12,12))

sns.heatmap(df1, annot=True,cmap = "Reds", fmt= '.0f',
            ax=ax,linewidths = 5, cbar = False,
            annot_kws={"size": 16})

plt.xticks(size = 18)
plt.yticks(size = 12, rotation = 0)
plt.ylabel("Variables")
plt.title("Descriptive Statistics", size = 16)
plt.show()

### 1.3. Missing Values

In [None]:
df.isnull().sum()

In [None]:
import missingno as msno
msno.bar(df)
plt.show()

## 2. Data Visualization

### 2.1 Corr Heat Map

In [None]:
corr = df.corr()
mask = np.triu(np.ones_like(corr,dtype=bool))
f,ax = plt.subplots(figsize=(11,9))
cmap = sns.diverging_palette(230,20,as_cmap = True)
sns.heatmap(corr,mask=mask,cmap=cmap,vmax=.3,square=True,linewidths=.5,cbar_kws={'shrink':.5})

In [None]:
#df.drop(['restecg','fbs','chol','trestbps'],axis=1,inplace=True)

In [None]:
# target correlations

corr.sort_values(by=['target'],ascending=False).iloc[0].sort_values(ascending=False)

### 2.2 Count Plot

In [None]:
plt.figure(figsize=(20,10))
sns.set_theme(style='darkgrid')
plt.subplot(2,3,1)
sns.countplot(data=df,x='fbs')
plt.subplot(2,3,2)
sns.countplot(data=df,x='restecg')
plt.subplot(2,3,3)
sns.countplot(data=df,x='slope')
plt.subplot(2,3,4)
sns.countplot(data=df,x='ca')
plt.subplot(2,3,5)
sns.countplot(data=df,x='exang')
plt.subplot(2,3,6)
sns.countplot(data=df,x='thal')
plt.show()

In [None]:
plt.figure(figsize=(20,10))
sns.set_theme(style='darkgrid')
plt.subplot(2,3,1)
sns.countplot(data=df,x='fbs',hue='target')
plt.subplot(2,3,2)
sns.countplot(data=df,x='restecg',hue='target')
plt.subplot(2,3,3)
sns.countplot(data=df,x='slope',hue='target')
plt.subplot(2,3,4)
sns.countplot(data=df,x='ca',hue='target')
plt.subplot(2,3,5)
sns.countplot(data=df,x='exang',hue='target')
plt.subplot(2,3,6)
sns.countplot(data=df,x='thal',hue='target')
plt.show()

### 2.3 Hist Plot

In [None]:
df.hist(figsize=(20,14))
plt.show()

### 2.4 Dist Plot

In [None]:
fig = plt.figure(figsize=(7,7))
ax = sns.displot(df.age,color='green',kde=True,label='Age')

In [None]:
fig = plt.figure(figsize=(7,7))
ax = sns.displot(df.trestbps, color='blue',kde=True,label='Trestbps')

In [None]:
fig = plt.figure(figsize=(7,7))
ax = sns.displot(df.chol,label='Chol',kde=True,color='red')

In [None]:
fig = plt.figure(figsize=(7,7))
ax = sns.displot(df.thalach, label='Thalach',kde=True,color='green')

In [None]:
fig = plt.figure(figsize=(7,7))
ax = sns.displot(df.oldpeak,label='oldpeak',color='blue',kde=True)

In [None]:
plt.figure(figsize=(12,10))
sns.distplot(df[df['target'] == 0]['trestbps'],color='green')
sns.distplot(df[df['target'] == 1]['trestbps'],color='red')
plt.title('Target vs Trestbps')
plt.xlim([30,330])

In [None]:
sns.displot(data=df,x=df.trestbps,hue='target',col='sex',kind='kde')

In [None]:
sns.displot(data=df, x = 'thalach',hue='target',col='sex',kind='ecdf')

### 2.5 Violin Plot

In [None]:
plt.figure(figsize=(13,13))
sns.set_theme(style='darkgrid')
plt.subplot(2,3,1)
sns.violinplot(x='cp',y='target',data=df)
plt.subplot(2,3,2)
sns.violinplot(x='fbs',y='target',data=df)
plt.subplot(2,3,3)
sns.violinplot(x='restecg',y='target',data=df)
plt.subplot(2,3,4)
sns.violinplot(x='exang',y='target',data=df)
plt.subplot(2,3,5)
sns.violinplot(x='slope',y='target',data=df)
plt.subplot(2,3,6)
sns.violinplot(x='ca',y='target',data=df)
plt.show()

### 2.6 Box Plot

In [None]:
plt.figure(figsize=(18,10))
sns.set_theme(style='darkgrid')
plt.subplot(2,3,1)
sns.boxplot(x='thal',data=df)
plt.subplot(2,3,2)
sns.boxplot(x='oldpeak',data=df)
plt.subplot(2,3,3)
sns.boxplot(x='thalach',data=df)
plt.subplot(2,3,4)
sns.boxplot(x='chol',data=df)
plt.subplot(2,3,5)
sns.boxplot(x='trestbps',data=df)
plt.subplot(2,3,6)
sns.boxplot(x='age',data=df)
plt.show()

In [None]:
df.thal.mean()
df.loc[ df['thal'] == 0, 'thal'] = 2
sns.displot(data=df,x='thal')

In [None]:
sns.displot(data=df,x='oldpeak')

In [None]:
df2 = df.copy()
cat_cols = ['sex','cp','fbs','restecg','exang','slope','ca','thal']
con_cols = ['age','trestbps','chol','thalach','oldpeak']

In [None]:
# Thalach
df.loc[ df['thalach'] <= 110, 'thalach'] = 0
df.loc[ (df['thalach'] > 110) & (df['thalach'] <= 130),'thalach'] = 1
df.loc[ (df['thalach'] > 130) & (df['thalach'] <= 150),'thalach'] = 2
df.loc[ (df['thalach'] > 150) & (df['thalach'] <= 170),'thalach'] = 3
df.loc[ (df['thalach'] > 170) & (df['thalach'] <= 190),'thalach'] = 4
df.loc[df['thalach'] > 190,'thalach'] = 5

# Thal
df.loc[df['thal'] == 0,'thal'] = 2
df.loc[df['thal'] == 1,'thal'] = 2

# Slope
df.loc[df['slope'] == 0,'slope'] = 1

# Restecg
df.loc[df['restecg'] == 2,'restecg'] = 1

df['age'] = df['age'].astype(float)
df['trestbps'] = df['trestbps'].astype(float)
df['chol'] = df['chol'].astype(float)

# Chol
df.loc[ df['chol'] <= 100, 'chol'] = 0
df.loc[ (df['chol'] > 100) & (df['chol'] <= 150),'chol'] = 1
df.loc[ (df['chol'] > 150) & (df['chol'] <= 200),'chol'] = 2
df.loc[ (df['chol'] > 200) & (df['chol'] <= 250),'chol'] = 3
df.loc[ (df['chol'] > 250) & (df['chol'] <= 300),'chol'] = 4
df.loc[ (df['chol'] > 300) & (df['chol'] <= 350),'chol'] = 5
df.loc[ (df['chol'] > 350) & (df['chol'] <= 400),'chol'] = 6
df.loc[df['chol'] > 400,'chol'] = 0

#trestbps
df.loc[ df['trestbps'] <= 90, 'trestbps'] = 0
df.loc[ (df['trestbps'] > 90) & (df['trestbps'] <= 120),'trestbps'] = 1
df.loc[ (df['trestbps'] > 120) & (df['trestbps'] <= 150),'trestbps'] = 2
df.loc[ (df['trestbps'] > 150) & (df['trestbps'] <= 190),'trestbps'] = 3
df.loc[df['trestbps'] > 190,'trestbps'] = 4


In [None]:
df['AgeBand'] = pd.cut(df['age'], 5)
df[['AgeBand', 'target']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

In [None]:
df.loc[ df['age'] <= 39, 'age'] = 0
df.loc[ (df['age'] > 39) & (df['age'] <= 48),'age'] = 1
df.loc[ (df['age'] > 48) & (df['age'] <= 58),'age'] = 2
df.loc[ (df['age'] > 58) & (df['age'] <= 68),'age'] = 3
df.loc[df['age'] > 68,'age'] = 4

In [None]:
df.drop('AgeBand',inplace=True,axis=1)
df.head()

In [None]:
df['OldpeakBand'] = pd.cut(df['oldpeak'], 5)
df[['OldpeakBand', 'target']].groupby(['OldpeakBand'], as_index=False).mean().sort_values(by='OldpeakBand', ascending=True)

In [None]:
df.loc[ df['oldpeak'] <= 1.24, 'oldpeak'] = 0
df.loc[ (df['oldpeak'] > 1.24) & (df['oldpeak'] <= 2.48),'oldpeak'] = 1
df.loc[ (df['oldpeak'] > 2.48) & (df['oldpeak'] <= 3.72),'oldpeak'] = 2
df.loc[ (df['oldpeak'] > 3.72) & (df['oldpeak'] <= 4.96),'oldpeak'] = 3
df.loc[df['oldpeak'] > 6.2,'oldpeak'] = 4

In [None]:
df.drop('OldpeakBand',inplace=True,axis=1)
df.head()

In [None]:
X = df.drop(['target'],axis=1)
y = df[['target']]

In [None]:
print('X Shape', X.shape)
print('Y Shape',y.shape)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.33,random_state=42)

print('Number transations x_train df',X_train.shape)
print('Number transations x_test df',X_test.shape)
print('Number transations y_train df',y_train.shape)
print('Number transations y_test df',y_test.shape)

In [None]:
models = []
models.append(['RidgeClassifier',RidgeClassifier()])
models.append(['XGBClassifier',XGBClassifier(learning_rate=0.1,objective='binary:logistic',random_state=0,eval_metric='mlogloss')])
models.append(['Logistic Regression',LogisticRegression(random_state=0)])
models.append(['SVM',SVC(random_state=0)])
models.append(['KNeigbors',KNeighborsClassifier()])
models.append(['GaussianNB',GaussianNB()])
models.append(['BernoulliNB',BernoulliNB()])
models.append(['DecisionTree',DecisionTreeClassifier(random_state=0)])
models.append(['RandomForest',RandomForestClassifier(random_state=0)])
models.append(['AdaBoostClassifier',AdaBoostClassifier()])
models.append(['MLPClassifier',MLPClassifier()])
models.append(['ExtraTreesClassifier',ExtraTreesClassifier()])

In [None]:
lst_1 = []
for m in range(len(models)):
    lst_2 = []
    model = models[m][1]
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test,y_pred)
    accuracies = cross_val_score(estimator= model, X = X_train,y = y_train, cv=10)

# k-fOLD Validation
    roc = roc_auc_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    print(models[m][0],':')
    print(cm)
    print('Accuracy Score: ',accuracy_score(y_test,y_pred))
    print('')
    print('K-Fold Validation Mean Accuracy: {:.2f} %'.format(accuracies.mean()*100))
    print('')
    print('Standard Deviation: {:.2f} %'.format(accuracies.std()*100))
    print('')
    print('ROC AUC Score: {:.2f} %'.format(roc))
    print('')
    print('Precision: {:.2f} %'.format(precision))
    print('')
    print('Recall: {:.2f} %'.format(recall))
    print('')
    print('F1 Score: {:.2f} %'.format(f1))
    print('-'*40)
    print('')
    lst_2.append(models[m][0])
    lst_2.append(accuracy_score(y_test,y_pred)*100)
    lst_2.append(accuracies.mean()*100)
    lst_2.append(accuracies.std()*100)
    lst_2.append(roc)
    lst_2.append(precision)
    lst_2.append(recall)
    lst_2.append(f1)
    lst_1.append(lst_2)

In [None]:
df2 = pd.DataFrame(lst_1,columns=['Model','Accuracy','K-Fold Mean Accuracy','Std.Deviation','ROC_AUC','Precision','Recall','F1 Score'])

df2.sort_values(by=['Accuracy','K-Fold Mean Accuracy'],inplace=True,ascending=False)
df2

# COMPARE

In [None]:
fig = plt.figure(figsize=(12,12))
sns.barplot(x='Accuracy',y='Model',data=df2,color='b')
plt.title('Model Compare Graphic');

In [None]:
grid_models = [(GaussianNB(),[{'var_smoothing': np.logspace(0,-9, num=100)}]),
               (XGBClassifier(), [{'learning_rate': [0.01, 0.05, 0.1], 'eval_metric': ['error']}]),
               (KNeighborsClassifier(),[{'n_neighbors':[5,7,8,10], 'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']}]), 
               (DecisionTreeClassifier(),[{'criterion':['gini','entropy'],'random_state':[0]}]), 
               (RandomForestClassifier(),[{'n_estimators':[100,150,200],'criterion':['gini','entropy'],'random_state':[0]}])
               ]

In [None]:
for i,j in grid_models:
    grid = GridSearchCV(estimator=i,param_grid = j, scoring = 'accuracy',cv = 10)
    grid.fit(X_train,y_train)
    best_accuracy = grid.best_score_
    best_param = grid.best_params_
    print(' {}: \n Best Accuracy: {:.2f} %'.format(i,best_accuracy*100))
    print('')
    print('-'*25)
    print('')

In [None]:
classifier = SVC(probability=True)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
y_prob = classifier.predict_proba(X_test)[:,1]
cm = confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))
print(f'ROC AUC score: {roc_auc_score(y_test, y_prob)}')
print('Accuracy Score: ',accuracy_score(y_test, y_pred))

# Visualizing Confusion Matrix
plt.figure(figsize = (8, 5))
sns.heatmap(cm, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 5, cbar = False, annot_kws = {'fontsize': 15}, 
            yticklabels = ['No', 'yes'], xticklabels = ['Predicted no', 'Predicted yes'])
plt.yticks(rotation = 0)
plt.show()

***Thanks for read. Please Vote.***