In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import necessary libraries
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
from colorama import Fore, Back, Style 
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from mlxtend.plotting import plot_confusion_matrix
from plotly.offline import plot, iplot, init_notebook_mode
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.express as px
from statsmodels.formula.api import ols
import plotly.graph_objs as gobj

init_notebook_mode(connected=True)
warnings.filterwarnings("ignore")
import plotly.figure_factory as ff

%matplotlib inline

import xgboost
import lightgbm
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

In [None]:
#get data
heart_df=pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
heart_df.head()

# Data Preparation

In [None]:
print(heart_df.info())
heart_df.describe(include='all')

In [None]:
# DoubleCheck NULL and NaN
print('null value\n',heart_df.isnull().sum(axis=0),'\n')
print('NaN value\n',heart_df.isna().sum(axis=0),'\n')

# Data Exploritaion

In [None]:
#Let's quickly create a plot in oreder to understand correlation between age and death

fig=make_subplots(
    rows=1, cols=2,
    column_widths=[2,1],
    row_heights=[1])

fig.add_trace(
    go.Box(x=heart_df['DEATH_EVENT'], y=heart_df['age'],        
           boxpoints='all'),
row=1, col=1)

#correlation table
Z=heart_df[['age','DEATH_EVENT']].corr()
X=list(Z.index)
Y=list(Z.columns)


fig.add_trace(
    go.Heatmap(x=X, y=Y, z=Z,
              zmax=1, zmid=0),
row=1, col=2)

fig.update_xaxes(title_text='Deth Event', col=1)
fig.update_yaxes(title_text='Age', col=1)
fig.update_layout(title_text='Death Event wise Age & Correlation')

fig.show()

In [None]:
#Let's eplore next factor
#anaemia

fig=make_subplots(
    rows=1, cols=2,
    column_widths=[2,1],
    row_heights=[1])

anaemia=heart_df[['anaemia','DEATH_EVENT']]
anaemia['anaemia'].replace(to_replace=[0,1], value=['No','Yes'], inplace=True)
anaemia_y=anaemia[anaemia['DEATH_EVENT']==1]
anaemia_n=anaemia[anaemia['DEATH_EVENT']==0]


fig.add_trace(go.Histogram(histfunc="count", y=anaemia_n['DEATH_EVENT'], x=anaemia_n['anaemia'], name="DEATH EVENT - NO"),
             row=1, col=1)
fig.add_trace(go.Histogram(histfunc="count", y=anaemia_y['DEATH_EVENT'], x=anaemia_y['anaemia'], name="DEATH EVENT - YES"),
             row=1, col=1)

#correlation table
Z=heart_df[['anaemia','DEATH_EVENT']].corr()
X=list(Z.index)
Y=list(Z.columns)


fig.add_trace(
    go.Heatmap(x=X, y=Y, z=Z,
              zmax=1, zmid=0, zmin=-1),
row=1, col=2)

fig.update_xaxes(title_text='Anaemia', col=1)
fig.update_yaxes(title_text='Death Event', col=1)
fig.update_layout(title_text='Death Event wise Anaemia & Correlation')

fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))

fig.show()

In [None]:
#Next One
#creatinine_phosphokinase

i='creatinine_phosphokinase'

fig=make_subplots(
    rows=1, cols=2,
    column_widths=[2,1],
    row_heights=[1])

fig.add_trace(
    go.Box(x=heart_df['DEATH_EVENT'], y=heart_df[i],        
           boxpoints='all'),
row=1, col=1)

#correlation table
Z=heart_df[[i,'DEATH_EVENT']].corr()
X=list(Z.index)
Y=list(Z.columns)


fig.add_trace(
    go.Heatmap(x=X, y=Y, z=Z,
              zmax=1, zmid=0, zmin=-1),
row=1, col=2)

fig.update_xaxes(title_text='Death Event', col=1)
fig.update_yaxes(title_text='Creatinine Phosphokinase', col=1)
fig.update_layout(title_text='Death Event wise Creatinine Phosphokinase & Correlation')

fig.show()

In [None]:
#Next One
#diabetes

i='diabetes'

fig=make_subplots(
    rows=1, cols=2,
    column_widths=[2,1],
    row_heights=[1])

anaemia=heart_df[[i,'DEATH_EVENT']]
anaemia[i].replace(to_replace=[0,1], value=['No','Yes'], inplace=True)
anaemia_y=anaemia[anaemia['DEATH_EVENT']==1]
anaemia_n=anaemia[anaemia['DEATH_EVENT']==0]


fig.add_trace(go.Histogram(histfunc="count", y=anaemia_n['DEATH_EVENT'], x=anaemia_n[i], name="DEATH EVENT - NO"),
             row=1, col=1)
fig.add_trace(go.Histogram(histfunc="count", y=anaemia_y['DEATH_EVENT'], x=anaemia_y[i], name="DEATH EVENT - YES"),
             row=1, col=1)

#correlation table
Z=heart_df[[i,'DEATH_EVENT']].corr()
X=list(Z.index)
Y=list(Z.columns)


fig.add_trace(
    go.Heatmap(x=X, y=Y, z=Z,
              zmax=1, zmid=0, zmin=-1),
row=1, col=2)

fig.update_xaxes(title_text='Diabetes', col=1)
fig.update_yaxes(title_text='Death Event', col=1)
fig.update_layout(title_text='Death Event wise Diabetes & Correlation')

fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))

fig.show()

In [None]:
#Next One
#creatinine_phosphokinase

i='ejection_fraction'

fig=make_subplots(
    rows=1, cols=2,
    column_widths=[2,1],
    row_heights=[1])

fig.add_trace(
    go.Box(x=heart_df['DEATH_EVENT'], y=heart_df[i],        
           boxpoints='all'),
row=1, col=1)

#correlation table
Z=heart_df[[i,'DEATH_EVENT']].corr()
X=list(Z.index)
Y=list(Z.columns)


fig.add_trace(
    go.Heatmap(x=X, y=Y, z=Z,
              zmax=1, zmid=0, zmin=-1),
row=1, col=2)

fig.update_xaxes(title_text='Death Event', col=1)
fig.update_yaxes(title_text='Ejection Fraction', col=1)
fig.update_layout(title_text='Death Event wise Ejection Fraction & Correlation')

fig.show()

In [None]:
#Next One
#high_blood_pressure

i='high_blood_pressure'

fig=make_subplots(
    rows=1, cols=2,
    column_widths=[2,1],
    row_heights=[1])

anaemia=heart_df[[i,'DEATH_EVENT']]
anaemia[i].replace(to_replace=[0,1], value=['No','Yes'], inplace=True)
anaemia_y=anaemia[anaemia['DEATH_EVENT']==1]
anaemia_n=anaemia[anaemia['DEATH_EVENT']==0]


fig.add_trace(go.Histogram(histfunc="count", y=anaemia_n['DEATH_EVENT'], x=anaemia_n[i], name="DEATH EVENT - NO"),
             row=1, col=1)
fig.add_trace(go.Histogram(histfunc="count", y=anaemia_y['DEATH_EVENT'], x=anaemia_y[i], name="DEATH EVENT - YES"),
             row=1, col=1)

#correlation table
Z=heart_df[[i,'DEATH_EVENT']].corr()
X=list(Z.index)
Y=list(Z.columns)


fig.add_trace(
    go.Heatmap(x=X, y=Y, z=Z,
              zmax=1, zmid=0, zmin=-1),
row=1, col=2)

fig.update_xaxes(title_text='High Blood Pressure', col=1)
fig.update_yaxes(title_text='Death Event', col=1)
fig.update_layout(title_text='Death Event wise High Blood Pressure & Correlation')

fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))

fig.show()

In [None]:
#Next One
#platelets

i='platelets'

fig=make_subplots(
    rows=1, cols=2,
    column_widths=[2,1],
    row_heights=[1])

fig.add_trace(
    go.Box(x=heart_df['DEATH_EVENT'], y=heart_df[i],        
           boxpoints='all'),
row=1, col=1)

#correlation table
Z=heart_df[[i,'DEATH_EVENT']].corr()
X=list(Z.index)
Y=list(Z.columns)


fig.add_trace(
    go.Heatmap(x=X, y=Y, z=Z,
              zmax=1, zmid=0, zmin=-1),
row=1, col=2)

fig.update_xaxes(title_text='Death Event', col=1)
fig.update_yaxes(title_text='Platelets', col=1)
fig.update_layout(title_text='Death Event wise Platelets & Correlation')

fig.show()

In [None]:
#Next One
#serum_creatinine

i='serum_creatinine'

fig=make_subplots(
    rows=1, cols=2,
    column_widths=[2,1],
    row_heights=[1])

fig.add_trace(
    go.Box(x=heart_df['DEATH_EVENT'], y=heart_df[i],        
           boxpoints='all'),
row=1, col=1)

#correlation table
Z=heart_df[[i,'DEATH_EVENT']].corr()
X=list(Z.index)
Y=list(Z.columns)


fig.add_trace(
    go.Heatmap(x=X, y=Y, z=Z,
              zmax=1, zmid=0, zmin=-1),
row=1, col=2)

fig.update_xaxes(title_text='Death Event', col=1)
fig.update_yaxes(title_text='Serum Creatinine', col=1)
fig.update_layout(title_text='Death Event wise Serum Creatinine & Correlation')

fig.show()

In [None]:
#Next One
#serum_sodium

i='serum_sodium'

fig=make_subplots(
    rows=1, cols=2,
    column_widths=[2,1],
    row_heights=[1])

fig.add_trace(
    go.Box(x=heart_df['DEATH_EVENT'], y=heart_df[i],        
           boxpoints='all'),
row=1, col=1)

#correlation table
Z=heart_df[[i,'DEATH_EVENT']].corr()
X=list(Z.index)
Y=list(Z.columns)


fig.add_trace(
    go.Heatmap(x=X, y=Y, z=Z,
              zmax=1, zmid=0, zmin=-1),
row=1, col=2)

fig.update_xaxes(title_text='Death Event', col=1)
fig.update_yaxes(title_text='Serum Sodium', col=1)
fig.update_layout(title_text='Death Event wise Serum Sodium & Correlation')

fig.show()

In [None]:
#Next One
#sex

i='sex'

fig=make_subplots(
    rows=1, cols=2,
    column_widths=[2,1],
    row_heights=[1])

anaemia=heart_df[[i,'DEATH_EVENT']]
anaemia[i].replace(to_replace=[0,1], value=['No','Yes'], inplace=True)
anaemia_y=anaemia[anaemia['DEATH_EVENT']==1]
anaemia_n=anaemia[anaemia['DEATH_EVENT']==0]


fig.add_trace(go.Histogram(histfunc="count", y=anaemia_n['DEATH_EVENT'], x=anaemia_n[i], name="DEATH EVENT - NO"),
             row=1, col=1)
fig.add_trace(go.Histogram(histfunc="count", y=anaemia_y['DEATH_EVENT'], x=anaemia_y[i], name="DEATH EVENT - YES"),
             row=1, col=1)

#correlation table
Z=heart_df[[i,'DEATH_EVENT']].corr()
X=list(Z.index)
Y=list(Z.columns)


fig.add_trace(
    go.Heatmap(x=X, y=Y, z=Z,
              zmax=1, zmid=0, zmin=-1),
row=1, col=2)

fig.update_xaxes(title_text='Sex', col=1)
fig.update_yaxes(title_text='Death Event', col=1)
fig.update_layout(title_text='Death Event wise Sex & Correlation')

fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))

fig.show()

In [None]:
#Next One
#smiking

i='smoking'

fig=make_subplots(
    rows=1, cols=2,
    column_widths=[2,1],
    row_heights=[1])

anaemia=heart_df[[i,'DEATH_EVENT']]
anaemia[i].replace(to_replace=[0,1], value=['No','Yes'], inplace=True)
anaemia_y=anaemia[anaemia['DEATH_EVENT']==1]
anaemia_n=anaemia[anaemia['DEATH_EVENT']==0]


fig.add_trace(go.Histogram(histfunc="count", y=anaemia_n['DEATH_EVENT'], x=anaemia_n[i], name="DEATH EVENT - NO"),
             row=1, col=1)
fig.add_trace(go.Histogram(histfunc="count", y=anaemia_y['DEATH_EVENT'], x=anaemia_y[i], name="DEATH EVENT - YES"),
             row=1, col=1)

#correlation table
Z=heart_df[[i,'DEATH_EVENT']].corr()
X=list(Z.index)
Y=list(Z.columns)


fig.add_trace(
    go.Heatmap(x=X, y=Y, z=Z,
              zmax=1, zmid=0, zmin=-1),
row=1, col=2)

fig.update_xaxes(title_text='Smoking', col=1)
fig.update_yaxes(title_text='Death Event', col=1)
fig.update_layout(title_text='Death Event wise Smoking & Correlation')

fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))

fig.show()

In [None]:
#Next One
#time

i='time'

fig=make_subplots(
    rows=1, cols=2,
    column_widths=[2,1],
    row_heights=[1])

fig.add_trace(
    go.Box(x=heart_df['DEATH_EVENT'], y=heart_df[i],        
           boxpoints='all'),
row=1, col=1)

#correlation table
Z=heart_df[[i,'DEATH_EVENT']].corr()
X=list(Z.index)
Y=list(Z.columns)


fig.add_trace(
    go.Heatmap(x=X, y=Y, z=Z,
              zmax=1, zmid=0, zmin=-1),
row=1, col=2)

fig.update_xaxes(title_text='Death Event', col=1)
fig.update_yaxes(title_text='Time', col=1)
fig.update_layout(title_text='Death Event wise Time & Correlation')

fig.show()

In [None]:
# Let's create correlation matrix for all figures
corr_df=heart_df.corr()
mask = np.triu(np.ones_like(corr_df, dtype=np.bool))

#corr_df
plt.figure(figsize=(20,8))
sns.heatmap(corr_df,
            vmin=-1, vmax=1,
            annot=True,
            cmap='PiYG',
            mask=mask,
            linewidths=.5)
plt.show()

In [None]:
# Creat Ranking correlation table
corr_death=corr_df[['DEATH_EVENT']][0:-1]
corr_death['ABS_DEATH_EVENT']=abs(corr_death.values)
corr_death['Rank']=corr_death['ABS_DEATH_EVENT'].rank()
corr_death=corr_death.sort_values(by='Rank', ascending=False)
corr_death

In [None]:
#Creat correlation ranking charts
plt.figure(figsize=(16,8))

plt.subplot(1,2,1)
sns.heatmap(corr_death[['Rank']],
           vmin=corr_death[['Rank']].min(), vmax=corr_death[['Rank']].max(),
            cmap='Blues',
            linewidths=.5)
plt.subplot(1,2,2)
corr_death.sort_values(by='Rank', ascending=True)['DEATH_EVENT'].plot(kind='barh', color='LightBlue', width=1, edgecolor='black')
for i, value in enumerate(corr_death.sort_values(by='Rank', ascending=True)['DEATH_EVENT']):
    text="{:.2f}".format(value)
    plt.annotate(text=text, xy=(value/2, i-0.1), fontsize=15)
        
plt.axis('off')

plt.show()

# Modeling

## Logistic Regresion

In [None]:
#Let's try create LR and define accurasy with difference factors
#Start with top 2
columns=list(corr_death.index)
n_columns=len(columns)

transform = preprocessing.StandardScaler()

lr=LogisticRegression()

acc_table=pd.DataFrame(columns=['Columns','Accuracy'])

for i in range(2,n_columns+1):
    end=i
    features=columns[0:end]
    print(features)
    x=heart_df[features]
    y=heart_df['DEATH_EVENT']
    x=transform.fit_transform(x)
    #print(x[0:5], '\n')
    #print(y[0:5], '\n')
    x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.2, random_state=2)
    #print('X train set:', x_train.shape)
    #print('Y train set:', y_train.shape)
    lr.fit(x_train, y_train)
    y_hat=lr.predict(x_test)
    acc=accuracy_score(y_test, y_hat)*100
    
    print('Accuracy: ','{:.2f}%'.format(acc), '\n')
    row=pd.DataFrame([[i, acc.tolist()]], columns=['Columns','Accuracy'])
    acc_table=acc_table.append(row)
    
acc_table

best=acc_table.sort_values(by='Accuracy', ascending=False)['Columns'].iloc[0]
print("Best number of columns: ", best)
best_acc=acc_table.sort_values(by='Accuracy', ascending=False)['Accuracy'].iloc[0]
ymin=acc_table.sort_values(by='Accuracy', ascending=False)['Accuracy'].iloc[-1]

acc_table.set_index('Columns').plot(kind='line',
                                   figsize=(25,8),
                                   marker='o', 
                                   legend=False)
plt.annotate(text='Best Accuracy: '+'{:.2f}%'.format(best_acc)+' '+str(columns[0:best+1]),
             xy=(best-0.6, best_acc+0.05),
             fontsize=12,
             color='red')

plt.title('Accuracy of Logistic Regression with difference factors \n', fontsize=20)
plt.xlabel('Columns', fontsize=16)
plt.ylabel('Accuracy', fontsize=16)
plt.ylim(ymin-0.2, best_acc+0.5)


plt.show()

In [None]:
best_features=columns[0:best]
print(columns)
print(best_features)
best_x=heart_df[best_features]
y=heart_df['DEATH_EVENT']
best_x=transform.fit_transform(best_x)
x_train, x_test, y_train, y_test=train_test_split(best_x, y, test_size=0.2, random_state=2)
lr.fit(x_train, y_train)
best_y_hat=lr.predict(x_test)

cm = confusion_matrix(y_test, best_y_hat)
plt.figure()
plot_confusion_matrix(cm, figsize=(12,8), hide_ticks=True, cmap=plt.cm.Blues)
plt.title("Logistic Regression Model - Confusion Matrix")
plt.xticks(range(2), ["Heart Not Failed","Heart Fail"], fontsize=16)
plt.yticks(range(2), ["Heart Not Failed","Heart Fail"], fontsize=16)
plt.show()

lr_best_acc=accuracy_score(y_test, best_y_hat)*100
    
print('Accuracy: ','{:.2f}%'.format(lr_best_acc), '\n')

## Suport Vector

In [None]:
#Let's try create SVM and define accurasy with difference factors
#Start with top 2
columns=list(corr_death.index)
n_columns=len(columns)

transform = preprocessing.StandardScaler()

svm=SVC()

acc_table=pd.DataFrame(columns=['Columns','Accuracy'])

for i in range(2,n_columns+1):
    end=i
    features=columns[0:end]
    print(features)
    x=heart_df[features]
    y=heart_df['DEATH_EVENT']
    x=transform.fit_transform(x)
    #print(x[0:5], '\n')
    #print(y[0:5], '\n')
    x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.2, random_state=2)
    #print('X train set:', x_train.shape)
    #print('Y train set:', y_train.shape)
    svm.fit(x_train, y_train)
    y_hat=svm.predict(x_test)
    acc=accuracy_score(y_test, y_hat)*100
    
    print('Accuracy: ','{:.2f}%'.format(acc), '\n')
    row=pd.DataFrame([[i, acc.tolist()]], columns=['Columns','Accuracy'])
    acc_table=acc_table.append(row)
    
acc_table

best=acc_table.sort_values(by='Accuracy', ascending=False)['Columns'].iloc[0]
print("Best number of columns: ", best)
best_acc=acc_table.sort_values(by='Accuracy', ascending=False)['Accuracy'].iloc[0]
ymin=acc_table.sort_values(by='Accuracy', ascending=False)['Accuracy'].iloc[-1]

acc_table.set_index('Columns').plot(kind='line',
                                   figsize=(25,8),
                                   marker='o', 
                                   legend=False)
plt.annotate(text='Best Accuracy: '+'{:.2f}%'.format(best_acc)+' '+str(columns[0:best+1]),
             xy=(best-0.6, best_acc+0.05),
             fontsize=12,
             color='red')

plt.title('Accuracy of Logistic Regression with difference factors \n', fontsize=20)
plt.xlabel('Columns', fontsize=16)
plt.ylabel('Accuracy', fontsize=16)
plt.ylim(ymin-0.5, best_acc+0.5)


plt.show()

In [None]:
best_features=columns[0:best]
print(columns)
print(best_features)
best_x=heart_df[best_features]
y=heart_df['DEATH_EVENT']
best_x=transform.fit_transform(best_x)
x_train, x_test, y_train, y_test=train_test_split(best_x, y, test_size=0.2, random_state=2)
svm.fit(x_train, y_train)
best_y_hat=svm.predict(x_test)

cm = confusion_matrix(y_test, best_y_hat)
plt.figure()
plot_confusion_matrix(cm, figsize=(12,8), hide_ticks=True, cmap=plt.cm.Blues)
plt.title("Logistic Regression Model - Confusion Matrix")
plt.xticks(range(2), ["Heart Not Failed","Heart Fail"], fontsize=16)
plt.yticks(range(2), ["Heart Not Failed","Heart Fail"], fontsize=16)
plt.show()

svm_best_acc=accuracy_score(y_test, best_y_hat)*100
    
print('Accuracy: ','{:.2f}%'.format(svm_best_acc), '\n')

## KNN

In [None]:
#Let's try create KNN and define accurasy with difference factors
#Start with top 2
columns=list(corr_death.index)
n_columns=len(columns)

transform = preprocessing.StandardScaler()

#knn=KNeighborsClassifier(n_neighbors=6)

kacc_table=pd.DataFrame(columns=['Columns','K','Accuracy'])

for i in range(2,n_columns+1):
    end=i
    features=columns[0:end]
    print(features)
    for k in range(1,15):
        knn=KNeighborsClassifier(n_neighbors=k)
        x=heart_df[features]
        y=heart_df['DEATH_EVENT']
        x=transform.fit_transform(x)
        #print(x[0:5], '\n')
        #print(y[0:5], '\n')
        x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.2, random_state=2)
        #print('X train set:', x_train.shape)
        #print('Y train set:', y_train.shape)
        knn.fit(x_train, y_train)
        y_hat=knn.predict(x_test)
        acc=accuracy_score(y_test, y_hat)*100
    
        #print('Accuracy with K=',k,': ','{:.2f}%'.format(acc), '\n')
        row=pd.DataFrame([[i, k, acc.tolist()]], columns=['Columns', 'K', 'Accuracy'])
        kacc_table=kacc_table.append(row)
    
kacc_table.sort_values(by='Accuracy', ascending=False).head()

c_best=kacc_table.sort_values(by='Accuracy', ascending=False)['Columns'].iloc[0]
k_best=kacc_table.sort_values(by='Accuracy', ascending=False)['K'].iloc[0]

print('\nBest K: ',k_best)
print('\nBest Columns: ',c_best)

In [None]:
print(columns)
print(best_features)
print(k_best)

best_features=columns[0:c_best]
knn=KNeighborsClassifier(n_neighbors=k_best)

best_x=heart_df[best_features]
y=heart_df['DEATH_EVENT']
best_x=transform.fit_transform(best_x)
x_train, x_test, y_train, y_test=train_test_split(best_x, y, test_size=0.2, random_state=2)
knn.fit(x_train, y_train)
best_y_hat=knn.predict(x_test)

cm = confusion_matrix(y_test, best_y_hat)
plt.figure()
plot_confusion_matrix(cm, figsize=(12,8), hide_ticks=True, cmap=plt.cm.Blues)
plt.title("Logistic Regression Model - Confusion Matrix")
plt.xticks(range(2), ["Heart Not Failed","Heart Fail"], fontsize=16)
plt.yticks(range(2), ["Heart Not Failed","Heart Fail"], fontsize=16)
plt.show()

knn_best_acc=accuracy_score(y_test, best_y_hat)*100
    
print('Accuracy: ','{:.2f}%'.format(knn_best_acc), '\n')

## Tree

In [None]:
#Let's try create KNN and define accurasy with difference factors
#Start with top 2
columns=list(corr_death.index)
n_columns=len(columns)

transform = preprocessing.StandardScaler()

tree=DecisionTreeClassifier()

acc_table=pd.DataFrame(columns=['Columns','Accuracy'])

for i in range(2,n_columns+1):
    end=i
    features=columns[0:end]
    print(features)
    x=heart_df[features]
    y=heart_df['DEATH_EVENT']
    x=transform.fit_transform(x)
    #print(x[0:5], '\n')
    #print(y[0:5], '\n')
    x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.2, random_state=2)
    #print('X train set:', x_train.shape)
    #print('Y train set:', y_train.shape)
    tree.fit(x_train, y_train)
    y_hat=tree.predict(x_test)
    acc=accuracy_score(y_test, y_hat)*100
    
    print('Accuracy: ','{:.2f}%'.format(acc), '\n')
    row=pd.DataFrame([[i, acc.tolist()]], columns=['Columns','Accuracy'])
    acc_table=acc_table.append(row)
    
acc_table

best=acc_table.sort_values(by='Accuracy', ascending=False)['Columns'].iloc[0]
print("Best number of columns: ", best)
best_acc=acc_table.sort_values(by='Accuracy', ascending=False)['Accuracy'].iloc[0]
ymin=acc_table.sort_values(by='Accuracy', ascending=False)['Accuracy'].iloc[-1]

acc_table.set_index('Columns').plot(kind='line',
                                   figsize=(25,8),
                                   marker='o', 
                                   legend=False)
plt.annotate(text='Best Accuracy: '+'{:.2f}%'.format(best_acc)+' '+str(columns[0:best+1]),
             xy=(best-0.6, best_acc+0.05),
             fontsize=12,
             color='red')

plt.title('Accuracy of Logistic Regression with difference factors \n', fontsize=20)
plt.xlabel('Columns', fontsize=16)
plt.ylabel('Accuracy', fontsize=16)
plt.ylim(ymin-0.5, best_acc+0.5)


plt.show()

In [None]:
best_features=columns[0:best]
print(columns)
print(best_features)
best_x=heart_df[best_features]
y=heart_df['DEATH_EVENT']
best_x=transform.fit_transform(best_x)
x_train, x_test, y_train, y_test=train_test_split(best_x, y, test_size=0.2, random_state=2)
tree.fit(x_train, y_train)
best_y_hat=tree.predict(x_test)

cm = confusion_matrix(y_test, best_y_hat)
plt.figure()
plot_confusion_matrix(cm, figsize=(12,8), hide_ticks=True, cmap=plt.cm.Blues)
plt.title("Logistic Regression Model - Confusion Matrix")
plt.xticks(range(2), ["Heart Not Failed","Heart Fail"], fontsize=16)
plt.yticks(range(2), ["Heart Not Failed","Heart Fail"], fontsize=16)
plt.show()

tree_best_acc=accuracy_score(y_test, best_y_hat)*100
    
print('Accuracy: ','{:.2f}%'.format(tree_best_acc), '\n')

# Summary

In [None]:
acc_list=[lr_best_acc, svm_best_acc, knn_best_acc, tree_best_acc]
acc_df=pd.DataFrame([acc_list], columns=('lr','svm','knn','tree'))
bar_df=acc_df.transpose()
acc_max=bar_df.max()
bar_df['color']=np.where(bar_df==acc_max, 'green', 'grey')
bar_df.rename(columns={0:'value'}, inplace=True)
bar_df

plt.figure(figsize=(20,8))
plt.bar(x=bar_df.index, height=bar_df['value'],
       color=bar_df['color'])
        
for i, value in enumerate(acc_list):
    text='{:.2f}%'.format(value)
    plt.annotate(text=text, xy=(i-0.05, value/2), fontsize=12)
    
plt.title('Accuracy of the Models\n', fontsize=20)
plt.xlabel('Model\n', fontsize=12)
plt.ylabel('Accuracy\n', fontsize=12)
plt.xticks(rotation=0, fontsize=12)

plt.show() 