In [None]:
import numpy as np
import pandas as pd 
from collections import Counter
import re
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.express as px
plt.style.use('seaborn-bright')

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import f1_score, classification_report, confusion_matrix, accuracy_score, roc_curve, auc 

from scipy import stats
from scipy.stats import norm, skew, boxcox
import statsmodels.formula.api as sm

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

## Data Overview

In [None]:
data.info()
data.head()

In [None]:
data.describe().T

In [None]:
print('Data shape: {}'.format(data.shape))

In [None]:
plt.figure(figsize=(7, 5))
sns.countplot(data.DEATH_EVENT)
plt.xlabel('Death Event')
plt.ylabel('Count')

##### Function for Distribution Plotting

In [None]:
def plotting(col):
    plt.figure(figsize=(6, 4))
    sns.distplot(data[col], fit=norm)

    (mu, sigma) = norm.fit(data[col])

    plt.legend(['Normal dist. ($\mu$ {:.2f} and $\sigma$ {:.2f})'.format(mu, sigma)], loc='upper right')

In [None]:
num_cols = ['age', 'creatinine_phosphokinase', 'ejection_fraction', 
            'platelets', 'serum_creatinine', 'serum_sodium', 'time']
for col in num_cols:
    plotting(col)

##### Data Correlation

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(data.corr(), annot=True, fmt='.2f')
plt.title('Correlation')

## Data Preprocessing

##### Function for Finding Outliers

In [None]:
def outliers(data, features):
    outliers_indx = []
    
    for col in features:
        q1 = np.percentile(data[col], 25)
        q3 = np.percentile(data[col], 75)
        iqr = q3 - q1
        step = iqr * 1.5
        
        datected_outlier = data[(data[col] < q1-step) | (data[col] > q3+step)].index
        outliers_indx.extend(datected_outlier)
        
    outliers_indx = Counter(outliers_indx)
    outliers = list(i for i, v in outliers_indx.items() if v > 1)
    
    return outliers

##### Outliers Themselves

In [None]:
data.loc[outliers(data, ["age","creatinine_phosphokinase",
              "ejection_fraction","platelets","serum_creatinine","serum_sodium","time"])]

In [None]:
data = data.drop(outliers(data, ["age","creatinine_phosphokinase",
                "ejection_fraction","platelets","serum_creatinine","serum_sodium","time"]), axis=0).reset_index(drop=True)

##### As a Result - no Outliers

In [None]:
data.loc[outliers(data, ["age","creatinine_phosphokinase",
              "ejection_fraction","platelets","serum_creatinine","serum_sodium","time"])]

In [None]:
skewed_values = pd.DataFrame({'Skewed Values': data.apply(lambda x: skew(x.dropna())).sort_values(ascending=False)})
skewed_values

In [None]:
plt.figure(figsize=(6, 4))
sns.distplot(data['creatinine_phosphokinase'], fit=norm)

(mu, sigma) = norm.fit(data['creatinine_phosphokinase'])

plt.legend(['Normal dist. ($\mu$ {:.2f} and $\sigma$ {:.2f})'.format(mu, sigma)], loc='upper right')
plt.title('Creatinine Phosphokinase before Transformation')
plt.xlabel('Creatinine Phosphokinase')
plt.ylabel('Density')

In [None]:
data['creatinine_phosphokinase'], value_cp = boxcox(data['creatinine_phosphokinase'])

In [None]:
plt.figure(figsize=(6, 4))
sns.distplot(data['creatinine_phosphokinase'], fit=norm)

(mu, sigma) = norm.fit(data['creatinine_phosphokinase'])

plt.legend(['Normal dist. ($\mu$ {:.2f} and $\sigma$ {:.2f})'.format(mu, sigma)], loc='upper right')
plt.title('Creatinine Phosphokinase after Transformation')
plt.xlabel('Creatinine Phosphokinase')
plt.ylabel('Density')

In [None]:
plt.figure(figsize=(6, 4))
sns.distplot(data['serum_creatinine'], fit=norm)

(mu, sigma) = norm.fit(data['serum_creatinine'])

plt.legend(['Normal dist. ($\mu$ {:.2f} and $\sigma$ {:.2f})'.format(mu, sigma)], loc='upper right')
plt.title('Serum Creatinine before Transformation')
plt.xlabel('Serum Creatinine')
plt.ylabel('Density')

In [None]:
data['serum_creatinine'], value_sc = boxcox(data['serum_creatinine'])

In [None]:
plt.figure(figsize=(6, 4))
sns.distplot(data['serum_creatinine'], fit=norm)

(mu, sigma) = norm.fit(data['serum_creatinine'])

plt.legend(['Normal dist. ($\mu$ {:.2f} and $\sigma$ {:.2f})'.format(mu, sigma)], loc='upper right')
plt.title('Serum Creatinine after Transformation')
plt.xlabel('Serum Creatinine')
plt.ylabel('Density')

In [None]:
plt.figure(figsize=(6, 4))
sns.distplot(data['platelets'], fit=norm)

(mu, sigma) = norm.fit(data['platelets'])

plt.legend(['Normal dist. ($\mu$ {:.2f} and $\sigma$ {:.2f})'.format(mu, sigma)], loc='upper right')
plt.title('Platelets before Transformation')
plt.xlabel('Platelets')
plt.ylabel('Density')

In [None]:
data['platelets'], value_p = boxcox(data['platelets'])

In [None]:
plt.figure(figsize=(6, 4))
sns.distplot(data['platelets'], fit=norm)

(mu, sigma) = norm.fit(data['platelets'])

plt.legend(['Normal dist. ($\mu$ {:.2f} and $\sigma$ {:.2f})'.format(mu, sigma)], loc='upper right')
plt.title('Platelets after Transformation')
plt.xlabel('Platelets')
plt.ylabel('Density')

In [None]:
plt.figure(figsize=(6, 4))
sns.distplot(data['ejection_fraction'], fit=norm)

(mu, sigma) = norm.fit(data['ejection_fraction'])

plt.legend(['Normal dist. ($\mu$ {:.2f} and $\sigma$ {:.2f})'.format(mu, sigma)], loc='upper right')
plt.title('Ejection Fraction before Transformation')
plt.xlabel('Ejection Fraction')
plt.ylabel('Density')

In [None]:
data["ejection_fraction"], value_ef = boxcox(data["ejection_fraction"])

In [None]:
plt.figure(figsize=(6, 4))
sns.distplot(data['ejection_fraction'], fit=norm)

(mu, sigma) = norm.fit(data['ejection_fraction'])

plt.legend(['Normal dist. ($\mu$ {:.2f} and $\sigma$ {:.2f})'.format(mu, sigma)], loc='upper right')
plt.title('Ejection Fraction after Transformation')
plt.xlabel('Ejection Fraction')
plt.ylabel('Density')

In [None]:
skewed_values = pd.DataFrame({'Skewed Values': data.apply(lambda x: skew(x.dropna())).sort_values(ascending=False)})
skewed_values

In [None]:
data.DEATH_EVENT.value_counts()

##### Resampling classes

In [None]:
data_majority = data[data['DEATH_EVENT']==0]
data_minority = data[data['DEATH_EVENT']==1]

In [None]:
data_majority.shape, data_minority.shape

In [None]:
from sklearn.utils import resample

data_minority_upsampled = resample(data_minority, 
                                 replace=True,     
                                 n_samples=198,   
                                 random_state=123)

In [None]:
data = pd.concat([data_majority, data_minority_upsampled])

In [None]:
data.DEATH_EVENT.value_counts()

## Modeling

In [None]:
X = data.drop(['DEATH_EVENT'], axis=1)
y = data.DEATH_EVENT

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Random Forest

In [None]:
plt.figure(figsize=(6, 4))

model = RandomForestClassifier()
model.fit(X_train, y_train)

feat_importances = pd.Series(model.feature_importances_, index=X_train.columns)
feat_importances.nlargest(12).plot(kind='barh')

In [None]:
X_train_RFC = X_train[['time', 'serum_creatinine', 'ejection_fraction', 'age', 'creatinine_phosphokinase']]
X_test_RFC = X_test[['time', 'serum_creatinine', 'ejection_fraction', 'age', 'creatinine_phosphokinase']]

In [None]:
RFC_clf = RandomForestClassifier()

In [None]:
params = {
    'n_estimators': range(10, 100, 20),
    'max_depth': range(1, 10), 
    'n_jobs': [-1], 
    'max_features': ['auto', 'sqrt']
}

In [None]:
grid_clf_RFC = GridSearchCV(RFC_clf, param_grid=params, cv=5, n_jobs=-1, verbose=1)
grid_clf_RFC.fit(X_train_RFC, y_train)

In [None]:
RFC_clf = grid_clf_RFC.best_estimator_
y_pred_RFC = RFC_clf.predict(X_test_RFC)
y_pred_prob_RFC = RFC_clf.predict_proba(X_test_RFC)

In [None]:
print(classification_report(y_test, y_pred_RFC),
      confusion_matrix(y_test, y_pred_RFC))
RFC_f1_score = f1_score(y_test, y_pred_RFC)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob_RFC[:,1])
roc_auc_RFC= auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc_RFC)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

### Decision Tree

In [None]:
plt.figure(figsize=(6, 4))

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

feat_importances = pd.Series(model.feature_importances_, index=X_train.columns)
feat_importances.nlargest(12).plot(kind='barh')

In [None]:
X_train_DT = X_train[['serum_creatinine', 'creatinine_phosphokinase', 'ejection_fraction', 'time']]
X_test_DT = X_test[['serum_creatinine', 'creatinine_phosphokinase', 'ejection_fraction', 'time']]

In [None]:
DT_clf = DecisionTreeClassifier()

In [None]:
params = {
    'max_depth': range(1, 10), 
    'min_samples_split': (10, 50, 10)
}

In [None]:
grid_clf_DT = GridSearchCV(DT_clf, param_grid=params, cv=5, n_jobs=-1, verbose=1)
grid_clf_DT.fit(X_train_DT, y_train)

In [None]:
DT_clf = grid_clf_DT.best_estimator_
y_pred_DT = DT_clf.predict(X_test_DT)
y_pred_prob_DT = DT_clf.predict_proba(X_test_DT)

In [None]:
print(classification_report(y_test, y_pred_DT),
      confusion_matrix(y_test, y_pred_DT))
DT_f1_score = f1_score(y_test, y_pred_DT)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob_DT[:,1])
roc_auc_DT= auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc_DT)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

### k-Nearest Neighbors

In [None]:
scores_data = pd.DataFrame()
n_neighbors = range(2, 20)

for neighbor in n_neighbors:
    kNN_clf = KNeighborsClassifier(n_neighbors=neighbor)
    kNN_clf.fit(X_train, y_train)
    
    mean_cross_val_score = cross_val_score(kNN_clf, X_train, y_train, cv = 5).mean()
    
    temp_score_data = pd.DataFrame({'neighbors': [neighbor],
                                         'cross_val_score': [mean_cross_val_score]})
    
    scores_data = scores_data.append(temp_score_data).reset_index(drop=True)
    
scores_data.head()

In [None]:
plt.figure(figsize=(6, 4))
sns.lineplot(x='neighbors', y='cross_val_score', data=scores_data)

In [None]:
scores_data.sort_values(by='cross_val_score', ascending=False).head()

In [None]:
kNN_clf = KNeighborsClassifier(n_neighbors=6)

kNN_clf.fit(X_train, y_train)

In [None]:
y_pred_kNN = kNN_clf.predict(X_test)
y_pred_prob_kNN = kNN_clf.predict_proba(X_test)

In [None]:
print(classification_report(y_test, y_pred_kNN), 
      confusion_matrix(y_test ,y_pred_kNN))
kNN_f1_score = f1_score(y_test, y_pred_kNN)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob_kNN[:,1])
roc_auc_kNN= auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc_kNN)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

##### As we can see Random Forest did the job better 

In [None]:
scores_comparison = pd.DataFrame({'Model': ['Random Forest', 'Decision Tree', 'k-Nearest Neighbors'], 
                                  'F1 Score': (RFC_f1_score, DT_f1_score, kNN_f1_score), 
                                  'ROC-AUC Score': (roc_auc_RFC, roc_auc_DT, roc_auc_kNN)})
scores_comparison