### Import all necessary libraries

In [None]:
# Built-in packages
import json

# Third party packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

# Charting options
sns.set(context= "notebook", color_codes=True)
%matplotlib inline

### Read the dataset

In [None]:
df = pd.read_csv("/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")
col_names = df.columns[:-1]
target_col = ["DEATH_EVENT"]
num_cols = ["age", "creatinine_phosphokinase", "ejection_fraction", "platelets" , "serum_creatinine", "serum_sodium" , "time"]
bin_cols = ["high_blood_pressure", "sex", "smoking", "anaemia", "diabetes"]
df.head()

In [None]:
# Check if any of the columns have null values
print(df.isnull().sum())

In [None]:
df_summary = df.describe()
df.describe()

Looking at the max and Q3 value, it seems like there are a few outlier in the columns **creatinine_phosphokinase**, **ejection_fraction**, **platelets** and **serum_creatinine**

In [None]:
def draw_axvlines(plt, col):
    mean = df_summary.loc["mean", col]
    q1 = df_summary.loc["25%", col]
    q2 = df_summary.loc["50%", col]
    q3 = df_summary.loc["75%", col]
    plt.axvline(mean, color = "g");              # Plotting a line to mark the mean 
    plt.axvline(q1, color = "b");                # Plotting a line to mark Q1 
    plt.axvline(q2, color = "navy");             # Plotting a line to mark Q2 
    plt.axvline(q3, color = "purple");           # Plotting a line to mark Q3
    plt.legend({"Mean": mean, "25%" : q1, "50%" : q2, "75%" : q3});

fig, axes = plt.subplots(5, 2, figsize = (20,20));
fig.suptitle('Distribution charts for Age, Experience and income.');


# Create boxplot to show distribution of Age
sns.boxplot(df["age"], ax = axes[0][0], color = "mediumslateblue");
axes[0][0].set(xlabel = 'Distribution of Age');

pp = sns.distplot(df["age"], ax = axes[0][1], bins = 10, color = "mediumslateblue");
axes[0][1].set(xlabel = 'Distribution of Age');
draw_axvlines(pp, "age");


# Create boxplot to show distribution of creatinine_phosphokinase
sns.boxplot(df["creatinine_phosphokinase"], ax = axes[1][0], color = "mediumslateblue");
axes[1][0].set(xlabel = 'Distribution of creatinine_phosphokinase');

pp = sns.distplot(df["creatinine_phosphokinase"], ax = axes[1][1], bins = 10, color = "mediumslateblue");
axes[1][1].set(xlabel = 'Distribution of creatinine_phosphokinase');
draw_axvlines(pp, "creatinine_phosphokinase")


# Create boxplot to show distribution of platelets
sns.boxplot(df["platelets"], ax = axes[2][0], color = "mediumslateblue");
axes[2][0].set(xlabel = 'Distribution of platelets');

pp = sns.distplot(df["platelets"], ax = axes[2][1], color = "mediumslateblue");
axes[2][1].set(xlabel = 'Distribution of platelets');
draw_axvlines(pp, "platelets")


# Create boxplot to show distribution of serum_creatinine
sns.boxplot(df["serum_creatinine"], ax = axes[3][0], color = "mediumslateblue");
axes[3][0].set(xlabel = 'Distribution of serum_creatinine');

pp = sns.distplot(df["serum_creatinine"], ax = axes[3][1], color = "mediumslateblue");
axes[3][1].set(xlabel = 'Distribution of serum_creatinine');
draw_axvlines(pp, "serum_creatinine")

# Create boxplot to show distribution of ejection_fraction
sns.boxplot(df["ejection_fraction"], ax = axes[4][0], color = "mediumslateblue");
axes[4][0].set(xlabel = 'Distribution of ejection_fraction');

pp = sns.distplot(df["ejection_fraction"], ax = axes[4][1], color = "mediumslateblue");
axes[4][1].set(xlabel = 'Distribution of ejection_fraction');
draw_axvlines(pp, "ejection_fraction")

As suspected, all the four columns have a lot of outliers.

In [None]:
# A function that returns value counts for a column split by personal_loan
def groupby_get_cc_count(tdf, col):
    tdf = tdf.groupby([col, "DEATH_EVENT"])["DEATH_EVENT"].count().reset_index(level = 0)
    tdf.columns = [col, "count"]
    tdf = tdf.reset_index()
    return tdf

In [None]:
fig, axes = plt.subplots(1, 3, figsize = (20,5));

for ix, i in enumerate(["high_blood_pressure", "sex", "smoking"]):
    xx = groupby_get_cc_count(df[[i, "DEATH_EVENT"]], i)
    sns.barplot(xx[i], xx["count"], hue = xx["DEATH_EVENT"], palette = "cividis", ax = axes[ix]);
    
fig, axes = plt.subplots(1, 2, figsize = (20,5));

for ix, i in enumerate(["anaemia", "diabetes"]):
    xx = groupby_get_cc_count(df[[i, "DEATH_EVENT"]], i)
    sns.barplot(xx[i], xx["count"], hue = xx["DEATH_EVENT"], palette = "cividis", ax = axes[ix]);

In [None]:
sns.scatterplot(x = "time", y = "serum_sodium", data = df[["serum_sodium", "time", "DEATH_EVENT"]], hue = "DEATH_EVENT");

In [None]:
xx = df[target_col[0]].value_counts().reset_index()
sns.barplot(x = "index", y = "DEATH_EVENT", data=xx, palette = "cividis");

### SMOTE (Synthetic Minority Oversampling Technique) for balancing data

In [None]:
col_names = list(df.columns)
col_names.remove(target_col[0])

X = df[col_names]
y = df[target_col[0]]

oversample = SMOTE()
X, y = oversample.fit_resample(X, y)


df = pd.concat([pd.DataFrame(X), pd.DataFrame(y)], axis=1)
df.columns = col_names + target_col
df

In [None]:
xx = df[target_col[0]].value_counts().reset_index()
sns.barplot(x = "index", y = "DEATH_EVENT", data=xx, palette = "cividis");

In [None]:
df_train = df

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
std = StandardScaler()

scaled = std.fit_transform(df[num_cols])     # Standardize the columns to get them on the same scale
scaled = pd.DataFrame(scaled, columns=num_cols)

df_train = pd.concat([scaled, df[bin_cols + target_col]], axis=1)

df_train.head()

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df_train.corr(), annot=True, fmt='.2g');

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()

col_names = list(df_train.columns)
col_names.remove(target_col[0])

X = df_train[col_names]
y = df_train[target_col[0]]

model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

In [None]:
col_names = ['age', "serum_creatinine", "serum_sodium", "ejection_fraction", "time"]
X = df_train[col_names]      # Contains the independent columns 
y = df_train[target_col]     # Our target column

In [None]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
train_y = train_y[target_col[0]]
test_y = test_y[target_col[0]]

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, accuracy_score
conf_matrix_all = {}
a = []
def death_event_prediction(name, algo, training_x, testing_x, training_y, testing_y, plot) :
    global a
    algo.fit(training_x,training_y)                           # Fit the training data set to the algorithm passed.
    predictions = algo.predict(testing_x)                     # Get all predictions
    probabilities = algo.predict_proba(testing_x)             # Get probablities of predictions

    conf_matrix = confusion_matrix(testing_y, predictions)    # Get confusion matrix using the predictions
    tn, fp, fn, tp = conf_matrix.ravel()
    
    conf_matrix_all[name] = conf_matrix                       # Save confusion matrix values to a dictionary
    a = conf_matrix    
    
    print("Classification report:")                           # Print the classification report
    print(classification_report(testing_y, predictions))
  
    model_roc_auc = roc_auc_score(testing_y, predictions)           # Get the Area under the curve number
    fpr,tpr,thresholds = roc_curve(testing_y, probabilities[:,1])   # Get False postive rate and true positive rate

    print ("Area under the curve: ", model_roc_auc)
    print(accuracy_score(testing_y, predictions))
    
    if plot:
        fig, axes = plt.subplots(1,2, figsize=(25, 5))
        conf_matrix = np.flip(conf_matrix)
        
        conf_2 = conf_matrix.astype(str)
        labels = np.array([['\nTP','\nFN'],['\nFP','\nTN']])
        labels = np.core.defchararray.add(conf_2, labels)
        sns.heatmap(conf_matrix, fmt='', annot = labels, ax=axes[0], cmap="YlGnBu", xticklabels=[1, 0], yticklabels=[1, 0]);                                           # Plot the confusion matrix
        axes[0].set(xlabel='Predicted', ylabel='Actual')

        plt.title('Receiver Operating Characteristic')
        sns.lineplot(fpr, tpr, ax=axes[1])                                         # Plot the ROC curve
        plt.plot([0, 1], [0, 1],'--')                                              # Plot the diagonal line
        axes[1].set_xlim([0, 1])                                                   # Set x-axis limit to 0 and 1
        axes[1].set_ylim([0, 1])                                                   # Set y-axis limit to 0 and 1
        axes[1].set(xlabel = 'False Positive Rate', ylabel = 'True Positive Rate');
        plt.show();

In [None]:
lr  = LogisticRegression(C=1e2, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, penalty="l2")

death_event_prediction("Logistic Regression", lr, train_X, test_X, train_y, test_y, plot = True)

In [None]:
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric="manhattan", metric_params=None, n_neighbors = 10, weights='distance')

death_event_prediction("K-nearest Neighbors", knn, train_X, test_X, train_y, test_y, plot=True)

In [None]:
svc = SVC(C=2.0, kernel='linear', degree= 2, gamma=1.0, random_state=None,
          coef0=0.0, shrinking=True, probability=True,tol=0.001,
          cache_size=200, class_weight=None, verbose=False,max_iter= -1)

death_event_prediction("Support Vector Classifier", svc, train_X, test_X, train_y, test_y, plot=True)

In [None]:
dtc = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=10, min_samples_split=2, 
                             min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, 
                             random_state=None, max_leaf_nodes=None, min_impurity_decrease=0, 
                             min_impurity_split=None, class_weight=None, presort='deprecated', ccp_alpha=0.0)

death_event_prediction("Decision Tree", dtc, train_X, test_X, train_y, test_y, plot=True)

In [None]:
rfc = RandomForestClassifier(n_estimators = 100, max_depth = 15, criterion = "entropy", 
                               min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', 
                               max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, 
                               bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, 
                               warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)

death_event_prediction("Random Forest", rfc,train_X,test_X,train_y,test_y, plot=True) 

In [None]:
from xgboost import XGBClassifier

xgc = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                        colsample_bytree=1, gamma=0, learning_rate=0.2, max_delta_step=0,
                        max_depth = 16, min_child_weight=1, missing=None, n_estimators=100,
                        objective='binary:logistic', random_state=0, reg_alpha=0, reg_lambda=1, 
                        scale_pos_weight=1, subsample=1)

death_event_prediction("XGBoost", xgc, train_X, test_X, train_y, test_y, plot=True)

In [None]:
import math
fig, axes = plt.subplots(2,3, figsize = (20, 12))

cnt = 0
for r in range(2):
    for c in range(3):
        try:
            conf_matrix = np.flip(list(conf_matrix_all.values())[cnt])
            conf_2 = conf_matrix.astype(str)
            labels = np.array([['\nTP','\nFN'],['\nFP','\nTN']])
            labels = np.core.defchararray.add(conf_2, labels)
            
            sns.heatmap(conf_matrix, fmt='', annot = labels, ax=axes[r, c], cmap="YlGnBu", xticklabels=[1, 0], yticklabels=[1, 0]);
            axes[r, c].set(title=list(conf_matrix_all.keys())[cnt])
            cnt += 1
        except:
            pass