# Heart failure dataset

This notebook is based on the heart failure dataset, and is my second notebook after Titanic.

My hope with this notebook is to increase my machine learning knowledge and craftmanship, so In the spirit of Kaggle i invite you to comment if you find anything that could be improved.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

filepath = ("../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")
df = pd.read_csv(filepath)

sns.__version__

# Looking at the data

In [None]:
# The entere dataframe
df

In [None]:
# The column headings
df.columns

In [None]:
# Data types
df.dtypes

In [None]:
# Statistics
df.describe()

Time ranges from 4 to 285 I guess it is counted in days.

In [None]:
# missing values?
pd.isna(df).sum()

Nope, no missing data.

In [None]:
# How many?

print(f'Total number of patients: {len(df)}')
print(f'Number of patients with anaemia: {df.anaemia.sum()}')
print(f'Number of patients with diabetes: {df.diabetes.sum()}')
print(f'Number of patients with high blod pressure: {df.high_blood_pressure.sum()}')
print(f'Number of female patients: {df.sex.sum()}')
print(f'Number of smokers: {df.smoking.sum()}')
print(f'Number of deaths or exits: {df.DEATH_EVENT.sum()}')

### Displaying the data in graphs

In [None]:
# first displaying the numerical data.

f, axs = plt.subplots(ncols=4, figsize=(15, 6))

sns.histplot(data=df, x="age", ax=axs[0])
sns.histplot(data=df, x='creatinine_phosphokinase',ax=axs[1])
sns.histplot(data=df, x='ejection_fraction', ax=axs[2])
sns.scatterplot(data=df, x="age", y="ejection_fraction", ax=axs[3])
f.subplots_adjust(wspace=0.4)

f, axs = plt.subplots(3,2, figsize=(15, 12))

sns.kdeplot(data=df, x='creatinine_phosphokinase', hue="DEATH_EVENT", ax=axs[0,0])
sns.kdeplot(data=df, x='ejection_fraction', hue="DEATH_EVENT", ax=axs[0,1])
sns.kdeplot(data=df, x='platelets', hue='DEATH_EVENT', ax=axs[1,0])
sns.kdeplot(data=df, x='serum_creatinine', hue='DEATH_EVENT', ax=axs[1,1])
sns.kdeplot(data=df, x='serum_sodium', hue='DEATH_EVENT', ax=axs[2,0])
sns.kdeplot(data=df, x='age', hue='DEATH_EVENT', ax=axs[2,1])

Age, ejection fraction, serum creatinine and serum sodium seems to be different for those patinents with death events compared other patients. 

In [None]:
# Plotting as box plots

f, axs = plt.subplots(3,2, figsize=(17, 12))

sns.boxplot(data=df, x='DEATH_EVENT', y='serum_creatinine', ax=axs[0,0])
sns.boxplot(data=df, x='DEATH_EVENT', y='creatinine_phosphokinase',ax=axs[0,1])
sns.boxplot(data=df, x='DEATH_EVENT', y='platelets', ax=axs[1,1])
sns.boxplot(data=df, x='DEATH_EVENT', y='serum_sodium',ax=axs[1,0])
sns.boxplot(data=df, x='DEATH_EVENT', y='age',ax=axs[2,0])
sns.boxplot(data=df, x='DEATH_EVENT', y='ejection_fraction',ax=axs[2,1])
f.subplots_adjust(wspace=0.2, hspace=0.3)

Seems like age and ejection fraction are different for patient experiencing an event.
Also I notice outliers.

In [None]:
# Then lets look at the nominal data
# Barplot and interpretation
sns.barplot(x='high_blood_pressure', y='DEATH_EVENT', data=df)
print('Percentage of patients with low blood pressure who had an event:', round(df.loc[df['high_blood_pressure']==0].DEATH_EVENT.sum()/len(df.loc[df['high_blood_pressure']==0])*100))
print('Percentage of patients with high blood pressure who had an event:', round(df.loc[df['high_blood_pressure']==1].DEATH_EVENT.sum()/len(df.loc[df['high_blood_pressure']==1])*100))

High blood pressure seems to correlate with risk of event.

In [None]:
# More plots!

f, axs = plt.subplots(2,2, figsize=(15, 9))

sns.barplot(x='smoking', y='DEATH_EVENT', data=df, ax=axs[0,0])
sns.barplot(x='sex', y='DEATH_EVENT', data=df, ax=axs[1,0])
sns.barplot(x='anaemia', y='DEATH_EVENT', data=df, ax=axs[1,1])
sns.barplot(x='diabetes', y='DEATH_EVENT', data=df, ax=axs[0,1])

Anamia might also be related to events

In [None]:
# Another way of creating plot.
# In these plots age is used as a reference. 

binary_columns =['anaemia','high_blood_pressure','smoking','diabetes']

for columns in binary_columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=df, x=columns, y='age', hue='DEATH_EVENT')
    plt.show()

It is interesting that in the above barplot smoking does not seem to be an important feature, but plottet in relation to age it is.
Also one might think that smoking is bad for your heart, but it seems to be the opposite in this dataset.

Also interesting that diabetes cancels out the effect of age on death events
Complex data, maybe there is a need for combined features?

In [None]:
# lets see if scatter plots can reveal anything.

f, axs = plt.subplots(2,2, figsize=(15, 9))

sns.scatterplot(data=df, x='serum_creatinine', y='age',hue='DEATH_EVENT',ax=axs[0,0])
sns.scatterplot(data=df, x='serum_sodium', y='age', hue='DEATH_EVENT',ax=axs[0,1])
sns.scatterplot(data=df, x='creatinine_phosphokinase', y='age', hue='DEATH_EVENT', ax=axs[1,0])
sns.scatterplot(data=df, x='platelets', y='age', hue='DEATH_EVENT', ax=axs[1,1])
sns.lmplot(data=df, x='serum_sodium', y='age', hue='DEATH_EVENT')
sns.lmplot(data=df, x='time', y='age', hue='DEATH_EVENT')

On the last lmplot one can see that patients at high age are more likely to have an early event.
Can not find anything too interesting on the other plots. 

In [None]:
# One last scatter plot, just because is looks nice in 3D.

fig = plt.figure(figsize=(17, 12))
ax = fig.add_subplot(111, projection = '3d')

x = df['age']
y = df['serum_creatinine']
z = df['ejection_fraction']

ax.set_xlabel("Age")
ax.set_ylabel("Serum creatinine")
ax.set_zlabel("Ejektion fraction")

scatter = ax.scatter(x, y, z, c=df['DEATH_EVENT'], marker='o')

legend = ax.legend(*scatter.legend_elements(),
                    loc="upper right", title="Death events")
ax.add_artist(legend)

plt.show()

One might be able to see that patients at high age are more likely to have an event, similar for patients with low ejection fractions.

In [None]:
# Now lets see how the features relate to death events.

v = pd.DataFrame(df.corr()['DEATH_EVENT'])
plt.figure(figsize=(20,12))
sns.barplot(x='DEATH_EVENT', y=v.index, data=v)

It looks like age, ejektion fraction, serum creatinine, serum sodium and time are the features that correlates the most to death events.

In [None]:
# Based on the above figure I am
# setting a negative treshold of -0.2 and a positive of 0.2.

df_corr = df.corr()
df_corr_true = [(df_corr['DEATH_EVENT'] >0.2) | (df_corr['DEATH_EVENT'] <-0.2)]
print(df_corr_true)

Age, ejection fraction, serum creatinine and time are the true correlators...

In [None]:
# Another way of displaying this is a heatmap.

plt.figure(figsize=(13,9))
sns.heatmap(df.corr(), annot = True)

In [None]:
# Based on finding in the previous barplots i would like to 
# create a new column that combines smoking, diabetes and sex.
# Hopefully this could make some of the nominal data usefull to the model.

def sds(row):
    return 1 + row['smoking']-row['diabetes']+row['sex']
  
sds = df.apply(sds, axis=1)
df.insert(0, 'sds', sds)
df['sds'].astype('category')

In [None]:
df.columns

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
ax=sns.boxplot(data=df, x='sds', y='age', hue='DEATH_EVENT')

Combined with age i looks like the new feature sds is usefull. People with a high sds score are more likely to survive (die later).

# Feature importance
Lets see which features a random forest consideres important, and plot that information.

In [None]:
from sklearn.ensemble import RandomForestClassifier

X = df.drop(columns='DEATH_EVENT')
y = df['DEATH_EVENT']

randomforest = RandomForestClassifier()
randomforest.fit(X, y)

print(f'\n Feature importance: {randomforest.feature_importances_}')
feat_importances = pd.Series(randomforest.feature_importances_, index=X.columns)
feat_importances.nlargest(20).plot(kind='barh')
plt.show()

from sklearn.inspection import plot_partial_dependence

fig, ax = plt.subplots(figsize=(17, 10))
ax.set_title("Random forest")
tree_disp = plot_partial_dependence(randomforest, X, ['sds','age','anaemia','creatinine_phosphokinase','diabetes','ejection_fraction','high_blood_pressure','platelets','serum_creatinine','serum_sodium','sex','smoking','time'], ax=ax)
fig.subplots_adjust(wspace=0.2, hspace=0.6)

## The new feature?

Once again age, ejection fraction, serum cratinine and time wins!!

It does not look my newly created feature sds is not considered important, too bad. At least it did better then the single binary features.

Maybe i was confused in input line 11 because i plottet the columns with binary values with age on the y-axis
let try it with time on the y-axis.

In [None]:
for columns in binary_columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=df, x=columns, y='time', hue='DEATH_EVENT')
    plt.show()

Truly on these plots the 4 features does not seem important.

# Now lets make some models

In the next steps i copied a lot from this notebook on Titanic:
https://www.kaggle.com/nadintamer/titanic-survival-predictions-beginner

And this notebook:
https://www.kaggle.com/codeblogger/step-by-step-decision-tree-classifier-98-34

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

features = ['age','ejection_fraction','serum_creatinine','time']
predictors = df[features]

target = df["DEATH_EVENT"]
X_train, X_val, y_train, y_val = train_test_split(predictors, target, test_size = 0.22, random_state = 0)

# Standardization of data for tree based models

sc=StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.fit_transform(X_val)


In [None]:
# define model_creator

def model_creator (model, best_parameter_grid):
    if not best_parameter_grid:
        f=model()
    
    else:
        f=model(**best_parameter_grid)
    
    f.fit(X_train, y_train)
    y_pred = f.predict(X_val)
    
    
    print('Accuracy:', round(accuracy_score(y_pred, y_val)*100, 2))
    print('F1 score:', round(f1_score(y_val , y_pred), 2))
    return f

I have chosen to use 4 models:
    Random Forest Classifier,
    Gradient Boosting Classifier,
    DecisionTreeClassifier and
    Ada Boost Classifier.

This choice is mainly based on what I have seen other people do.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix, accuracy_score


rfc=model_creator(RandomForestClassifier, best_parameter_grid={})

In [None]:
# Making grid search on Random Forest Classifier

from sklearn.model_selection import GridSearchCV

# reduced search
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,6,8],
    'criterion' :['gini', 'entropy']
}

CV_randomforest = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 4)
CV_randomforest.fit(X_train, y_train)

print('Best parameters:', CV_randomforest.best_params_)

In [None]:
rfc_2=model_creator(RandomForestClassifier, best_parameter_grid=CV_randomforest.best_params_)

In [None]:
rfc_2.get_params()

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc=model_creator(GradientBoostingClassifier, best_parameter_grid=[])

In [None]:
# Making grid search for GradientboostingClassifier

param_grid = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.05, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 3),
    "min_samples_leaf": np.linspace(0.1, 0.5, 3),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 0.8, 0.95, 1.0],
    "n_estimators":[10]
    }

CV_gbc = GridSearchCV(estimator=gbc, param_grid=param_grid, cv=4, n_jobs=-1)
CV_gbc.fit(X_train, y_train)

print('Best parameters:',CV_gbc.best_params_)

In [None]:
gbc_2=model_creator(GradientBoostingClassifier, CV_gbc.best_params_)

In [None]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier

dtc=model_creator(DecisionTreeClassifier, best_parameter_grid=[])

In [None]:
param_grid = { 
    "max_depth": [1,5,10],
    "min_samples_split": [0.001, 0.01, 0.1, 0.2, 0.02, 0.002],
    "criterion": ["gini", "entropy"],
    "max_leaf_nodes": [2,5,10],
    "class_weight": ["balanced", None]
}

CV_decisiontree = GridSearchCV(estimator=dtc, param_grid=param_grid, cv= 4)
CV_decisiontree.fit(X_train, y_train)

print('Best parameters:', CV_decisiontree.best_params_)

In [None]:
dtc_2 = model_creator(DecisionTreeClassifier, CV_decisiontree.best_params_)

In [None]:
from sklearn.ensemble import AdaBoostClassifier

abc=model_creator(AdaBoostClassifier, best_parameter_grid=[])

In [None]:
# Compute ROC curve and ROC area for each class and making it a function
# This part is from: https://stackoverflow.com/questions/25009284/how-to-plot-roc-curve-in-python

def ROC_Curve(model):
    probs = model.predict_proba(X_val)
    preds = probs[:,1]

    fpr, tpr, threshold = roc_curve(y_val, preds)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(10,10))
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([-0.001, 1])
    plt.ylim([0, 1.001])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    
#Compute Confusion Matrix and making it a function

def Confusion_matrix(model):
    cm = confusion_matrix(y_val, model.predict(X_val))
    plt.figure(figsize=(6,6))
    sns.heatmap(cm, annot=True, cmap="Blues",
                   xticklabels=["FALSE","TRUE"],
                   yticklabels=["FALSE","TRUE"],
                   cbar=False)
    plt.title("Stacking Classifier Confusion Matrix (Number)")
    plt.show()

## Stacking Classifier

Let's see if a stacking classifier performs better by combining the four models. 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, auc

estimators = [
    ('rfc_2', rfc_2),
    ('gbc_2', gbc_2),
    ('dtc_2', dtc_2),
    ('abc', abc)
]

sc = StackingClassifier(
estimators=estimators, final_estimator=LogisticRegression()
)

sc.fit(X_train, y_train)
sc_y_pred = sc.predict(X_val)

print('Accuracy:', round(accuracy_score(sc_y_pred, y_val) * 100, 2))
print('F1 score:', round(f1_score(sc_y_pred, y_val),2)) 
print('Roc auc score',round(roc_auc_score(sc_y_pred, y_val),2))
print("\n",classification_report(sc_y_pred, y_val))

Confusion_matrix(sc)
ROC_Curve(sc)

# End

So this (above) is the final model, although this stacking classifier it is not much better then the previous models.

Please leave a comment if you find anything that could be improved.

Thanks for reading through :).

# Appendix

From here it does not count, but to me it was interesting to see that if i looped through a model my accuracy could increse significantly. I know that i does not make my model better, it just had more luck on this validation set, but it is funny when this it is the way Kaggle judges models.

In [None]:
estimators = [
    ('rfc_2', rfc_2),
    ('gbc_2', gbc_2),
    ('dtc_2', dtc_2),
    ('abc', abc)
]

sc_acc_list=[]

for i in range(40):

    sc = StackingClassifier(
         estimators=estimators, final_estimator=LogisticRegression()
    )

    sc.fit(X_train, y_train)
    sc_y_pred = sc.predict(X_val)
    sc_acc_list.append(round(accuracy_score(sc_y_pred, y_val) * 100, 2))
    
print('Max accuracy:', max(sc_acc_list))
print('Min accuracy:', min(sc_acc_list))

In [None]:
rfc_loop = 0

for i in range(500):
    rfc.fit(X_train, y_train)
    rfc_y_pred = rfc.predict(X_val)
    acc_rfc = round(accuracy_score(rfc_y_pred, y_val) * 100, 2)
     
    if acc_rfc >= 89:
        rfc_loop += 1
        print('Model:',i,' Accuracy',acc_rfc)
        print('Model:',i,' F1 score:',round(f1_score(y_val , rfc_y_pred),2))
        print("\n",classification_report(rfc_y_pred, y_val))
        Confusion_matrix(rfc)
        ROC_Curve(rfc)
        break

if rfc_loop == 0:          
    for i in range(500):
        rfc.fit(X_train, y_train)
        rfc_y_pred = rfc.predict(X_val)
        acc_rfc = round(accuracy_score(rfc_y_pred, y_val) * 100, 2)

        if acc_rfc >=87:
            rfc_loop += 2
            print('Model:',i,' Accuracy',acc_rfc)
            print('Model:',i,' F1 score:',round(f1_score(y_val , rfc_y_pred),2))
            print("\n",classification_report(rfc_y_pred, y_val))
            Confusion_matrix(rfc)
            ROC_Curve(rfc)
            break
            
if rfc_loop == 0:
    print('No rfc model is that good')
else: 
    print('Loop number: ', rfc_loop)

In [None]:
gbc_loop = 0

for i in range(500):
    gbc.fit(X_train, y_train)
    gbc_y_pred = gbc.predict(X_val)
    gbc_acc = accuracy_score(gbc_y_pred, y_val)
    
    if gbc_acc > 0.86:
        gbc_loop += 1
        print(f'Model:{i}, Accuracy: {round(gbc_acc*100, 2)}')
        print(f'Model:{i}, F1 score: {round(f1_score(y_val , gbc_y_pred), 2)}')
        print("\n",classification_report(gbc_y_pred, y_val))
        Confusion_matrix(gbc)
        ROC_Curve(gbc)
        break
        
if gbc_loop == 0:
    for i in range(500):
        gbc.fit(X_train, y_train)
        gbc_y_pred = gbc.predict(X_val)
        gbc_acc = accuracy_score(gbc_y_pred, y_val)
    
        if gbc_acc > 0.84:
            gbc_loop += 2
            print(f'Model:{i}, Accuracy: {round(gbc_acc*100, 2)}')
            print(f'Model:{i}, F1 score: {round(f1_score(y_val , gbc_y_pred), 2)}')
            print("\n",classification_report(gbc_y_pred, y_val))
            Confusion_matrix(gbc)
            ROC_Curve(gbc)
            break

if gbc_loop == 0:
    print('No gbc model is that good')
else: 
    print('Loop number: ', gbc_loop)