# Heart Failure Predictions

Cardiovascular diseases (CVDs) are the first cause of death in the world with an estimated 17.9 million deaths each year, which accounts for 31% of all deaths worlwide.
Heart failure is a common event caused by CVDs and this dataset contains 12 features that can be used to predict mortality by heart failure.
Most cardiovascular diseases can be prevented by addressing behavioural risk factors such as tobacco use, unhealthy diet and obesity, physical inactivity and harmful use of alcohol using population-wide strategies.
People with cardiovascular disease or who are at high cardiovascular risk, due to the presence of one or more risk factors such as hypertension, diabetes, hyperlipidaemia or already established disease, need early detection and management wherein a machine learning model can be of great help.


## Working environment

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session/'

In [None]:
# list of librairies used in this project
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st 

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
from mlxtend.plotting import plot_confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [None]:
# Colors and style settings
sns.set_palette("Set1")
sns.set_style('ticks')

In [None]:
dataset = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

## Exploratory analysis

In [None]:
dataset.head()

In [None]:
print(dataset.shape)
print(dataset.columns)

In [None]:
s = (dataset.dtypes == 'object')
object_cols = list(s[s].index)
print('Categorial columns :', object_cols)

### Cleaning

Missing values

In [None]:
dataset.isnull().mean()

Duplicated values

In [None]:
dataset.duplicated().sum()

### Descriptive analysis

#### Quantitative data representations

In [None]:
plt.figure(figsize = (40,40))

plt.subplot(3, 3, 1)
sns.boxplot('age', data=dataset)

plt.subplot(3,3,2)
sns.boxplot('creatinine_phosphokinase', data=dataset)

plt.subplot(3,3,3)
sns.boxplot('ejection_fraction', data=dataset)

plt.subplot(3,3,4)
sns.boxplot('platelets', data=dataset)

plt.subplot(3,3,5)
sns.boxplot('serum_creatinine', data=dataset)

plt.subplot(3,3,6)
sns.boxplot('serum_sodium', data=dataset)

plt.subplot(3,3,7)
sns.boxplot('time', data=dataset)

plt.tight_layout()

plt.show()

We show two outliers in ejection_fraction.

In [None]:
outliers = dataset['ejection_fraction'] >= 70
dataset[outliers]

In [None]:
dataset = dataset[~outliers]

In [None]:
plt.figure(figsize = (40,40))

plt.subplot(3, 3, 1)
sns.boxplot('age', data=dataset)

plt.subplot(3,3,2)
sns.boxplot('creatinine_phosphokinase', data=dataset)

plt.subplot(3,3,3)
sns.boxplot('ejection_fraction', data=dataset)

plt.subplot(3,3,4)
sns.boxplot('platelets', data=dataset)

plt.subplot(3,3,5)
sns.boxplot('serum_creatinine', data=dataset)

plt.subplot(3,3,6)
sns.boxplot('serum_sodium', data=dataset)

plt.subplot(3,3,7)
sns.boxplot('time', data=dataset)

plt.tight_layout()

plt.show()

#### Categorial data

In [None]:
plt.figure(figsize = (15,15))

plt.subplot(2,3,1)
sns.countplot('sex', hue='DEATH_EVENT', data=dataset)
plt.legend()

plt.subplot(2,3,2)
sns.countplot('anaemia', hue='DEATH_EVENT', data=dataset)
plt.legend()

plt.subplot(2,3,3)
sns.countplot('high_blood_pressure', hue='DEATH_EVENT', data=dataset)
plt.legend()

plt.subplot(2,3,4)
sns.countplot('smoking', hue='DEATH_EVENT', data=dataset)
plt.legend()

plt.subplot(2,3,5)
sns.countplot('diabetes', hue='DEATH_EVENT', data=dataset)
plt.legend()

plt.tight_layout()

plt.show()

### Bivariate analysis

#### Categorial data and correlations

In [None]:
X = 'sex'
Y = 'DEATH_EVENT'
cont = dataset[[X, Y]].pivot_table(index=X, columns=Y, aggfunc=len, margins=True, margins_name='Total')

In [None]:
tx = cont.loc[:,["Total"]]
ty = cont.loc[["Total"],:]
n = len(dataset)
indep = tx.dot(ty) / n

c = cont.fillna(0)
measure = (c-indep)**2/indep
xi_n = measure.sum().sum()
table = measure/xi_n

plt.figure(figsize = (10,10))
sns.heatmap(table.iloc[:-1,:-1],annot=c.iloc[:-1,:-1])
plt.ylabel('sex')
plt.xlabel('DEATH_EVENT')
plt.yticks(rotation = 360)
plt.title('Contingency table')

plt.show()

In [None]:
print('xi_n : ', xi_n)

st_chi2, st_p, st_dof, st_exp = st.chi2_contingency(cont)
print('chi-squared :', st_chi2)
print('p-value :', st_p)

In [None]:
X = 'smoking'
Y = 'DEATH_EVENT'
cont = dataset[[X, Y]].pivot_table(index=X, columns=Y, aggfunc=len, margins=True, margins_name='Total')

In [None]:
tx = cont.loc[:,["Total"]]
ty = cont.loc[["Total"],:]
n = len(dataset)
indep = tx.dot(ty) / n

c = cont.fillna(0)
measure = (c-indep)**2/indep
xi_n = measure.sum().sum()
table = measure/xi_n

plt.figure(figsize = (10,10))
sns.heatmap(table.iloc[:-1,:-1],annot=c.iloc[:-1,:-1])
plt.ylabel('smoking')
plt.xlabel('DEATH_EVENT')
plt.yticks(rotation = 360)
plt.title('Contingency table')

plt.show()

In [None]:
print('xi_n : ', xi_n)

st_chi2, st_p, st_dof, st_exp = st.chi2_contingency(cont)
print('chi-squared :', st_chi2)
print('p-value :', st_p)

In [None]:
X = 'anaemia'
Y = 'DEATH_EVENT'
cont = dataset[[X, Y]].pivot_table(index=X, columns=Y, aggfunc=len, margins=True, margins_name='Total')

In [None]:
tx = cont.loc[:,["Total"]]
ty = cont.loc[["Total"],:]
n = len(dataset)
indep = tx.dot(ty) / n

c = cont.fillna(0)
measure = (c-indep)**2/indep
xi_n = measure.sum().sum()
table = measure/xi_n

plt.figure(figsize = (10,10))
sns.heatmap(table.iloc[:-1,:-1],annot=c.iloc[:-1,:-1])
plt.ylabel('anaemia')
plt.xlabel('DEATH_EVENT')
plt.yticks(rotation = 360)
plt.title('Contingency table')

plt.show()

In [None]:
print('xi_n : ', xi_n)

st_chi2, st_p, st_dof, st_exp = st.chi2_contingency(cont)
print('chi-squared :', st_chi2)
print('p-value :', st_p)

In [None]:
X = 'high_blood_pressure'
Y = 'DEATH_EVENT'
cont = dataset[[X, Y]].pivot_table(index=X, columns=Y, aggfunc=len, margins=True, margins_name='Total')

In [None]:
tx = cont.loc[:,["Total"]]
ty = cont.loc[["Total"],:]
n = len(dataset)
indep = tx.dot(ty) / n

c = cont.fillna(0)
measure = (c-indep)**2/indep
xi_n = measure.sum().sum()
table = measure/xi_n

plt.figure(figsize = (10,10))
sns.heatmap(table.iloc[:-1,:-1],annot=c.iloc[:-1,:-1])
plt.ylabel('high_blood_pressure')
plt.xlabel('DEATH_EVENT')
plt.yticks(rotation = 360)
plt.title('Contingency table')

plt.show()

In [None]:
print('xi_n : ', xi_n)

st_chi2, st_p, st_dof, st_exp = st.chi2_contingency(cont)
print('chi-squared :', st_chi2)
print('p-value :', st_p)

#### Correlation of quantitative data

In [None]:
plt.figure(figsize=[10,10])

sns.boxplot(x='DEATH_EVENT', y='creatinine_phosphokinase', data=dataset)
plt.show()

In [None]:
X = dataset['DEATH_EVENT'] # qualitative
Y = dataset['creatinine_phosphokinase']

def eta_squared(x,y):
    moyenne_y = y.mean()
    classes = []
    for classe in x.unique():
        yi_classe = y[x==classe]
        classes.append({'ni': len(yi_classe),
                        'moyenne_classe': yi_classe.mean()})
    SCT = sum([(yj-moyenne_y)**2 for yj in y])
    SCE = sum([c['ni']*(c['moyenne_classe']-moyenne_y)**2 for c in classes])
    return SCE/SCT
    
print('eta-squared :', eta_squared(X,Y))

In [None]:
plt.figure(figsize=[10,10])

sns.boxplot(x='DEATH_EVENT', y='platelets', data=dataset)
plt.show()

In [None]:
X = dataset['DEATH_EVENT'] # qualitative
Y = dataset['platelets']

print('eta-squared :', eta_squared(X,Y))

In [None]:
plt.figure(figsize=[10,10])

sns.boxplot(x='DEATH_EVENT', y='ejection_fraction', data=dataset)
plt.show()

In [None]:
X = dataset['DEATH_EVENT'] # qualitative
Y = dataset['ejection_fraction']

print('eta-squared :', eta_squared(X,Y))

In [None]:
plt.figure(figsize=[10,10])

sns.boxplot(x='DEATH_EVENT', y='serum_creatinine', data=dataset)
plt.show()

In [None]:
X = dataset['DEATH_EVENT'] # qualitative
Y = dataset['serum_creatinine']

print('eta-squared :', eta_squared(X,Y))

In [None]:
plt.figure(figsize=[10,10])

sns.boxplot(x='DEATH_EVENT', y='serum_sodium', data=dataset)
plt.show()

In [None]:
X = dataset['DEATH_EVENT'] # qualitative
Y = dataset['serum_sodium']

print('eta-squared :', eta_squared(X,Y))

In [None]:
plt.figure(figsize=[10,10])

sns.boxplot(x='DEATH_EVENT', y='time', data=dataset)
plt.show()

In [None]:
X = dataset['DEATH_EVENT'] # qualitative
Y = dataset['time']

print('eta-squared :', eta_squared(X,Y))

#### 1.3.3. Matrice de corrélation globale

In [None]:
corrMatrix = dataset.corr()
plt.figure(figsize=(15,15))
sns.heatmap(corrMatrix,square = True, cmap="coolwarm",linewidths=.5, annot=True)
plt.show()

## Machine Learning and modelisation

In [None]:
X = dataset[['age','ejection_fraction','serum_creatinine', 'serum_sodium', 'time']]
y = dataset['DEATH_EVENT']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size = 0.8, random_state=1)

### 2.1. Random Forest

In [None]:
rf = RandomForestClassifier(max_depth=2, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print('Accuracy : ', metrics.accuracy_score(y_pred=y_pred,y_true=y_test) * 100, '%')

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, figsize=(12,8), hide_ticks=True, cmap=plt.cm.Reds)
plt.title("Random Forest Model - Confusion Matrix")
plt.xticks(range(2), ["Heart Not Failed","Heart Failed"], fontsize=16)
plt.yticks(range(2), ["Heart Not Failed","Heart Failed"], fontsize=16)
plt.show()

In [None]:
importances = rf.feature_importances_
indices = np.argsort(importances)
 
plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [dataset.columns[i] for i in indices])
plt.xlabel('Relative Importance')

### 2.2. Gradient Boosting

In [None]:
gbc = GradientBoostingClassifier(max_depth=2, random_state=42)
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)

print('Accuracy : ', metrics.accuracy_score(y_pred=y_pred,y_true=y_test) * 100, '%')

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, figsize=(12,8), hide_ticks=True, cmap=plt.cm.Reds)
plt.title("Gradient Boosting Model - Confusion Matrix")
plt.xticks(range(2), ["Heart Not Failed","Heart Failed"], fontsize=16)
plt.yticks(range(2), ["Heart Not Failed","Heart Failed"], fontsize=16)
plt.show()

In [None]:
importances = gbc.feature_importances_
indices = np.argsort(importances)
 
plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [dataset.columns[i] for i in indices])
plt.xlabel('Relative Importance')

### 2.3. AdaBoost

In [None]:
adc = AdaBoostClassifier(n_estimators=100, random_state=42)
adc.fit(X_train, y_train)
y_pred = adc.predict(X_test)

print('Accuracy: ', metrics.accuracy_score(y_pred=y_pred,y_true=y_test) * 100, '%')

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, figsize=(12,8), hide_ticks=True, cmap=plt.cm.Reds)
plt.title("AdaBoost Model - Confusion Matrix")
plt.xticks(range(2), ["Heart Not Failed","Heart Failed"], fontsize=16)
plt.yticks(range(2), ["Heart Not Failed","Heart Failed"], fontsize=16)
plt.show()

In [None]:
importances = adc.feature_importances_
indices = np.argsort(importances)
 
plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [dataset.columns[i] for i in indices])
plt.xlabel('Relative Importance')

**Conclusions :**
- Random Forest is the method with more accuracy
- ejection_fraction and creatinine_phosphokinase are the most importante features

References : 
- https://archive.ics.uci.edu/ml/datasets/Heart+failure+clinical+records
- https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0181001
- https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-020-1023-5
- https://www.fedecardio.org/Les-gestes-qui-sauvent/L-arret-cardiaque/larret-cardiaque-un-bilan-alarmant
- https://www.who.int/fr/news-room/fact-sheets/detail/the-top-10-causes-of-death
