In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
url = '../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv'
df = pd.read_csv(url)

In [None]:
df.head(10)

In [None]:
df.tail(10)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

# Cleaning the data

In [None]:
# Missing values
df.isnull().sum()

In [None]:
# Duplicated values
df.duplicated().sum()

In [None]:
# Outliers values
plt.figure(figsize=(20, 15))
plt.boxplot(df, labels=df.columns)
plt.show()

Creatinine phosphokinase column

In [None]:
df.creatinine_phosphokinase.describe()

In [None]:
# plt.figure(figsize=(20, 15))
plt.boxplot(df['creatinine_phosphokinase'], labels=['Creatinine Phosphokinase'])
plt.title('Outliers values')

In [None]:
# Removing outliers
q1 = df['creatinine_phosphokinase'].quantile(0.25)
q3 = df['creatinine_phosphokinase'].quantile(0.75)
iqr = q3 - q1
toprange = q3 + iqr * 1.5
botrange = q1 - iqr * 1.5
for i in df.index:
    if df.loc[i, 'creatinine_phosphokinase'] > toprange:
        df.loc[i, 'creatinine_phosphokinase'] = toprange
    if df.loc[i, 'creatinine_phosphokinase'] < botrange:
        df.loc[i, 'creatinine_phosphokinase'] = botrange

In [None]:
# Creatinine phosphokinase column after cleaning
# plt.figure(figsize=(20, 15))
plt.boxplot(df['creatinine_phosphokinase'], labels=['Creatinine Phosphokinase'])
plt.title('Cleaning values')

Ejection fraction column

In [None]:
df.ejection_fraction.describe()

In [None]:
# plt.figure(figsize=(20, 15))
plt.boxplot(df['ejection_fraction'], labels=['Ejection fraction'])
plt.title('Outliers values')

In [None]:
# Removing outliers
q1 = df['ejection_fraction'].quantile(0.25)
q3 = df['ejection_fraction'].quantile(0.75)
iqr = q3 - q1
toprange = q3 + iqr * 1.5
botrange = q1 - iqr * 1.5
for i in df.index:
    if df.loc[i, 'ejection_fraction'] > toprange:
        df.loc[i, 'ejection_fraction'] = toprange
    if df.loc[i, 'ejection_fraction'] < botrange:
        df.loc[i, 'ejection_fraction'] = botrange

In [None]:
# Ejection fraction column after cleaning
# plt.figure(figsize=(20, 15))
plt.boxplot(df['ejection_fraction'], labels=['Ejection fraction'])
plt.title('Cleaning values')

Platelets column

In [None]:
df.platelets.describe()

In [None]:
# plt.figure(figsize=(20, 15))
plt.boxplot(df['platelets'], labels=['Platelets'])
plt.title('Outliers values')

In [None]:
# Removing outliers
q1 = df.platelets.quantile(0.25)
q3 = df.platelets.quantile(0.75)
iqr = q3 - q1
toprange = q3 + iqr * 1.5
botrange = q1 - iqr * 1.5
for i in df.index:
    if df.loc[i, 'platelets'] > toprange:
        df.loc[i, 'platelets'] = toprange
    if df.loc[i, 'platelets'] < botrange:
        df.loc[i, 'platelets'] = botrange

In [None]:
# Platelets column after cleaning
# plt.figure(figsize=(20, 15))
plt.boxplot(df['platelets'], labels=['Platelets'])
plt.title('Cleaning values')

Serum creatinine column

In [None]:
df.serum_creatinine.describe()

In [None]:
# plt.figure(figsize=(20, 15))
plt.boxplot(df['serum_creatinine'], labels=['Serum creatinine'])
plt.title('Outliers values')

In [None]:
# Removing outliers
q1 = df.serum_creatinine.quantile(0.25)
q3 = df.serum_creatinine.quantile(0.75)
iqr = q3 - q1
toprange = q3 + iqr * 1.5
botrange = q1 - iqr * 1.5
for i in df.index:
    if df.loc[i, 'serum_creatinine'] > toprange:
        df.loc[i, 'serum_creatinine'] = toprange
    if df.loc[i, 'serum_creatinine'] < botrange:
        df.loc[i, 'serum_creatinine'] = botrange

In [None]:
# Platelets column after cleaning
# plt.figure(figsize=(20, 15))
plt.boxplot(df['serum_creatinine'], labels=['Serum creatinine'])
plt.title('Cleaning values')

Serum sodium column

In [None]:
df.serum_sodium.describe()

In [None]:
# plt.figure(figsize=(20, 15))
plt.boxplot(df['serum_sodium'], labels=['Serum sodium'])
plt.title('Outliers values')

In [None]:
# Removing outliers
q1 = df.serum_sodium.quantile(0.25)
q3 = df.serum_sodium.quantile(0.75)
iqr = q3 - q1
toprange = q3 + iqr * 1.5
botrange = q1 - iqr * 1.5
for i in df.index:
    if df.loc[i, 'serum_sodium'] > toprange:
        df.loc[i, 'serum_sodium'] = toprange
    if df.loc[i, 'serum_sodium'] < botrange:
        df.loc[i, 'serum_sodium'] = botrange

In [None]:
# Serum sodium column after cleaning
# plt.figure(figsize=(20, 15))
plt.boxplot(df['serum_sodium'], labels=['Serum sodium'])
plt.title('Cleaning values')

Data After Cleaning Outliers

In [None]:
plt.figure(figsize=(20, 15))
plt.boxplot(df, labels=df.columns)
plt.show()

Unnecessary Data

In [None]:
df.drop('time', axis=1, inplace=True)

# Exploring the data

In [None]:
df.head(10)

In [None]:
df.tail(10)

In [None]:
df.info()

In [None]:
df.loc[:, ['creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium']].describe()

Anaemia is a boolean column (  0:non anaemic patient   -   1:anaemic patient  )

In [None]:
df.anaemia.value_counts()

In [None]:
# Ages with anaemia cases
df.groupby('anaemia')['age'].describe()

In [None]:
# Some of ages in each anaemia case
df.groupby('anaemia')['age'].unique()

In [None]:
plt.figure(figsize=(10, 7))
plt.hist(df.anaemia, histtype='bar')
plt.title('Anaemic Patients')
plt.show()

Diabetes is a boolean column (  0:non diabetic patient  -  1:diabetic patient)

In [None]:
df.diabetes.value_counts()

In [None]:
# Ages with diabetes cases
df.groupby('diabetes')['age'].describe()

In [None]:
# Some of ages in each diabetes case
df.groupby('diabetes')['age'].unique()

In [None]:
# Patients of diabetes and anaemia cases
df.groupby('diabetes')['anaemia'].value_counts()

In [None]:
plt.figure(figsize=(10, 7))
plt.hist(df.diabetes, histtype='bar')
plt.title('Diabetic Patients')
plt.show()

High Blood Pressure is a boolean column ( 0:non hypertension patient - 1:hypertension patient)

In [None]:
df.high_blood_pressure.value_counts()

In [None]:
# Ages with hypertension cases
df.groupby('high_blood_pressure')['age'].describe()

In [None]:
# Some of ages in each hypertension case
df.groupby('high_blood_pressure')['age'].unique()

In [None]:
# Patients of hypertension and anaemia cases
df.groupby('high_blood_pressure')['anaemia'].value_counts()

In [None]:
# Patients of hypertension and diabetes cases
df.groupby('high_blood_pressure')['diabetes'].value_counts()

In [None]:
plt.figure(figsize=(10, 7))
plt.hist(df.high_blood_pressure, histtype='bar')
plt.title('Blood Pressure Patients')
plt.show()

sex is a boolean column (  0:females    -     1:males)

In [None]:
df.sex.value_counts()

In [None]:
# Ages in each gender
df.groupby('sex')['age'].describe()

In [None]:
# Some of ages in each gender
df.groupby('sex')['age'].unique()

In [None]:
# Gender of patients in each anaemia case
df.groupby('sex')['anaemia'].value_counts()

In [None]:
# Gender of patients in each diabetes case
df.groupby('sex')['diabetes'].value_counts()

In [None]:
# Gender of patients in each hypertension case
df.groupby('sex')['high_blood_pressure'].value_counts()

In [None]:
plt.figure(figsize=(10, 7))
plt.hist(df.sex, histtype='bar')
plt.title('Gender of Patients')
plt.show()

Smoking is a boolean column (  0:non smoker    -    1:smoker)

In [None]:
df.smoking.value_counts()

In [None]:
# Ages with smoking cases
df.groupby('smoking')['age'].describe()

In [None]:
# Some of ages in each smoking cases
df.groupby('smoking')['age'].unique()

In [None]:
# Patients of smoking and anaemia cases
df.groupby('smoking')['anaemia'].value_counts()

In [None]:
# Patients of smoking and diabetes cases
df.groupby('smoking')['diabetes'].value_counts()

In [None]:
# Patients of smoking and hypertension cases
df.groupby('smoking')['high_blood_pressure'].value_counts()

In [None]:
# Gender of patients in each smoking case
df.groupby('sex')['smoking'].value_counts()

In [None]:
plt.figure(figsize=(10, 7))
plt.hist(df.smoking, histtype='bar')
plt.title('Smoker Patients')
plt.show()

Correlation between age/sex and the anaemia, diabetes, high blood pressure and smoking

In [None]:
# Age
plt.figure(figsize=(15, 10))
sns.heatmap(df.loc[:, ['age', 'anaemia', 'diabetes', 'high_blood_pressure', 'smoking']].corr(), annot=True)
plt.title('Correlation between age and anaemia, diabetes, high blood pressure and smoking')
plt.show()

In [None]:
# Sex
plt.figure(figsize=(15, 10))
sns.heatmap(df.loc[:, ['sex', 'anaemia', 'diabetes', 'high_blood_pressure', 'smoking']].corr(), annot=True)
plt.title('Correlation between sex and anaemia, diabetes, high blood pressure and smoking', loc='left')
plt.show()

# Death cases

DEATH EVENT is a boolean column (   0:alive patient  -  1:dead patient)

In [None]:
df.DEATH_EVENT.value_counts()

In [None]:
# Ages with death cases
df.groupby('DEATH_EVENT')['age'].describe()

In [None]:
plt.figure(figsize=(10, 7))
death_age = df.groupby('DEATH_EVENT')['age'].mean()
myexplode = [0.1, 0]
plt.pie(death_age, autopct='%.1f%%', labels=['Alive', 'Dead'], explode=myexplode, shadow=True)
plt.title('Average Ages In Death Cases')
plt.legend(death_age, loc='upper right')
plt.show()

In [None]:
# Some of ages in each death cases
df.groupby('DEATH_EVENT')['age'].unique()

In [None]:
# Patients of death and anaemia cases
df.groupby('DEATH_EVENT')['anaemia'].value_counts()

In [None]:
plt.figure(figsize=(10, 7))
death_anae = df.groupby('DEATH_EVENT')['anaemia'].value_counts()
myexplode = [0.1, 0, 0.2, 0]
plt.pie(death_anae, autopct='%.1f%%', labels=['Alive-non anaemic', 'Alive-anaemic', 'Dead-non anaemic', 'Dead-anaemic'], explode=myexplode, shadow=True)
plt.title('Anaemic Patients In Death Cases')
plt.legend(death_anae, loc='upper left')
plt.show()

In [None]:
# Patients of death and diabetes cases
df.groupby('DEATH_EVENT')['diabetes'].value_counts()

In [None]:
plt.figure(figsize=(10, 7))
death_diab = df.groupby('DEATH_EVENT')['diabetes'].value_counts()
myexplode = [0.1, 0, 0.2, 0]
plt.pie(death_diab, autopct='%.1f%%', labels=['Alive-non diabetic', 'Alive-diabetic', 'Dead-non diabetic', 'Dead-diabetic'], explode=myexplode, shadow=True)
plt.title('Diabetic Patients In Death Cases')
plt.legend(death_diab, loc='upper left')
plt.show()

In [None]:
# Patients of death and hypertension cases
df.groupby('DEATH_EVENT')['high_blood_pressure'].value_counts()

In [None]:
plt.figure(figsize=(10, 7))
death_hbp = df.groupby('DEATH_EVENT')['high_blood_pressure'].value_counts()
myexplode = [0.1, 0, 0.2, 0]
plt.pie(death_hbp, autopct='%.1f%%', labels=['Alive-normal press', 'Alive-high press', 'Dead-normal press', 'Dead-high press'], explode=myexplode, shadow=True)
plt.title('Blood Pressure Patients In Death Cases')
plt.legend(death_hbp, loc='upper left')
plt.show()

In [None]:
# Patients of death and smoking cases
df.groupby('DEATH_EVENT')['smoking'].value_counts()

In [None]:
plt.figure(figsize=(10, 7))
death_smok = df.groupby('DEATH_EVENT')['smoking'].value_counts()
myexplode = [0.1, 0, 0.2, 0]
plt.pie(death_smok, autopct='%.1f%%', labels=['Alive-non smoker', 'Alive-smoker', 'Dead-non smoker', 'Dead-smoker'], explode=myexplode, shadow=True)
plt.title('Smoker Patients In Death Cases')
plt.legend(death_smok, loc='upper left')
plt.show()

In [None]:
# Gender of patients in death cases
df.groupby('DEATH_EVENT')['sex'].value_counts()

In [None]:
plt.figure(figsize=(10, 7))
death_sex = df.groupby('DEATH_EVENT')['sex'].value_counts()
myexplode = [0.1, 0, 0.2, 0]
plt.pie(death_sex, autopct='%.1f%%', labels=['Alive-female', 'Alive-male', 'Dead-female', 'Dead-male'], explode=myexplode, shadow=True)
plt.title('Gender of Patients In Death Cases')
plt.legend(death_sex, loc='upper left')
plt.show()

In [None]:
# Average CPK enzyme in each death case
df.groupby('DEATH_EVENT')['creatinine_phosphokinase'].mean()

In [None]:
plt.figure(figsize=(10, 7))
death_cpk = df.groupby('DEATH_EVENT')['creatinine_phosphokinase'].mean()
myexplode = [0.07, 0]
plt.pie(death_cpk, autopct='%.1f%%', labels=['Alive-avg cpk', 'Dead-avg cpk'], explode=myexplode, shadow=True)
plt.title('Average values of Creatinine Phosphokinase In Death Cases')
plt.legend(death_cpk, loc='upper right')
plt.show()

In [None]:
# Average EF enzyme in each death case
df.groupby('DEATH_EVENT')['ejection_fraction'].mean()

In [None]:
plt.figure(figsize=(10, 7))
death_ef = df.groupby('DEATH_EVENT')['ejection_fraction'].mean()
myexplode = [0.07, 0]
plt.pie(death_ef, autopct='%.1f%%', labels=['Alive-avg ef enzyme', 'Dead-avg ef enzyme'], explode=myexplode, shadow=True)
plt.title('Average values of Ejection Fraction In Death Cases')
plt.legend(death_ef, loc='upper right')
plt.show()

In [None]:
# Average count of platelets in each death case
df.groupby('DEATH_EVENT')['platelets'].mean()

In [None]:
plt.figure(figsize=(10, 7))
death_plts = df.groupby('DEATH_EVENT')['platelets'].mean()
myexplode = [0.07, 0]
plt.pie(death_plts, autopct='%.1f%%', labels=['Alive-avg plts', 'Dead-avg plts'], explode=myexplode, shadow=True)
plt.title('Average values of Platelets In Death Cases')
plt.legend(death_plts, loc='upper right')
plt.show()

In [None]:
# Average Serum Creatinine in each death case
df.groupby('DEATH_EVENT')['serum_creatinine'].mean()

In [None]:
plt.figure(figsize=(10, 7))
death_SC = df.groupby('DEATH_EVENT')['serum_creatinine'].mean()
myexplode = [0.1, 0]
plt.pie(death_SC, autopct='%.1f%%', labels=['Alive-avg Serum Creatinine', 'Dead-avg Serum Creatinine'], explode=myexplode, shadow=True)
plt.title('Average values of Serum Creatinine In Death Cases')
plt.legend(death_SC, loc='upper left')
plt.show()

In [None]:
# Average Serum Sodium in each death case
df.groupby('DEATH_EVENT')['serum_sodium'].mean()

In [None]:
plt.figure(figsize=(10, 7))
death_SS = df.groupby('DEATH_EVENT')['serum_sodium'].mean()
myexplode = [0.1, 0]
plt.pie(death_SS, autopct='%.1f%%', labels=['Alive-avg Serum Sodium', 'Dead-avg Serum Sodium'], explode=myexplode, shadow=True)
plt.legend(death_SS, loc='upper right')
plt.title('Average values of Serum Sodium In Death Cases')
plt.show()


Correlation between Death cases and other factors

In [None]:
plt.figure(figsize=(20, 15))
sns.heatmap(df.corr(), annot=True)
plt.title('Correlation between Death Events and other data')
plt.show()