In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
heart_data = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv');

heart_data.head()

In [None]:
heart_data.shape

In [None]:
heart_data.columns

In [None]:
import matplotlib.pyplot as plt;
import seaborn as sns;

%matplotlib inline

plt.rcParams['figure.figsize'] = (16, 6);
sns.set_style('dark');

In [None]:
heart_data.dtypes

All columns are numeric

# Check distribution
- age
- platelets
- serum_creatinine
- time

In [None]:
title_params = {
    'fontsize': 24
};

label_size = 16;

title_padding = 10;

In [None]:
colnames_continuous = {
    'age': 'Age', 
    'platelets': 'Platelets', 
    'serum_creatinine': 'Serum Creatinine', 
    'time': 'Time',
    'ejection_fraction': 'Ejection Fraction',
    'serum_sodium': 'Serum Sodium'
};

fig, axes = plt.subplots(3, 2, figsize=(28, 16));
fig.tight_layout(h_pad=10);

row = 0;
col = 0;

for key, val in colnames_continuous.items():
    if col >= 2:
        row += 1;
        col = 0;
    sns.histplot(x=key, data=heart_data, ax=axes[row, col]);
    axes[row, col].set_title(val, fontdict=title_params, pad=title_padding);
    axes[row, col].set_xlabel(key, fontsize=label_size);
    axes[row, col].set_ylabel('count', fontsize=label_size);
    col += 1;

## Observations
- Age is somewhat right skewed
- Platelets seem to be symettrical with a few outliers
- Serum Creatinine is right skewed
- Age has no particular distribution
- Ejection Fraction imabalanced
- Serum Sodium left skewed

# Remaining columns analysis

In [None]:
colnames = {
    'anaemia': 'Anaemia', 
    'diabetes': 'Diabetes', 
    'high_blood_pressure': 'High Blood Pressure',
    'sex': 'Gender',
    'smoking': 'Smoker'
};

for key, val in colnames.items():
    print(val, '-', heart_data[key].unique());

In [None]:
gender = {0: 'Female', 1: 'Male'};
other = {0: 'No', 1: 'Yes'};

heart_data['sex'] = heart_data['sex'].map(gender);
for key, val in colnames.items():
    if not( key == 'sex' ):
        heart_data[key] = heart_data[key].map(other);

In [None]:
heart_data.head()

## Observations
- All remaining variables are binary

# Check deaths in relation to the binary variables
- X axis describes if the person died of heart disease
- the legend says if the person died of heart disease had 
- some underlying disease, eg. anaemia or habit, eg. smoking

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(24, 24));
fig.tight_layout(h_pad=10);

row = 0;
col = 0;

for key, val in colnames.items():
    if col >= 2:
        row += 1;
        col = 0;
    sns.countplot(x='DEATH_EVENT', hue=key, data=heart_data, ax=axes[row, col]);
    axes[row, col].set_title(val, fontdict=title_params, pad=title_padding);
    axes[row, col].set_xlabel('DEATH EVENT', fontsize=label_size);
    axes[row, col].set_ylabel('count', fontsize=label_size);
    col += 1;

# delete last plot
fig.delaxes(axes[row, col]);

# Age analysis

In [None]:
heart_data['age'].describe()

## Observations
- most persons are around the age of 60
- the mean and median have the same value
- possible to group persons by age [51 <= 60 <= 100

### Assign agegroups

In [None]:
def agegroup(age):
    if age <= 51:
        return 0;
    elif age > 51 and age <= 60:
        return 1;
    else:
        return 2;

heart_data['agegroup'] = heart_data['age'].map(agegroup);

heart_data['agegroup'].head()

### check deaths by agegroup

In [None]:
heart_data[['DEATH_EVENT', 'agegroup']].groupby('agegroup').mean().sort_values(by='DEATH_EVENT', ascending=False)

In [None]:
sns.countplot(x='DEATH_EVENT', hue='agegroup', data=heart_data);

## Observations
- Most people aged above 60 died of heart attack

# Group different columns and look for information
- group by gender and check death rate

In [None]:
heart_data[['DEATH_EVENT', 'sex']].groupby('sex').mean().sort_values(by='DEATH_EVENT', ascending=False)

## Observations
- around the same percenage of males and females died of heart attacks
- try grouping with males and females that smoked

In [None]:
heart_data[['DEATH_EVENT', 'sex', 'smoking', 'agegroup']].groupby(['sex', 'smoking', 'agegroup']).mean()

## Observations
- almost all females aged between 40-60 that smoked died of heart disease
- all males aged less than 51 and most males aged between 51-60 that smoked died of heart disease
- smoking causes heart disease

# Normalize the non binary variables
- divide percentage values by 100
- convert non percentage values to percentage and divide by 100

#### Creatinine Phosphokinase
The reference interval is 0.6–1.3 mg/dL (53–115 μmol/L). Measuring serum creatinine is a simple test, and it is the most commonly used indicator of renal function 

Males produce approximately 150 μmol to 200 μmol of creatinine per kilogram of body weight per 24 h while females produce approximately 100 μmol/kg/24 h to 150 μmol/kg/24 h. In normal circumstances, all this daily creatinine production is excreted in the urine.

[Source: https://en.wikipedia.org/wiki/Creatinine](https://en.wikipedia.org/wiki/Creatinine)

### Analyze and group serum_creatinine

In [None]:
heart_data['serum_creatinine'].describe()

### group csp into low, medium, and high levels
- okay levels [0.6, 1.3] for both men and women
- below and above that is abnormal

In [None]:
def scp_level(csp):
    if csp < 0.6:
        return 0;
    elif csp >= 0.6 and csp <= 1.3:
        return 1;
    else:
        return 2;

heart_data['scp_level'] = heart_data['serum_creatinine'].map(scp_level);

### group by sex, scp levels, age groups and check relation to death

In [None]:
heart_data[['sex', 'scp_level', 'agegroup', 'DEATH_EVENT']].groupby(['sex', 'agegroup', 'scp_level']).mean()

## Observations
- clearly, high levels of serum creatinine contribute to heart disease

In [None]:
heart_data.columns

# Normalize creatinine phospokinase
- find amount produced by each person relative to the average reference range
- Reference range: [60, 400] IU/L
- for instance, if a male produced 75 umol then 
- 75 / 230
- between [0.3, 1.73]

[Source: https://en.wikipedia.org/wiki/Creatine_kinase](https://en.wikipedia.org/wiki/Creatine_kinase)

In [None]:
def csp_normalizer(csp):
    return (( csp / 60 ) + ( csp / 400 )) / 2;

heart_data['csp_normalized'] = heart_data['creatinine_phosphokinase'].map(csp_normalizer);

heart_data[['creatinine_phosphokinase', 'csp_normalized']].head()

In [None]:
heart_data['csp_normalized'].describe()

In [None]:
def csp_level(csp):
    if csp < 0.3:
        return 0;
    elif csp >= 0.3 and csp <= 1.73:
        return 1;
    else:
        return 2;

heart_data['csp_level'] = heart_data['csp_normalized'].map(csp_group);

heart_data['csp_level'].head()

### group by sex, csp levels, age groups and check relation to death

In [None]:
heart_data[['sex', 'csp_level', 'agegroup', 'DEATH_EVENT']].groupby(['sex', 'agegroup', 'csp_level']).mean()

## Observations
- some correlation between csp levels and death
- less deaths if csp levels are average in most persons

# Ejection Fraction
- Percentage of blood cells leaving heart
- Normal rate for an adult: >=50%
- low: [30, 49]%
- very low: < 30%

# Normalize Ejection Fraction
- divide by 100%

In [None]:
heart_data['ef_normalized'] = heart_data['ejection_fraction'] / 100;

heart_data[['ejection_fraction', 'ef_normalized']].head()

In [None]:
def ef_level(ef):
    if ef <= 0.3:
        return 0;
    elif ef > 0.3 and ef < 0.5:
        return 1;
    else:
        return 2;

heart_data['ef_level'] = heart_data['ef_normalized'].map(ef_level);

heart_data['ef_level'].head()

### check heart failure by ef level

In [None]:
heart_data[['DEATH_EVENT', 'sex', 'ef_level']].groupby(['sex', 'ef_level']).mean()

## Observations
- most heart failure caused by extremely low ejection fractions

# Number of platelets in blood per liter
- The number of platelets varies across individuals. The normal physiologic range is 200,000 to 500,000 per microliter of blood. Since they contain receptors for thrombopoietin (the protein that facilitates the maturation of megakaryocytes and release of platelets), a higher number of platelets binds more of the protein. Consequently, there is stimulation for more production of thrombopoietin in the liver and kidneys. This is the basis for the production of more thrombopoietin and, as a result, more platelets in the bloodstream during the blood clotting process.

[Source: https://en.wikipedia.org/wiki/Platelet](https://en.wikipedia.org/wiki/Platelet)

# Normalize blood platelets
- (x - mean) / std

In [None]:
def platelets_normalizer(platelets, mean, std):
    return (platelets - mean) / std;

platelets_mean = heart_data['platelets'].mean()
platelets_std = heart_data['platelets'].std();
heart_data['platelets_normalized'] = heart_data['platelets'].map(lambda x: platelets_normalizer(x, platelets_mean, platelets_std));

heart_data[['platelets', 'platelets_normalized']].head()

In [None]:
heart_data['platelets_normalized'].describe()

In [None]:
def platelets_level(platelets):
    if platelets < 200_000:
        return 0;
    elif platelets >= 200_000 and platelets <= 500_000:
        return 1;
    else:
        return 2;

heart_data['platelets_level'] = heart_data['platelets'].map(platelets_level);
heart_data['platelets_level'].head()

### group by plateletes level and check for heart failure

In [None]:
heart_data[['sex', 'platelets_level', 'DEATH_EVENT']].groupby(['sex', 'platelets_level']).mean()

## Observations
- average plateletes levels implies less chances of heart failure

# Serum Sodium
- Level of serum sodium in the blood mEq/L
- The minimum physiological requirement for sodium is between 115 and 500 milligrams per day depending on sweating due to physical activity, and whether the person is adapted to the climate.
- Normal levels [115, 500]

[Source: https://en.wikipedia.org/wiki/Sodium_in_biology](https://en.wikipedia.org/wiki/Sodium_in_biology)

# Normalize Serum Sodium
- (x - mean) / std

In [None]:
# na: sodium symbol
def serum_na_normalizer(na, mean, std):
    return (na - mean) / std;

serum_na_mean = heart_data['serum_sodium'].mean();
serum_na_std = heart_data['serum_sodium'].std();

heart_data['serum_na_normalized'] = heart_data['serum_sodium'].map(lambda x: serum_na_normalizer(x, serum_na_mean, serum_na_std));

heart_data[['serum_sodium', 'serum_na_normalized']].head()

### group serum sodium into low, average, and high

In [None]:
def serum_na_level(na):
    if na < 115:
        return 0;
    elif na >= 115 and na <= 500:
        return 1;
    else:
        return 2;

heart_data['serum_na_level'] = heart_data['serum_sodium'].map(serum_na_level);
heart_data[['serum_sodium', 'serum_na_level']].head()

In [None]:
heart_data['serum_na_level'].value_counts() / heart_data.shape[0] * 100

### check if serum na level causes heart failure

In [None]:
heart_data[['sex', 'serum_na_level', 'DEATH_EVENT']].groupby(['sex', 'serum_na_level']).mean()

### Observations
- Almost everyone has a normal serum sodium level
- Almost 32% of deaths in both males and females caused by serum levels

# convert categorical variables to integers

In [None]:
categorical_columns = heart_data.select_dtypes('object').columns;
print(categorical_columns);

In [None]:
gender = {
    'Male': 1,
    'Female': 0
};

other = {
    'Yes': 1,
    'No': 0
};

for idx, cc in enumerate(categorical_columns):
    if cc == 'sex':
        heart_data[cc] = heart_data[cc].map(gender);
    else:
        heart_data[cc] = heart_data[cc].map(other);

heart_data.head()

# RandomForestClassifier model for predicting heart failure

In [None]:
heart_data.columns

In [None]:
list(colnames_continuous.keys())

In [None]:
cols_to_drop = list(colnames_continuous.keys()) + ['creatinine_phosphokinase', 'csp_normalized', 'ef_normalized', 'platelets_normalized', 'serum_na_normalized'];

In [None]:
X = heart_data.drop(cols_to_drop + ['DEATH_EVENT'], axis=1).values;
y = heart_data['DEATH_EVENT'].values;

In [None]:
X.shape

In [None]:
y.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier;

In [None]:
clf = RandomForestClassifier();

clf.fit(X, y);
clf.score(X, y)