In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.subplots as ms
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import plotly.express as px


## Loading the dataset

Let's load the data and have a quick preview of what dataset looks like.

In [None]:
df = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

In [None]:
df.describe()

In [None]:
# Dividing data into survivors vs non survivors for visualizations

survivals = df.loc[df['DEATH_EVENT'] == 0]
deaths = df.loc[df['DEATH_EVENT'] == 1]

## Checking the data for missing values and cleaning up (if required)
I generally prefer to check the data for missing values as the first step. Turns out this data doesn't have any missing values.

In [None]:
print("No. of Samples", len(df))
print('===============================')
print(df.isnull().sum())
print('===============================')

## Overview of the data

Columns **anaemia**, **diabetes**, **high_blood_pressure** and **smoking** are boolean type

Rest of the columns except **sex** are numeric.

**sex** is a categorical column with only 2 values ( Male: 0 and Female: 1). Since there are only 2 categories, we need not do any hot encoding and we can leave them as they are.

**DEATH_EVENT** is our label column

## Feature Selection

Lets find out what features are relevant for predicting heart failure.

We can do this in two ways:
* Plotting a correlation wrt death event and visualizing each feature wrt Death Event and see if there is any correlation
* Using feature importance in ensemble techniques

Correlation only works in case of non-categorical data. So it needs additional effort for checking relation between categorical data and label.

We will use a hybrid approach. We'll use BE and Feature importance to find the optimum number of features required for prediction. To make sure selection process is going in the right direction, we'll plot some of the features vs death event.


### Backward Elimination

We use p-values to filter out redundant features. 

In [None]:
X = df.drop('DEATH_EVENT', axis=1)
y = df['DEATH_EVENT']
columns = list(X.columns)
p_max = 1
while (len(columns)>0):
    p_values = []
    X_test = X[columns]
    X_test = sm.add_constant(X_test)
    model = sm.OLS(y,X_test).fit()
    p = pd.Series(model.pvalues.values[1:],index = columns)      
    p_max = max(p)
    p_max_index = p.idxmax()
    if(p_max >0.05):
        columns.remove(p_max_index)
    else:
        break
selected_features_BE = columns
print(selected_features_BE)

# Gives us following list ['age', 'ejection_fraction', 'serum_creatinine', 'time']

### Using feature importance in ensemble techniques

We will first find the optimum number of features required for prediction based on prediction accuracy and then select find the actual list of features.

From the plot we can select 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)
model = RandomForestClassifier(criterion='entropy')
model.fit(X,y)
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
})

feature_importance.sort_values('importance', inplace=True)
feature_importance.reset_index(drop=True, inplace=True)
fig = px.bar(feature_importance, x='feature', y='importance')
fig.show()

Backward Elimination suggests only 4 features **'age', 'ejection_fraction', 'serum_creatinine', 'time'**

Random Forest suggests additional 3 features **serum_sodium, platelets, creatinine_phosphate**

We can safely ignore the rest.

## Verifying features through visualizing wrt Death Event

We will use plotly to plot distribution plots and check if we find any pattern between individual features and death event

### Age

Let's plot age distribution with categorised by survival event.

We will also plot the survival rate for each age group to see if age acts as a factor along with other features.

**From the plots we see age can be a deciding factor along with other features. Death rate is higher in higher age groups**

In [None]:
bins = list(range(40, 100, 5))

survivors_grp = df.loc[df['DEATH_EVENT'] == 0]['age']
deaths_grp = df.loc[df['DEATH_EVENT'] == 1]['age']

fig = ms.make_subplots(rows=1,
    cols=2,
   subplot_titles = ['Age vs Survival Event', 'Survival Rate vs Age']
)

fig.add_trace(
    go.Histogram(
        x=survivors_grp.values,
        marker_color='#595ee6',
        name='Survivals'
    ),
    row=1,
    col=1
)
fig.add_trace(
    go.Histogram(
        x=deaths_grp.values,
        marker_color='#d059e6',
        name='Deaths'
    ),
    row=1,
    col=1
)

# Plotting Death percantage vs Age groups
groups = df.groupby([pd.cut(df['age'], bins), 'DEATH_EVENT'])
survival_rate = groups.size().unstack()
survival_rate['Survival Rate'] = survival_rate[0] * 100 / (survival_rate[1] + survival_rate[0])
survival_rate['Age Groups'] = survival_rate.index.astype(str).tolist()
survival_rate['idx'] = range(len(survival_rate))
survival_rate.set_index('idx', inplace=True)

fig.add_trace(
    go.Bar(
        x=survival_rate['Age Groups'],
        y=survival_rate['Survival Rate'],
        name='Survival %'
    ),
    row=1,
    col=2
)

fig.update_layout(
    bargap=0.25,
    bargroupgap=0.2
)

fig.show()

### Ejection Fraction

Let's plot Ejection distribution with categorised by survival event.

We will also plot the survival rate as ejection fraction increases.

**From the plots we see Survival rate increases as Ejection Fraction increases. It can be a deciding factor along with other features.**

In [None]:
bins = list(range(14, 80, 5))

survivors_grp = df.loc[df['DEATH_EVENT'] == 0]['ejection_fraction']
deaths_grp = df.loc[df['DEATH_EVENT'] == 1]['ejection_fraction']

fig = ms.make_subplots(rows=1,
    cols=2,
   subplot_titles = ['Ejection Fraction vs Survival Event', 'Survival Rate vs Ejection Fraction']
)

fig.add_trace(
    go.Histogram(
        x=survivors_grp.values,
        marker_color='#595ee6',
        name='Survivals'
    ),
    row=1,
    col=1
)
fig.add_trace(
    go.Histogram(
        x=deaths_grp.values,
        marker_color='#d059e6',
        name='Deaths'
    ),
    row=1,
    col=1
)

# Plotting Death percantage vs Age groups
groups = df.groupby([pd.cut(df['ejection_fraction'], bins), 'DEATH_EVENT'])
survival_rate = groups.size().unstack()
survival_rate['Survival Rate'] = survival_rate[0] * 100 / (survival_rate[1] + survival_rate[0])
survival_rate['Ejection Fraction Groups'] = survival_rate.index.astype(str).tolist()
survival_rate['idx'] = range(len(survival_rate))
survival_rate.set_index('idx', inplace=True)

fig.add_trace(
    go.Bar(
        x=survival_rate['Ejection Fraction Groups'],
        y=survival_rate['Survival Rate'],
        name='Survival %'
    ),
    row=1,
    col=2
)

fig.update_layout(
    bargap=0.25,
    bargroupgap=0.2
)

fig.show()


## Serum Creatinine

Let's plot the distribution with categorised by survival event.

We will also plot the survival rate wrt creatinine levels to see if it acts as a factor along with other features.

**From the plots we can see Survival rate decreases as Serum Creatinine increases to a certain point but again increases for higher values. It can be a deciding factor along with other features.**

In [None]:
bins = list(np.arange(0.0, 10, 0.5))

survivors_grp = df.loc[df['DEATH_EVENT'] == 0]['serum_creatinine']
deaths_grp = df.loc[df['DEATH_EVENT'] == 1]['serum_creatinine']

fig = ms.make_subplots(rows=1,
    cols=2,
   subplot_titles = ['Serum Creatinine vs Survival Event', 'Survival Rate vs Serum Creatinine']
)

fig.add_trace(
    go.Histogram(
        x=survivors_grp.values,
        marker_color='#595ee6',
        name='Survivals'
    ),
    row=1,
    col=1
)
fig.add_trace(
    go.Histogram(
        x=deaths_grp.values,
        marker_color='#d059e6',
        name='Deaths'
    ),
    row=1,
    col=1
)

# Plotting Death percantage vs Age groups
groups = df.groupby([pd.cut(df['serum_creatinine'], bins), 'DEATH_EVENT'])
survival_rate = groups.size().unstack()
survival_rate['Survival Rate'] = survival_rate[0] * 100 / (survival_rate[1] + survival_rate[0])
survival_rate['Serum Creatinine Groups'] = survival_rate.index.astype(str).tolist()
survival_rate['idx'] = range(len(survival_rate))
survival_rate.set_index('idx', inplace=True)

fig.add_trace(
    go.Bar(
        x=survival_rate['Serum Creatinine Groups'],
        y=survival_rate['Survival Rate'],
        name='Survival %'
    ),
    row=1,
    col=2
)

fig.update_layout(
    bargap=0.25,
    bargroupgap=0.2
)

fig.show()


## Time

Let's plot the distribution with categorised by survival event.

We will also plot the survival rate wrt time to see if it acts as a factor along with other features.

**From the plots we can see Survival rate is higher for higher time values. It can be a deciding factor along with other features.**

In [None]:
bins = list(range(0, 300, 20))

survivors_grp = df.loc[df['DEATH_EVENT'] == 0]['time']
deaths_grp = df.loc[df['DEATH_EVENT'] == 1]['time']

fig = ms.make_subplots(rows=1,
    cols=2,
   subplot_titles = ['Time vs Survival Event', 'Survival Rate vs Time']
)

fig.add_trace(
    go.Histogram(
        x=survivors_grp.values,
        marker_color='#595ee6',
        name='Survivals'
    ),
    row=1,
    col=1
)
fig.add_trace(
    go.Histogram(
        x=deaths_grp.values,
        marker_color='#d059e6',
        name='Deaths'
    ),
    row=1,
    col=1
)

# Plotting Death percantage vs Age groups
groups = df.groupby([pd.cut(df['time'], bins), 'DEATH_EVENT'])
survival_rate = groups.size().unstack()
survival_rate['Survival Rate'] = survival_rate[0] * 100 / (survival_rate[1] + survival_rate[0])
survival_rate['Time Groups'] = survival_rate.index.astype(str).tolist()
survival_rate['idx'] = range(len(survival_rate))
survival_rate.set_index('idx', inplace=True)

fig.add_trace(
    go.Bar(
        x=survival_rate['Time Groups'],
        y=survival_rate['Survival Rate'],
        name='Survival %'
    ),
    row=1,
    col=2
)

fig.update_layout(
    bargap=0.25,
    bargroupgap=0.2
)

fig.show()


## Creatinine Phosphokinase

Let's plot the distribution with categorised by survival event.

We will also plot the survival rate for wrt creatinine levels to see if it acts as a factor along with other features.

**From the plots we can see survival rates for all the cretanine levels are approximately same. It may not be able to help in prediction.**

**We can ignore this feature**

In [None]:
bins = list(range(23, 6000, 500))

sur_grp = survivals['creatinine_phosphokinase']
dth_grp = survivals['creatinine_phosphokinase']

fig = ms.make_subplots(rows=1,
    cols=2,
   subplot_titles = ['Creatinine vs Survival Event', 'Survival Rate vs Creatinine']
)

fig.add_trace(
    go.Histogram(
        x=sur_grp.values,
        marker_color='#595ee6',
        name='Survivals'
    ),
    row=1,
    col=1
)
fig.add_trace(
    go.Histogram(
        x=dth_grp.values,
        marker_color='#d059e6',
        name='Deaths'
    ),
    row=1,
    col=1
)

# Plotting Death percantage vs Age groups
grps = df.groupby([pd.cut(df['creatinine_phosphokinase'], bins), 'DEATH_EVENT'])
sur_rate = grps.size().unstack()
sur_rate['Survival Rate'] = sur_rate[0] * 100 / (sur_rate[1] + sur_rate[0])
sur_rate['Creatinine'] = sur_rate.index.astype(str).tolist()
sur_rate['idx'] = range(len(sur_rate))
sur_rate.set_index('idx', inplace=True)

fig.add_trace(
    go.Bar(
        x=sur_rate['Creatinine'],
        y=sur_rate['Survival Rate'],
        name='Survival %'
    ),
    row=1,
    col=2
)

fig.update_layout(
    bargap=0.25,
    bargroupgap=0.2
)

fig.show()


## Serum Sodium

Let's plot the distribution with categorised by survival event.

We will also plot the survival rate wrt Serum Sodium levels to see if it acts as a factor along with other features.

**From the plots we can see survival rates for all the serum sodium levels are approximately same. It may not be able to help in prediction.**

**We can ignore this feature**

In [None]:
bins = list(range(110, 130, 1))

sur_grp = survivals['serum_sodium']
dth_grp = survivals['serum_sodium']

fig = ms.make_subplots(rows=1,
    cols=2,
   subplot_titles = ['Serum Sodium vs Survival Event', 'Survival Rate vs Serum Sodium']
)

fig.add_trace(
    go.Histogram(
        x=sur_grp.values,
        marker_color='#595ee6',
        name='Survivals'
    ),
    row=1,
    col=1
)
fig.add_trace(
    go.Histogram(
        x=dth_grp.values,
        marker_color='#d059e6',
        name='Deaths'
    ),
    row=1,
    col=1
)

# Plotting Death percantage vs Age groups
grps = df.groupby([pd.cut(df['serum_sodium'], bins), 'DEATH_EVENT'])
sur_rate = grps.size().unstack()
sur_rate['Survival Rate'] = sur_rate[0] * 100 / (sur_rate[1] + sur_rate[0])
sur_rate['Serum Sodium'] = sur_rate.index.astype(str).tolist()
sur_rate['idx'] = range(len(sur_rate))
sur_rate.set_index('idx', inplace=True)

fig.add_trace(
    go.Bar(
        x=sur_rate['Serum Sodium'],
        y=sur_rate['Survival Rate'],
        name='Survival %'
    ),
    row=1,
    col=2
)

fig.update_layout(
    bargap=0.25,
    bargroupgap=0.2
)

fig.show()


## Platelets

Let's plot the distribution with categorised by survival event.

We will also plot the survival rate wrt Platelets levels to see if it acts as a factor along with other features.

**From the plots we can see there is no clear pattern for survival rate wrt platelets levels. It may not be able to help in prediction.**

**We can ignore this feature**

In [None]:
bins = list(range(2500, 850000, 20000))

sur_grp = survivals['platelets']
dth_grp = survivals['platelets']

fig = ms.make_subplots(rows=1,
    cols=2,
   subplot_titles = ['Platelets vs Survival Event', 'Survival Rate vs Platelets']
)

fig.add_trace(
    go.Histogram(
        x=sur_grp.values,
        marker_color='#595ee6',
        name='Survivals'
    ),
    row=1,
    col=1
)
fig.add_trace(
    go.Histogram(
        x=dth_grp.values,
        marker_color='#d059e6',
        name='Deaths'
    ),
    row=1,
    col=1
)

# Plotting Death percantage vs Age groups
grps = df.groupby([pd.cut(df['platelets'], bins), 'DEATH_EVENT'])
sur_rate = grps.size().unstack()
sur_rate['Survival Rate'] = sur_rate[0] * 100 / (sur_rate[1] + sur_rate[0])
sur_rate['Platelets'] = sur_rate.index.astype(str).tolist()
sur_rate['idx'] = range(len(sur_rate))
sur_rate.set_index('idx', inplace=True)

fig.add_trace(
    go.Bar(
        x=sur_rate['Platelets'],
        y=sur_rate['Survival Rate'],
        name='Survival %'
    ),
    row=1,
    col=2
)

fig.update_layout(
    bargap=0.25,
    bargroupgap=0.2
)

fig.show()

## Selected Features

Based on the models and visualizations, we can select following features for prediction

**'age',  'ejection_fraction',  'serum_creatinine' and 'time'**

## Train - Test Split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X[['age', 'ejection_fraction', 'serum_creatinine', 'time']], y, test_size = 0.2)

## Random Forest Classifier

In [None]:
rfc = RandomForestClassifier(criterion='entropy', max_features=4, n_estimators=1000)

rfc.fit(x_train, y_train)

rfc_predict = rfc.predict(x_test)

acc_score = accuracy_score(y_test, rfc_predict)

print("Accuracy Score: {}%".format(acc_score*100))