In [None]:
import numpy as np
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
df=pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

In [None]:
df.head()

In [None]:
df.info()

**EDA**

In [None]:
df.corr()['DEATH_EVENT'].drop('DEATH_EVENT').sort_values().plot(kind='bar')

In [None]:
df['time'].unique()

Since this is a time-to-event data-set. There are 2 target columns: time and DEATH_EVENT.

DEATH_EVENT encodes whether the patient died (1) or whether they were censored (0). Censoring means that the scientists lost contact with the patient.

time captures the time of the event. That is, the time at which the patient died or were censored.

Using the time column as a feature is wrong. When we deploy our model, no end user will be able to provide you the value of time, since they do not know at what time in the future the patient will die/get censored!

In [None]:
df=df.drop('time', axis=1)

In [None]:
df.corr()['DEATH_EVENT'].drop('DEATH_EVENT').sort_values()

ejection_fraction, Age and serum_creatinine have a very high corellation with Death. Let us explore them further

In [None]:
df['serum_creatinine'].unique()

In [None]:
plt.figure(figsize=(12,4))
sns.displot(data=df, x='age',hue='DEATH_EVENT')

In [None]:
plt.figure(figsize=(4,8))
sns.boxplot(data=df, x='DEATH_EVENT', y='serum_creatinine')

serum_creatinine levels are higher for those who die from Heart Failure

In [None]:
plt.figure(figsize=(12,4))
sns.boxplot(data=df, x='DEATH_EVENT', y='age')

Age also seems to be higher for those who die from Heart Failure

In [None]:
df['ejection_fraction'].unique()

In [None]:
plt.figure(figsize=(12,4))
sns.boxplot(data=df, x='DEATH_EVENT', y='ejection_fraction')

As less blood leaves the heart, the ejection fraction decreases causing Heart Failure.

In [None]:
df.columns

In [None]:
plt.figure(figsize=(12,4))
sns.boxplot(data=df, x='DEATH_EVENT', y='platelets')

In [None]:
plt.figure(figsize=(4,6))
sns.boxplot(data=df, x='DEATH_EVENT', y='creatinine_phosphokinase')

Not much variation

In [None]:
df.head()

In [None]:
plt.figure(figsize=(12,4))
sns.heatmap(data=df.corr().drop('DEATH_EVENT'), cmap='coolwarm', annot=True)

There are not any features that are correlated with each other. We can move onto Data Cleaning and Preprocessing

**Data PreProcessing**

In [None]:
df.isna().sum()

There are no Null values, we are good to go.

**Categorical Features**

There seem to be quite a few Categorical features which needs to be transformed so that sklearn will be able to understand them

In [None]:
df.head()

In [None]:
df.columns

**Model Creation and Evaluation**

Let's start by Using Random Forest Classifier

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
X= df.drop('DEATH_EVENT', axis=1)
y=df['DEATH_EVENT']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
X_train = scaler.fit_transform(X_train)

In [None]:
X_test = scaler.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=600)

In [None]:
rfc.fit(X_train,y_train)

**Predictions and Evaluation**

Let's predict off the y_test values and evaluate our model.

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
predictions = rfc.predict(X_test)

In [None]:
print(classification_report(y_test,predictions))

The performance is very poor and thus I have decided to remove features which aren't much correllated to Death

In [None]:
abs(df.corr()['DEATH_EVENT'].drop('DEATH_EVENT')).sort_values()

Sex and Diabetes are hardly correlated with Death and thus I will remove the less correllated features and train the model

In [None]:
X=df.drop(['diabetes','sex','DEATH_EVENT','smoking'], axis=1)
y=df['DEATH_EVENT']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
rfc = RandomForestClassifier(n_estimators=600)

In [None]:
rfc.fit(X_train,y_train)

In [None]:
predictions = rfc.predict(X_test)

In [None]:
print(classification_report(y_test,predictions))

We have got a decent score here. Thanks!