In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from matplotlib import pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

In [None]:
data.head()

# Data Exploration

### Age

In [None]:
# Add index column
data = data.reset_index()

In [None]:
plt.figure(figsize=(12, 6))
data.groupby('age')['index'].count().plot()

### Anaemia

In [None]:
# Decrease of red blood cells or hemoglobin ?

data.anaemia.value_counts()

### Creatinine Phosphokinase

In [None]:
# Level of the CPK enzyme in the blood (mcg/L)

plt.figure(figsize=(12, 6))
sns.distplot(data.creatinine_phosphokinase, hist=False)

### Diabetes

In [None]:
# If the patient has diabetes

data.diabetes.value_counts()

### Ejection Fraction

In [None]:
# Percentage of blood leaving the heart at each contraction

plt.figure(figsize=(12, 6))
sns.distplot(data.ejection_fraction, hist=False)

### High Blood Pressure

In [None]:
# If the patient has hypertension

data.high_blood_pressure.value_counts()

### Platelets

In [None]:
# Platelets in the blood (kiloplatelets/mL)

plt.figure(figsize=(12, 6))
sns.distplot(data.platelets, hist=False)

### Serum Creatinine

In [None]:
# Level of serum creatinine in the blood (mg/dL)

plt.figure(figsize=(12, 6))
sns.distplot(data.serum_creatinine, hist=False)

### Serum Sodium

In [None]:
# Level of serum sodium in the blood (mEq/L)

plt.figure(figsize=(12, 6))
sns.distplot(data.serum_sodium, hist=False)

### Sex

In [None]:
# Woman or man

data.sex.value_counts()

### Smoking

In [None]:
# If the patient smokes or not

data.smoking.value_counts()

### Time

In [None]:
# Follow-up period 

plt.figure(figsize=(12, 6))
sns.distplot(data.time, hist=False)

In [None]:
# If the patient deceased during the follow-up period

data.DEATH_EVENT.value_counts()

# Data Featuring

In [None]:
data.isnull().sum()

In [None]:
# No missing values

### Sex x Death

In [None]:
sns.catplot(x="sex", y="DEATH_EVENT", kind="bar", data=data)

In [None]:
# Sex is not impactful

### Smoking

In [None]:
sns.catplot(x="smoking", y="DEATH_EVENT", kind="bar", data=data)

In [None]:
# Same as sex

### Anaemia x Death

In [None]:
sns.catplot(x="anaemia", y="DEATH_EVENT", kind="bar", data=data)

In [None]:
# having anaemia has more impact in death event

### Diabetes x Death

In [None]:
sns.catplot(x="diabetes", y="DEATH_EVENT", kind="bar", data=data)

In [None]:
# Diabetes has no impact

### High blood pressure x Death

In [None]:
sns.catplot(x="high_blood_pressure", y="DEATH_EVENT", kind="bar", data=data)

In [None]:
# Hypertension seems to have an impact

### Age x Death

In [None]:
grid = sns.FacetGrid(data, row='DEATH_EVENT', size=3, aspect=1.6)
grid.map(sns.distplot, 'age', 'DEATH_EVENT')
grid.add_legend()

In [None]:
data.groupby('DEATH_EVENT')['age'].mean()

### Time x Death

In [None]:
grid = sns.FacetGrid(data, row='DEATH_EVENT', size=3, aspect=1.6)
grid.map(sns.distplot, 'time', 'DEATH_EVENT')
grid.add_legend()

In [None]:
# less time : more probability to die

# Data Prediction

In [None]:
# check age bins

sns.distplot(data.age, bins=5)

In [None]:
data['ageBin'] = pd.cut(data['age'], 5)
data[['ageBin', 'DEATH_EVENT']].groupby(['ageBin'], as_index=False).mean().sort_values(by='ageBin', ascending=True)

In [None]:
# More than 50% of death after 70

In [None]:
data.loc[ data['age'] <= 50, 'age'] = 0
data.loc[(data['age'] > 50) & (data['age'] <= 60), 'age'] = 1
data.loc[(data['age'] > 60) & (data['age'] <= 70), 'age'] = 2
data.loc[(data['age'] > 70) & (data['age'] <= 80), 'age'] = 3
data.loc[(data['age'] > 80) & (data['age'] <= 90), 'age'] = 4
data.loc[ data['age'] > 90, 'age'] = 5

In [None]:
data = data[['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes', 'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time', 'DEATH_EVENT']]

In [None]:
data.head()

# Creating the model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [None]:
X = data.drop("DEATH_EVENT", axis=1)
Y = data["DEATH_EVENT"]

In [None]:
# Split 20% test, 80% train

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=0)

In [None]:
# Logistic Regression

log = LogisticRegression(max_iter=100)
log.fit(X_train, Y_train)
Y_pred_log = log.predict(X_test)
acc_log = accuracy_score(Y_pred_log, Y_test)
acc_log

In [None]:
rf = RandomForestClassifier()

# search the best params
grid = {'n_estimators':[100,200], 'max_depth': [2,5,10]}

clf_rf = GridSearchCV(rf, grid, cv=10)
clf_rf.fit(X_train, Y_train)

Y_pred_rf = clf_rf.predict(X_test)
# get the accuracy score
acc_rf = accuracy_score(Y_pred_rf, Y_test)
print(acc_rf)

# Conclusion

In [None]:
# Best model is with random forest