# Dataset Description

Columns:

* age: the age of the patients
* anaemia: whether patient has anaemia or not (decrease in red blood cells)
* creatinine_phosphokinase: Level of CPK enzyme in blood (mcg/L)
* diabetes: whether patient has diabetes or not
* ejection_fraction: How much blood the left ventricle pumps out with each contraction
* high_blood_pressure: whether patient has high blood pressure or not
* platelets: No. of platelets within blood (responsible for blood clotting)
* serum_creatinine: Level of creatinine in blood (estimates how well kidneys are filtering)
* serum_sodium: levels of sodium in blood 
* DEATH_EVENT: whether patient died or not

In [None]:
#importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report, plot_confusion_matrix, plot_roc_curve,roc_curve, roc_auc_score



In [None]:
data = pd.read_csv("/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data.info()

# Data Visualization

In [None]:
data.hist(figsize=(15,15))

In [None]:
# % of Death events within the dataset
data.DEATH_EVENT.value_counts().plot.pie(explode=[0.05,0.05],autopct='%1.1f%%',shadow=True, figsize=(10,10))
plt.title('the % of deaths')

In [None]:
# % of sex counts within the dataset
data.sex.value_counts().plot.pie(explode=[0.05,0.05],autopct='%1.1f%%',shadow=True, figsize=(10,10))

In [None]:
# Counts of Deaths and No Deaths according to sex
plt.figure(figsize=(10,10))
sns.countplot(x='sex',hue = 'DEATH_EVENT',data=data)
plt.legend(['Death','No Death'])

In [None]:
# % of people with anaemia within the dataset
data.anaemia.value_counts().plot.pie(explode=[0.05,0.05],autopct='%1.1f%%',shadow=True, figsize=(10,10))

In [None]:
# Count of deaths and no deaths against anaemia
sns.countplot(x='anaemia',hue='DEATH_EVENT',data=data)
plt.legend(['Death','No Death'])

In [None]:
# % of people who are smokers or non-smokers within the datasets 
data.smoking.value_counts().plot.pie(explode=[0.05,0.05],autopct='%1.1f%%',shadow=True, figsize=(10,10))

In [None]:
sns.countplot(x='smoking',hue='DEATH_EVENT',data=data)
plt.legend(['Death','No Death'])

In [None]:
# % of people with high blood pressure or no high blood pressure within the dataset
data.high_blood_pressure.value_counts().plot.pie(explode=[0.05,0.05],autopct='%1.1f%%',shadow=True, figsize=(10,10))

In [None]:
# Count of death or no death against high blood pressure
sns.countplot(x='high_blood_pressure',hue='DEATH_EVENT',data=data)
plt.legend(['Death','No Death'])

In [None]:
# count of death or no death of sex against smoking
sns.barplot(x='sex',y='smoking',hue='DEATH_EVENT',data=data)
plt.legend(['Death','No Death'])

In [None]:
# count of death or no death of sex against anaemia
sns.barplot(x='sex',y='anaemia',hue='DEATH_EVENT',data=data)
plt.legend(['Death','No Death'])

In [None]:
# count of death or no death of sex against anaemia
sns.barplot(x='sex',y='high_blood_pressure',hue='DEATH_EVENT',data=data)
plt.legend(['Death','No Death'])

In [None]:
#Correlation matrix of all attributes against each other
sns.heatmap(data.corr())

In [None]:
#correlation of all columns against DEATH_EVENT column
data[data.columns[0:]].corr()['DEATH_EVENT'][:-1]

In [None]:
#we can see that most of the columns have little to 0 correlation
#we will drop such columns as they are not that useful to us in prediction

In [None]:
data = data.drop(columns=['diabetes','sex','smoking','high_blood_pressure','creatinine_phosphokinase','anaemia','platelets'])

In [None]:
#we are checking for outliers
sns.boxplot(x=data.ejection_fraction)

In [None]:
#we can see that there are 2 outliers at 70 and 80
#since there are only 2 we will go ahead and remove them

In [None]:
data = data[data['ejection_fraction']<70]

In [None]:
data.head(2)

In [None]:
#Standardize all columns except target column (DEATH_EVENT)
ssc = StandardScaler()
data.iloc[:,:-1]= ssc.fit_transform(data.iloc[:,:-1])

In [None]:
data.head()

In [None]:
#split x and y sets
x = data.iloc[:,:-1].values
y = data[['DEATH_EVENT']]

In [None]:
print(x.shape)
print(y.shape)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,  y, train_size = 0.7, random_state = 42)

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

# Logistic Regression

In [None]:
lgr = LogisticRegression()

lgr.fit(x_train,y_train)
log_pred = lgr.predict(x_test)

In [None]:
plot_confusion_matrix(lgr,x_test, y_test, normalize = 'true')

In [None]:
print(classification_report(y_test, log_pred))

# Bernoulli Naive Bayes

In [None]:
bnb = BernoulliNB()
bnb.fit(x_train,y_train)
bnb_pred = bnb.predict(x_test)

In [None]:
print(classification_report(y_test, bnb_pred))

In [None]:
plot_confusion_matrix(bnb,x_test,y_test, normalize = 'true')

# Decision Tree Classifier

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)
dtc_pred = dtc.predict(x_test)

In [None]:
print(classification_report(y_test, dtc_pred))

In [None]:
plot_confusion_matrix(dtc, x_test, y_test, normalize = 'true')

# Random Forest Classifier

In [None]:
rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)
rfc_pred = rfc.predict(x_test)

In [None]:
print(classification_report(y_test, rfc_pred))

In [None]:
plot_confusion_matrix(rfc, x_test, y_test, normalize = 'true')

# ROC AUC CURVE

In [None]:
fig, ax = plt.subplots(figsize = (15,10))

plot_roc_curve(dtc,x_test,y_test, ax=ax)
plot_roc_curve(rfc,x_test,y_test, ax=ax)
plot_roc_curve(bnb,x_test,y_test, ax=ax)
plot_roc_curve(lgr,x_test,y_test, ax=ax)

In [None]:
#Juding by the Area Under the Curve (AUC) we can tell how well a classifier is predicting.
#Values above 0.5 indicate that is predicting fairly well. 
# 0.5 means the model is randomly guessing while values below 0.5 show that the model is doing a poor job at prediction
#looking at this graph we can say for certain that Random Forest is predicting very well with the highest auc score of 0.9
#Decision Tree has the least auc score but not too bad as well at 0.79