In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("fivethirtyeight")
HF_data = pd.read_csv("../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")

In [None]:
HF_data.head()

In [None]:
HF_data.info()
# no null values in the dataset

In [None]:
HF_data["age"] = HF_data['age'].astype("int")

In [None]:
HF_data.nunique()

# Seeing the impact of each column on DEATH_EVENT column

In [None]:
print(sns.kdeplot(data = HF_data["anaemia"],shade = True))
# a decent amount of people does not have a lower quantity of blood cells.
sns.barplot(x = HF_data["anaemia"],y = HF_data["DEATH_EVENT"],saturation  = 0.1)
# as expected, there are more deaths from stroke who have anaemia.
plt.title("Anaemia VS Death_event")

In [None]:
print(sns.boxplot(x = HF_data["age"]))
# majority of ages in the data are inbetween 50-70
plt.title("Range of age in the dataset")

In [None]:
sns.barplot(x = 'DEATH_EVENT' , y = 'age' , data = HF_data)
# seems like the average age does not play any vital role in Heart Failure
plt.title("Age VS Death_event")

In [None]:
sns.kdeplot(x = HF_data["creatinine_phosphokinase"],shade = True)
# most of the creatinine_phosphokinase is below 2000

In [None]:
sns.stripplot(x = HF_data["DEATH_EVENT"],y = HF_data["creatinine_phosphokinase"])
# higher creatinine_phosphokinase means a high level of stress. but it does not leave any major impact on heart failure
plt.title("Creatinine Phosphokinase VS Death_event")

In [None]:
HF_data[HF_data["diabetes"] & HF_data["DEATH_EVENT"] == 1]["DEATH_EVENT"].value_counts()
# out of 299 entries, 40 died who had diabetes.

In [None]:
sns.swarmplot(x = HF_data["DEATH_EVENT"], y = HF_data["ejection_fraction"],hue = HF_data["DEATH_EVENT"],s = 4)
# when the ejection fraction falls below 30 death event increases
plt.title("Ejection fraction VS Death_event")

In [None]:
mask1 = HF_data[HF_data["high_blood_pressure"] & HF_data["DEATH_EVENT"] == 1] 
mask1["DEATH_EVENT"].sum()

In [None]:
plt.figure(figsize = (16,6))
plt.subplot(1,2,1)
sns.kdeplot(x = HF_data["platelets"])
plt.subplot(1,2,2)
sns.scatterplot(x = HF_data["DEATH_EVENT"],y = HF_data["platelets"] ,hue = HF_data["DEATH_EVENT"])
# majority of entries have normal number of platelets (between 150,000 - 400,000) and 
# entries with platelets < 200,000 have greater risk of heart failure.
plt.title("Platelets VS Death_event")

In [None]:
plt.figure(figsize = (16,6))
plt.subplot(1,2,1)
sns.kdeplot(x = HF_data["serum_creatinine"])
# average values of serum creatinine is <= 1 for normal behaviour of body
plt.subplot(122)
sns.scatterplot(x = HF_data["DEATH_EVENT"],y = HF_data["serum_creatinine"],hue = HF_data["DEATH_EVENT"])
# increased values of serum creatinine leads to heart failure
plt.title("Serum Creatinine VS Death_event")

In [None]:
plt.figure(figsize = (16,6))
plt.subplot(1,2,1)
sns.kdeplot(x = HF_data["serum_sodium"])
plt.subplot(122)
sns.barplot(x = HF_data["DEATH_EVENT"],y = HF_data["serum_sodium"],)
# Less quantity of serum sodium leads to heart failure but it not affect our model much,
#since most of the death are from normal serum sodium quntity
plt.title("Serum Sodium VS Death_event")

In [None]:
print(HF_data["sex"].value_counts())
# 1 - Male , 0 - Female
# changing sex dtype to string
copy = HF_data.copy()
Male = copy['sex'] == 1
copy.loc[Male,'sex'] = "Male"
Female = copy['sex'] == 0
copy.loc[Female,'sex'] = "Female"


# changing smoking dtype to string
smoke_1 = copy['smoking'] == 1
copy.loc[smoke_1,'smoking'] = "Yes"
smoke_0 = copy['smoking'] == 0
copy.loc[smoke_0,'smoking'] = "No"

# changing DEATH_EVENT dtype to string
death_1 = copy['DEATH_EVENT'] == 1
copy.loc[smoke_1,'DEATH_EVENT'] = "Yes"
death_0 = copy['DEATH_EVENT'] == 0
copy.loc[smoke_0,'DEATH_EVENT'] = "No"

copy.head()

In [None]:
sex_count = copy[['sex','DEATH_EVENT']].value_counts()
sex_count.head()
sex_count = sex_count.reset_index()
sex_count.columns  = ['sex','Death_event','count']
sex_count

In [None]:
smoker_count = copy[['smoking','DEATH_EVENT']].value_counts()
smoker_count = smoker_count.reset_index()
smoker_count.columns = ['smoking','Death_event','count']
smoker_count

In [None]:
smoke_sex_count = copy[['smoking','sex','DEATH_EVENT']].value_counts()
smoke_sex_count = smoke_sex_count.reset_index()
smoke_sex_count.columns = ['smoking','sex','Death_event','count']
smoke_sex_count
# we see in males, the smokers and non-smokers does not have a big difference in death_count but
# we see a big difference in females. It might be that majority of females does not smoke which in that case this result 
# will not be of much use. 

In [None]:
plt.figure(figsize = (16,10))
plt.subplot(221)
sns.barplot(x = 'Death_event',y = 'count',data = sex_count,hue = 'sex')
plt.subplot(222)
sns.barplot(x = 'Death_event',y = 'count',data = smoker_count,hue = 'smoking')
plt.subplot(2,2,3)
sns.barplot(x = 'Death_event',y = 'count',data = smoke_sex_count)

# The time column cannot be a feature. It contains the time when the patient died which is of no use when predicting heart failures.

# **Machine learning part(Feature scaling,Training and Fitting the model)**

In [None]:
# Splitting the dataset using Train test split
features = HF_data[['anaemia','ejection_fraction','platelets','serum_creatinine']]
result = HF_data['DEATH_EVENT']
print(features.head(3))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features,result,train_size = 0.7,random_state = 1)

In [None]:
X_train = X_train.reset_index() 
X_train.drop('index',axis = 1,inplace = True)
X_train.head()

In [None]:
X_test = X_test.reset_index()
X_test.drop('index',axis = 1,inplace = True)
X_test.head()

In [None]:
from sklearn.preprocessing import StandardScaler


sc = StandardScaler()
X_train_fs = X_train.copy()
X_test_fs = X_test.copy()
X_train_fs.iloc[:,1:] = sc.fit_transform(X_train.iloc[:,1:])
X_test_fs.iloc[:,1:] = sc.fit_transform(X_test.iloc[:,1:])

# **Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

Classifier = RandomForestClassifier(random_state = 0,max_leaf_nodes=100,max_depth=5,n_estimators = 100)
Classifier.fit(X_train,y_train)
y_pred = Classifier.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
score = accuracy_score(y_test,y_pred)
f1_score = f1_score(y_test,y_pred)
print(cm)
print("Accuracy Score:{}".format(score))
print("f1 Score:{}".format(f1_score))
plt.figure(figsize = (10,5))
plt.subplot(121)
sns.countplot(x = y_test)
plt.title("Countplot of y_test")
plt.subplot(122)
sns.countplot(x = y_pred)
plt.title("Countplot of y_pred")
plt.xlabel("DEATH_EVENT")

# **Decision Tree Classifier**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

Classifier_1 = DecisionTreeClassifier(random_state = 1,max_leaf_nodes = 100,max_depth=5)
Classifier_1.fit(X_train,y_train)
y_pred_1 = Classifier_1.predict(X_test)
cm = confusion_matrix(y_test,y_pred_1)
score = accuracy_score(y_test,y_pred_1)
f1_score = f1_score(y_test,y_pred_1)
print(cm)
print("Accuracy Score:{}".format(score))
print("f1 Score:{}".format(f1_score))

plt.figure(figsize = (10,5))
plt.subplot(121)
sns.countplot(x = y_test)
plt.title("Countplot of y_test")
plt.subplot(122)
sns.countplot(x = y_pred_1)
plt.title("Countplot of y_pred_1")
plt.xlabel("DEATH_EVENT")


# Decision Tree Classifier performs poorly in comparision to Random Forest Classifier

# **K - Nearest Neighbor**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

Classifier_2 = KNeighborsClassifier(weights = 'distance')
Classifier_2.fit(X_train,y_train)
y_pred_2 = Classifier_2.predict(X_test)
cm = confusion_matrix(y_test,y_pred_2)
score = accuracy_score(y_test,y_pred_2)
f1_score = f1_score(y_test,y_pred_2)
print(cm)
print("Accuracy Score: {}".format(score))
print("f1 Score:{}".format(f1_score))

plt.figure(figsize = (10,5))
plt.subplot(121)
sns.countplot(x = y_test)
plt.title("Countplot of y_test")
plt.subplot(122)
sns.countplot(x = y_pred_2)
plt.title("Countplot of y_pred_2")
plt.xlabel("DEATH_EVENT")

# since the dataset is not big and most of the DEATH_EVENT are '0', setting n_neighbors >10 will result in predicting
# DEATH_EVENT to '0' most of the time

# the K-NN plot matchs the test set almost perfectly even thought the accuracy score is low. I don't know what to say in this case

# **Support Vector Machine**

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

Classifier_3 = SVC(kernel='sigmoid')
Classifier_3.fit(X_train_fs,y_train)
y_pred_3 = Classifier_3.predict(X_test_fs)
cm = confusion_matrix(y_test,y_pred_3)
score = accuracy_score(y_test,y_pred_3)
f1_score = f1_score(y_test,y_pred_3)
print(cm)
print("Accuracy Score: {}".format(score))
print("f1 Score:{}".format(f1_score))

plt.figure(figsize = (10,5))
plt.subplot(121)
sns.countplot(x = y_test)
plt.title("Countplot of y_test")
plt.subplot(122)
sns.countplot(x = y_pred_3)
plt.title("Countplot of y_pred_3")
plt.xlabel("DEATH_EVENT")

# **Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

Classifier_4 = LogisticRegression(random_state = 1)
Classifier_4.fit(X_train_fs,y_train)
y_pred_4 = Classifier_4.predict(X_test_fs)
cm = confusion_matrix(y_test,y_pred_4)
score = accuracy_score(y_test,y_pred_4)
f1_score = f1_score(y_test,y_pred_4)
print(cm)
print("Accuracy Score:{}".format(score))
print("f1 Score:{}".format(f1_score))

plt.figure(figsize = (10,5))
plt.subplot(121)
sns.countplot(x = y_test)
plt.title("Countplot of y_test")
plt.subplot(122)
sns.countplot(x = y_pred_4)
plt.title("Countplot of y_pred_4")
plt.xlabel("DEATH_EVENT")

# **Naive Bayes**

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

Classifier_5 = GaussianNB()
Classifier_5.fit(X_train_fs,y_train)
y_pred_5 = Classifier_5.predict(X_test_fs)
cm = confusion_matrix(y_test,y_pred_5)
score = accuracy_score(y_test,y_pred_5)
f1_score = f1_score(y_test,y_pred_5)
print(cm)
print("Accuracy Score:{}".format(score))
print("f1 Score:{}".format(f1_score))

plt.figure(figsize = (10,5))
plt.subplot(121)
sns.countplot(x = y_test)
plt.title("Countplot of y_test")
plt.subplot(122)
sns.countplot(x = y_pred_5)
plt.title("Countplot of y_pred_5")
plt.xlabel("DEATH_EVENT")