   # Heart failure prediction

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
pip install seaborn==0.11.0

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

In [None]:
df=pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

In [None]:
df.head()

A brief about the data
* Age: The age of the patient
* Anaemia: The presence of Anaemia. 1 if present, 0 if absent
* creatinine_phosphokinase: The level of creatinine phosphokinase of the patient
* ejection_fraction: Measurement of the heart's ejection fraction
* High_blood_pressure: The presence of Anaemia. 1 if present, 0 if absent
* platelets: Platelet count of the patient
* serum_creatinine: level of creatinine in the blood
* serum_sodium: level of sodium in the blood
* sex: Male or Female
* smoking: If the patient is a smoker. 1 if yes, 0 if no.
* Death event: The target variable, 1 if it resulted in death, 0 if there was no death.

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#Check for null values
df.isnull().sum().sum()

No null values in this dataset, which is good.

In [None]:
df['DEATH_EVENT'].value_counts()

For the sake of vizualizations,we will convert the 0s and 1s value to Yes and No respectively. Refering some notebooks, ive taken 0 as count of Females and 1 as count of males.

In [None]:

df.loc[df['sex'] == 1, 'sex'] = 'Male'
df.loc[df['sex'] == 0, 'sex'] = 'Female'



In [None]:
#Function for changing the category
def change_cat(coln):
    df.loc[df[coln] == 1, coln] = 'YES'
    df.loc[df[coln] == 0, coln] = 'NO'

#Function for the reverse so we can use int values for prediction
def reverse_cat(coln):
    df.loc[df[coln] == 'YES', coln] = 1
    df.loc[df[coln] == 'NO', coln] = 0

In [None]:
change_cat('high_blood_pressure')
change_cat('DEATH_EVENT')
change_cat('anaemia')
change_cat('smoking')
change_cat('diabetes')

In [None]:
sns.countplot(x='DEATH_EVENT',data=df)

In [None]:
sns.distplot(df['age'],bins=40,kde=False)

In [None]:
sns.countplot(x='anaemia',hue='DEATH_EVENT',data=df)

In [None]:
sns.countplot(x='diabetes',hue=df['DEATH_EVENT'],data=df)

In [None]:
sns.countplot(x='smoking',hue='DEATH_EVENT',data=df)

In [None]:
sns.countplot(x='sex',hue='DEATH_EVENT',data=df)

In [None]:
#Histogram comparing some of the features in terms of Death event
plt.figure(figsize=(20,20))
for i, col in enumerate(['creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium','time']):
    plt.subplot(4,4,i+1)
    sns.histplot(x=df[col],hue=df['DEATH_EVENT'])
    plt.tight_layout()

In [None]:
##Histogram comparing some of the features in terms of High blood pressure
plt.figure(figsize=(20,20))
for i, col in enumerate(['creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium','time']):
    plt.subplot(4,4,i+1)
    sns.histplot(x=df[col],hue=df['high_blood_pressure'])
    plt.tight_layout()

In [None]:
#Histogram comparing some of the features in terms of Smoking
plt.figure(figsize=(20,20))
for i, col in enumerate(['creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium','time']):
    plt.subplot(4,4,i+1)
    sns.histplot(x=df[col],hue=df['smoking'])
    plt.tight_layout()

In [None]:
#Histogram comparing some of the features in terms of Anaemia
plt.figure(figsize=(20,20))
for i, col in enumerate(['creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium','time']):
    plt.subplot(4,4,i+1)
    sns.histplot(x=df[col],hue=df['anaemia'])
    plt.tight_layout()

In [None]:
plt.figure(figsize=(20,20))
for i, col in enumerate(['creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium','time']):
    plt.subplot(4,4,i+1)
    sns.boxplot(x=df[col])
    plt.tight_layout()

In [None]:
#Correlation heatmap
c=df.corr()
sns.heatmap(c,annot=True)

In [None]:
#Bact to the original convention 
reverse_cat('high_blood_pressure')
reverse_cat('DEATH_EVENT')
reverse_cat('anaemia')
reverse_cat('smoking')
reverse_cat('diabetes')

As seen from the histograms, the 2 features:creatinine phosphokinase and creatinine phosphokinase had a right skewed distribution, using the log transformation will conform it to normality.

In [None]:
#Log transformation of creatinine phosphokinase and creatinine phosphokinase
df['log_creatinine_phosphokinase']=np.log(df['creatinine_phosphokinase'])
df['log_serum_creatinine']=np.log(df['serum_creatinine'])

In [None]:
#Defining X and Y
X=df[['ejection_fraction','platelets','serum_sodium','time','log_creatinine_phosphokinase','log_serum_creatinine']]
y=df['DEATH_EVENT']
y=y.astype('int')

In [None]:
#Split into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
scaler = StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)


In [None]:
#Applying logistic regression algorithm
lr=LogisticRegression(random_state=0)
lr.fit(X_train,y_train)
predictions=lr.predict(X_test)
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions),'\n')
LR_acc=accuracy_score(y_test,predictions)
print('Accuracy:',LR_acc)


In [None]:
#Applying Decision Tree Classifier
dt = DecisionTreeClassifier()
params = {'criterion':['gini', 'entropy'], 
          'random_state':[0]}
dt1 = GridSearchCV(dt, param_grid=params)
dt1.fit(X_train,y_train)
dtpredictions = dt1.predict(X_test)
#Confusion Matrix
print(confusion_matrix(y_test,dtpredictions))
#Classsification Report
print(classification_report(y_test,dtpredictions),'\n')
DT_acc=accuracy_score(y_test,dtpredictions)
print('Accuracy:',DT_acc)

In [None]:
#Applying Naive Bayes
NB= GaussianNB()
NB.fit(X_train,y_train)
NBpredictions = NB.predict(X_test)

#Confusion Matrix
print(confusion_matrix(y_test,NBpredictions))
#Classification Report
print(classification_report(y_test,NBpredictions),'\n')
NB_acc=accuracy_score(y_test,NBpredictions)
print('Accuracy:',NB_acc)

In [None]:
#SVM
svc = SVC()
params = {'kernel':['linear','rbf'], 
          'random_state':[0]}
svc1 = GridSearchCV(svc, param_grid=params)
svc1.fit(X_train,y_train)
svc_predictions = svc1.predict(X_test)
#Confusion Matrix
print(confusion_matrix(y_test,svc_predictions))
#Classification Report
print(classification_report(y_test,svc_predictions),'\n')
SVC_acc=accuracy_score(y_test,svc_predictions)
print('Accuracy:',SVC_acc)

In [None]:
#Random Forest
rf=RandomForestClassifier()
rf.fit(X_train,y_train)
rf_pred=rf.predict(X_test)
rf_acc=accuracy_score(y_test,rf_pred)
print(confusion_matrix(y_test,rf_pred))
print(classification_report(y_test,rf_pred),'\n')
print('Accuracy:',rf_acc)

In [None]:
#Comparison of various models
models = pd.DataFrame({
    'Model': ['Logistic Regression', 'Naive Bayes','SVC', 'Decision Tree','Random Forrest Classifier'],
    'Score': [LR_acc,NB_acc,SVC_acc,DT_acc,rf_acc]})
models.sort_values(by='Score', ascending=False,ignore_index=True)


### We see that Random Forrest Classifier has performed the best with a 91% accuracy.