In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import scipy as sp
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [None]:
data = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

In [None]:
data.head()

We will first do an exploratory data analysis of the heart failure clinical dataset

In [None]:
# Check data types
data.dtypes

In [None]:
# Drop duplicate rows
duplicates = data[data.duplicated()]
print('no of duplicate rows:',duplicates.shape)
# there are no duplicate rows, so we don't have to remove anything

In [None]:
# Drop missing/null values
print('Number of null values')
print(data.isnull().sum())
# there are no null values

In [None]:
data = data.dropna()
data.count()

Now that we have checked for null/missing values and duplicate rows, 
we will detect outliers and remove them so that the model is not affected by extreme high or low values

In [None]:
# sns.boxplot(x=data['age'])
sns.boxplot(x=data['creatinine_phosphokinase'])

In [None]:
sns.boxplot(x=data['ejection_fraction'])

In [None]:
sns.boxplot(x=data['platelets'])

In [None]:
sns.boxplot(x=data['serum_creatinine'])


In [None]:
sns.boxplot(x=data['serum_sodium'])


In [None]:
sns.boxplot(x=data['time'])

We will use IQR method to find the list of outliers. ie. to remove values that falls outside 1.5IQR below Q1 and 1.5IQR above Q3

In [None]:
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3-Q1

# df = (data <(Q1 - 1.5*IQR)) | (data > (Q3 + 1.5*IQR))
Q1 - 1.5*IQR

In [None]:
# For each of the features we visualized bloxplot above, we will take value that lies within Q1-1.5IQR and Q3+1.5IQR (this will eliminate any outliers)
# creatinine_phosphokinase
data = data[data['creatinine_phosphokinase']>= Q1['creatinine_phosphokinase'] - 1.5*IQR['creatinine_phosphokinase']]
data = data[data['creatinine_phosphokinase']<= Q3['creatinine_phosphokinase'] + 1.5*IQR['creatinine_phosphokinase']]

# ejection_fraction
data = data[data['ejection_fraction']>= Q1['ejection_fraction'] - 1.5*IQR['ejection_fraction']]
data = data[data['ejection_fraction']<= Q3['ejection_fraction'] + 1.5*IQR['ejection_fraction']]

# platelets
data = data[data['platelets']>= Q1['platelets'] - 1.5*IQR['platelets']]
data = data[data['platelets']<= Q3['platelets'] + 1.5*IQR['platelets']]

# serum_creatinine
data = data[data['serum_creatinine']>= Q1['serum_creatinine'] - 1.5*IQR['serum_creatinine']]
data = data[data['serum_creatinine']<= Q3['serum_creatinine'] + 1.5*IQR['serum_creatinine']]

# serum_sodium
data = data[data['serum_sodium']>= Q1['serum_sodium'] - 1.5*IQR['serum_sodium']]
data = data[data['serum_sodium']<= Q3['serum_sodium'] + 1.5*IQR['serum_sodium']]

# time
data = data[data['time']>= Q1['time'] - 1.5*IQR['time']]
data = data[data['time']<= Q3['time'] + 1.5*IQR['time']]

In [None]:
# after eliminating outliers we get this shape
data.shape

In [None]:
# find correlation between variables
# Finding the relations between the variables.
plt.figure(figsize=(20,10))

sns.heatmap(data.corr(),cmap='BrBG',annot=True)

1. Death_Event mainly depends on serum creatinine and age
2. non of the features are highly correlated to one another

Now we will create the Machine learning models


In [None]:
# Create train and test set
y = data['DEATH_EVENT']
x = data.iloc[:,0:-1]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [None]:
# scale data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# LOGISTIC REGRESSION
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)
log_predictions = logmodel.predict(X_test)
print(classification_report(y_test,log_predictions))
print('Accuracy LR:',accuracy_score(y_test,log_predictions))

In [None]:
# KNN
# use elbow method to find the best K 
from sklearn.neighbors import KNeighborsClassifier
error_rate = []
for i in range(1,150):
    print('runnin: ',i)
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test)) 
    print('KNN',accuracy_score(y_test,pred_i))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,150), error_rate, color='blue', linestyle = 'dashed', marker='o', markerfacecolor='red',markersize=10)
plt.title('Error Rate vs K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
print('When K is between around 4 to 11 the error rate is the lowest, so we will pick value 4 as the best K')

In [None]:
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
print(classification_report(y_test,knn_pred))
print('Accuracy KNN:',accuracy_score(y_test,knn_pred))

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=300)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
print(classification_report(y_test,rfc_pred))
print('Accuracy RFC',accuracy_score(y_test,rfc_pred))

In [None]:
# SVM
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print(classification_report(y_test,svm_pred))
print('SVM',accuracy_score(y_test,svm_pred))

In [None]:
#grid search
from sklearn.model_selection import GridSearchCV
param_grid = {'C':[0.1,1,10,100,1000], 'gamma':[1,0.1,0.01,0.001,0.0001]}
grid = GridSearchCV(SVC(probability=True),param_grid, verbose = 3)
grid.fit(X_train,y_train)

In [None]:
# Use the best gamma and c value for svm after grid search
from sklearn.svm import SVC
svm = SVC(probability=True)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print(classification_report(y_test,svm_pred))
print('SVM',accuracy_score(y_test,svm_pred))

Out of all the models, I will think of the Random Forest model as the best among them because in this situation, other than having highest accuracy, what is most important is to accurately detect Death Events. Which means if a person died, is the model actually classifying that event as DEATH_EVENT= False. So I will give importance to the recall metrics. Among all the models tested, Random Forest model has the highest recall value and therefore it is the best among all these models.