In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dataset = pd.read_csv("/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")

In [None]:
dataset.head()

In [None]:
dataset.columns

In [None]:
# Input columns:
dataset.iloc[:,:-1]

In [None]:
# Output column:
dataset['DEATH_EVENT']

In [None]:
categorical_columns = ['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking', 'DEATH_EVENT']
numerical_columns = ['age', 'creatinine_phosphokinase', 'ejection_fraction', 
                     'platelets', 'serum_creatinine', 'serum_sodium', 'time'],

In [None]:
dataset.corr()

Significant correlation with DEATH_EVENT observed for the following features:
* Age: 0.253729
* Serum creatinine: 0.294278 
* Serum sodium: -0.195204
* Time:  -0.526964
* Ejection Fraction: -0.268603 

In [None]:
import seaborn as sns
sns.countplot(x="DEATH_EVENT", data=dataset)

The number of deaths are half that of the number of patients alive, i.e. 100 deaths against 200 alive.
This data needs to be balanced, via oversampling of the minority class(deaths) or undersampling the majority class(alive).

Here, we use the SMOTE technique to oversample the minority class.


In [None]:
from imblearn.over_sampling import SMOTE
oversampling_func = SMOTE(random_state=42)

In [None]:
# To oversample the dataset needs to split into X and y
X = dataset.iloc[:,:-1]
y = dataset[["DEATH_EVENT"]]

In [None]:
print("Size of dataset before oversampling: "+str(len(X)))

SMOTE is used to oversample X and y, and are saved into new data frames X_smote and y_smote.

In [None]:
X_smote, y_smote = oversampling_func.fit_resample(X, y)

In [None]:
smote_dataset = pd.concat((X_smote,y_smote),axis=1) # X_smote and y_smote are combined to create the countplot below

In [None]:
sns.countplot(x="DEATH_EVENT", data=smote_dataset)

In [None]:
print('Size of SMOTE dataset: '+str(len(X_smote)))

In [None]:
smote_dataset.corr()

Significant correlation with DEATH_EVENT observed for the following features:

* Age
* Serum creatinine
* Serum sodium
* Time
* Ejection Fraction

Only the significant features observed in the correlation are captured here. 

In [None]:
features = ["age", "serum_creatinine", "serum_sodium", "time", "ejection_fraction"]
input_data = smote_dataset[features]
output_data = smote_dataset["DEATH_EVENT"]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(input_data, output_data, test_size = 0.2, random_state = 1)

Since the features like age, serum_sodium, serum_creatinine are on different scales, the data is scaled, using StandardScaler. 

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train[0:5]  # Training data after scaling

In [None]:
X_test[0:5]  # Testing data after scaling

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(max_depth=15, random_state = 42)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
rf_acc = accuracy_score(y_test, y_pred)
print("Accuracy of Random Forest Classifier: "+str(accuracy_score(y_test, y_pred)))

In [None]:
# Iterating through various K Nearest Neighbours Classifiers 
from sklearn.neighbors import KNeighborsClassifier
acc_list = []
max_val = 0
for neighbours in range(1,41):
    knn_clf = KNeighborsClassifier(n_neighbors=neighbours)
    knn_clf.fit(X_train, y_train)
    y_pred = knn_clf.predict(X_test)
    accuracy_val = accuracy_score(y_test, y_pred)
    acc_list.append(int(accuracy_val))
    print(str(neighbours)+": "+str(accuracy_val))
    if accuracy_val > max_val:
        no_of_neighbours = neighbours
        max_val = accuracy_val
        
print("Optimal number of neighbours: "+str(no_of_neighbours)+" with accuracy "+str(max_val))
knn_acc = max_val

In [None]:
from sklearn.svm import SVC
svc_clf = SVC(random_state=42)
svc_clf.fit(X_train, y_train)
y_pred = svc_clf.predict(X_test)
print("Accuracy of SVM classifier "+str(accuracy_score(y_test, y_pred)))
svc_acc = accuracy_score(y_test, y_pred)

In [None]:
acc_list=[svc_acc,knn_acc,rf_acc]

In [None]:
print("Maximum Accuracy achieved: "+str(max(acc_list)))
print("Accuracy of various classifiers:")
print("* SVM Classifier: "+str(svc_acc))
print("* KNN Classifier: "+str(knn_acc))
print("* Random Forest Classifier: "+str(rf_acc))