In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt, seaborn as sns 
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings
warnings.filterwarnings('ignore')

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
vis_data = data.drop('DEATH_EVENT', axis = 1)

In [None]:
data.quantile([0.01, 0.99])
data.describe()
# data is not fitting for 'creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine'. Will have to investigate them further. 

In [None]:
data.info()
# this data has no non numerical feature 

In [None]:
plt.style.use('dark_background')
fig, axes = plt.subplots(2, 2, figsize = (20, 10))

ax = sns.boxplot(x='creatinine_phosphokinase', data=vis_data, orient='v', ax=axes[0, 0])
ax = sns.boxplot(x="ejection_fraction", data=vis_data, orient='v', ax=axes[0, 1])
ax = sns.boxplot(x="platelets", data=vis_data, orient='v', ax=axes[1, 0])
ax = sns.boxplot(x="serum_creatinine", data=vis_data, orient='v', ax=axes[1, 1])

# 'ejection_fraction' does not have many outliers so we can ignore this analysis. All the other features do. 
# Outliers will be dealt with during RobustScaling 

In [None]:
plt.style.use('dark_background')
fig, axes = plt.subplots(2, 2, figsize = (15, 10))

ax = sns.distplot(vis_data['creatinine_phosphokinase'], ax=axes[0, 0])
ax = sns.distplot(vis_data["ejection_fraction"], ax=axes[0, 1])
ax = sns.distplot(vis_data["platelets"], ax=axes[1, 0])
ax = sns.distplot(vis_data["serum_creatinine"], ax = axes[1, 1])

# ejection_fraction though does not have a significant number of outliers, does not have a normal distribution either. 
# All the other features do have normall distributions.  Let's treat this feature to a log transform and see if the Gaussian distribution
# is obtained or not 

In [None]:
plt.style.use('dark_background')
plt.figure(figsize = (20, 8))
# ejection_fraction treatment
sns.distplot(np.log(vis_data['ejection_fraction']))
# not exactly Gaussian, but will do
data['ejection_fraction'] = pd.Series(np.log(data['ejection_fraction']))

In [None]:
# let's check all the other features as well, distplot and boxplot analysis 
remaining_cols = vis_data.drop(['creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine'], axis = 1)
# TBD

In [None]:
round(data.isnull().sum()/len(data.index)*100, 2)
# no missing values in the data either 

In [None]:
# let's check for data imbalance now. Countplot analysis, for every feature 
plt.style.use('dark_background')
fig, axes = plt.subplots(4, 3, figsize = (20, 20))
ax = sns.countplot(vis_data['age'], ax=axes[0, 0])
ax = sns.countplot(vis_data['anaemia'], ax=axes[0, 1])
ax = sns.countplot(vis_data['creatinine_phosphokinase'], ax=axes[0, 2])
ax = sns.countplot(vis_data['diabetes'], ax=axes[1, 0])
ax = sns.countplot(vis_data['ejection_fraction'], ax=axes[1, 1])
ax = sns.countplot(vis_data['high_blood_pressure'], ax=axes[1, 2])
ax = sns.countplot(vis_data['platelets'], ax=axes[2, 0])
ax = sns.countplot(vis_data['serum_creatinine'], ax=axes[2, 1])
ax = sns.countplot(vis_data['serum_sodium'], ax=axes[2, 2])
ax = sns.countplot(vis_data['sex'], ax=axes[3, 0])
ax = sns.countplot(vis_data['smoking'], ax=axes[3, 1])
ax = sns.countplot(vis_data['time'], ax=axes[3, 2])

plt.tight_layout()

# platelets and creatinine_phosphokinase have one dominant value it seems. Rest of the features are okay

In [None]:
list(data['platelets'].value_counts())[0]/sum(list(data['platelets'].value_counts()))
list(data['creatinine_phosphokinase'].value_counts())[0]/sum(list(data['creatinine_phosphokinase'].value_counts()))
# dominant values occur just 8% and 15% of the total 300 times, respectively, so this cannot be considered imbalance. 

In [None]:
plt.figure(figsize = (15, 15))
sns.heatmap(data.corr(), annot = True, cmap = 'YlGnBu_r')
# highest absolute correlation values being just 0.5, there is no strong correlation present anywhere 

In [None]:
# we've already identified the binary categorical features from the countplot analysis: anaemia, diabetes, high_blood_pressure, sex, smoking
# we can simply one hot encode these
binaries = ['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking']
data[binaries] = data[binaries].astype('str')
dummies = pd.get_dummies(data[binaries])
data = data.drop(data[binaries], axis = 1)
data = pd.concat([data, dummies], axis = 1)

In [None]:
# now we split the data and then scale it using RobustScaler
from sklearn.model_selection import train_test_split
y = data['DEATH_EVENT']
X = data.drop('DEATH_EVENT', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state = 1)

from sklearn.preprocessing import RobustScaler
scale = RobustScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

In [None]:
# Modelling from hereon

In [None]:
model=[]
score=[]

In [None]:
# For Hyper-parameter Tuning the model
from sklearn.model_selection import GridSearchCV

# For checking Model Performance
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import learning_curve

import warnings
warnings.simplefilter(action="ignore")

from sklearn.model_selection import StratifiedKFold

1. KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier()
cv_method = StratifiedKFold(n_splits=3)
param={'n_neighbors':(1,3,5,7),'metric':('euclidean','manhattan','chebyshev','minkowski'),'p' :(1,2)}
clf = GridSearchCV(neigh, param,cv=cv_method,scoring="accuracy")
clf.fit(X_train,y_train)

In [None]:
clf.best_params_

In [None]:
knn = KNeighborsClassifier(n_neighbors= 5, p= 1,metric= 'chebyshev')
knn.fit(X_train,y_train)
knn.score(X_test,y_test)


In [None]:
model.append('knn')
score.append(knn.score(X_test,y_test))

2. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rdf=RandomForestClassifier()
cv_method = StratifiedKFold(n_splits=3)
param={'criterion':('gini', 'entropy'),'min_samples_split': (2, 6, 20),'min_samples_leaf': (1, 4, 16),'n_estimators' :(100,150, 200, 250)}
clf = GridSearchCV(rdf, param,cv=cv_method,scoring="accuracy")
clf.fit(X_train,y_train)

In [None]:
clf.best_params_

In [None]:
random_forest = RandomForestClassifier(criterion= 'entropy',min_samples_leaf= 1,min_samples_split=20,n_estimators= 250)
random_forest.fit(X_train, y_train)
random_forest.score(X_test, y_test)

In [None]:
model.append('random_forest')
score.append(random_forest.score(X_test,y_test))

3. Support Vector Machine

In [None]:
from sklearn.svm import SVC

cv_method = StratifiedKFold(n_splits=3)
param={'C': [0.1, 1, 10, 100, 1000],'gamma': [1, 0.1, 0.01, 0.001, 0.0001],'kernel': ['rbf','sigmoid','linear']}
clf = GridSearchCV(SVC(), param,cv=cv_method,scoring="accuracy")
clf.fit(X_train,y_train)

In [None]:
clf.best_params_

In [None]:
svc=SVC(C= 100, gamma= 0.01, kernel='rbf')
svc.fit(X_train, y_train)
svc.score(X_test, y_test)

In [None]:
model.append('svc')
score.append(svc.score(X_test,y_test))

In [None]:
plt.bar(model,score)
plt.xlabel(model)
plt.ylabel(score)
plt.title("Comparision of model with score")
plt.show()

In [None]:
#classifiacation reports for applied models

for i in range(len(model)):
    print("Classification Report for ", model[i]," with score ",f'{score[i]*100:0.2f}',"%")
    pred = eval(model[i]).predict(X_test)
    print(classification_report(y_test, pred))

