In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

# Exploratory data analysis

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

Looks like a very ideal dataset<br>
All the values are numeric<br>
There are no null values to impute<br>
The platelets column needs scaling before applying ML algorithms, but lets find out if that feature actually matters<br>
Lets proceed to visualization<br>

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(df.corr())

**Conclusions from the heatmap :**
* Death event looks to be highly correlated with serum_creatinine and age
* Survival event looks to be highly correlated with time, ejection_fraction, serum_sodium
* Sex and smoking have the least correlation with DEATH_EVENT, we can consider dropping these features

In [None]:
df.corr()['DEATH_EVENT'].sort_values(ascending=False)

# Distplot of all the features
To understand how all the features are distributed

In [None]:
plt.figure(figsize = (20, 25))
feature_num = 1
for i in df:
    if feature_num < 13:
        ax = plt.subplot(4, 4, feature_num)
        sns.distplot(df[i])
        plt.xlabel(i, fontsize = 12)
        
    feature_num += 1
plt.show()

**Inference from the distplot** :<br>
There is a notable skew in certain features like platelets, creatinine_phosphokinase <br>
We can overcome this using log transformation but will skip the same as these features have less impact on survival

In [None]:
df.shape

# Scaling the features using Standard Scaler<br>
ML algorithms are very sensitive differences in scales of various features<br>
In this particular case the platelets feature has a huge magnitude when compared to other features<br>
This can considerably offset the accuracy of our results<br>
Hence scaling is necessary<br>

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
y = df['DEATH_EVENT']
df_scaled = ss.fit_transform(df.drop(('DEATH_EVENT'),axis=1))
df_scaled = pd.DataFrame(data=df_scaled,columns=df.columns[:-1])
df_scaled = pd.concat([df_scaled,y],axis=1)

In [None]:
df_scaled

# Plot Boxplots to find outliers

In [None]:
fig, ax = plt.subplots(figsize = (15, 10))
sns.boxplot(data = df_scaled, width = 0.5, ax = ax, fliersize = 3)
plt.show()

# Treating Outliers with Principal Component Analysis<br>
Principal component analysis can be used to find the features that explain the most of the variance(95-100%) in the dataset.<br>
Most of the times we do not need all the features in the input dataset to explain the variance.<br>

Hence this can be used to :<br>
1. Drop unwanted features <br>
2. Drop unwanted rows that act as outliers<br>

The inbuilt sklearn pca does not give us the top features(best features to use), the location of outliers in the dataset<br>
There is this cool library called pca that does both and hence we will be using the same.

In [None]:
pip install pca

In [None]:
from pca import pca
model = pca(n_feat=12,n_components=12) #considering all the 12 features as I do not want to eliminate columns
df_scaled_x = model.fit_transform(df_scaled.drop(('DEATH_EVENT'),axis=1))


In [None]:
df_scaled_x['topfeat']

In [None]:
outliers = df_scaled_x['outliers']
outliers_ = outliers[outliers['y_bool_spe']==True]

In [None]:
outliers_.reset_index(inplace=True)

In [None]:
outliers_index = outliers_['index']

In [None]:
outliers_index #gives the index of the outliers in the dataset

Let us plot the pca model<br>
All the points outside the green zone gives us the outliers

In [None]:
plt.figure(figsize=(20,10))
model.biplot(legend=True, SPE=True, hotellingt2=True)

Let us drop the outliers

In [None]:
df_new = df_scaled.drop(outliers_index,axis=0)

In [None]:
df_new

Let us see if there are some more outliers in the dataset after PCA

In [None]:
fig, ax = plt.subplots(figsize = (15, 10))
sns.boxplot(data = df_new, width = 0.5, ax = ax, fliersize = 3)
plt.show()

Looks like there are some more outliers and we have to clean them up

In [None]:
df_exp = df_scaled[df_scaled['platelets'] < df_scaled['platelets'].quantile(0.95)]
df_exp = df_exp[df_exp['platelets'] > df_exp['platelets'].quantile(.1)]
df_exp = df_exp[df_exp['serum_sodium'] > df_exp['serum_sodium'].quantile(.1)]
df_exp = df_exp[df_exp['serum_creatinine'] < df_exp['serum_creatinine'].quantile(0.9)]
df_exp = df_exp[df_exp['creatinine_phosphokinase'] < df_exp['creatinine_phosphokinase'].quantile(0.91)]
fig, ax = plt.subplots(figsize = (15, 10))
sns.boxplot(data = df_exp, width = 0.5, ax = ax, fliersize = 3)
plt.show()

We have cleaned up most of the outliers, do not want to drop more samples as it may reduce the amount of data input<br>
to ML algorithms for classification

In [None]:
df_exp.shape

In [None]:
df_exp['DEATH_EVENT'].value_counts()

# Test train split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_exp.drop(('DEATH_EVENT'),axis=1), df_exp['DEATH_EVENT'], test_size=0.3, random_state=42,stratify=df_exp['DEATH_EVENT'])

In [None]:
y_train.value_counts()

In [None]:
sns.countplot(y_train)

The output labels are imbalanced and we have overcome this via oversampling in order to avoid model bias towards majority class

# Oversampling with SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(sampling_strategy={0:150,1:150})
X,y = sm.fit_resample(X_train,y_train)

# Choosing the best model based on F1-score

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, balanced_accuracy_score
from sklearn.model_selection import KFold
models=[("XGboost", XGBClassifier()),
        ("Stochastic Gradient Descent", SGDClassifier()),
        ("Linear Discriminant Analysis", LinearDiscriminantAnalysis()),
        ("Decision Tree", DecisionTreeClassifier()),
        ("Random Forest", RandomForestClassifier()),
        ("Extra Trees", ExtraTreesClassifier()),
        ("Gradient Boosting", GradientBoostingClassifier()),
        ("KNeighbors", KNeighborsClassifier()),
        ("SVM", SVC()),
        ("Naive Bayes", GaussianNB()),
        ("Cat Boost", CatBoostClassifier(verbose=False)),
        ("Ada Boost", AdaBoostClassifier())]

f1 = []
variance = []
names = []
for name, model in models:
    kfold = KFold(n_splits=10)
    results = cross_val_score(model, X, y, cv=kfold, scoring='f1')
    f1.append(results.mean())
    variance.append(results.std())
    names.append(name)
    print('Model name : {}, F1 score : {},  variance : {}'.format(name,results.mean(),results.std()))

f1 = pd.Series(data=f1,name='f1-score')
variance = pd.Series(data=variance,name='variance')
names = pd.Series(data=names,name='names')
df_f1 = pd.concat([f1,variance],axis=1)

df_f1.set_index(keys=names,inplace=True)

In [None]:
df_f1.sort_values('f1-score',ascending=False,inplace=True)

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(y=df_f1.index,x=df_f1['f1-score'])

We will use **Cat boost , Extra trees, Random forests** as they seem to have the best F1-scores

# Feature Selection
Let us use extra trees to do some feature selection and eliminate unwanted features to boost model accuracy

In [None]:
from sklearn.metrics import confusion_matrix,classification_report
ext = ExtraTreesClassifier(criterion='entropy', max_depth=8)
ext.fit(X,y)
pred_ext = ext.predict(X_test)
print(confusion_matrix(pred_ext,y_test))
print(classification_report(pred_ext,y_test))
sns.barplot(x=ext.feature_importances_,y=X_test.columns)

The most important features are **age, ejection_fraction, serum_creatinine,time**<br>
We will be dropping all the other features

In [None]:
X.columns

In [None]:
X.drop(['anaemia', 'creatinine_phosphokinase', 'diabetes',
        'high_blood_pressure', 'platelets','serum_sodium', 'sex', 'smoking'],inplace=True,axis=1)
X_test.drop(['anaemia', 'creatinine_phosphokinase', 'diabetes',
        'high_blood_pressure', 'platelets','serum_sodium', 'sex', 'smoking'],inplace=True,axis=1)

In [None]:
print(X.shape,X_test.shape,y.shape,y_test.shape)

# Hyper parameter tuning for Extra trees classifier

In [None]:
from sklearn.model_selection import GridSearchCV
params_ext = [{'criterion' : ["gini", "entropy"],
              'min_samples_split': [2,4,6,8], 
              'max_depth': [2,4,6,8],
              'max_features' : ["auto", "sqrt", "log2"],
              'n_estimators': [100,200,400,600],
          }]
classifier_ = ExtraTreesClassifier()
grid_search_ext = GridSearchCV(classifier_,params_ext,cv=3,n_jobs=150,scoring='f1',verbose=10)
grid_search_ext.fit(X,y)
print(grid_search_ext.best_params_)

In [None]:
grid_search_ext.best_estimator_

# Prediction using Extra trees

In [None]:
ext = ExtraTreesClassifier(max_depth=8, max_features='sqrt', min_samples_split=4)
ext.fit(X,y)
pred_ext = ext.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(pred_ext,y_test))
print(classification_report(pred_ext,y_test))

# Hyper parameter tuning for random forests

In [None]:
from sklearn.ensemble import RandomForestClassifier
params_rfc = [{'criterion' : ["gini", "entropy"],
              'min_samples_split': [2,4,6,8], 
              'max_depth': [2,4,6,8],
              'max_features' : ["auto", "sqrt", "log2"],
              'n_estimators': [100,200,400,600],
          }]
classifier_ = RandomForestClassifier()
grid_search_rfc = GridSearchCV(classifier_,params_rfc,cv=3,n_jobs=150,scoring='f1',verbose=10)
grid_search_rfc.fit(X,y)
print(grid_search_rfc.best_params_)

In [None]:
grid_search_rfc.best_estimator_

In [None]:
rfc = RandomForestClassifier(criterion='entropy', max_depth=8)
rfc.fit(X,y)

# Prediction using random forests

In [None]:
pred_rfc = rfc.predict(X_test)

In [None]:
print(confusion_matrix(pred_rfc,y_test))
print(classification_report(pred_rfc,y_test))

# Training the Cat Boost Classifier

In [None]:
cat = CatBoostClassifier(verbose=False)
cat.fit(X,y)

# Prediction using cat boost

In [None]:
pred_cat = cat.predict(X_test)

In [None]:
print(confusion_matrix(pred_cat,y_test))
print(classification_report(pred_cat,y_test))

# Conclusion

So the best model to use would be **Extra trees and cat boost** , because they both have an **F1-score of 0.82** <br>
And also an **accuracy of 0.87**

**Please upvote if this notebook was helpful and if you liked it<br>
Comments for improvements are welcome**