In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sn # data visualization
from matplotlib import pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Motivation
#### Cardiovascular diseases (CVDs) are the number 1 cause of death globally, taking an estimated 17.9 million lives each year, which accounts for 31% of all deaths worlwide.
#### Heart failure is a common event caused by CVDs and this dataset contains 12 features that can be used to predict mortality by heart failure.

#### Most cardiovascular diseases can be prevented by addressing behavioural risk factors such as tobacco use, unhealthy diet and obesity, physical inactivity and harmful use of alcohol using population-wide strategies.

#### People with cardiovascular disease or who are at high cardiovascular risk (due to the presence of one or more risk factors such as hypertension, diabetes, hyperlipidaemia or already established disease) need early detection and management wherein a machine learning model can be of great help.

# Import Data

In [None]:
df_raw = pd.read_csv("../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")

# Data Exploration

In [None]:
df_raw.columns

In [None]:
df_raw.head()

In [None]:
df_raw.info() # This dataset is very nice and doesnt need any manipulation liked vectorization, but it does require rescaling because some of the values are way too big

In [None]:
df_raw.hist(figsize=(15,15))

In [None]:
print('Number of positive samples:',sum(df_raw['DEATH_EVENT'] == 1))
print('Number of negative samples:',sum(df_raw['DEATH_EVENT'] == 0))

In [None]:
# We can choose to either remove or replace the NaN values and in this case there are so few that I can just remove them
df_raw.dropna(inplace = True)
# Divide the raw data into two dataframes based on the categorical heart failure variable
df_raw_neg,df_raw_pos = df_raw.groupby(['DEATH_EVENT'])
# We are taking an even sample of data from people who have had a heart failure and those who have not in order to reduce the bias of the model we will construct
df_filtered = pd.concat([df_raw_neg[1].sample(96),df_raw_pos[1].sample(96)]) 
# We want as much data as we can get while maintaining an similar number of positive and negative samples

In [None]:
df_filtered.hist(figsize=(15,15)) # We can visualize the new sample balanced around the categorical label we want to fit on

In [None]:
sn.boxplot(data=df_filtered,x='DEATH_EVENT',y='age') # Subtle spatial differences with age increasing the likelihood of heart disease

In [None]:
sn.boxplot(data=df_filtered,x='DEATH_EVENT',y='serum_sodium') # More visualizations to justify a descision tree or a support vector machine

In [None]:
sn.boxplot(data=df_filtered,x='DEATH_EVENT',y='platelets') # Platelets dont seem to really help distinguish the categorical label

In [None]:
df_filtered.corr() # We can find better features by seeing what correlates well with the categorical label

In [None]:
# After visualizing the data and looking at each features correlation with the label we can choose features which are likely to help our model
FEATURES = ['age','creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium']
# FEATURES = ['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
#        'ejection_fraction', 'high_blood_pressure', 'platelets',
#        'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time']
# I tested adding a few features to improve model performance and it seems that my Random Forest Model did better with more labels, so I used them all
# I think the model was able to pick up on the subtle differences that we saw in the Boxplots better than I could.
# The difference in model performance is about -10% when you use the first feature set detailed above with a limited set.
# I think with more time I could pick better features that are more accessable to people who cannot collect all of the data perhaps in a developing country with limited infrastructure
LABELS = ['DEATH_EVENT']

In [None]:
X = df_filtered[FEATURES].values
y = df_filtered[LABELS].values.reshape(1,-1)[0]
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.7,shuffle=True)
# We want to split our data in order to cross-validate our models later and ensure that they are properly generalizing

# K-Means

In [None]:
scalar = StandardScaler() # not really necessary for k-means, but it will be useful when we boost later with Ada
kmeans = KMeans(n_clusters=2)
# The point of trying k-means is to understand whether the data is spatially separable and whether the separation is relevant to the label class
pipe = Pipeline([('scalar',scalar),('kmeans',kmeans)]).fit(X_train) # The pipe is a sklean object that allows us to create more complex models that feed into each other seamlessly
# While we dont need a scalar for kmeans it will be usefull later when we use Adaboost

In [None]:
df_filtered['kMeansPrediction'] = pipe.predict(X)
sn.scatterplot(data=df_filtered,x='age', y='serum_sodium',hue='kMeansPrediction')
print(classification_report(y_test,(pipe.predict(X_test))))
# After playing around with different features that were highly correlated to the label I came across a good set of features that gave the model a pretty good f1-score
# Enough to justify moving forward with these variables

# Boosting Random Forest with AdaBoost


In [None]:
params_clf = [{'criterion' : ["gini", "entropy"],
                'max_depth': [4,6,8,10,12],
                'max_features' : ["auto", "sqrt", "log2"],
                'n_estimators': [100,200,400,600,800],
          }]
clf = RandomForestClassifier()
# We want to find the best hyper parameters and for that we will use the GridSearchCV function provided by sklearn
grid_search_clf = GridSearchCV(clf,params_clf,cv=3,n_jobs=25,scoring='f1',verbose=True)
grid_search_clf.fit(X,y)
print(grid_search_clf.best_estimator_)
# We want to perform a grid search to fine tune the hyperparameters

## Random Forest

In [None]:
clf = grid_search_clf.best_estimator_.fit(X_train,y_train)
df_filtered['RandomForestPrediction'] = clf.predict(X)
sn.scatterplot(data=df_filtered,x='age', y='serum_sodium',hue='RandomForestPrediction')
print(classification_report(y_test,clf.predict(X_test))) # sklearns classification report will give us a bunch of metrics to evaluate our model

## AdaBoost

In [None]:
params_ada = [{'n_estimators': [50,150,250,350]}]
ada = AdaBoostClassifier(clf)
# We want to find the best hyper parameters so we will use the sklearn function again
grid_search_ada = GridSearchCV(ada,params_ada,cv=3,n_jobs=25,scoring='f1',verbose=True)
grid_search_ada.fit(X,y)
print(grid_search_ada.best_params_) 

In [None]:
ada = grid_search_ada.best_estimator_
boost_pipe = Pipeline([('scalar',scalar),('adaboost',ada)]).fit(X_train,y_train) # Using those parameters we will train the model using our train sample
df_filtered['AdaBoostPrediction'] = boost_pipe.predict(X) # Then we will predict for the whole dataset for visualization later
sn.scatterplot(data=df_filtered,x='age', y='serum_sodium',hue='AdaBoostPrediction') # Graph our prediction
print(confusion_matrix(y_test,boost_pipe.predict(X_test))) # print the confusion matrix and classification report
print(classification_report(y_test,boost_pipe.predict(X_test)))

# ROC Metrics

In [None]:
# fpr, tpr, thresholds = roc_curve(df_filtered['DEATH_EVENT'],df_filtered['AdaBoostPrediction'])
fpr, tpr, thresholds = roc_curve(y_test,boost_pipe.predict(X_test))

In [None]:
sn.lineplot(fpr,tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(['ROC'])

In [None]:
print("Metrics on entire dataset: ")
print(confusion_matrix(df_raw[LABELS[0]],boost_pipe.predict(df_raw[FEATURES])))
print(classification_report(boost_pipe.predict(df_raw[FEATURES]),df_raw[LABELS[0]])) 
# Score on the entire dataset including the other ~180 samples
# slightly higher than it should be because its already trained on the ~125 training samples and will have a high accuracy on them, but it hasnt seen ~180 samples + test size ~60
# so the overall score is still really good with that in consideration

# Error Visualization

In [None]:
# We want to see where our model makes mistakes and visualizing that can help us make a better model
sn.scatterplot(x=df_filtered['age'], y=df_filtered['serum_sodium'],hue=df_filtered['AdaBoostPrediction']==df_filtered['DEATH_EVENT'])
plt.legend(['Correct','Not Correct'])

In [None]:
sn.scatterplot(x=df_filtered['ejection_fraction'], y=df_filtered['creatinine_phosphokinase'],hue=df_filtered['AdaBoostPrediction']==df_filtered['DEATH_EVENT'])
plt.legend(['Correct','Not Correct']) # We just want to choose a few graphs to visualize where the error is made

# Predicting Probability of Development / Risk Assessment

In [None]:
df_filtered['Risk Assessment'] = boost_pipe.predict_proba(df_filtered[FEATURES])[:,1]
# With the probability we can assign a risk to a patient based on their data and help the patient understand the severity of their case

In [None]:
df_filtered[FEATURES+LABELS+['AdaBoostPrediction','Risk Assessment']].head(10)

# Conclusion

#### I think the use of Random Forest and AdaBoost was fairly straight-forward considering the distribution of the label classes were very well spatially separated and the correlation of the features was fairly good for many of the features, however; I think that I would have wanted to spend more time removing unnecessary features to improve the accessability of the model for, as discussed earlier, places with limited infrastructure and healthcare where getting those measurements would prove difficult.
#### We can see that a majority of the incorrect guesses are spatially relevant, so I think if I had more time I would have liked to play more with the features and try to engineer features that would help spatially separate the data for the model to leverage
#### I think in the future when I have taken more statistics I would like to offer a more robust analysis of the features and perhaps a hypothesis test for each feature to ensure that it is relevant to the model. That would be in order to understand exactly which features I dont need to reduce the requirements to make the model more accessable.

In [None]:
ada.feature_importances_

In [None]:
for x in range(len(ada.feature_importances_)):
    if ada.feature_importances_[x] > 0.04:
        print(FEATURES[x])