In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

import sklearn.metrics as m
import sklearn.tree as tree
import sklearn.ensemble as ensemble
import sklearn.model_selection as ms
import sklearn.svm as svm
import sklearn.neural_network as nn

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

# Basic EDA

In [None]:
data.info()

In [None]:
data.sample(5)

In [None]:
data.shape[0] - data.count() # no blank values

In [None]:
print(data['DEATH_EVENT'].value_counts(normalize=True)) # About 32% of the data is positive class
print()
print(data['DEATH_EVENT'].value_counts())

In [None]:
data_corr = data.corr()['DEATH_EVENT'] * 100
data_corr.sort_values()

# Splitting Data
* Data is splited such that BOTH the training and testing dataset contain the exact same proportion of death events

In [None]:
X = data.copy()
X.drop('DEATH_EVENT', axis=1, inplace=True)
y = data['DEATH_EVENT'].copy()

X_train, X_test, y_train, y_test = ms.train_test_split(X, y, train_size=200, shuffle=True, stratify=y, random_state=42)

In [None]:
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

# Candidates Models & Ensemble (BASELINE)

* My approach will be to train individual classifier models as well as an ensemble model (voting classifier)
* I will assess the individual classifier models based on the default hyperparameter values
* The voting classifier will be based on all the individual classifier models with their default hyperparameter values
* ALL features will be used for this baseline models

In [None]:
rf_clf = ensemble.RandomForestClassifier(random_state=42)
dt_clf = tree.DecisionTreeClassifier(random_state=42)
ext_clf = ensemble.ExtraTreesClassifier(random_state=42)
mlp_clf = nn.MLPClassifier(random_state=42)

voting_classifier = ensemble.VotingClassifier([
                                            ('rf_clf', ensemble.RandomForestClassifier(random_state=42)),
                                            ('dt_clf', tree.DecisionTreeClassifier(random_state=42)),
                                            ('ext_clf', ensemble.ExtraTreesClassifier(random_state=42)),
                                            ('mlp_clf', nn.MLPClassifier(random_state=42))
                                            ], voting='hard')

In [None]:
rf_clf.fit(X_train, y_train)
dt_clf.fit(X_train, y_train)
ext_clf.fit(X_train, y_train)
mlp_clf.fit(X_train, y_train)
voting_classifier.fit(X_train, y_train)

In [None]:
estimators = [rf_clf, dt_clf, ext_clf, mlp_clf, voting_classifier]

cv = ms.RepeatedKFold(n_splits=4, n_repeats=10, random_state=42)

for estimator in estimators:
    cv_accuracy = ms.cross_val_score(estimator, X_train, y_train, cv=cv, n_jobs=-1, scoring='accuracy')
    f1_score = ms.cross_val_score(estimator, X_train, y_train, cv=cv, n_jobs=-1, scoring='f1')
    print(estimator.__class__.__name__)
    print(f'Avg Accuracy: {np.mean(cv_accuracy) * 100}')
    print(f'Std Accuracy: {np.std(cv_accuracy) * 100}')
    print(f'Avg F1: {np.mean(f1_score) * 100}')
    print(f'Std F1: {np.std(f1_score) * 100}')
    print()

In [None]:
# MLPClassifier (with no tuning) perform the worse
# Drop MLPClassifier from the ensemble model and retrain it

new_voting_classifier = ensemble.VotingClassifier([
                                            ('rf_clf', ensemble.RandomForestClassifier(random_state=42)),
                                            ('dt_clf', tree.DecisionTreeClassifier(random_state=42)),
                                            ('ext_clf', ensemble.ExtraTreesClassifier(random_state=42)),
                                            ], voting='hard')


new_voting_classifier.fit(X_train, y_train)
nvc_accuracy_score = ms.cross_val_score(new_voting_classifier, X_train, y_train, cv=cv, n_jobs=-1, scoring='accuracy')
nvc_f1_score = ms.cross_val_score(new_voting_classifier, X_train, y_train, cv=cv, n_jobs=-1, scoring='f1')


print(np.mean(nvc_accuracy_score) * 100)
print(np.std(nvc_accuracy_score) * 100)
print(np.mean(nvc_f1_score) * 100)
print(np.std(nvc_f1_score) * 100)

# Commentary

* As it stands with no hyperparameter tuning, RandomForestClassifier perform the best
    * F1 Score stands at 73.6% with Accuracy at 84.5%
  
 
* The new voting classifier perform 'much better' with lower bias and variance when it drop MLPClassifier from its ensemble
    * Accuracy improves from 78.6% to 83.2%
    * F1 Score improves from 56.2% to 71.0%
    * Notably, new voting classifier has a slightly lower standard deviation as compare to the best model: RandomForestClassifier


|                  	| RandomForestClassifier (%) 	| New Voting Classifier (%) 	| Old Voting Classifier (%) 	|
|------------------	|----------------------------	|---------------------------	|---------------------------	|
| Average Accuracy 	| 84.5                       	| 83.2                      	| 78.6                      	|
| Std Accuracy     	| 3.8                        	| 3.7                       	| 5.5                       	|
| Average F1       	| 73.6                       	| 71.0                      	| 56.2                      	|
| Std F1           	| 6.2                        	| 6.0                       	| 11.4                      	|


In [None]:
# Let's change the voting method to soft to see if there is any gain in performance

new_soft_voting_classifier = ensemble.VotingClassifier([
                                            ('rf_clf', ensemble.RandomForestClassifier(random_state=42)),
                                            ('dt_clf', tree.DecisionTreeClassifier(random_state=42)),
                                            ('ext_clf', ensemble.ExtraTreesClassifier(random_state=42)),
                                            ], voting='soft')

new_soft_voting_classifier.fit(X_train, y_train)
nsvc_accuracy_score = ms.cross_val_score(new_soft_voting_classifier, X_train, y_train, cv=cv, n_jobs=-1, scoring='accuracy')
nsvc_f1_score = ms.cross_val_score(new_soft_voting_classifier, X_train, y_train, cv=cv, n_jobs=-1, scoring='f1')

print(np.mean(nsvc_accuracy_score) * 100)
print(np.std(nsvc_accuracy_score) * 100)
print(np.mean(nsvc_f1_score) * 100)
print(np.std(nsvc_f1_score) * 100)

# Seems that hard voting is the way to go

# Performance on Testing Data (BASELINE MODELS)

In [None]:
estimators = [rf_clf, dt_clf, ext_clf, mlp_clf, new_voting_classifier]

for estimator in estimators:
    print(estimator.__class__.__name__)
    print(estimator.score(X_test, y_test) * 100)
    print()

# Candidates Models & Ensemble (Feature Selection)

* Let's select features that have at least 20% correlation with the target

In [None]:
data_corr = data.corr()['DEATH_EVENT'] * 100
data_corr.sort_values()

In [None]:
X = data[['time', 'ejection_fraction', 'age', 'serum_creatinine']].copy()
y = data['DEATH_EVENT'].copy()

X_train, X_test, y_train, y_test = ms.train_test_split(X, y, train_size=200, shuffle=True, stratify=y, random_state=42)


rf_clf = ensemble.RandomForestClassifier(random_state=42)
dt_clf = tree.DecisionTreeClassifier(random_state=42)
ext_clf = ensemble.ExtraTreesClassifier(random_state=42)

voting_classifier = ensemble.VotingClassifier([
                                            ('rf_clf', ensemble.RandomForestClassifier(random_state=42)),
                                            ('dt_clf', tree.DecisionTreeClassifier(random_state=42)),
                                            ('ext_clf', ensemble.ExtraTreesClassifier(random_state=42)),
                                            ], voting='hard')


rf_clf.fit(X_train, y_train)
dt_clf.fit(X_train, y_train)
ext_clf.fit(X_train, y_train)
voting_classifier.fit(X_train, y_train)


estimators = [rf_clf, dt_clf, ext_clf, voting_classifier]

cv = ms.RepeatedKFold(n_splits=4, n_repeats=10, random_state=42)

for estimator in estimators:
    cv_accuracy = ms.cross_val_score(estimator, X_train, y_train, cv=cv, n_jobs=-1, scoring='accuracy')
    f1_score = ms.cross_val_score(estimator, X_train, y_train, cv=cv, n_jobs=-1, scoring='f1')
    print(estimator.__class__.__name__)
    print(f'Avg Accuracy: {np.mean(cv_accuracy) * 100}')
    print(f'Std Accuracy: {np.std(cv_accuracy) * 100}')
    print(f'Avg F1: {np.mean(f1_score) * 100}')
    print(f'Std F1: {np.std(f1_score) * 100}')
    print()

In [None]:
for estimator in estimators:
    print(estimator.__class__.__name__)
    print(estimator.score(X_test, y_test) * 100)
    print()

# Conclusion

* Interestingly, the ExtraTreesClassifier perform the best on the training data (with feature selection) with the highest F1 score of 76.4% though the standard deviation on its F1 score is also the highest
* On the testing data, the Voting Classifier and Random Forest model perform the same
* All the models perform slightly better with feature selection
* The reduced feature dataset also result in a slight increaser in the scores' standard deviation