In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebrau.
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import more relevant libraries and modules
import scipy
from scipy import stats
from scipy.stats.mstats import winsorize 
from sklearn.utils import resample
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Read in the data to a dataframe

heart_failure = pd.read_csv("/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")

# Display the first 5 rows

heart_failure.head(5)

# EDA


In [None]:
# Display information about the dataset

heart_failure.info()

This dataset consists of 299 rows and 13 columns, with only numerically valued entries. Furthermore, there don't appear to be any missing values that we would need to address

In [None]:
# Obtain a basic statistical description for the dataset
heart_failure.describe()

Based off of the standard deviation, $75^{\text{th}}-$percentile, and the maximum value, the following variables have outliers that we will need to address:

1. `creatinine_phosphokinase`
2. `ejection_fraction`
3. `platelets`
4. `serum_creatinine`

There are two possible treatments: winsorization or removal by z-score. For sake of completeness of the dataset, we will opt to winsorize the data since it will limit the extreme values. But first, we will generate plots for the data.

In [None]:
# Plot a histogram for the `age` variable

plt.figure(figsize = (10,5))
sns.histplot(heart_failure['age'], bins = 11, kde = True)
plt.title("Distribution of Age")

plt.savefig('AgeHistogram.pdf', dpi = 300)
plt.show()

In [None]:
# Plot pie charts for the following categorical features
health_indicators = ['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking']
c = 1

for var in health_indicators:
    
    plt.subplot(2,3,c)
    ax = heart_failure[var].value_counts(normalize = True).plot.pie(y = var, figsize = (10, 5), autopct='%1.1f%%')
    
    c = c + 1
    
plt.tight_layout()
plt.savefig('PieCharts.pdf', dpi = 300)
plt.show()

In [None]:
# Plot histograms of the continuous features
continuous = []

for col in heart_failure.columns:
    
    if heart_failure[col].nunique() > 10:
        
        continuous.append(col)
        
plt.figure(figsize = (10,6)) 
c = 1

for col in continuous:
    
    plt.subplot(2,4,c)
    plt.hist(heart_failure[col])
    plt.title("Histogram: \n{}".format(col))
    
    c = c + 1
    
plt.tight_layout()
plt.show()

It is evident that none of these features are normally distributed; they all exhibit some skew or aren't even normally distributed.

In [None]:
# Output a correlation matrix
corr_matrix = heart_failure.corr()

f, ax = plt.subplots(figsize=(15,10))
heatmap = sns.heatmap(corr_matrix,
                    square = True,
                    cmap = 'YlGnBu',
                    annot = True,
                    annot_kws = {"size": 12})

# Add column names as labels
ax.set_yticklabels(corr_matrix.columns, rotation = 0)
ax.set_xticklabels(corr_matrix.columns, rotation = 80)            

sns.set_style({'xtick.bottom': True}, {'ytick.left': True})

Let's explore the relationship between variables that possess a correlation coefficent with $ |c| \geq 0.1.$ This entails masking the correlation matrix above and redisplaying the output

In [None]:
# Mask the previous correlation matrix to correlation coefficients greater than 0.1
new_matrix = corr_matrix[abs(corr_matrix) >= 0.1]

g, ax = plt.subplots(figsize = (15,10))

new_heatmap = sns.heatmap(new_matrix, square = True,
                         cmap = 'coolwarm', annot = True,
                        annot_kws = {"size": 12})

# Add column names as labels
ax.set_yticklabels(new_matrix.columns, rotation = 0)
ax.set_xticklabels(new_matrix.columns, rotation = 80)            

sns.set_style({'xtick.bottom': True}, {'ytick.left': True})

Now,we can plot barplots for the relationship between the continuous variables and the following categorical variables: `anemia`, `diabetes`, `high_blood_pressure`, `sex`, `smoking`. 

In [None]:
# Plot barplots
categorical = ['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking']

# Remove the time variable from the list continuous
continuous.remove('time')

plt.figure(figsize = (15,15))
c = 1 

for var1 in continuous:
    
    for var2 in categorical:
        
        plt.subplot(6,5,c)
        sns.barplot(x = var2, y = var1, data = heart_failure)
        
        c = c + 1

plt.tight_layout()
plt.show()

The most intruiging difference here is between `creatinine_phosphokinase` versus `anaemia`, `sex`, and `high_blood_pressure`. Otherwise, there weren't too many other notable differences.

In [None]:
# Plot scatter plots of the continuous variables with correlation
# coefficient greater than 0.1

plt.figure(figsize = (10, 5))

plt.subplot(131)
sns.scatterplot(x = 'ejection_fraction', y = 'serum_creatinine', data = heart_failure)

plt.subplot(132)
sns.scatterplot(x = 'ejection_fraction', y = 'serum_sodium', data = heart_failure)

plt.subplot(133)
sns.scatterplot(x = 'serum_creatinine', y = 'serum_sodium', data = heart_failure)

plt.tight_layout()
plt.savefig('Scatterplots.pdf', dpi = 300)
plt.show()

In [None]:
# Winsorize the aforementioned variables
# Make a copy of the dataframe to apply modifications to 

heart = heart_failure.copy()

# Apply one-way winsorization to the either the 90th or 95th percentile
heart['winsorized_creatinine_phosphokinase'] = winsorize(heart['creatinine_phosphokinase'], [0, 0.1])
heart['winsorized_ejection_fraction'] = winsorize(heart['ejection_fraction'], [0, 0.05])
heart['winsorized_platelets'] = winsorize(heart['platelets'], [0, 0.1])
heart['winsorized_serum_creatinine'] = winsorize(heart['serum_creatinine'], [0, 0.1])

In [None]:
# Check for outliers in the winsorized variables
heart[['winsorized_creatinine_phosphokinase', 'winsorized_ejection_fraction', 
       'winsorized_platelets', 'winsorized_serum_creatinine']].describe()

This is much more workable than previously, so we can proceed further to the feature engineering phase. First, we'll examine the number of unique entries per variable to get a better sense of the dataset.


In [None]:
# Compute the number of unique values for each variable

for column in heart.columns.drop(['creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine']):
    
    print("Number of unique values for {}: ".format(column), heart[column].nunique())

The following variables are binary: `anaemia`, `diabetes`, `high_blood_pressure`, `sex`,`smoking`, and our target variable `DEATH_EVENT`. This implies that we will be utilizing classification models to make predictions. In particular, we will most likely utilize support vector machines and boosting models. There is a possibility of training a random forest classifier, although this would require additional feature engineering to convert the continuous variables into a suitable form for the random forest classifier.

In [None]:
# Check the dataset for class imbalance (or balance)

heart['DEATH_EVENT'].value_counts()

As the above demonstrates, the dataset exhibits class imbalance, which we will need to address. There are two approaches we can utilize:

1. We oversample the minority class by creating more synthetic data (for the minority class) to match the majority class

2. We undersample the majority class by removing data from the majority class to match the minority class

Of the two approaches, we will opt for the former, as our dataset is relatively small; removal of data may affect the performance of any model we build

In [None]:
# Oversample the minority class
Ones = heart[heart.DEATH_EVENT == 1]
Zeroes = heart[heart.DEATH_EVENT == 0]

Ones_upsample = resample(Ones, replace = True,
                          n_samples = len(Zeroes),
                          random_state = 31)

heart = pd.concat([Zeroes, Ones_upsample])

In [None]:
# Re-examine the `DEATH_EVENTS` column
heart.DEATH_EVENT.value_counts()

Now the number of zero and one class equal. We can now train our classification models, compare them to select the most accurate base model, and then tune that model's hyperparameters to increase accuracy and hopefully minimize false negatives as this is a heart failure classification (and risk prediction) problem. 

In [None]:
# Import sklearn models and performance metrics

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse

from sklearn import ensemble
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Drop the original columns that we winsorized previously

heart = heart.drop(['creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine'], axis = 1)

What we can do prior to training our models is to compute a correlation matrix to determine which, if any, features are highly correlated with the target variable and, if so, if that correlation is statistically significant.

Based on the correlation matrix above, the features most correlated with the target variable are `time`, `winsorized_serum_creatinine`, `age`, `winsorized_ejection_fraction`, and `serum_sodium`. We can now perform non-parametric tests for statistical significance, since our features are not normally distributed. In particular, we will perform the Kruskal-Wallis test.

In [None]:
# Perform Kruskal-Wallis tests

from scipy.stats import kruskal

corr_vars = ['age', 'serum_sodium', 'time', 'winsorized_ejection_fraction', 'winsorized_serum_creatinine']

for var in corr_vars:
    
    stat, p_value = kruskal(heart[var], heart['DEATH_EVENT'])
    
    if p_value < 0.05:
        print("P-value for {} and Death_Event:\n".format(var), p_value)
        print('\n')

In each case, since the p-value was significantly smaller than 0.01, we are able to reject the Null hypothesis that, in each instance, the difference or correlation between the feature and the target variable is not statistically significant.

The first model we'll train is a K Nearest-Neighbors model, but first, we must split the data into training and test sets.

In [None]:
# Split the data into training and test sets, with a test-set size of 30 percent

X = heart.drop('DEATH_EVENT', axis = 1)
Y = heart.DEATH_EVENT

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 131)

In [None]:
# Fit the model 
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)

# Make predictions with the model
y_pred_knn = knn.predict(X_test)

# Print accuracy, precision, and recall scores for the model 
acc_knn = accuracy_score(y_test, y_pred_knn)
pre_knn = precision_score(y_test, y_pred_knn)
recall_knn = recall_score(y_test, y_pred_knn)
print(" Accuracy score: {} \n".format(acc_knn), 
      "Precision score: {} \n".format(pre_knn),
      "Recall score: {}".format(recall_knn))

This isn't necessarily a great model, but we have a baseline that we can compare to other models going forward. Next, we will build a baseline Random Forest Classifier.

In [None]:
# Instantiate a Random Forest classifier
rfc = ensemble.RandomForestClassifier()
rfc.fit(X_train, y_train)

# Print the cross-validation (5-fold) score 
print(cross_val_score(rfc, X_train, y_train, cv = 5))

# Make predictions and then print our performance metrics
y_pred_rfc = rfc.predict(X_test)

acc_rfc = accuracy_score(y_test, y_pred_rfc)
pre_rfc = precision_score(y_test, y_pred_rfc)
recall_rfc = recall_score(y_test, y_pred_rfc)
print(" Accuracy score: {} \n".format(acc_rfc), 
      "Precision score: {} \n".format(pre_rfc),
      "Recall score: {}".format(recall_rfc))

This is a great start to a baseline Random forest model that we can tune even further to improve performance. Next, we can move on to building an SVM classifier

In [None]:
# Build an SVM classifier
svc = SVC()
svc.fit(X_train, y_train)

# Print the cross-validation (5-fold) score of the training set
print(cross_val_score(svc, X_train, y_train, cv = 5))

# Make predictions and then print performance metrics
y_pred_svc = svc.predict(X_test)

acc_svc = accuracy_score(y_test, y_pred_svc)
pre_svc = precision_score(y_test, y_pred_svc)
recall_svc = recall_score(y_test, y_pred_svc)
print(" Accuracy score: {} \n".format(acc_svc), 
      "Precision score: {} \n".format(pre_svc),
      "Recall score: {}".format(recall_svc))

Well, this is somewhat disappointing. We'll proceed to building a boosting classifier.

In [None]:
# Build and fit a Gradient Boosting classifier with default parameters as a baseline model
gbc = ensemble.GradientBoostingClassifier()
gbc.fit(X_train, y_train)

# Print cross-validation (5-fold) score for the training set
print(cross_val_score(gbc, X_train, y_train, cv = 5))

# Make predictions and then print performance metrics
y_pred_gb = gbc.predict(X_test)

acc_gb = accuracy_score(y_test, y_pred_gb)
pre_gb = precision_score(y_test, y_pred_gb)
recall_gb = recall_score(y_test, y_pred_gb)
print(" Accuracy score: {} \n".format(acc_gb), 
      "Precision score: {} \n".format(pre_gb),
      "Recall score: {}".format(recall_gb))

Similar to the random forest classifier we trained earlier, this too is a great baseline model that we can tune the hyperparameters and improve upon. Going forward, we can compare the random forest and the gradient boosting after tuning their respective hyperparameters. An argument can be made that we should go forward solely with the random forest classifier for this problem, as it pertains to heart failure classification and thus we would prefer higher precision and recall scores to maximize the percentage of correct positive classifications and minimizing the percentage of false negative classifications. On this note, we can print out the confusion matrices to better evaluate which model minimizes false negative classifications.

In [None]:
# Display confusion matrices for the baseline models
confusion_knn = confusion_matrix(y_test, y_pred_knn)
confusion_rf = confusion_matrix(y_test, y_pred_rfc)
confusion_svc = confusion_matrix(y_test, y_pred_svc)
confusion_gb = confusion_matrix(y_test, y_pred_gb)

plt.figure(figsize = (15,10))

plt.subplot(2,2,1)
sns.heatmap(confusion_knn, annot = True, cmap = "YlGnBu")
plt.title("Confusion Matrix: \nKNN")

plt.subplot(2,2,2)
sns.heatmap(confusion_rf, annot = True, cmap ="YlGnBu")
plt.title("Confusion Matrix: \nRandom Forest")

plt.subplot(2,2,3)
sns.heatmap(confusion_gb, annot = True, cmap = "YlGnBu")
plt.title("Confusion Matrix: \nGradient Boosting")

plt.subplot(2,2,4)
sns.heatmap(confusion_svc, annot = True, cmap = "YlGnBu")
plt.title("Confusion Matrix: \nSupport Vector Machine")

plt.tight_layout()
plt.savefig('ConfusionMatrices.pdf', dpi = 300)
plt.show()

Based upon the above confusion matrices, the two methods are fairly similar, however, the random forest classifier does possess fewer false negative classifications than the gradient boosting classifier. Hence, we will proceed with the former method for our model and tune its hyperparameters. 

In [None]:
from yellowbrick.model_selection import feature_importances

# Display the feature importances

plt.subplot(211)
viz = feature_importances(rfc, X_train, y_train)

plt.subplot(212)
viz2 = feature_importances(gbc, X_train, y_train)

plt.tight_layout()
plt.savefig('Feature_Importances.pdf', dpi = 300)
plt.show()

The five most important features for the Random Forest and Gradient Boosting classifiers were the following, in respective order:

RF: `time`, `winsorized_serum_creatinine`, `winsorized_ejection_fraction`, `winsorized_platelets`, and `age`

GB: `time`, `winsorized_serum_creatine`, `winsorized_ejection_fraction`, `winsorized_platelets`, and `winsorized_creatinine_phosphokinase`

The importance of the remaining features decays to zero much more quickly for the Gradient Boosting classifier, indicating that if we opted to proceed with a boosting model, we could reduce the dimensionality further.

In [None]:
# Use a Random GridSearch with k-fold cross-validation to search for the best hyperparameters

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Set up the parameter grid

params = {"n_estimators": [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)],
         "max_features": ['auto', 'sqrt'],
         "max_depth": [int(x) for x in np.linspace(start = 10, stop = 110, num = 11)],
         "min_samples_split": [2, 5, 10],
         "min_samples_leaf": [1, 2, 4],
         "bootstrap": [True, False],
         }

# Iterating through this grid, we will be testing 3960 settings on a base model
rf = ensemble.RandomForestClassifier()

# Instantiate a randomized GridSearch model with 5-fold cross-validation
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = params, n_iter = 100, cv = 5, verbose = 2, random_state = 43,  n_jobs =-1)

# Fit the random search model to the training set and output the best parameters
rf_random.fit(X_train, y_train)
rf_random.best_params_

With these hyperparameter values in hand, we can use a more concentrated Grid Search.

In [None]:
# Create a parameter grid based upon the results from the randomized search

param_grid = {'bootstrap': [False],
             'max_features': [2, 3],
             'max_depth': [110, 120, 130, 140, 150],
             'min_samples_leaf': [1, 2, 4],
             'min_samples_split': [2, 3],
             'n_estimators': [100, 200, 300, 1000]
             }

# Create our base random forest model
rf = ensemble.RandomForestClassifier()

# Instantiate the Grid Search model
GridSearch = GridSearchCV(estimator = rf, param_grid = param_grid, 
                         cv = 5, n_jobs = -1, verbose = 2, error_score = 'raise')

In [None]:
# Fit the Grid Search to the data and determine the best values for the hyperparameters
GridSearch.fit(X_train, y_train)
GridSearch.best_params_

In [None]:
# Make predictions with the GridSearch parameters
y_pred_gridcv = GridSearch.predict(X_test)

# Output model evaluation metrics
acc_gridcv = accuracy_score(y_test, y_pred_gridcv)
pre_gridcv = precision_score(y_test, y_pred_gridcv)
recall_gridcv = recall_score(y_test, y_pred_gridcv)
print(" Accuracy score: {} \n".format(acc_gridcv), 
      "Precision score: {} \n".format(pre_gridcv),
      "Recall score: {}".format(recall_gridcv))

In [None]:
# Display confusion matrices for the original model and the GridSearch model

plt.figure(figsize = (15, 10))

plt.subplot(1,2,1)
sns.heatmap(confusion_rf, annot = True, cmap = 'YlGnBu')
plt.title("Confusion Matrix: Original Model")

plt.subplot(1,2,2)
sns.heatmap(confusion_matrix(y_test, y_pred_gridcv), annot = True, cmap = 'YlGnBu')
plt.title("Confusion Matrix: Grid Search")

plt.tight_layout()
plt.savefig('OGvsGS.pdf', dpi = 300)
plt.show()