In [None]:
import pandas as pd
import csv
import re
from pandas import read_csv
import datetime
import numpy as np

##For Analysis
import statsmodels.api as sm
import statsmodels.stats.api as sms
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, f1_score
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE, RFECV
from sklearn.feature_selection import SelectFromModel
import shap

##
## ===> Visualization <===
##
import seaborn as sns
sns.set()
sns.set_style('whitegrid')
import matplotlib.pyplot as plt

pd.set_option('display.width',1000)
pd.set_option('display.max_columns',300)
pd.set_option('display.max_rows',1000)

In [None]:
#Dataset taken from https://www.kaggle.com/andrewmvd/heart-failure-clinical-data

In [None]:
#Importing Data
heart_failure_dataset = pd.read_csv("../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")

print(heart_failure_dataset .info()) #299 Observations

In [None]:
heart_failure_dataset.head()

In [None]:
#Splitting into y and X variables
y = heart_failure_dataset.filter(['DEATH_EVENT'])

X = heart_failure_dataset
X = heart_failure_dataset.drop(['DEATH_EVENT'], axis=1)

##Splitting data into training (80%) and testing (20%) sets (While keeping balanced)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, train_size=0.8, random_state=2021, stratify=y)

In [None]:
#General Information for training data
X_train.info() #239 observations in the training set.

In [None]:
##Histograms of All Numerical Variables of Interest 
num_variables = ['age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium', 'time']
X_train[num_variables].hist(bins=10, figsize=(20,15), layout=(3,5))

In [None]:
#Summary Statistics for Numerical Variables 
X_train[num_variables].describe()

In [None]:
#Joining training data for exploratory purposes
combined_traindata = pd.concat([X_train, y_train],axis=1)

In [None]:
#Correctly identifying 'anaemia','diabetes', 'high_blood_pressure','sex', 'smoking' to categorical variable.
combined_traindata['anaemia'] = combined_traindata['anaemia'].astype('category')

combined_traindata['diabetes'] = combined_traindata['diabetes'].astype('category')

combined_traindata['high_blood_pressure'] = combined_traindata['high_blood_pressure'].astype('category')

combined_traindata['sex'] = combined_traindata['sex'].astype('category')

combined_traindata['smoking'] = combined_traindata['smoking'].astype('category')

In [None]:
#Summary Statistics for Categorical Variables
cat_variables = ['anaemia','diabetes', 'high_blood_pressure','sex', 'smoking']
combined_traindata[cat_variables].describe()

In [None]:
#Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
#Building GB Classification Model for Sklearn Prediction
base_gb_model1 = GradientBoostingClassifier(n_estimators=100, random_state=2021).fit(X_train, np.ravel(y_train))

In [None]:
#Cross-Validation Accuracy Score
base_gb_model1_cvs = cross_val_score(base_gb_model1, X_train, np.ravel(y_train), cv=10)
base_gb_model1_cvs.max()
base_gb_model1_cvs.min()
base_gb_model1_cvs.mean() #0.7952898550724637

In [None]:
#Determining test score
base_gb_model1.score(X_test, y_test) #0.83

In [None]:
#Response Prediction
y_pred = base_gb_model1.predict(X_test)

In [None]:
#Creating classification report for GB Classification Model 
print(classification_report(y_test, y_pred))

In [None]:
#Creating confusion matrix for GB Classification Model. True negatives (TN) are in the upper-left position, False Negatives (FN) are in the lower-left position, False Positives (FP) are in the upper-right position, True Positives (TP) are in the lower-right position.
confusion_matrix(y_test, y_pred)

In [None]:
#Determining AUC score for the GB Classification Model.
roc_auc_score(y_test, y_pred) #0.8074454428754815

In [None]:
#Determining F1 score for the GB Classification Model.
f1_score(y_test, y_pred,average='binary') #0.7368421052631579; a poor F1 score, is close to 0.0. Best F1 score is close to 1.

In [None]:
#Numerical Importance of Predictors (Gini Importance)
initial_feature_list = list(X_train.columns)
rfr_importance = list(base_gb_model1.feature_importances_)

#Merged and Sorted with Predictors of importance
var_importance_merge = [(predictor,round(importance,2)) for predictor, importance in zip(initial_feature_list,rfr_importance)]

var_importance_merge = sorted(var_importance_merge, key = lambda x: x[1], reverse = True)

print(var_importance_merge)

In [None]:
#Ranking Feature Importance
df_importance = pd.DataFrame(var_importance_merge, columns = ['PREDICTOR','IMPORTANCE_LEVEL'])

#Predictor Rank Plot
sns.catplot(x="IMPORTANCE_LEVEL", y='PREDICTOR', data = df_importance, kind = "bar", height =14)

In [None]:
#Applying Feature Selection using 'SelectFromModel'
gb_model_fs = SelectFromModel(base_gb_model1).fit(X_train,np.ravel(y_train))

In [None]:
#Package example reference: https://scikit-learn.org/stable/auto_examples/feature_selection/plot_select_from_model_diabetes.html
gb_feature_names = X_train.columns[gb_model_fs.get_support()]
print(gb_feature_names)

In [None]:
#Filtering Training Data to Columns that will be used for Model Building
X_train_reduced = X_train.filter(['ejection_fraction', 'serum_creatinine', 'time'])
X_test_reduced =X_test.filter(['ejection_fraction', 'serum_creatinine', 'time'])

In [None]:
#checking for multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

X_train_constant_vif = sm.add_constant(X_train_reduced) #For evaluating VIF only.

vif= [variance_inflation_factor(X_train_constant_vif.values,i) for i in range(X_train_constant_vif.shape[1])]

pd.DataFrame({'vif': vif[1:]}, index=X_train_reduced.columns).T #Multicollinearity interpretted as high when VIF > 5

In [None]:
#Building GB Classification Model for Sklearn Prediction
base_gb_model2 = GradientBoostingClassifier(n_estimators=100, random_state=2021).fit(X_train_reduced, np.ravel(y_train))

In [None]:
#Cross-Validation Accuracy Score
base_gb_model2_cvs = cross_val_score(base_gb_model2, X_train_reduced, np.ravel(y_train), cv=10)
base_gb_model2_cvs.max()
base_gb_model2_cvs.min()
base_gb_model2_cvs.mean() #0.8454710144927535

In [None]:
#Determining test score
base_gb_model2.score(X_test_reduced, y_test) #0.83

In [None]:
#Response Prediction
y_pred2 = base_gb_model2.predict(X_test_reduced)

In [None]:
#Creating classification report for GB classification model.
print(classification_report(y_test, y_pred2))

In [None]:
#Creating confusion matrix for GB classification model. True negatives (TN) are in the upper-left position, False Negatives (FN) are in the lower-left position, False Positives (FP) are in the upper-right position, True Positives (TP) are in the lower-right position.
confusion_matrix(y_test, y_pred2)

In [None]:
#Determining AUC score for the GB Classification model.
roc_auc_score(y_test, y_pred2) #0.8215661103979461

In [None]:
#Determining F1 score for the GB classification model
f1_score(y_test, y_pred2,average='binary') #0.7500000000000001

In [None]:
##Hypertuning with GridSearchCV
##Hypertuning parameters: max_depth, min_child_weight, eta.
#Code referenced as guide for tuning procedure: https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
param_grid = {
    'max_depth':[6,8,10],
    'n_estimators':[60,80,100],
    'learning_rate': [0.10,0.15,0.20]
}

gb_cm = GradientBoostingClassifier(random_state=2021)

gb_gscv = GridSearchCV(estimator = gb_cm, param_grid = param_grid, cv=10, n_jobs=-1, verbose = 2)

In [None]:
##Fitting GSCV with Data 
gb_gscv.fit(X_train_reduced, np.ravel(y_train))

In [None]:
##Extracting best params from GSCV
gb_gscv.best_params_ #{'learning_rate': 0.15, 'max_depth': 6, 'n_estimators': 80}

In [None]:
##Re-fitting a second random forest classification model with hypertuned parameters
base_gb_model3= GradientBoostingClassifier(n_estimators=100, max_depth=6, learning_rate=0.15, random_state=2021).fit(X_train_reduced, np.ravel(y_train))

In [None]:
#Cross-Validation Accuracy Score
base_gb_model3_cvs = cross_val_score(base_gb_model3, X_train_reduced, np.ravel(y_train), cv=10)
base_gb_model3_cvs.max()
base_gb_model3_cvs.min()
base_gb_model3_cvs.mean() #0.8414855072463767

In [None]:
#Response Prediction
y_pred3 = base_gb_model3.predict(X_test_reduced)

In [None]:
#Determining test score
base_gb_model3.score(X_test_reduced, y_test) #0.85

In [None]:
#Creating classification report for random forest classification.
print(classification_report(y_test, y_pred3))

In [None]:
#Creating confusion matrix for GBclassification model. True negatives (TN) are in the upper-left position, False Negatives (FN) are in the lower-left position, False Positives (FP) are in the upper-right position, True Positives (TP) are in the lower-right position.
confusion_matrix(y_test, y_pred3)

In [None]:
#Determining AUC score for GB classification model.
roc_auc_score(y_test, y_pred3) #0.8478818998716303

In [None]:
#Determining F1 score for the for random forest classification model.
f1_score(y_test, y_pred3,average='binary') #0.7804878048780488; a poor F1 score, is close to 0.0. Best F1 score is close to 1.

In [None]:
#Numerical Importance of Predictors (Gini Importance)
final_feature_list = list(X_train_reduced.columns)
final_rfr_importance = list(base_gb_model3.feature_importances_)

#Merged and Sorted with Predictors of importance
var_importance_merge_final = [(predictor,round(importance,2)) for predictor, importance in zip(final_feature_list,final_rfr_importance)]

var_importance_merge_final = sorted(var_importance_merge_final, key = lambda x: x[1], reverse = True)

print(var_importance_merge_final)

In [None]:
#Ranking Feature Importance
df_importance_final = pd.DataFrame(var_importance_merge_final, columns = ['PREDICTOR','IMPORTANCE_LEVEL'])

#Predictor Rank Plot
sns.catplot(x="IMPORTANCE_LEVEL", y='PREDICTOR', data = df_importance_final, kind = "bar", height =14)