## SHAP ANALYSIS

In [None]:
import pandas as pd
import numpy as np
import impyute as impy

from collections import Counter

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, f1_score, roc_curve, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, cross_val_predict

from imblearn.under_sampling import RandomUnderSampler

import matplotlib.pyplot as plt

import shap
shap.initjs()

In [None]:
X = pd.read_csv('/path/X.csv')

In [None]:
# prepare data

X = X.iloc[3::4, :]
X = X.sort_values(by = 'hour')
X = X.dropna(axis = 1, thresh = 1800)

In [None]:
y = X.pop('label')

# Random undersampling

print('Original dataset shape %s' % Counter(y))

rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X, y)
print('Resampled dataset shape %s' % Counter(y_res))

X_res['label'] = y_res

In [None]:
# Imputation

Xv = X_res.values
Xv = impy.median(Xv)

# convert back to dataframe
km_X = pd.DataFrame(Xv, columns = X_res.columns.values.tolist())
km_X.fillna(method='ffill')

In [None]:
# Scaler for KMeans

scaler = StandardScaler()
scaler.fit(km_X)
X = scaler.transform(km_X)

In [None]:
# SKLearn Kmeans n=2 with imputation

k = 2
kmeans = KMeans(n_clusters=k)
y_pred = kmeans.fit_predict(X)

print(kmeans.inertia_)
print(silhouette_score(X, kmeans.labels_))

In [None]:
# Prepare classes for Decision Tree

kmeanModel = KMeans(n_clusters = 2)
y = kmeanModel.fit(X).labels_

In [None]:
# Random Forrest Classifier

model_rf = RandomForestClassifier(random_state=1)
rf_pipeline = Pipeline([ 
    ('model_rf',model_rf)
])
rf_param = {
               'model_rf__max_depth': [2, 3, 5, 7, 9],
               'model_rf__max_features': ['sqrt', 'log2'],
               'model_rf__n_estimators': [10, 100, 500]}

rf_random = RandomizedSearchCV(rf_pipeline, rf_param, scoring='f1')

In [None]:
rf_outer_results = list()
rf_best_f1_score=float('-inf')
rf_best_parameters={}

#Random Forrest
search = rf_random
result = search.fit(km_X, y)
best_model = result.best_estimator_
yhat = best_model.predict(km_X)
f1score = f1_score(y, yhat)
if f1score >= rf_best_f1_score:
    rf_best_f1_score=f1score
    rf_best_parameters=result.best_params_
rf_outer_results.append(f1score)
print('Random Forrest test: est=%.3f, cfg=%s' % (result.best_score_, result.best_params_))

In [None]:
# fit best model

clf = RandomForestClassifier(n_estimators = 500, max_depth = 9, max_features = 'log2', random_state = 42)
clf.fit(km_X, y)

In [None]:
# roc curve

y_probas_forest = cross_val_predict(clf, km_X, y, cv=3, method='predict_proba')

In [None]:
# roc curve

y_scores_forest = y_probas_forest[:, 1] # score  proba of positive class
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y, y_scores_forest)

In [None]:
roc_auc_score(y, clf.predict_proba(km_X)[:, 1])

In [None]:
# Force plot for all observations

explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(km_X)
shap.force_plot(explainer.expected_value[1], shap_values[1], km_X)

In [None]:
# SHAP

#shap_interaction_values = explainer.shap_interaction_values(km_X)

In [None]:
# Summary Plot for sepsis +

shap.summary_plot(shap_values[1], km_X)

In [None]:
# Summary Plot for sepsis -

shap.summary_plot(shap_values[0], km_X)