# Calculate SHAP values from the DeepIMPACT Models

Shubhayu Bhattacharyay
<br>
Ari Ercole

## I. Initialization

 ### Import necessary packages

In [1]:
# Fundamental methods
import os
import sys
import json
import time
import glob
import random
import warnings
import itertools
import numpy as np
import pandas as pd
import pickle as cp
import seaborn as sns
from scipy import stats
from pathlib import Path
import matplotlib.pyplot as plt
from IPython.display import clear_output
warnings.filterwarnings(action="ignore")

# Tensorflow, and CORAL methods (neural network methods)
import tensorflow as tf
tf.compat.v1.enable_eager_execution()
import tensorflow.python.keras.backend as K
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Input, Activation, Dense, Dropout, Conv2D, Flatten, LSTM, Permute, Reshape, AlphaDropout, BatchNormalization

# Keras Tuner methods
import kerastuner as kt
from kerastuner.tuners import RandomSearch, Hyperband

# Import scikit-learn stratified split function
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelBinarizer, PowerTransformer, label_binarize

# Import SHAP modules for SHAP assessment
import shap

# Load custom functions
%run -i 'functions/ordinal_encoding.py'
%run -i 'functions/multiclass_metrics.py'

## II. Implementation of SHAP DeepExplainer Algorithm

In [None]:
# Load optimal deepMN and deepOR model configurations per each repeat
opt_tune_idx_deepMN = pd.read_csv('../metrics/deepMN_optimal_repeatedCV_aurocs.csv')
opt_tune_idx_deepOR = pd.read_csv('../metrics/deepOR_optimal_repeatedCV_aurocs.csv')

# Initalize empty dataframe to store compiled prediction results
pooled_shap_values_deepMN = pd.DataFrame(np.empty((0,14)))
pooled_shap_values_deepOR = pd.DataFrame(np.empty((0,14)))

# Inititalize dynamic display messages for status updates
repeat_status = display('',display_id=True)
fold_status = display('',display_id=True)

# Iterate across repeat directories
for curr_repeat_name in opt_tune_idx_deepMN['repeat.name'].unique():
    curr_opt_tune_idx_deepMN = int(opt_tune_idx_deepMN.tune_idx[opt_tune_idx_deepMN['repeat.name'] == curr_repeat_name].values[0])
    curr_opt_tune_idx_deepOR = int(opt_tune_idx_deepOR.tune_idx[opt_tune_idx_deepOR['repeat.name'] == curr_repeat_name].values[0])
    
    # Update repeat status message
    repeat_status.update(curr_repeat_name + ' started.')
    
    # Derive list of fold subdirectories in current repeat directory
    fold_dirs = glob.glob(os.path.join('../repeated_cv',curr_repeat_name,'Fold*/'))    

    # Loop through current list of fold directories
    for curr_fold_dir in fold_dirs:
        
        # Update fold status message
        fold_status.update(curr_fold_dir[24:29] + ' started.')
        
        # Load the optimal models of the current repeat from the current fold directory
        deepMN = load_model(os.path.join(curr_fold_dir,'trained_models','deepMN','deepMN_tuning_'+str(curr_opt_tune_idx_deepMN).zfill(3)+'.h5'),custom_objects={'f1_score_m': f1_score_m})
        deepOR = load_model(os.path.join(curr_fold_dir,'trained_models','deepOR','deepOR_tuning_'+str(curr_opt_tune_idx_deepOR).zfill(3)+'.h5'),custom_objects={'f1_score_m': f1_score_m})    
        
        # Load normalized training set and encode for deepMN and deepOR
        curr_norm_training_set = pd.read_csv(os.path.join(curr_fold_dir,'norm_train_dataframe.csv'))
        curr_norm_training_deepMN_labels = label_binarize(curr_norm_training_set.GOSE.values,classes=[1,3,4,5,6,7,8])
        curr_norm_training_deepOR_labels = multi_to_ord(curr_norm_training_set.GOSE)
        curr_norm_training_matrix = curr_norm_training_set.drop(columns=['entity_id','PatientType', 'GCS','GOSE']).values

        # Load SMOTEd normalized training set and encode for deepMN and deepOR
        curr_smote_norm_training_set = pd.read_csv(os.path.join(curr_fold_dir,'smote_norm_train_dataframe.csv'))
        curr_smote_norm_training_deepMN_labels = label_binarize(curr_smote_norm_training_set.GOSE.values,classes=[1,3,4,5,6,7,8])
        curr_smote_norm_training_deepOR_labels = multi_to_ord(curr_smote_norm_training_set.GOSE)
        curr_smote_norm_training_matrix = curr_smote_norm_training_set.drop(columns=['GOSE']).values
        
        # Use stratified splitting method to extract random, stratified subset for SHAP value explanation
        sss = StratifiedShuffleSplit(n_splits=1, train_size=100)
        for train_index, val_index in sss.split(curr_norm_training_matrix, curr_norm_training_deepMN_labels):
            X_norm_train = curr_norm_training_matrix[train_index]
            
        # SMOTE: Use stratified splitting method to extract random, stratified subset for SHAP value explanation
        sss = StratifiedShuffleSplit(n_splits=1, train_size=100)
        for train_index, val_index in sss.split(curr_smote_norm_training_matrix, curr_smote_norm_training_deepMN_labels):
            X_smote_norm_train = curr_smote_norm_training_matrix[train_index]
        
        # Establish SHAP deep explainer objects based on SMOTE using randomly drawn subset
        if curr_opt_tune_idx_deepMN >= 118:
            exp_deepMN = shap.DeepExplainer(deepMN, X_smote_norm_train)
        else:
            exp_deepMN = shap.DeepExplainer(deepMN, X_norm_train)

        if curr_opt_tune_idx_deepOR >= 118:
            exp_deepOR = shap.DeepExplainer(deepOR, X_smote_norm_train)
        else:
            exp_deepOR = shap.DeepExplainer(deepOR, X_norm_train)
              
        # Load testing set from current imputation and create formatted labels    
        curr_norm_testing_set = pd.read_csv(os.path.join(curr_fold_dir,'norm_test_dataframe.csv'))
        curr_norm_testing_labels = label_binarize(curr_norm_testing_set.GOSE.values,classes=[1,3,4,5,6,7,8])
        curr_norm_testing_matrix = curr_norm_testing_set.drop(columns=['entity_id','PatientType', 'GCS','GOSE']).values
        
        # Use stratified splitting method to extract random, stratified testing subset for SHAP value explanation
        sss = StratifiedShuffleSplit(n_splits=1, train_size=30)
        for train_index, val_index in sss.split(curr_norm_testing_matrix, curr_norm_testing_labels):
            X_test = curr_norm_testing_matrix[train_index]

        # Use trained SHAP explainers to explain predictors in new test set
        shap_values_deepMN = exp_deepMN.shap_values(X_test)
        shap_values_deepOR = exp_deepOR.shap_values(X_test)
    
        curr_fold_shap_values_deepMN = pd.DataFrame(np.empty((0,14)))
        for i in range(len(shap_values_deepMN)):
            curr_shap = pd.DataFrame(shap_values_deepMN[i])
            curr_shap.columns = curr_norm_training_set.drop(columns=['entity_id','PatientType', 'GCS','GOSE']).columns
            curr_shap['repeat.name'] = curr_repeat_name
            curr_shap['fold.name'] = curr_fold_dir[24:29]
            curr_shap['tune.idx'] = curr_opt_tune_idx_deepMN
            curr_shap['node'] = i+1
            curr_fold_shap_values_deepMN.columns = curr_shap.columns
            curr_fold_shap_values_deepMN = curr_fold_shap_values_deepMN.append(curr_shap,ignore_index=True)
        pooled_shap_values_deepMN.columns = curr_fold_shap_values_deepMN.columns
        pooled_shap_values_deepMN = pooled_shap_values_deepMN.append(curr_fold_shap_values_deepMN,ignore_index=True)
        
        curr_fold_shap_values_deepOR = pd.DataFrame(np.empty((0,14)))
        for i in range(len(shap_values_deepOR)):
            curr_shap = pd.DataFrame(shap_values_deepOR[i])
            curr_shap.columns = curr_norm_training_set.drop(columns=['entity_id','PatientType', 'GCS','GOSE']).columns
            curr_shap['repeat.name'] = curr_repeat_name
            curr_shap['fold.name'] = curr_fold_dir[24:29]
            curr_shap['tune.idx'] = curr_opt_tune_idx_deepOR
            curr_shap['node'] = i+1
            curr_fold_shap_values_deepOR.columns = curr_shap.columns
            curr_fold_shap_values_deepOR = curr_fold_shap_values_deepOR.append(curr_shap,ignore_index=True)
        pooled_shap_values_deepOR.columns = curr_fold_shap_values_deepOR.columns
        pooled_shap_values_deepOR = pooled_shap_values_deepOR.append(curr_fold_shap_values_deepOR,ignore_index=True)
    
        # Update fold status message
        fold_status.update(curr_fold_dir[24:29] + ' completed.')
        
    # Update repeat status message
    repeat_status.update(curr_repeat_name + ' completed.')
    
pooled_shap_values_deepMN.to_csv('../repeated_cv/compiled_shap_values/deepMN_shap_values.csv',index = False)
pooled_shap_values_deepOR.to_csv('../repeated_cv/compiled_shap_values/deepOR_shap_values.csv',index = False)