# XGBoost

Tryingt o get SHAP interactions to work

## Import libraries

In [1]:
# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")

import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import scipy

from xgboost import XGBClassifier
from sklearn.metrics import auc
from sklearn.metrics import roc_curve

import json

from dataclasses import dataclass

import seaborn as sns

from sklearn.metrics import roc_auc_score

from sklearn.metrics import confusion_matrix

from matplotlib.lines import Line2D

# add histograms to dependency plots
import matplotlib.gridspec as gridspec

import pickle
import shap

from os.path import exists

import math

import importlib
# Import local package
from utils import waterfall
# Force package to be reloaded
importlib.reload(waterfall);

import time

Report the time duration to run notebook

In [2]:
start_time = time.time()

## Set up paths and filenames

In [3]:
@dataclass(frozen=True)
class Paths:
    '''Singleton object for storing paths to data and database.'''

  #  data_path: str = '../'
  #  data_filename: str = 'SAMueL ssnap extract v2.csv'
  #  data_save_path: str = './'
  #  data_save_filename: str = 'reformatted_data.csv'
  #  database_filename: str = 'samuel.db'
  #  notebook: str = '01'
  #  kfold_folder: str = 'data/kfold_5fold/'

    data_read_path: str = '../data/'
    data_read_filename: str = '02_reformatted_data_ml_230612.csv'
 #   data_save_path: str = './kfold_5fold'
#    data_save_filename: str = 'train.csv'
    notebook: str = '230707'
    model_notebook: str = '230623_'
    model_text: str = 'xgb_all_data_5_features'

paths = Paths()

## Import data

Data has previously been split into 5 stratified k-fold splits.

In [4]:
filename = paths.data_read_path + paths.data_read_filename
data = pd.read_csv(filename)

In [5]:
class_names = data['discharge_disability'].unique()
class_names = np.sort(class_names)
n_classes = len(class_names)

Get list of features

In [6]:
features = list(data)
print(f"There are {len(features)} features")

There are 56 features


Want to use onset to thrombolysis time in the model. Define function to calculate the feature.

In [7]:
def calculate_onset_to_thrombolysis(row):
    # Set default value of onset to thrombolysis of -100 (no thrombolysis given)
    onset_to_thrombolysis = -100
    # Set value if thrombolysis given
    if  row['scan_to_thrombolysis_time'] != -100:
        onset_to_thrombolysis = (row['onset_to_arrival_time'] + 
        row['arrival_to_scan_time'] + row['scan_to_thrombolysis_time'])
    return onset_to_thrombolysis

In [8]:
# Calculate onset to thgrombolysis (but set to -100 if no thrombolysis given)
data['onset_to_thrombolysis_time'] = data.apply(calculate_onset_to_thrombolysis, axis=1)
data.drop(['scan_to_thrombolysis_time', 'arrival_to_scan_time',
        'onset_to_arrival_time'], axis=1, inplace=True)

### Include key features

In [9]:
selected_features = ['prior_disability','stroke_severity','stroke_team',
                     'onset_to_thrombolysis_time','age']
selected_features.append('discharge_disability')
data = data[selected_features]

## One hot the categorical features

Convert some categorical features to one hot encoded features.

Define a function

In [10]:
def convert_feature_to_one_hot(df, feature_name, prefix):
    """
    df [dataframe]: training or test dataset
    feature_name [str]: feature to convert to ont hot encoding
    prefix [str]: string to use on new feature
    """

    # One hot encode a feature
    df_feature = pd.get_dummies(
        df[feature_name], prefix = prefix)
    df = pd.concat([df, df_feature], axis=1)
    df.drop(feature_name, axis=1, inplace=True)

    return(df)

Set up two lists for the one hot encoding. 

A list of the feature names that are categorical and to be converted using one hot encoding.
A list of the prefixes to use for these features.

In [11]:
features_to_one_hot = ["stroke_team"]
list_prefix = ["team"]

In [12]:
# Take copy of "stroke_team" column first, to use in the histogram plot later
data_stroke_team = data["stroke_team"]

For each feature in the list, convert to one hot encoded

In [13]:
for feature, prefix in zip(features_to_one_hot, list_prefix):
    data = convert_feature_to_one_hot(data, feature, prefix)

Get X and y

In [14]:
X_data = data.drop('discharge_disability', axis=1)
y_data = data['discharge_disability']

Get list of features in dataset, post one hot encoding.

In [15]:
features_ohe = list(X_data)

## Fit XGBoost model

Train model with all data

In [16]:
filename = f"{paths.model_notebook}{paths.model_text}.p"

# Check if exists
file_exists = exists(filename)

if file_exists:
    # load model
    with open(filename, 'rb') as filehandler:
        model = pickle.load(filehandler)
else:        

    # Define model
    model = XGBClassifier(verbosity = 0, seed=42)#, learning_rate=0.5)

    # Fit model
    model.fit(X_data, y_data)

    # Save model
    with open(filename, 'wb') as filehandler:
        pickle.dump(model, filehandler)

# Get predicted probabilities
y_probs = model.predict_proba(X_data)
y_pred = model.predict(X_data)

# Calculate error
y_error = y_data - y_pred

Show accuracy (identity)

In [17]:
accuracy = np.mean(y_error==0)
print (f'Accuracy: {accuracy:0.2f}')

error_within_one = np.mean(np.abs(y_error)<=1)
print (f'Error within 1: {error_within_one:0.2f}')

Accuracy: 0.46
Error within 1: 0.76


## SHAP values
SHAP values give the contribution that each feature has on the models prediction, per instance. A SHAP value is returned for each feature, for each instance.

We will use the shap library: https://shap.readthedocs.io/en/latest/index.html

'Raw' SHAP values from XGBoost model are log odds ratios. A SHAP value is returned for each feature, for each instance, for each model (one per k-fold)

## Get SHAP values
TreeExplainer is a fast and exact method to estimate SHAP values for tree models and ensembles of trees. Using this we can calculate the SHAP values.

Either load from pickle (if file exists), or calculate.

In [47]:
filename = (f'{paths.model_notebook}{paths.model_text}_shap_values_extended.p')
# Check if exists
file_exists = exists(filename)

if file_exists:

    # Load shap values
    with open(filename, 'rb') as filehandler:
        shap_values_extended = pickle.load(filehandler)
        shap_values = shap_values_extended.values

    # Load explainer
    explainer_filename = (f'{paths.model_notebook}{paths.model_text}_shap_explainer.p')
    with open(explainer_filename, 'rb') as filehandler:
        explainer = pickle.load(filehandler)
else:

    # Set up explainer using the model and feature values from training set
    explainer = shap.TreeExplainer(model, X_data)

    # Get (and store) Shapley values along with base and feature values
    shap_values_extended = explainer(X_data)

    # Shap values exist for each classification in a Tree
    # We are interested in 1=give thrombolysis (not 0=not give thrombolysis)
    shap_values = shap_values_extended.values

    explainer_filename = (f'{paths.model_notebook}{paths.model_text}_shap_explainer.p')

    # Save explainer using pickle
    with open(explainer_filename, 'wb') as filehandler:
        pickle.dump(explainer, filehandler)
        
    # Save shap values extendedr using pickle
    with open(filename, 'wb') as filehandler:
        pickle.dump(shap_values_extended, filehandler)

In [41]:
X_data.iloc[0]

prior_disability                   1.0
stroke_severity                    0.0
onset_to_thrombolysis_time      -100.0
age                               72.5
team_Addenbrooke's Hospital        0.0
                                 ...  
team_Worthing Hospital             0.0
team_Wycombe General Hospital      0.0
team_Yeovil District Hospital      0.0
team_York Hospital                 0.0
team_Ysbyty Gwynedd                0.0
Name: 0, Length: 122, dtype: float64

In [42]:
shap_values[0]

array([[-6.48659848e-01,  7.21264280e-01,  1.38934925e-01,
        -1.42322016e-01, -2.09933316e-01, -1.92726538e-01,
        -1.86848325e-01],
       [ 8.18358007e-01,  5.05328184e-01,  1.93646350e-01,
        -3.42065903e-01, -7.11279787e-01, -6.94785213e-01,
        -7.89308796e-01],
       [-8.20140136e-02, -4.61606249e-02, -2.52895175e-02,
         1.77101422e-02,  2.94222948e-02, -6.90431937e-02,
        -8.19621345e-02],
       [ 2.03090894e-01,  9.16062462e-02,  6.06858705e-02,
        -1.26671845e-01, -1.13759702e-01, -1.23999722e-01,
        -5.43234618e-01],
       [-2.57833198e-03, -9.91304303e-03,  0.00000000e+00,
         2.22565535e-03,  0.00000000e+00, -1.25147454e-02,
        -1.22882620e-03],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.0000000

In [48]:
sv_extended_0 = explainer(X_data.iloc[0])
sv_extended_0.values

array([[-6.48659848e-01,  7.21264280e-01,  1.38934925e-01,
        -1.42322016e-01, -2.09933316e-01, -1.92726538e-01,
        -1.86848325e-01],
       [ 8.18358007e-01,  5.05328184e-01,  1.93646350e-01,
        -3.42065903e-01, -7.11279787e-01, -6.94785213e-01,
        -7.89308796e-01],
       [-8.20140136e-02, -4.61606249e-02, -2.52895175e-02,
         1.77101422e-02,  2.94222948e-02, -6.90431937e-02,
        -8.19621345e-02],
       [ 2.03090894e-01,  9.16062462e-02,  6.06858705e-02,
        -1.26671845e-01, -1.13759702e-01, -1.23999722e-01,
        -5.43234618e-01],
       [-2.57833198e-03, -9.91304303e-03,  0.00000000e+00,
         2.22565535e-03,  0.00000000e+00, -1.25147454e-02,
        -1.22882620e-03],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.0000000

In [44]:
sv_extended_0.data

prior_disability                   1.0
stroke_severity                    0.0
onset_to_thrombolysis_time      -100.0
age                               72.5
team_Addenbrooke's Hospital        0.0
                                 ...  
team_Worthing Hospital             0.0
team_Wycombe General Hospital      0.0
team_Yeovil District Hospital      0.0
team_York Hospital                 0.0
team_Ysbyty Gwynedd                0.0
Name: 0, Length: 122, dtype: float64

In [45]:
# Get SHAP interaction values
shap_interactions = explainer.shap_interaction_values(X_data.iloc[0])

FEATURE_DEPENDENCE::independent does not support interactions!


In [63]:
explainer_0 = shap.TreeExplainer(model)#,feature_dependence='independent')

In [64]:
explainer_0

<shap.explainers._tree.Tree at 0x7f38603b4340>

In [51]:
shap_values_extended_0 = explainer_0(X_data)#.iloc[0])#,feature_dependence='independent')# Get (and store) Shapley values along with base and feature values


In [65]:
shap_values_extended_0.shape

(156847, 122, 7)

In [66]:
# Get SHAP interaction values
shap_interactions_0 = explainer_0.shap_interaction_values(X_data)

XGBoostError: std::bad_alloc

In [67]:
# Get SHAP interaction values
shap_interactions_0 = explainer_0.shap_interaction_values(X_data.iloc[0])

XGBoostError: [10:35:04] ../src/c_api/c_api_utils.h:121: Check failed: std::accumulate(shape.cbegin(), shape.cend(), static_cast<bst_ulong>(1), std::multiplies<>{}) == chunksize * rows (3416 vs. 12920166) : 
Stack trace:
  [bt] (0) /home/kerry/miniconda3/envs/sam10/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x1395a3) [0x7f38893395a3]
  [bt] (1) /home/kerry/miniconda3/envs/sam10/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x16b6c0) [0x7f388936b6c0]
  [bt] (2) /home/kerry/miniconda3/envs/sam10/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(XGBoosterPredictFromDMatrix+0x458) [0x7f3889341b08]
  [bt] (3) /home/kerry/miniconda3/envs/sam10/lib/python3.10/lib-dynload/../../libffi.so.8(+0xa052) [0x7f38ed47a052]
  [bt] (4) /home/kerry/miniconda3/envs/sam10/lib/python3.10/lib-dynload/../../libffi.so.8(+0x8925) [0x7f38ed478925]
  [bt] (5) /home/kerry/miniconda3/envs/sam10/lib/python3.10/lib-dynload/../../libffi.so.8(ffi_call+0xde) [0x7f38ed47906e]
  [bt] (6) /home/kerry/miniconda3/envs/sam10/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0x91e7) [0x7f38ed5051e7]
  [bt] (7) /home/kerry/miniconda3/envs/sam10/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0x869b) [0x7f38ed50469b]
  [bt] (8) /home/kerry/miniconda3/envs/sam10/bin/python(_PyObject_MakeTpCall+0x25b) [0x4f631b]



In [68]:
shap_interactions_0

[array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 arr

In [70]:
explainer_1 = shap.TreeExplainer(model, feature_perturbation = 'tree_path_dependent')
#feature_dependence='independent')
#feature_perturbation = “interventional” (default) or “tree_path_dependent”

In [71]:
shap_values_extended_1 = explainer_1(X_data)#.iloc[0])#,feature_dependence='independent')

In [72]:
shap_values_extended_1.shape

(156847, 122, 7)

In [73]:
shap_values_extended_1.values

array([[[-1.0816586e+00,  5.1269364e-01,  1.3275680e-01, ...,
         -4.1671115e-01, -5.8298862e-01, -1.8233849e-01],
        [ 5.8814341e-01,  2.8126255e-01,  6.3276939e-02, ...,
         -7.6380491e-01, -1.2825589e+00, -1.3687001e+00],
        [-6.7595296e-02, -2.6532521e-02, -1.8656641e-02, ...,
          3.9283261e-02,  1.7640313e-02, -5.7050850e-02],
        ...,
        [ 1.1974848e-03,  6.8621116e-04, -3.8467338e-03, ...,
          3.3241377e-04, -1.5765021e-04, -1.8904471e-03],
        [-2.8421269e-05, -3.9391471e-03, -7.1662487e-03, ...,
          4.2563767e-04, -9.1875132e-05,  3.9559929e-03],
        [-6.3054892e-03, -1.4162227e-04,  3.8378309e-03, ...,
          7.1978732e-04, -1.9129448e-05,  4.2708358e-05]],

       [[-9.9576747e-01,  3.2591254e-01,  1.6363731e-01, ...,
         -2.5166118e-01, -5.0957918e-01, -8.5134380e-02],
        [-4.7764072e-01, -3.5583368e-01, -8.7512754e-02, ...,
          5.4710072e-01,  5.4524839e-01,  6.0138077e-01],
        [ 1.8609518e-01, 

In [74]:
# Get SHAP interaction values
shap_interactions_1 = explainer_1.shap_interaction_values(X_data.iloc[0])

XGBoostError: [10:39:41] ../src/c_api/c_api_utils.h:121: Check failed: std::accumulate(shape.cbegin(), shape.cend(), static_cast<bst_ulong>(1), std::multiplies<>{}) == chunksize * rows (3416 vs. 12920166) : 
Stack trace:
  [bt] (0) /home/kerry/miniconda3/envs/sam10/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x1395a3) [0x7f38893395a3]
  [bt] (1) /home/kerry/miniconda3/envs/sam10/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x16b6c0) [0x7f388936b6c0]
  [bt] (2) /home/kerry/miniconda3/envs/sam10/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(XGBoosterPredictFromDMatrix+0x458) [0x7f3889341b08]
  [bt] (3) /home/kerry/miniconda3/envs/sam10/lib/python3.10/lib-dynload/../../libffi.so.8(+0xa052) [0x7f38ed47a052]
  [bt] (4) /home/kerry/miniconda3/envs/sam10/lib/python3.10/lib-dynload/../../libffi.so.8(+0x8925) [0x7f38ed478925]
  [bt] (5) /home/kerry/miniconda3/envs/sam10/lib/python3.10/lib-dynload/../../libffi.so.8(ffi_call+0xde) [0x7f38ed47906e]
  [bt] (6) /home/kerry/miniconda3/envs/sam10/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0x91e7) [0x7f38ed5051e7]
  [bt] (7) /home/kerry/miniconda3/envs/sam10/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0x869b) [0x7f38ed50469b]
  [bt] (8) /home/kerry/miniconda3/envs/sam10/bin/python(_PyObject_MakeTpCall+0x25b) [0x4f631b]



In [75]:
# Get SHAP interaction values
shap_interactions_1 = explainer_1.shap_interaction_values(X_data)

XGBoostError: std::bad_alloc