# XGBoost Shapley Values
This notebook shows an example of producing shapely values to locally explain predictions.

In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.datasets import load_boston

In [2]:
import pygbmexpl.xgb.parser as xgb_parser
import pygbmexpl.xgb.explainer as xgb_explainer

# Build example xgboost model

## Set up data

In [3]:
np.random.seed(100)
X_train = pd.DataFrame(
    {
        'x':[206]*5 + [194] + [6]*4,
        'y': list(np.random.randint(100, 400, 6)) + [299, 299, 301, 301],
        'z': list(np.random.randint(100, 400, 10))
    }
)
X_train

Unnamed: 0,x,y,z
0,206,108,114
1,206,380,390
2,206,179,340
3,206,153,380
4,206,166,243
5,194,326,328
6,6,299,158
7,6,299,237
8,6,301,193
9,6,301,186


In [4]:
target_train = pd.Series([10]*5 + [20] + [50]*2 + [30]*2)
target_train.name = 't'
target_train

0    10
1    10
2    10
3    10
4    10
5    20
6    50
7    50
8    30
9    30
Name: t, dtype: int64

## Create xgboost matrix

In [5]:
xgb_data = xgb.DMatrix(
    data = X_train, 
    label = target_train
)
xgb_data.set_base_margin([0] * X_train.shape[0])

  if getattr(data, 'base', None) is not None and \


## Build model

In [6]:
model = xgb.train(
    params = {
        'objective': 'reg:squarederror',
        'max_depth': 2, 
        'subsample': 1,
        'colsample_bytree': 1,
        'eta': 1, 
        'lambda': 0,
        'gamma': 0,
        'alpha': 0
    }, 
    dtrain = xgb_data, 
    num_boost_round = 1
)

# Get tree node predictions
Here we pass the xgboost booster object, parse and add node predictions in one step.

In [7]:
tree_df = xgb_parser.extract_model_predictions(model)

In [8]:
tree_df

Unnamed: 0,tree,nodeid,depth,yes,no,missing,split,split_condition,leaf,gain,cover,node_type,H,G,weight
0,0,0,0,1.0,2.0,1.0,x,100.0,,1926.6665,10,internal,10,-230.0,23.0
1,0,1,1,3.0,4.0,3.0,y,300.0,,400.0,4,internal,4,-160.0,40.0
2,0,2,1,5.0,6.0,5.0,x,200.0,,83.333313,6,internal,6,-70.0,11.666667
3,0,3,2,,,,,,50.0,,2,leaf,2,-100.0,50.0
4,0,4,2,,,,,,30.0,,2,leaf,2,-60.0,30.0
5,0,5,2,,,,,,20.0,,1,leaf,1,-20.0,20.0
6,0,6,2,,,,,,10.0,,5,leaf,5,-50.0,10.0


# Get shapley values for single row

In [9]:
row_to_explain = pd.Series({'x': 150, 'y': 75, 'z': 200})
row_to_explain

x    150
y     75
z    200
dtype: int64

In [10]:
xgb_explainer.shapley_values(tree_df, row_to_explain)

  'This algorithm is likely to run very slow, it gives the same results but is not the more efficient treeSHAP algorithm.'
100%|██████████| 6/6 [00:00<00:00, 414.62it/s]


Unnamed: 0,bias,x,y,z
0,23.0,-5.0,2.0,0.0


With the `return_permutations` arg set to `True` the function returns the contributions for each perumtation of the features.

In [11]:
xgb_explainer.shapley_values(tree_df, row_to_explain, return_permutations = True)

100%|██████████| 6/6 [00:00<00:00, 398.72it/s]


Unnamed: 0,bias,permutation,tree,x,y,z
0,23.0,"['x', 'y', 'z']",0,-3.0,0.0,0.0
1,23.0,"['x', 'z', 'y']",0,-3.0,0.0,0.0
2,23.0,"['y', 'x', 'z']",0,-7.0,4.0,0.0
3,23.0,"['y', 'z', 'x']",0,-7.0,4.0,0.0
4,23.0,"['z', 'x', 'y']",0,-3.0,0.0,0.0
5,23.0,"['z', 'y', 'x']",0,-7.0,4.0,0.0
