# Shapley values for an xgboost model

In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np

In [2]:
import tabular_trees

# Build example xgboost model

## Set up data

In [3]:
np.random.seed(100)
explanatory_variables = pd.DataFrame(
    {
        'x':[206]*5 + [194] + [6]*4,
        'y': list(np.random.randint(100, 400, 6)) + [299, 299, 301, 301],
        'z': list(np.random.randint(100, 400, 10))
    }
)
explanatory_variables

Unnamed: 0,x,y,z
0,206,108,114
1,206,380,390
2,206,179,340
3,206,153,380
4,206,166,243
5,194,326,328
6,6,299,158
7,6,299,237
8,6,301,193
9,6,301,186


In [4]:
response = pd.Series([10]*5 + [20] + [50]*2 + [30]*2)
response

0    10
1    10
2    10
3    10
4    10
5    20
6    50
7    50
8    30
9    30
dtype: int64

## Create xgboost matrix

In [5]:
xgb_data = xgb.DMatrix(
    data = explanatory_variables, 
    label = response,
    base_margin = [0] * explanatory_variables.shape[0]
)

## Build model

In [6]:
model = xgb.train(
    params = {
        'objective': 'reg:squarederror',
        'max_depth': 2, 
        'subsample': 1,
        'colsample_bytree': 1,
        'eta': 1, 
        'lambda': 0,
        'gamma': 0,
        'alpha': 0,
        'tree_method': 'exact',
    }, 
    dtrain = xgb_data, 
    num_boost_round = 1
)

# Get tree data

In [7]:
model_trees_xgboost = tabular_trees.trees.export_tree_data(model)
model_trees = model_trees_xgboost.to_tabular_trees()

In [8]:
model_trees.trees

Unnamed: 0,tree,node,left_child,right_child,missing,feature,split_condition,prediction,leaf,count
0,0,0-0,0-1,0-2,0-1,x,100.0,23.0,0,10.0
1,0,0-1,0-3,0-4,0-3,y,300.0,40.0,0,4.0
2,0,0-2,0-5,0-6,0-5,x,200.0,11.666667,0,6.0
3,0,0-3,,,,Leaf,,50.0,1,2.0
4,0,0-4,,,,Leaf,,30.0,1,2.0
5,0,0-5,,,,Leaf,,20.0,1,1.0
6,0,0-6,,,,Leaf,,10.0,1,5.0


# Get shapley values for single row

In [9]:
row_to_explain = pd.Series({'x': 150, 'y': 75, 'z': 200})
row_to_explain

x    150
y     75
z    200
dtype: int64

In [10]:
results = tabular_trees.calculate_shapley_values(model_trees, row_to_explain)

  results = tabular_trees.calculate_shapley_values(model_trees, row_to_explain)
100%|██████████| 6/6 [00:00<00:00, 248.65it/s]


In [11]:
results.summary

Unnamed: 0,bias,x,y,z
0,23.0,-5.0,2.0,0.0


In [12]:
results.permutations

Unnamed: 0,bias,permutation,tree,x,y,z
0,23.0,"['x', 'y', 'z']",0,-3.0,0.0,0.0
1,23.0,"['x', 'z', 'y']",0,-3.0,0.0,0.0
2,23.0,"['y', 'x', 'z']",0,-7.0,4.0,0.0
3,23.0,"['y', 'z', 'x']",0,-7.0,4.0,0.0
4,23.0,"['z', 'x', 'y']",0,-3.0,0.0,0.0
5,23.0,"['z', 'y', 'x']",0,-7.0,4.0,0.0
