# Compare Local Explanation Methods
This notebook compares the following method for explaining local predictions from xgboost models;
- shapley values, from shap package - https://github.com/slundberg/shap
- shapley values, directly from xgboost - https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.Booster.predict
- prediction decomposition method - implemented in eli5; https://eli5.readthedocs.io/en/latest/index.html and described here; http://blog.datadive.net/interpreting-random-forests/

Additionally pygbmexpl - https://github.com/richardangell/pygbmexpl/tree/develop is used to inspect the path of records through the trees

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Compare-Local-Explanation-Methods" data-toc-modified-id="Compare-Local-Explanation-Methods-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Compare Local Explanation Methods</a></span></li><li><span><a href="#Load-data" data-toc-modified-id="Load-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load data</a></span></li><li><span><a href="#Build-model" data-toc-modified-id="Build-model-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Build model</a></span></li><li><span><a href="#Set-row-to-consider" data-toc-modified-id="Set-row-to-consider-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Set row to consider</a></span></li><li><span><a href="#Get-prediction-from-model" data-toc-modified-id="Get-prediction-from-model-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Get prediction from model</a></span></li><li><span><a href="#Decompose-prediction-with-eli5" data-toc-modified-id="Decompose-prediction-with-eli5-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Decompose prediction with eli5</a></span></li><li><span><a href="#Parse-model-with-pygbmexpl" data-toc-modified-id="Parse-model-with-pygbmexpl-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Parse model with pygbmexpl</a></span></li><li><span><a href="#Decompose-prediction-with-pygbmexpl" data-toc-modified-id="Decompose-prediction-with-pygbmexpl-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Decompose prediction with pygbmexpl</a></span></li><li><span><a href="#Explain-prediction-shap-built-into-xgboost" data-toc-modified-id="Explain-prediction-shap-built-into-xgboost-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Explain prediction shap built into xgboost</a></span></li><li><span><a href="#Explain-prediction-with-shap-library" data-toc-modified-id="Explain-prediction-with-shap-library-10"><span class="toc-item-num">10&nbsp;&nbsp;</span>Explain prediction with shap library</a></span></li></ul></div>

In [1]:
import xgboost as xgb
import pandas as pd
import shap
from eli5 import show_prediction

The sklearn.metrics.scorer module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API.
The sklearn.feature_selection.base module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.feature_selection. Anything that cannot be imported from sklearn.feature_selection is now part of the private API.


In [2]:
import pygbmexpl.xgb.parser as xgb_parser
import pygbmexpl.xgb.explainer as xgb_explainer

In [3]:
import data_prep

# Load data

In [4]:
data_df = data_prep.get_boston_df()

In [5]:
data_xgb = data_prep.get_boston_xgb()

# Build model

In [6]:
model = xgb.train(
    params = {
        'silent': 1,
        'max_depth': 3
    }, 
    dtrain = data_xgb, 
    num_boost_round = 5
)

In [7]:
model.dump_model('dump_raw.json', with_stats = True, dump_format = 'json')

# Set row to consider

In [10]:
row = 0

# Get prediction from model

In [12]:
model.predict(data_xgb)[row]

22.020464

In [11]:
data_df.loc[row].drop('target')

CRIM         0.00632
ZN          18.00000
INDUS        2.31000
CHAS         0.00000
NOX          0.53800
RM           6.57500
AGE         65.20000
DIS          4.09000
RAD          1.00000
TAX        296.00000
PTRATIO     15.30000
B          396.90000
LSTAT        4.98000
Name: 0, dtype: float64

# Decompose prediction with eli5

In [13]:
show_prediction(model, data_df.loc[row].drop('target'), show_feature_values = True)

Contribution?,Feature,Value
18.608,<BIAS>,1.0
6.434,LSTAT,4.98
0.063,NOX,0.538
-0.685,DIS,4.09
-2.399,RM,6.575


# Parse model with pygbmexpl

In [15]:
model_df = xgb_parser.extract_model_predictions(model)
model_df.shape

(71, 15)

In [17]:
model_df.loc[model_df['tree'] == 0]

Unnamed: 0,tree,nodeid,depth,yes,no,missing,split,split_condition,leaf,gain,cover,node_type,H,G,weight
0,0,0,0,1.0,2.0,1.0,LSTAT,9.725,,18223.4688,506.0,internal,506.0,-3359.478957,6.639286
1,0,1,1,3.0,4.0,3.0,RM,6.941,,6826.89062,212.0,internal,212.0,-1848.510661,8.71939
2,0,2,1,5.0,6.0,5.0,LSTAT,16.084999,,2368.78906,294.0,internal,294.0,-1510.968295,5.139348
3,0,3,2,7.0,8.0,7.0,DIS,1.48495,,525.820312,142.0,internal,142.0,-1062.806345,7.484552
4,0,4,2,9.0,10.0,9.0,RM,7.437,,675.375,70.0,internal,70.0,-785.704316,11.224347
5,0,5,2,11.0,12.0,11.0,B,116.024994,,106.074219,150.0,internal,150.0,-903.771299,6.025142
6,0,6,2,13.0,14.0,13.0,NOX,0.603,,624.634766,144.0,internal,144.0,-607.196996,4.216646
7,0,7,3,,,,,,12.0,,4.0,leaf,4.0,-48.0,12.0
8,0,8,3,,,,,,7.353669,,138.0,leaf,138.0,-1014.806345,7.353669
9,0,9,3,,,,,,9.823172,,40.0,leaf,40.0,-392.926865,9.823172


# Decompose prediction with pygbmexpl

In [18]:
prediction_path = xgb_explainer.decompose_prediction(model_df, data_df.loc[row].drop('target'))

`item` has been deprecated and will be removed in a future version
Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


`item` has been deprecated and will be removed in a future version
`item` has been deprecated and will be removed in a future version
`item` has been deprecated and will be removed in a future version
`item` has been deprecated and will be removed in a future version
`item` has been deprecated and will be removed in a future version
`item` has been deprecated and will be removed in a future version


In [20]:
prediction_path.loc[prediction_path['tree'] == 0]

Unnamed: 0,tree,nodeid,yes,no,missing,split,split_condition,cover,weight,node_type,H,G,value,contributing_var,contribution
0,0,0,1.0,2.0,1.0,LSTAT,9.725,506.0,6.639286,internal,506.0,-3359.478957,4.98,base,6.639286
1,0,1,3.0,4.0,3.0,RM,6.941,212.0,8.71939,internal,212.0,-1848.510661,6.575,LSTAT,2.080103
3,0,3,7.0,8.0,7.0,DIS,1.48495,142.0,7.484552,internal,142.0,-1062.806345,4.09,RM,-1.234838
8,0,8,,,,,,138.0,7.353669,leaf,138.0,-1014.806345,,DIS,-0.130883


In [21]:
prediction_path.groupby('contributing_var')['contribution'].sum()

contributing_var
DIS      -0.685034
LSTAT     6.433818
NOX       0.062679
RM       -2.398669
base     18.607671
Name: contribution, dtype: float64

In [30]:
prediction_path['contributing_var'].value_counts(dropna = False)

LSTAT    6
base     5
DIS      4
RM       4
NOX      1
Name: contributing_var, dtype: int64

Prediction contributions are the same as out of eli5

# Explain prediction shap built into xgboost

In [22]:
pd.DataFrame(
    model.predict(data_xgb, pred_contribs = True), 
    columns = data_df.columns
).iloc[row].rename({'target': '<BIAS>'})

CRIM        0.135195
ZN          0.000000
INDUS       0.000000
CHAS        0.000000
NOX         0.346123
RM         -1.558380
AGE         0.000000
DIS        -0.433603
RAD         0.000000
TAX         0.000000
PTRATIO     0.131869
B           0.017024
LSTAT       4.774564
<BIAS>     18.607672
Name: 0, dtype: float32

# Explain prediction with shap library

In [23]:
explainer = shap.TreeExplainer(model)

Setting feature_perturbation = "tree_path_dependent" because no background data was given.


In [24]:
shap_values_np = explainer.shap_values(data_df.drop('target', axis = 1))

In [28]:
shap_values_df = pd.DataFrame(shap_values_np[0,:]).T
shap_values_df.columns = data_df.columns.values[:-1]
shap_values_df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.135195,0.0,0.0,0.0,0.346123,-1.55838,0.0,-0.433603,0.0,0.0,0.131869,0.017024,4.774564
