# XGBoost Model Parser
This notebook shows an example of parsing an xgboost model with pygbmexpl.

In [1]:
import xgboost as xgb
import pandas as pd
from pprint import pprint
from sklearn.datasets import load_boston

In [2]:
import pygbmexpl.xgb.parser as xgb_parser

# Build example xgboost model

## Load data from sklearn

In [3]:
boston = load_boston()

## Create xgboost matrix

In [4]:
xgb_data = xgb.DMatrix(
    data = boston['data'], 
    label = boston['target'], 
    feature_names = boston['feature_names']
)

## Build model

In [5]:
model = xgb.train(
    params = {
        'silent': 1,
        'max_depth': 3
    }, 
    dtrain = xgb_data, 
    num_boost_round = 10
)

## Export model to text and json files

In [6]:
model.dump_model('outputs/dump_raw.txt', with_stats = True)
model.dump_model('outputs/dump_raw_no_stats.txt', with_stats = False)

In [7]:
model.dump_model('outputs/dump_raw.json', with_stats = True, dump_format='json')
model.dump_model('outputs/dump_raw_no_stats.json', with_stats = False, dump_format='json')

# Parse model dump

## Import text file dumps

In [8]:
tree_df1, dump_raw1 = xgb_parser.read_dump_text('outputs/dump_raw.txt')
tree_df2, dump_raw2 = xgb_parser.read_dump_text('outputs/dump_raw_no_stats.txt')

## Import json file dumps

In [9]:
tree_df3, dump_raw3 = xgb_parser.read_dump_json('outputs/dump_raw.json')
tree_df4, dump_raw4 = xgb_parser.read_dump_json('outputs/dump_raw_no_stats.json')

## Raw lines read from files

### Text file dump with stats

In [10]:
dump_raw1[:20]

['booster[0]:\n',
 '0:[LSTAT<9.72500038] yes=1,no=2,missing=1,gain=18247.6094,cover=506\n',
 '\t1:[RM<6.94099998] yes=3,no=4,missing=3,gain=6860.23438,cover=212\n',
 '\t\t3:[DIS<1.48494995] yes=7,no=8,missing=7,gain=564.898438,cover=142\n',
 '\t\t\t7:leaf=11.8800001,cover=4\n',
 '\t\t\t8:leaf=7.20474863,cover=138\n',
 '\t\t4:[RM<7.43700027] yes=9,no=10,missing=9,gain=713.554688,cover=70\n',
 '\t\t\t9:leaf=9.67683029,cover=40\n',
 '\t\t\t10:leaf=12.9474201,cover=30\n',
 '\t2:[LSTAT<16.0849991] yes=5,no=6,missing=5,gain=2385.59375,cover=294\n',
 '\t\t5:[B<116.024994] yes=11,no=12,missing=11,gain=118.414062,cover=150\n',
 '\t\t\t11:leaf=3.54750013,cover=7\n',
 '\t\t\t12:leaf=5.99104166,cover=143\n',
 '\t\t6:[NOX<0.603000045] yes=13,no=14,missing=13,gain=639.9375,cover=144\n',
 '\t\t\t13:leaf=5.06040001,cover=49\n',
 '\t\t\t14:leaf=3.55718756,cover=95\n',
 'booster[1]:\n',
 '0:[RM<6.8375001] yes=1,no=2,missing=1,gain=9723.03125,cover=506\n',
 '\t1:[LSTAT<14.3999996] yes=3,no=4,missing=3,ga

### Text file dump without stats

In [11]:
dump_raw2[:20]

['booster[0]:\n',
 '0:[LSTAT<9.72500038] yes=1,no=2,missing=1\n',
 '\t1:[RM<6.94099998] yes=3,no=4,missing=3\n',
 '\t\t3:[DIS<1.48494995] yes=7,no=8,missing=7\n',
 '\t\t\t7:leaf=11.8800001\n',
 '\t\t\t8:leaf=7.20474863\n',
 '\t\t4:[RM<7.43700027] yes=9,no=10,missing=9\n',
 '\t\t\t9:leaf=9.67683029\n',
 '\t\t\t10:leaf=12.9474201\n',
 '\t2:[LSTAT<16.0849991] yes=5,no=6,missing=5\n',
 '\t\t5:[B<116.024994] yes=11,no=12,missing=11\n',
 '\t\t\t11:leaf=3.54750013\n',
 '\t\t\t12:leaf=5.99104166\n',
 '\t\t6:[NOX<0.603000045] yes=13,no=14,missing=13\n',
 '\t\t\t13:leaf=5.06040001\n',
 '\t\t\t14:leaf=3.55718756\n',
 'booster[1]:\n',
 '0:[RM<6.8375001] yes=1,no=2,missing=1\n',
 '\t1:[LSTAT<14.3999996] yes=3,no=4,missing=3\n',
 '\t\t3:[DIS<1.38485003] yes=7,no=8,missing=7\n']

### Json dump with stats

In [12]:
len(dump_raw3)

10

In [13]:
pprint(dump_raw3[0])

{'children': [{'children': [{'children': [{'cover': 4,
                                           'leaf': 11.8800001,
                                           'nodeid': 7},
                                          {'cover': 138,
                                           'leaf': 7.20474863,
                                           'nodeid': 8}],
                             'cover': 142,
                             'depth': 2,
                             'gain': 564.898438,
                             'missing': 7,
                             'no': 8,
                             'nodeid': 3,
                             'split': 'DIS',
                             'split_condition': 1.48494995,
                             'yes': 7},
                            {'children': [{'cover': 40,
                                           'leaf': 9.67683029,
                                           'nodeid': 9},
                                          {'cover': 30,
              

### Json dump without stats

In [14]:
len(dump_raw4)

10

In [15]:
pprint(dump_raw4[0])

{'children': [{'children': [{'children': [{'leaf': 11.8800001, 'nodeid': 7},
                                          {'leaf': 7.20474863, 'nodeid': 8}],
                             'depth': 2,
                             'missing': 7,
                             'no': 8,
                             'nodeid': 3,
                             'split': 'DIS',
                             'split_condition': 1.48494995,
                             'yes': 7},
                            {'children': [{'leaf': 9.67683029, 'nodeid': 9},
                                          {'leaf': 12.9474201, 'nodeid': 10}],
                             'depth': 2,
                             'missing': 9,
                             'no': 10,
                             'nodeid': 4,
                             'split': 'RM',
                             'split_condition': 7.43700027,
                             'yes': 9}],
               'depth': 1,
               'missing': 3,
              

## View tabular tree structure 

### Tables parsed from text file dumps

In [16]:
tree_df1.loc[tree_df1['tree'] == 0]

Unnamed: 0,tree,nodeid,depth,yes,no,missing,split,split_condition,leaf,gain,cover
0,0,0,0,1.0,2.0,1.0,LSTAT,9.725,,18247.6094,506
1,0,1,1,3.0,4.0,3.0,RM,6.941,,6860.23438,212
2,0,2,1,5.0,6.0,5.0,LSTAT,16.084999,,2385.59375,294
3,0,3,2,7.0,8.0,7.0,DIS,1.48495,,564.898438,142
4,0,4,2,9.0,10.0,9.0,RM,7.437,,713.554688,70
5,0,5,2,11.0,12.0,11.0,B,116.024994,,118.414062,150
6,0,6,2,13.0,14.0,13.0,NOX,0.603,,639.9375,144
7,0,7,3,,,,,,11.88,,4
8,0,8,3,,,,,,7.204749,,138
9,0,9,3,,,,,,9.67683,,40


In [17]:
tree_df2.loc[tree_df2['tree'] == 0]

Unnamed: 0,tree,nodeid,depth,yes,no,missing,split,split_condition,leaf
0,0,0,0,1.0,2.0,1.0,LSTAT,9.725,
1,0,1,1,3.0,4.0,3.0,RM,6.941,
2,0,2,1,5.0,6.0,5.0,LSTAT,16.084999,
3,0,3,2,7.0,8.0,7.0,DIS,1.48495,
4,0,4,2,9.0,10.0,9.0,RM,7.437,
5,0,5,2,11.0,12.0,11.0,B,116.024994,
6,0,6,2,13.0,14.0,13.0,NOX,0.603,
7,0,7,3,,,,,,11.88
8,0,8,3,,,,,,7.204749
9,0,9,3,,,,,,9.67683


### Tables parsed from json file dumps

In [18]:
tree_df3.loc[tree_df3['tree'] == 0]

Unnamed: 0,tree,nodeid,depth,yes,no,missing,split,split_condition,leaf,gain,cover
0,0,0,0,1.0,2.0,1.0,LSTAT,9.725,,18247.6094,506
1,0,1,1,3.0,4.0,3.0,RM,6.941,,6860.23438,212
2,0,2,1,5.0,6.0,5.0,LSTAT,16.084999,,2385.59375,294
3,0,3,2,7.0,8.0,7.0,DIS,1.48495,,564.898438,142
4,0,4,2,9.0,10.0,9.0,RM,7.437,,713.554688,70
5,0,5,2,11.0,12.0,11.0,B,116.024994,,118.414062,150
6,0,6,2,13.0,14.0,13.0,NOX,0.603,,639.9375,144
7,0,7,3,,,,,,11.88,,4
8,0,8,3,,,,,,7.204749,,138
9,0,9,3,,,,,,9.67683,,40


In [19]:
tree_df4.loc[tree_df4['tree'] == 0]

Unnamed: 0,tree,nodeid,depth,yes,no,missing,split,split_condition,leaf
0,0,0,0,1.0,2.0,1.0,LSTAT,9.725,
1,0,1,1,3.0,4.0,3.0,RM,6.941,
2,0,2,1,5.0,6.0,5.0,LSTAT,16.084999,
3,0,3,2,7.0,8.0,7.0,DIS,1.48495,
4,0,4,2,9.0,10.0,9.0,RM,7.437,
5,0,5,2,11.0,12.0,11.0,B,116.024994,
6,0,6,2,13.0,14.0,13.0,NOX,0.603,
7,0,7,3,,,,,,11.88
8,0,8,3,,,,,,7.204749
9,0,9,3,,,,,,9.67683


### Check results are equal between text file and json file outputs

In [20]:
pd.testing.assert_frame_equal(tree_df1, tree_df3)

In [21]:
pd.testing.assert_frame_equal(tree_df2, tree_df4)

## Other useful tree information

### Total number of nodes in all trees

In [22]:
tree_df3.shape[0]

144

### Number of nodes per tree

In [23]:
tree_df3.groupby('tree').size()

tree
0    15
1    13
2    13
3    15
4    15
5    15
6    15
7    15
8    13
9    15
dtype: int64

### Number of terminal nodes per tree

In [24]:
tree_df3.loc[tree_df3['split'].isnull()].groupby('tree').size()

tree
0    8
1    7
2    7
3    8
4    8
5    8
6    8
7    8
8    7
9    8
dtype: int64

### Number of times each varaible is used in the model

In [25]:
tree_df3['split'].value_counts()

RM         14
LSTAT      12
DIS        11
NOX         9
CRIM        6
PTRATIO     6
B           3
TAX         3
AGE         2
CHAS        1
Name: split, dtype: int64

# Get tree node predictions
Here we pass the xgboost booster object, parse and add node predictions in one step.

In [26]:
tree_df = xgb_parser.extract_model_predictions(model)

In [27]:
tree_df.loc[tree_df['tree'] == 0]

Unnamed: 0,tree,nodeid,depth,yes,no,missing,split,split_condition,leaf,gain,cover,node_type,H,G,weight
0,0,0,0,1.0,2.0,1.0,LSTAT,9.725,,18247.6094,506,internal,506,-3284.715003,6.491532
1,0,1,1,3.0,4.0,3.0,RM,6.941,,6860.23438,212,internal,212,-1817.271126,8.572034
2,0,2,1,5.0,6.0,5.0,LSTAT,16.084999,,2385.59375,294,internal,294,-1467.443877,4.991306
3,0,3,2,7.0,8.0,7.0,DIS,1.48495,,564.898438,142,internal,142,-1041.775311,7.336446
4,0,4,2,9.0,10.0,9.0,RM,7.437,,713.554688,70,internal,70,-775.495815,11.078512
5,0,5,2,11.0,12.0,11.0,B,116.024994,,118.414062,150,internal,150,-881.551458,5.87701
6,0,6,2,13.0,14.0,13.0,NOX,0.603,,639.9375,144,internal,144,-585.892419,4.068697
7,0,7,3,,,,,,11.88,,4,leaf,4,-47.52,11.88
8,0,8,3,,,,,,7.204749,,138,leaf,138,-994.255311,7.204749
9,0,9,3,,,,,,9.67683,,40,leaf,40,-387.073212,9.67683
