# XGBoost Model Parser
This notebook shows an example of parsing an xgboost model with pygbmexpl.

In [1]:
import xgboost as xgb
from sklearn.datasets import load_boston

In [2]:
import pygbmexpl.xgb.parser as xgb_parser

# Build example xgboost model

## Load data from sklearn

In [3]:
boston = load_boston()

## Create xgboost matrix

In [4]:
xgb_data = xgb.DMatrix(
    data = boston['data'], 
    label = boston['target'], 
    feature_names = boston['feature_names']
)

## Build model

In [5]:
model = xgb.train(
    params = {
        'silent': 1,
        'max_depth': 3
    }, 
    dtrain = xgb_data, 
    num_boost_round = 10
)

## Export model to text and json files

In [6]:
model.dump_model('dump_raw.txt', with_stats = True)
model.dump_model('dump_raw_no_stats.txt', with_stats = False)

In [7]:
model.dump_model('dump_raw.json', with_stats = True, dump_format='json')
model.dump_model('dump_raw_no_stats.json', with_stats = False, dump_format='json')

# Parse model dump

## Import text file

In [8]:
dump_raw1, tree_df1 = xgb_parser.read_dump_text('dump_raw.txt')

In [9]:
dump_raw2, tree_df2 = xgb_parser.read_dump_text('dump_raw_no_stats.txt')

In [10]:
dump_raw3, tree_df3 = xgb_parser.read_dump_json('dump_raw.json')

In [11]:
dump_raw4, tree_df4 = xgb_parser.read_dump_json('dump_raw_no_stats.json')

## Raw lines read from files

In [12]:
dump_raw1[:20]

['booster[0]:\n',
 '0:[LSTAT<9.72500038] yes=1,no=2,missing=1,gain=18247.6094,cover=506\n',
 '\t1:[RM<6.94099998] yes=3,no=4,missing=3,gain=6860.23438,cover=212\n',
 '\t\t3:[DIS<1.48494995] yes=7,no=8,missing=7,gain=564.898438,cover=142\n',
 '\t\t\t7:leaf=11.8800001,cover=4\n',
 '\t\t\t8:leaf=7.20474863,cover=138\n',
 '\t\t4:[RM<7.43700027] yes=9,no=10,missing=9,gain=713.554688,cover=70\n',
 '\t\t\t9:leaf=9.67683029,cover=40\n',
 '\t\t\t10:leaf=12.9474201,cover=30\n',
 '\t2:[LSTAT<16.0849991] yes=5,no=6,missing=5,gain=2385.59375,cover=294\n',
 '\t\t5:[B<116.024994] yes=11,no=12,missing=11,gain=118.414062,cover=150\n',
 '\t\t\t11:leaf=3.54750013,cover=7\n',
 '\t\t\t12:leaf=5.99104166,cover=143\n',
 '\t\t6:[NOX<0.603000045] yes=13,no=14,missing=13,gain=639.9375,cover=144\n',
 '\t\t\t13:leaf=5.06040001,cover=49\n',
 '\t\t\t14:leaf=3.55718756,cover=95\n',
 'booster[1]:\n',
 '0:[RM<6.8375001] yes=1,no=2,missing=1,gain=9723.03125,cover=506\n',
 '\t1:[LSTAT<14.3999996] yes=3,no=4,missing=3,ga

In [13]:
dump_raw2[:20]

['booster[0]:\n',
 '0:[LSTAT<9.72500038] yes=1,no=2,missing=1\n',
 '\t1:[RM<6.94099998] yes=3,no=4,missing=3\n',
 '\t\t3:[DIS<1.48494995] yes=7,no=8,missing=7\n',
 '\t\t\t7:leaf=11.8800001\n',
 '\t\t\t8:leaf=7.20474863\n',
 '\t\t4:[RM<7.43700027] yes=9,no=10,missing=9\n',
 '\t\t\t9:leaf=9.67683029\n',
 '\t\t\t10:leaf=12.9474201\n',
 '\t2:[LSTAT<16.0849991] yes=5,no=6,missing=5\n',
 '\t\t5:[B<116.024994] yes=11,no=12,missing=11\n',
 '\t\t\t11:leaf=3.54750013\n',
 '\t\t\t12:leaf=5.99104166\n',
 '\t\t6:[NOX<0.603000045] yes=13,no=14,missing=13\n',
 '\t\t\t13:leaf=5.06040001\n',
 '\t\t\t14:leaf=3.55718756\n',
 'booster[1]:\n',
 '0:[RM<6.8375001] yes=1,no=2,missing=1\n',
 '\t1:[LSTAT<14.3999996] yes=3,no=4,missing=3\n',
 '\t\t3:[DIS<1.38485003] yes=7,no=8,missing=7\n']

In [14]:
len(dump_raw3)

10

In [15]:
dump_raw3[0:1]

[{'nodeid': 0,
  'depth': 0,
  'split': 'LSTAT',
  'split_condition': 9.72500038,
  'yes': 1,
  'no': 2,
  'missing': 1,
  'gain': 18247.6094,
  'cover': 506,
  'children': [{'nodeid': 1,
    'depth': 1,
    'split': 'RM',
    'split_condition': 6.94099998,
    'yes': 3,
    'no': 4,
    'missing': 3,
    'gain': 6860.23438,
    'cover': 212,
    'children': [{'nodeid': 3,
      'depth': 2,
      'split': 'DIS',
      'split_condition': 1.48494995,
      'yes': 7,
      'no': 8,
      'missing': 7,
      'gain': 564.898438,
      'cover': 142,
      'children': [{'nodeid': 7, 'leaf': 11.8800001, 'cover': 4},
       {'nodeid': 8, 'leaf': 7.20474863, 'cover': 138}]},
     {'nodeid': 4,
      'depth': 2,
      'split': 'RM',
      'split_condition': 7.43700027,
      'yes': 9,
      'no': 10,
      'missing': 9,
      'gain': 713.554688,
      'cover': 70,
      'children': [{'nodeid': 9, 'leaf': 9.67683029, 'cover': 40},
       {'nodeid': 10, 'leaf': 12.9474201, 'cover': 30}]}]},
   {'no

In [16]:
len(dump_raw4)

10

In [17]:
dump_raw4[0:1]

[{'nodeid': 0,
  'depth': 0,
  'split': 'LSTAT',
  'split_condition': 9.72500038,
  'yes': 1,
  'no': 2,
  'missing': 1,
  'children': [{'nodeid': 1,
    'depth': 1,
    'split': 'RM',
    'split_condition': 6.94099998,
    'yes': 3,
    'no': 4,
    'missing': 3,
    'children': [{'nodeid': 3,
      'depth': 2,
      'split': 'DIS',
      'split_condition': 1.48494995,
      'yes': 7,
      'no': 8,
      'missing': 7,
      'children': [{'nodeid': 7, 'leaf': 11.8800001},
       {'nodeid': 8, 'leaf': 7.20474863}]},
     {'nodeid': 4,
      'depth': 2,
      'split': 'RM',
      'split_condition': 7.43700027,
      'yes': 9,
      'no': 10,
      'missing': 9,
      'children': [{'nodeid': 9, 'leaf': 9.67683029},
       {'nodeid': 10, 'leaf': 12.9474201}]}]},
   {'nodeid': 2,
    'depth': 1,
    'split': 'LSTAT',
    'split_condition': 16.0849991,
    'yes': 5,
    'no': 6,
    'missing': 5,
    'children': [{'nodeid': 5,
      'depth': 2,
      'split': 'B',
      'split_condition': 1

## Tree structure 

In [18]:
tree_df1.head(20)

Unnamed: 0,tree,nodeid,depth,yes,no,missing,split,split_condition,gain,cover,leaf
0,0,0,0,1.0,2.0,1.0,LSTAT,9.725,18247.6094,506.0,
1,0,1,1,3.0,4.0,3.0,RM,6.941,6860.23438,212.0,
8,0,2,1,5.0,6.0,5.0,LSTAT,16.084999,2385.59375,294.0,
2,0,3,2,7.0,8.0,7.0,DIS,1.48495,564.898438,142.0,
5,0,4,2,9.0,10.0,9.0,RM,7.437,713.554688,70.0,
9,0,5,2,11.0,12.0,11.0,B,116.024994,118.414062,150.0,
12,0,6,2,13.0,14.0,13.0,NOX,0.603,639.9375,144.0,
3,0,7,3,,,,,,,4.0,11.88
4,0,8,3,,,,,,,138.0,7.204749
6,0,9,3,,,,,,,40.0,9.67683


In [19]:
tree_df2.head(20)

Unnamed: 0,tree,nodeid,depth,yes,no,missing,split,split_condition,leaf
0,0,0,0,1.0,2.0,1.0,LSTAT,9.725,
1,0,1,1,3.0,4.0,3.0,RM,6.941,
8,0,2,1,5.0,6.0,5.0,LSTAT,16.084999,
2,0,3,2,7.0,8.0,7.0,DIS,1.48495,
5,0,4,2,9.0,10.0,9.0,RM,7.437,
9,0,5,2,11.0,12.0,11.0,B,116.024994,
12,0,6,2,13.0,14.0,13.0,NOX,0.603,
3,0,7,3,,,,,,11.88
4,0,8,3,,,,,,7.204749
6,0,9,3,,,,,,9.67683


In [20]:
tree_df3.head(20)

Unnamed: 0,tree,nodeid,depth,yes,no,missing,split,split_condition,gain,cover,leaf
0,0,0,0.0,1.0,2.0,1.0,LSTAT,9.725,18247.6094,506,
1,0,1,1.0,3.0,4.0,3.0,RM,6.941,6860.23438,212,
2,0,2,1.0,5.0,6.0,5.0,LSTAT,16.084999,2385.59375,294,
3,0,3,2.0,7.0,8.0,7.0,DIS,1.48495,564.898438,142,
4,0,4,2.0,9.0,10.0,9.0,RM,7.437,713.554688,70,
5,0,5,2.0,11.0,12.0,11.0,B,116.024994,118.414062,150,
6,0,6,2.0,13.0,14.0,13.0,NOX,0.603,639.9375,144,
7,0,7,,,,,,,,4,11.88
8,0,8,,,,,,,,138,7.204749
9,0,9,,,,,,,,40,9.67683


In [21]:
tree_df4.head(20)

Unnamed: 0,tree,nodeid,depth,yes,no,missing,split,split_condition,leaf
0,0,0,0.0,1.0,2.0,1.0,LSTAT,9.725,
1,0,1,1.0,3.0,4.0,3.0,RM,6.941,
2,0,2,1.0,5.0,6.0,5.0,LSTAT,16.084999,
3,0,3,2.0,7.0,8.0,7.0,DIS,1.48495,
4,0,4,2.0,9.0,10.0,9.0,RM,7.437,
5,0,5,2.0,11.0,12.0,11.0,B,116.024994,
6,0,6,2.0,13.0,14.0,13.0,NOX,0.603,
7,0,7,,,,,,,11.88
8,0,8,,,,,,,7.204749
9,0,9,,,,,,,9.67683


## Other useful tree information

Total number of nodes in all trees

In [22]:
tree_df3.shape[0]

144

Number of nodes per tree

In [23]:
tree_df3.groupby('tree').size()

tree
0    15
1    13
2    13
3    15
4    15
5    15
6    15
7    15
8    13
9    15
dtype: int64

Number of terminal nodes per tree

In [24]:
tree_df3.loc[tree_df3['split'].isnull()].groupby('tree').size()

tree
0    8
1    7
2    7
3    8
4    8
5    8
6    8
7    8
8    7
9    8
dtype: int64

# Get tree node predictions

In [25]:
tree_df = xgb_parser.extract_model_predictions(model)

In [26]:
tree_df.head(20)

Unnamed: 0,index,tree,nodeid,depth,yes,no,missing,split,split_condition,gain,cover,leaf,weight,node_type,H,G
0,0,0,0,0,1.0,2.0,1.0,LSTAT,9.725,18247.6094,506.0,,6.491532,internal,506.0,-3284.715003
1,1,0,1,1,3.0,4.0,3.0,RM,6.941,6860.23438,212.0,,8.572034,internal,212.0,-1817.271126
2,8,0,2,1,5.0,6.0,5.0,LSTAT,16.084999,2385.59375,294.0,,4.991306,internal,294.0,-1467.443877
3,2,0,3,2,7.0,8.0,7.0,DIS,1.48495,564.898438,142.0,,7.336446,internal,142.0,-1041.775311
4,5,0,4,2,9.0,10.0,9.0,RM,7.437,713.554688,70.0,,11.078512,internal,70.0,-775.495815
5,9,0,5,2,11.0,12.0,11.0,B,116.024994,118.414062,150.0,,5.87701,internal,150.0,-881.551458
6,12,0,6,2,13.0,14.0,13.0,NOX,0.603,639.9375,144.0,,4.068697,internal,144.0,-585.892419
7,3,0,7,3,,,,,,,4.0,11.88,11.88,leaf,4.0,-47.52
8,4,0,8,3,,,,,,,138.0,7.204749,7.204749,leaf,138.0,-994.255311
9,6,0,9,3,,,,,,,40.0,9.67683,9.67683,leaf,40.0,-387.073212
