In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd

In [2]:
#  this script demonstrates how to fit gamma regression model (with log link function)
#  in xgboost, before running the demo you need to generate the autoclaims dataset
#  by running gen_autoclaims.R located in xgboost/demo/data.

data = np.genfromtxt('/Users/richardangell/Projects/xgboost/demo/data/autoclaims.csv', delimiter=',')
dtrain = xgb.DMatrix(data[0:4741, 0:34], data[0:4741, 34])
dtest = xgb.DMatrix(data[4741:6773, 0:34], data[4741:6773, 34])

# for gamma regression, we need to set the objective to 'reg:gamma', it also suggests
# to set the base_score to a value between 1 to 5 if the number of iteration is small
param = {'silent':1, 'objective':'reg:gamma', 'booster':'gbtree', 'max_depth': 6, 'base_score': 3}

# the rest of settings are the same
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 100

# training and evaluation
bst = xgb.train(param, dtrain, num_round, watchlist)
preds = bst.predict(dtest)
labels = dtest.get_label()
print('test deviance=%f' % (2 * np.sum((labels - preds) / preds - np.log(labels) + np.log(preds))))

[0]	eval-gamma-nloglik:474.955	train-gamma-nloglik:452.463
[1]	eval-gamma-nloglik:352.75	train-gamma-nloglik:336.077
[2]	eval-gamma-nloglik:262.296	train-gamma-nloglik:249.934
[3]	eval-gamma-nloglik:195.364	train-gamma-nloglik:186.194
[4]	eval-gamma-nloglik:145.856	train-gamma-nloglik:139.052
[5]	eval-gamma-nloglik:109.256	train-gamma-nloglik:104.205
[6]	eval-gamma-nloglik:82.2192	train-gamma-nloglik:78.4658
[7]	eval-gamma-nloglik:62.2658	train-gamma-nloglik:59.474
[8]	eval-gamma-nloglik:47.5594	train-gamma-nloglik:45.4801
[9]	eval-gamma-nloglik:36.7394	train-gamma-nloglik:35.1879
[10]	eval-gamma-nloglik:28.7975	train-gamma-nloglik:27.637
[11]	eval-gamma-nloglik:22.9865	train-gamma-nloglik:22.1154
[12]	eval-gamma-nloglik:18.7522	train-gamma-nloglik:18.0955
[13]	eval-gamma-nloglik:15.6839	train-gamma-nloglik:15.1858
[14]	eval-gamma-nloglik:13.4628	train-gamma-nloglik:13.0951
[15]	eval-gamma-nloglik:11.8812	train-gamma-nloglik:11.6075
[16]	eval-gamma-nloglik:10.768	train-gamma-nloglik:10

In [3]:
bst.dump_model('model_dump_test.txt', with_stats = True)

In [4]:
def read_dump(file):
    """
    Reads an xgboost model dump .txt file and parses it into a tabular structure.
    :param file: xgboost model dump .txt file
    :return: pd.DataFrame, with columns tree, node, left, right, missing, split_var, split_point, prediction
    """
    with open(file) as f:

        lines = f.readlines()
    
    tree_no = -1
    
    lines_list = []
    
    for i in range(len(lines)):
        
        # if line is a new tree
        if lines[i][:7] == 'booster':
            
            tree_no += 1
        
        # else if node row
        else:
        
            line_dict = {}
        
            # remove \n from end and any \t from start
            node_str = lines[i][:len(lines[i])-1].replace('\t', '')
            
            line_dict['tree'] = tree_no
            
            # split by :
            node_str_split1 = node_str.split(':')
            
            # get the node number before the :
            line_dict['node'] = int(node_str_split1[0])

            # else if leaf node
            if node_str_split1[1][:4]  == 'leaf':
                
                node_str_split2 = node_str_split1[1].split(',')
                
                line_dict['quality'] = float(node_str_split2[0].split('=')[1])
                line_dict['cover'] = float(node_str_split2[1].split('=')[1])

            # else non terminal node
            else:
                
                node_str_split2 = node_str_split1[1].split(' ')
                
                node_str_split3 = node_str_split2[0].replace('[', '').replace(']', '').split('<')
                
                # extract split variable name before the <
                line_dict['split_var'] = node_str_split3[0]

                # extract split point after the <
                line_dict['split_point'] = float(node_str_split3[1])
  
                node_str_split4 = node_str_split2[1].split(',')
                
                # get the child nodes
                line_dict['yes'] = int(node_str_split4[0].split('=')[1])
                line_dict['no'] = int(node_str_split4[1].split('=')[1])
                line_dict['missing'] = int(node_str_split4[2].split('=')[1])
                
                # get the child nodes
                # note quality = gain
                line_dict['quality'] = float(node_str_split4[3].split('=')[1])
                line_dict['cover'] = float(node_str_split4[4].split('=')[1])

            lines_list = lines_list + [line_dict]
    
    lines_df = pd.DataFrame.from_dict(lines_list)
    
    col_order = ['tree', 'node', 'yes', 'no', 'missing', 'split_var', 'split_point','quality', 'cover']
    
    # reorder columns
    lines_df = lines_df.loc[:,col_order]
    
    lines_df.sort_values(['tree', 'node'], inplace = True)
    
    return(lines, lines_df)

In [149]:
lines, model_structure = read_dump('model_dump_test.txt')

In [150]:
model_structure.head(20)

Unnamed: 0,tree,node,yes,no,missing,split_var,split_point,quality,cover
0,0,0,,,,,,0.299507,2885251.0
1,1,0,,,,,,0.299335,2138500.0
2,2,0,,,,,,0.299103,1585294.0
3,3,0,,,,,,0.29879,1175469.0
4,4,0,,,,,,0.298368,871863.5
5,5,0,,,,,,0.297801,646947.2
6,6,0,,,,,,0.297038,480325.3
7,7,0,,,,,,0.296014,356889.2
8,8,0,,,,,,0.294641,265446.0
9,9,0,,,,,,0.292804,197703.9


In [151]:
base_score = np.log(3)

In [152]:
model_structure['weight'] = 0

In [153]:
model_structure.loc[model_structure.split_var.isnull(), 'weight'] = base_score + \
    model_structure.loc[model_structure.split_var.isnull(), 'quality']

In [154]:
model_structure['H'] = model_structure['cover']

In [155]:
model_structure['G'] = 0

In [156]:
model_structure.loc[model_structure.split_var.isnull(), 'G'] = \
    - model_structure.loc[model_structure.split_var.isnull(), 'weight'] * \
    model_structure.loc[model_structure.split_var.isnull(), 'H']

In [157]:
model_structure.reset_index(inplace = True) 

In [158]:
model_structure['node_type'] = 'internal'
model_structure.loc[model_structure.split_point.isnull(), 'node_type'] = 'leaf'

In [159]:
model_structure.head(26)

Unnamed: 0,index,tree,node,yes,no,missing,split_var,split_point,quality,cover,weight,H,G,node_type
0,0,0,0,,,,,,0.299507,2885251.0,1.398119,2885251.0,-4033925.0,leaf
1,1,1,0,,,,,,0.299335,2138500.0,1.397947,2138500.0,-2989511.0,leaf
2,2,2,0,,,,,,0.299103,1585294.0,1.397715,1585294.0,-2215790.0,leaf
3,3,3,0,,,,,,0.29879,1175469.0,1.397402,1175469.0,-1642603.0,leaf
4,4,4,0,,,,,,0.298368,871863.5,1.396981,871863.5,-1217976.0,leaf
5,5,5,0,,,,,,0.297801,646947.2,1.396413,646947.2,-903405.7,leaf
6,6,6,0,,,,,,0.297038,480325.3,1.395651,480325.3,-670366.3,leaf
7,7,7,0,,,,,,0.296014,356889.2,1.394626,356889.2,-497727.0,leaf
8,8,8,0,,,,,,0.294641,265446.0,1.393253,265446.0,-369833.5,leaf
9,9,9,0,,,,,,0.292804,197703.9,1.391417,197703.9,-275088.6,leaf


In [160]:
#xx = model_structure.loc[model_structure.tree == 13].copy()
#xx

In [161]:
#aa= single_tree_fill(xx)

In [162]:
def single_tree_fill(df):
    
    df = df.copy()
    
    leaf_df = df.loc[df.node_type == 'leaf']
    
    # loop through each leaf node
    for i in leaf_df.index:
        
        #print(i, 'leaf---------------')
        
        leaf_row = leaf_df.loc[[i]]
        
        current_node = leaf_row['node'].item()
        
        leaf_G = leaf_row['G'].item()
        
        #print('current_node', current_node)
        #print(df)
        #print('---')
        
        if current_node > 0:
        
            # traverse the tree bottom to top and propogate the G values upwards
            while True:

                parent = (df.yes == current_node) | (df.no == current_node)

                df.loc[parent, 'G'] = df.loc[parent, 'G'] + leaf_G

                leaf_row = df.loc[parent]

                current_node = leaf_row['node'].item()            

                #print('current_node', current_node)
                #print(df)
                #print('---')

                if current_node == 0:

                    break
                
    return(df)

In [163]:
trees = model_structure.tree.max()
trees

99

In [164]:
new_trees = model_structure.loc[model_structure.tree == -1]
for n in range(trees + 1):
    tree_df = model_structure.loc[model_structure.tree == n]
    new_trees

In [165]:
new_trees = [single_tree_fill(model_structure.loc[model_structure.tree == n]) for n in range(trees + 1)]

In [166]:
model_structure2 = pd.concat(new_trees, axis = 0)

In [167]:
model_structure.shape

(6766, 14)

In [168]:
model_structure2.shape

(6766, 14)

In [169]:
model_structure2.head(26)

Unnamed: 0,index,tree,node,yes,no,missing,split_var,split_point,quality,cover,weight,H,G,node_type
0,0,0,0,,,,,,0.299507,2885251.0,1.398119,2885251.0,-4033925.0,leaf
1,1,1,0,,,,,,0.299335,2138500.0,1.397947,2138500.0,-2989511.0,leaf
2,2,2,0,,,,,,0.299103,1585294.0,1.397715,1585294.0,-2215790.0,leaf
3,3,3,0,,,,,,0.29879,1175469.0,1.397402,1175469.0,-1642603.0,leaf
4,4,4,0,,,,,,0.298368,871863.5,1.396981,871863.5,-1217976.0,leaf
5,5,5,0,,,,,,0.297801,646947.2,1.396413,646947.2,-903405.7,leaf
6,6,6,0,,,,,,0.297038,480325.3,1.395651,480325.3,-670366.3,leaf
7,7,7,0,,,,,,0.296014,356889.2,1.394626,356889.2,-497727.0,leaf
8,8,8,0,,,,,,0.294641,265446.0,1.393253,265446.0,-369833.5,leaf
9,9,9,0,,,,,,0.292804,197703.9,1.391417,197703.9,-275088.6,leaf


In [170]:
(model_structure2.G == 0).sum()

0

In [171]:
model_structure2.loc[model_structure2.node_type == 'internal', 'weight'] = \
    - model_structure2.loc[model_structure2.node_type == 'internal', 'G'] / \
    model_structure2.loc[model_structure2.node_type == 'internal', 'H']

In [172]:
model_structure2.head(26)

Unnamed: 0,index,tree,node,yes,no,missing,split_var,split_point,quality,cover,weight,H,G,node_type
0,0,0,0,,,,,,0.299507,2885251.0,1.398119,2885251.0,-4033925.0,leaf
1,1,1,0,,,,,,0.299335,2138500.0,1.397947,2138500.0,-2989511.0,leaf
2,2,2,0,,,,,,0.299103,1585294.0,1.397715,1585294.0,-2215790.0,leaf
3,3,3,0,,,,,,0.29879,1175469.0,1.397402,1175469.0,-1642603.0,leaf
4,4,4,0,,,,,,0.298368,871863.5,1.396981,871863.5,-1217976.0,leaf
5,5,5,0,,,,,,0.297801,646947.2,1.396413,646947.2,-903405.7,leaf
6,6,6,0,,,,,,0.297038,480325.3,1.395651,480325.3,-670366.3,leaf
7,7,7,0,,,,,,0.296014,356889.2,1.394626,356889.2,-497727.0,leaf
8,8,8,0,,,,,,0.294641,265446.0,1.393253,265446.0,-369833.5,leaf
9,9,9,0,,,,,,0.292804,197703.9,1.391417,197703.9,-275088.6,leaf


In [174]:
preds2 = bst.predict(dtest, ntree_limit = 1)

In [175]:
preds2[:5]

array([4.0475802, 4.0475802, 4.0475802, 4.0475802, 4.0475802],
      dtype=float32)

In [173]:
np.exp(1.398119)

4.0475793051862565

In [177]:
preds3 = bst.predict(dtest, ntree_limit = 2)

In [178]:
preds3[:5]

array([5.460029, 5.460029, 5.460029, 5.460029, 5.460029], dtype=float32)

In [187]:
np.exp(1.398119+1.3979470-np.log(3))

5.4600268718660425

In [188]:
preds4 = bst.predict(dtest, ntree_limit = 3)

In [189]:
preds4[:5]

array([7.3636575, 7.3636575, 7.3636575, 7.3636575, 7.3636575],
      dtype=float32)

In [190]:
np.exp(1.398119+1.3979470-np.log(3)+1.397715-np.log(3))

7.363655073111648

In [212]:
preds5 = bst.predict(dtest, ntree_limit = 11)

In [213]:
preds5[:5]

array([78.442825, 78.442825, 78.442825, 78.442825, 78.442825],
      dtype=float32)

In [193]:
dtest

<xgboost.core.DMatrix at 0x1a2288a0b8>

In [195]:
data[4741:4747, 29]

array([0., 0., 0., 0., 0., 0.])

In [211]:
np.exp(model_structure2.loc[(model_structure2.tree <= 10), 'weight'].sum() - (10*np.log(3)))

78.4428162301013

In [216]:
model_structure2.loc[(model_structure2.tree <= 10), 'weight'].sum() - (10*np.log(3))

4.362369903668112

In [214]:
preds6 = bst.predict(dtest, ntree_limit = 12)

In [215]:
preds6[:5]

array([104.53664, 104.53664, 104.53664, 104.53664, 104.53664],
      dtype=float32)

In [217]:
1.385718-np.log(3)

0.2871057113318902

In [218]:
1.385780-np.log(3)

0.28716771133189023

In [221]:
np.exp(4.362369903668112+0.28716771133189023)

104.53663822696451

In [202]:
len(model_structure2.loc[(model_structure2.index <= 11) &(model_structure2.node < 2), 'weight'])

12

In [204]:
model_structure2.loc[(model_structure2.tree <= 11) &(model_structure2.node < 2)]

Unnamed: 0,index,tree,node,yes,no,missing,split_var,split_point,quality,cover,weight,H,G,node_type
0,0,0,0,,,,,,0.299507,2885250.75,1.398119,2885250.75,-4033925.0,leaf
1,1,1,0,,,,,,0.299335,2138500.5,1.397947,2138500.5,-2989511.0,leaf
2,2,2,0,,,,,,0.299103,1585294.38,1.397715,1585294.38,-2215790.0,leaf
3,3,3,0,,,,,,0.29879,1175469.25,1.397402,1175469.25,-1642603.0,leaf
4,4,4,0,,,,,,0.298368,871863.5,1.396981,871863.5,-1217976.0,leaf
5,5,5,0,,,,,,0.297801,646947.188,1.396413,646947.188,-903405.7,leaf
6,6,6,0,,,,,,0.297038,480325.344,1.395651,480325.344,-670366.3,leaf
7,7,7,0,,,,,,0.296014,356889.188,1.394626,356889.188,-497727.0,leaf
8,8,8,0,,,,,,0.294641,265446.031,1.393253,265446.031,-369833.5,leaf
9,9,9,0,,,,,,0.292804,197703.938,1.391417,197703.938,-275088.6,leaf


In [206]:
np.log(3)

1.0986122886681098

In [154]:
trees_G = model_structure[['tree', 'node', 'G']].copy()

In [155]:
trees_G.columns = ['tree_yes', 'node_yes', 'G_yes']

In [156]:
trees_G.head()

Unnamed: 0,tree_yes,node_yes,G_yes
0,0,0,-2306778.0
1,1,0,-1709378.0
2,2,0,-1266813.0
3,3,0,-938952.8
4,4,0,-696068.2


In [157]:
model_structure = model_structure.merge(trees_G,
                                        how = 'left',
                                        left_on = ['tree', 'yes'],
                                        right_on = ['tree_yes', 'node_yes'])

In [158]:
model_structure.head(20)

Unnamed: 0,index,tree,node,yes,no,missing,split_var,split_point,quality,cover,weight,H,G,tree_yes,node_yes,G_yes
0,0,0,0,,,,,,0.299507,2885251.0,0.799507,2885251.0,-2306778.0,,,
1,1,1,0,,,,,,0.299335,2138500.0,0.799335,2138500.0,-1709378.0,,,
2,2,2,0,,,,,,0.299103,1585294.0,0.799103,1585294.0,-1266813.0,,,
3,3,3,0,,,,,,0.29879,1175469.0,0.79879,1175469.0,-938952.8,,,
4,4,4,0,,,,,,0.298368,871863.5,0.798368,871863.5,-696068.2,,,
5,5,5,0,,,,,,0.297801,646947.2,0.797801,646947.2,-516135.2,,,
6,6,6,0,,,,,,0.297038,480325.3,0.797038,480325.3,-382837.7,,,
7,7,7,0,,,,,,0.296014,356889.2,0.796014,356889.2,-284088.8,,,
8,8,8,0,,,,,,0.294641,265446.0,0.794641,265446.0,-210934.2,,,
9,9,9,0,,,,,,0.292804,197703.9,0.792804,197703.9,-156740.6,,,


In [159]:
trees_G.columns = ['tree_no', 'node_no', 'G_no']

In [160]:
model_structure = model_structure.merge(trees_G,
                                        how = 'left',
                                        left_on = ['tree', 'no'],
                                        right_on = ['tree_no', 'node_no'])

In [163]:
model_structure.head(26)

Unnamed: 0,index,tree,node,yes,no,missing,split_var,split_point,quality,cover,weight,H,G,tree_yes,node_yes,G_yes,tree_no,node_no,G_no
0,0,0,0,,,,,,0.299507,2885251.0,0.799507,2885251.0,-2306778.0,,,,,,
1,1,1,0,,,,,,0.299335,2138500.0,0.799335,2138500.0,-1709378.0,,,,,,
2,2,2,0,,,,,,0.299103,1585294.0,0.799103,1585294.0,-1266813.0,,,,,,
3,3,3,0,,,,,,0.29879,1175469.0,0.79879,1175469.0,-938952.8,,,,,,
4,4,4,0,,,,,,0.298368,871863.5,0.798368,871863.5,-696068.2,,,,,,
5,5,5,0,,,,,,0.297801,646947.2,0.797801,646947.2,-516135.2,,,,,,
6,6,6,0,,,,,,0.297038,480325.3,0.797038,480325.3,-382837.7,,,,,,
7,7,7,0,,,,,,0.296014,356889.2,0.796014,356889.2,-284088.8,,,,,,
8,8,8,0,,,,,,0.294641,265446.0,0.794641,265446.0,-210934.2,,,,,,
9,9,9,0,,,,,,0.292804,197703.9,0.792804,197703.9,-156740.6,,,,,,


In [None]:
  xgb_trees[Feature != 'Leaf', G := G_yes + G_no]
  
  xgb_trees[Feature != 'Leaf', weight := - G / H]