### Speed comparison of gradient boosting libraries for shap values calculations

Here we compare CatBoost, LightGBM and XGBoost for shap values calculations on GPU.

We used Titan X Pascal for training and evaluation.

We use epsilon_normalized dataset from [here](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/).

In [1]:
import os
import numpy as np
import scipy
import pandas as pd
import copy
import tqdm
import datetime
from sklearn import datasets
import catboost
import xgboost as xgb
import lightgbm as lgb
import time

In [2]:
catboost.__version__, lgb.__version__, xgb.__version__

('0.11.1', '2.2.3', '0.81')

In [3]:
train_data, train_target = datasets.load_svmlight_file("epsilon_normalized",)
test_data, test_target = datasets.load_svmlight_file("epsilon_normalized.t",)
train_data.shape, train_target.shape

((400000, 2000), (400000,))

### parameters

In [4]:
num_iters = 1000
lr = 0.1
max_bin = 128
gpu_device = '1'
random_state = 0

We use only 10k samples from test set

In [5]:
sep = 10000

train_target[train_target == -1] = 0
test_target[test_target == -1] = 0

test_data = test_data[:sep]
test_target = test_target[:sep]

In [6]:
def preprocess_data(data, label=None, mode='train', boosting=None):
    assert boosting is not None
    
    if isinstance(data, scipy.sparse.csr_matrix):
        data = data.todense().A
    
    if boosting == 'xgboost':
        return xgb.DMatrix(data, label)
    elif boosting == 'lightgbm':
        if mode == 'train':
            return lgb.Dataset(data, label)
        else:
            return data
    elif boosting == 'catboost':
        return catboost.Pool(data, label)
    else:
        raise RuntimeError("Unknown boosting library")

In [7]:
def create_parameters(base_params, boosting=None, **kwargs):
    assert boosting is not None
    assert isinstance(base_params, dict)
    
    params = copy.copy(base_params)
    if boosting == 'xgboost':
        params['objective'] = 'binary:logistic'
        params['max_depth'] = kwargs['depth']
        params['tree_method'] = 'gpu_hist'
        params['gpu_id'] = gpu_device
    elif boosting == 'lightgbm':
        params['objective'] = 'binary'
        params['device'] = "gpu"
        params['gpu_device_id'] = gpu_device
        params['num_leaves'] = 2**kwargs['depth']
    elif boosting == 'catboost':
        params['objective'] = 'Logloss'
        params['task_type'] = 'GPU'
        params['devices'] = gpu_device
        params['bootstrap_type'] = 'Bernoulli'
        params['logging_level'] = 'Silent'
    else:
        raise RuntimeError("Unknown boosting library")
        
    return params

In [8]:
def train(data, params, num_iters, boosting=None):
    assert boosting is not None
    if boosting == 'xgboost':
        return xgb.train(params=params, dtrain=data, num_boost_round=num_iters)
    elif boosting == 'lightgbm':
        return lgb.train(params=params, train_set=data, num_boost_round=num_iters)
    elif boosting == 'catboost':
        return catboost.train(pool=data, params=params, num_boost_round=num_iters)
    else:
        raise RuntimeError("Unknown boosting library")

In [9]:
def predict_shap(model, data, boosting=None):
    assert boosting is not None
    if boosting == 'xgboost':
        return model.predict(data, pred_contribs=True)
    elif boosting == 'lightgbm':
        return model.predict(data, pred_contrib=True)
    elif boosting == 'catboost':
        return model.get_feature_importance(data, fstr_type='ShapValues')

In [10]:
def create_path(boosting, params):
    fname = [boosting]
    for key, value in sorted(params.items()):
        fname.append(str(key))
        fname.append(str(value))
    fname = "_".join(fname)
    fname = fname.replace(".", '')
    fname += ".model"
    return fname

In [11]:
def load_model(fname, boosting):
    if boosting == "xgboost":
        bst = xgb.Booster(model_file=fname)
        bst.load_model(fname)
    elif boosting == "lightgbm":
        bst = lgb.Booster(model_file=fname)
    elif boosting == "catboost":
        bst = catboost.CatBoost()
        bst.load_model(fname)
    else:
        raise RuntimeError("Unknown boosting")
    return bst

In [12]:
base_params = {
    'learning_rate': lr,
    'max_bin': max_bin,
    'random_state': random_state
}

In [13]:
result = []

boosting_list = ['xgboost', 'catboost', 'lightgbm']
depth_list = [2, 4, 6, 8, 10]


for gb_type in boosting_list:
    
    print("{} is going".format(gb_type))
    train_preprocessed = preprocess_data(train_data, train_target, boosting=gb_type)
    test_preprocessed = preprocess_data(test_data, test_target, mode='test',boosting=gb_type)
    
    for depth in tqdm.tqdm(depth_list):
        
        params = create_parameters(base_params, boosting=gb_type, depth=depth)
        params['depth'] = depth
        fname = create_path(gb_type, params)
        if os.path.exists(fname):
            print("model exist")
            bst = load_model(fname, boosting=gb_type)
        else:
            print("model is training")
            start_train = datetime.datetime.now()
            bst = train(train_preprocessed, params, num_iters=num_iters, boosting=gb_type)
            finish_train = datetime.datetime.now()
            delta_train = finish_train - start_train
            delta_train = int(delta_train.total_seconds() * 1000)
            bst.save_model(fname)

        start_time = datetime.datetime.now()
        preds = predict_shap(bst, test_preprocessed, boosting=gb_type)
        assert preds.shape == (test_data.shape[0], test_data.shape[1] + 1)
        finish_time = datetime.datetime.now()

        delta = finish_time - start_time
        delta = int(delta.total_seconds() * 1000)

        current_res = {
        'boosting': gb_type,
        'depth': depth,
        'time': delta,
        }

        result.append(current_res)
            
    print("*" * 40)

xgboost is going


  0%|          | 0/5 [00:00<?, ?it/s]

model is training


 20%|██        | 1/5 [10:33<42:14, 633.56s/it]

model is training


 40%|████      | 2/5 [25:27<38:10, 763.55s/it]

model is training


 60%|██████    | 3/5 [44:29<29:39, 889.96s/it]

model is training


 80%|████████  | 4/5 [1:10:28<17:37, 1057.17s/it]

model is training


100%|██████████| 5/5 [2:00:04<00:00, 1440.85s/it]


****************************************
catboost is going


  0%|          | 0/5 [00:00<?, ?it/s]

model exist


 20%|██        | 1/5 [00:02<00:08,  2.19s/it]

model exist


 40%|████      | 2/5 [00:04<00:06,  2.14s/it]

model exist


 60%|██████    | 3/5 [00:06<00:04,  2.26s/it]

model exist


 80%|████████  | 4/5 [00:13<00:03,  3.50s/it]

model exist


100%|██████████| 5/5 [01:49<00:00, 21.81s/it]


****************************************
lightgbm is going


  0%|          | 0/5 [00:00<?, ?it/s]

model exist


 20%|██        | 1/5 [00:00<00:02,  1.91it/s]

model exist


 40%|████      | 2/5 [00:04<00:07,  2.40s/it]

model exist


 60%|██████    | 3/5 [00:46<00:30, 15.35s/it]

model exist


 80%|████████  | 4/5 [04:42<01:10, 70.72s/it]

model exist


100%|██████████| 5/5 [24:31<00:00, 294.34s/it]

****************************************





\* We trained some models before and didn't train them again

In [14]:
result_df = pd.DataFrame(result)
result_df.head(2)

Unnamed: 0,boosting,depth,time
0,xgboost,2,495
1,xgboost,4,1996


In [15]:
result_df.to_csv("shap_benchmark_{}_max_bin.csv".format(max_bin), index=False)

In [16]:
result_df = pd.read_csv("shap_benchmark_128_max_bin.csv", )
result_df['time'] = result_df['time'] / 1000.
result_df.pivot_table(index="boosting", columns="depth")

Unnamed: 0_level_0,time,time,time,time,time
depth,2,4,6,8,10
boosting,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
catboost,2.145,2.079,2.515,7.19,95.03
lightgbm,0.502,4.241,41.21,236.708,1188.494
xgboost,0.495,1.996,12.864,74.94,298.284
