## Advanced options for multioutput handling

### Imports

In [1]:
import os
# set the device to run
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

os.makedirs('../data', exist_ok=True)

import pandas as pd
from pandas import Series, DataFrame

from sklearn.model_selection import train_test_split

from py_boost import GradientBoosting, SketchBoost

# strategies to deal with multiple outputs
from py_boost.multioutput.sketching import *
from py_boost.multioutput.target_splitter import *

### Downloading data from OpenML

In [2]:
!wget https://www.openml.org/data/get_csv/19335692/file1c556677f875.csv -O ../data/helena.csv

--2022-05-25 14:44:53--  https://www.openml.org/data/get_csv/19335692/file1c556677f875.csv
Resolving www.openml.org (www.openml.org)... 131.155.11.11
Connecting to www.openml.org (www.openml.org)|131.155.11.11|:443... connected.
HTTP request sent, awaiting response... 307 Temporary Redirect
Location: https://old.openml.org/data/get_csv/19335692/file1c556677f875.csv [following]
--2022-05-25 14:44:54--  https://old.openml.org/data/get_csv/19335692/file1c556677f875.csv
Resolving old.openml.org (old.openml.org)... 131.155.11.11
Connecting to old.openml.org (old.openml.org)|131.155.11.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: ‘../data/helena.csv’

../data/helena.csv      [   <=>              ]  14.56M  29.6MB/s    in 0.5s    

2022-05-25 14:44:55 (29.6 MB/s) - ‘../data/helena.csv’ saved [15271704]



In [3]:
data = pd.read_csv('../data/helena.csv')

data

Unnamed: 0,class,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27
0,41,0.005521,0.080556,0.110417,0.490822,0.586406,0.066414,0.092206,0.116352,0.379310,...,-0.342986,78.6894,17.237800,21.504200,14.43730,17.378000,9.61674,-0.609370,1.044830,1.481790
1,92,0.049398,0.147917,0.541667,0.542865,0.515608,0.105128,0.475550,0.049555,0.383460,...,2.639370,59.7879,5.393410,3.819610,11.49240,3.929470,5.91423,1.409210,4.749540,1.103820
2,24,0.548663,1.000000,1.000000,0.397029,0.627398,1.023440,1.004220,0.027381,0.451337,...,0.137427,58.1429,-3.365980,-0.037489,10.63470,2.660180,3.93377,-0.898220,2.137790,1.054470
3,29,0.023073,0.206250,0.238889,0.622998,0.764067,0.202599,0.177892,0.071232,0.531712,...,0.477009,55.4798,-1.051090,-4.755360,13.36710,2.852060,9.65162,0.224397,-0.220216,-0.273287
4,91,0.224427,0.433333,0.902083,0.814199,0.576879,0.344413,0.822975,0.026121,0.425875,...,0.521306,76.8475,-19.371700,32.270700,9.41442,4.343450,8.67710,-1.587580,1.117870,-0.545338
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65191,88,0.007292,0.152083,0.061111,0.114431,0.406104,0.143170,0.053086,0.129365,0.215442,...,1.265300,53.2951,-1.416430,2.173900,13.66950,1.588520,2.02855,0.619052,0.622377,-0.363035
65192,77,0.411279,1.000000,0.430556,0.503805,0.207163,1.003740,0.412067,0.017673,0.044771,...,-2.842380,91.1178,-0.009207,-2.224830,1.30504,0.898489,1.80362,-2.726140,-0.184329,-0.476441
65193,71,0.999352,1.000000,1.000000,0.501360,0.501075,0.999384,0.999414,0.009636,0.000648,...,0.213472,84.4141,2.042450,13.849800,7.24428,1.443890,4.00495,-0.749115,1.025130,0.096257
65194,24,0.206175,0.383333,0.944444,0.749915,0.550936,0.292477,0.830921,0.033542,0.430515,...,0.879472,61.4110,17.354200,5.566660,16.22600,10.049400,6.04195,0.400956,0.375599,0.644575


In [4]:
data['class'].value_counts()

78    4005
55    3063
40    2992
39    2623
38    2216
      ... 
56     121
75     121
32     119
34     116
10     111
Name: class, Length: 100, dtype: int64

Looks like we have 100 classes here ...

In [5]:
X = data.drop('class', axis=1).values.astype('float32')
y = data['class'].values.astype('int32')

X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

### Traininig a multiclass model

A simple use case for training a multiclass problem is the same as for regression. By default py_boost builds multioutout trees to handle multioutput problems (single tree outputs a vector of length 100 for 100 class task).


In [6]:
%%time
model = GradientBoosting('crossentropy',
                         ntrees=10000, lr=0.03, verbose=100, es=300, lambda_l2=1,
                         subsample=1, colsample=1, min_data_in_leaf=10, use_hess=True,
                         max_bin=256, max_depth=6)

model.fit(X, y, 
          eval_sets = [{'X': X_test, 'y': y_test}])

[14:44:56] Stdout logging level is INFO.
[14:44:56] GDBT train starts. Max iter 10000, early stopping rounds 300
[14:44:56] Iter 0; Sample 0, Crossentropy = 4.28506738343478; 
[14:45:02] Iter 100; Sample 0, Crossentropy = 2.775789849192044; 
[14:45:08] Iter 200; Sample 0, Crossentropy = 2.6526719930327483; 
[14:45:15] Iter 300; Sample 0, Crossentropy = 2.6102726621840295; 
[14:45:22] Iter 400; Sample 0, Crossentropy = 2.589505190201448; 
[14:45:29] Iter 500; Sample 0, Crossentropy = 2.578001086482414; 
[14:45:36] Iter 600; Sample 0, Crossentropy = 2.571097266565488; 
[14:45:43] Iter 700; Sample 0, Crossentropy = 2.567045761144305; 
[14:45:50] Iter 800; Sample 0, Crossentropy = 2.564035213467976; 
[14:45:57] Iter 900; Sample 0, Crossentropy = 2.5623130097423057; 
[14:46:05] Iter 1000; Sample 0, Crossentropy = 2.560707451287711; 
[14:46:12] Iter 1100; Sample 0, Crossentropy = 2.5597066472006365; 
[14:46:19] Iter 1200; Sample 0, Crossentropy = 2.5593727068690812; 
[14:46:26] Iter 1300; Sa

<py_boost.gpu.boosting.GradientBoosting at 0x7f477977a750>

In [7]:
%%time
pred = model.predict(X_test)
pred.shape

CPU times: user 574 ms, sys: 39.4 ms, total: 613 ms
Wall time: 614 ms


(13040, 100)

### Sketching strategies to speedup training

Computational costs of training multioutput models drastically increase when number of output grows. We implemented a few strategies to simplify tree structure search via gradinet matrix sketching:

* ***RandomSamplingSketch*** (recommended for use_hess=True)
* ***RandomProjectionSketch*** (recommended for use_hess=False)
* ***TopOutputsSketch***
* ***SVDSketch*** (needs RAPIDS (cuml) to be installed)

Let us check, how it works.

In [8]:
%%time

sketch = RandomProjectionSketch(1)
# sketch = RandomSamplingSketch(10)
# sketch = TopOutputsSketch(10)
# sketch = SVDSketch(n_components=1)

model = GradientBoosting('crossentropy',
                         ntrees=10000, lr=0.03, verbose=100, es=300, lambda_l2=1,
                         subsample=1, colsample=1, min_data_in_leaf=10, use_hess=False, 
                         max_bin=256, max_depth=6,
                         multioutput_sketch=sketch,
                        )

model.fit(X, y, eval_sets = [{'X': X_test, 'y': y_test}])

[14:46:47] Stdout logging level is INFO.
[14:46:47] GDBT train starts. Max iter 10000, early stopping rounds 300
[14:46:47] Iter 0; Sample 0, Crossentropy = 4.378003967540384; 
[14:46:48] Iter 100; Sample 0, Crossentropy = 2.884982323664597; 
[14:46:49] Iter 200; Sample 0, Crossentropy = 2.735828979785273; 
[14:46:50] Iter 300; Sample 0, Crossentropy = 2.675699308505761; 
[14:46:51] Iter 400; Sample 0, Crossentropy = 2.6414584678771402; 
[14:46:52] Iter 500; Sample 0, Crossentropy = 2.6170187307174535; 
[14:46:52] Iter 600; Sample 0, Crossentropy = 2.59952581837706; 
[14:46:53] Iter 700; Sample 0, Crossentropy = 2.5870286449443083; 
[14:46:54] Iter 800; Sample 0, Crossentropy = 2.577119629163826; 
[14:46:55] Iter 900; Sample 0, Crossentropy = 2.5703519150061833; 
[14:46:56] Iter 1000; Sample 0, Crossentropy = 2.564371190982428; 
[14:46:57] Iter 1100; Sample 0, Crossentropy = 2.5589528775269903; 
[14:46:58] Iter 1200; Sample 0, Crossentropy = 2.5552830004828304; 
[14:46:59] Iter 1300; S

<py_boost.gpu.boosting.GradientBoosting at 0x7f4779621ad0>

In [9]:
pred = model.predict(X_test)
pred.shape

(13040, 100)

### SketchBoost

Alternatively you can use SketchBoost class with built in FilterSketch strategy. Just define number of outputs to keep

In [None]:
%%time
model = SketchBoost('crossentropy',
                     ntrees=10000, lr=0.03, verbose=100, es=300, lambda_l2=1,
                     subsample=1, colsample=1, min_data_in_leaf=10, 
                     max_bin=256, max_depth=6,
                     sketch_outputs=5, sketch_method='filter', sketch_params={'ntrees': 1}
                        )

model.fit(X, y, eval_sets = [{'X': X_test, 'y': y_test}])

[14:47:13] Stdout logging level is INFO.
[14:47:13] GDBT train starts. Max iter 10000, early stopping rounds 300
[14:47:15] Iter 0; Sample 0, Crossentropy = 4.3961769645320175; 
[14:47:17] Iter 100; Sample 0, Crossentropy = 2.8582913990827743; 
[14:47:18] Iter 200; Sample 0, Crossentropy = 2.708252094790909; 
[14:47:20] Iter 300; Sample 0, Crossentropy = 2.6441740325347634; 
[14:47:21] Iter 400; Sample 0, Crossentropy = 2.6105068779184983; 
[14:47:22] Iter 500; Sample 0, Crossentropy = 2.588556652585321; 
[14:47:23] Iter 600; Sample 0, Crossentropy = 2.5745484313370657; 
[14:47:25] Iter 700; Sample 0, Crossentropy = 2.564501589591466; 
[14:47:26] Iter 800; Sample 0, Crossentropy = 2.5571412859938105; 
[14:47:27] Iter 900; Sample 0, Crossentropy = 2.551078857451761; 


In [None]:
pred = model.predict(X_test)
pred.shape

#### We see a nice speed up and sometimes even a better accuracy!

#### These modifications allow us to train a model faster than CatBoost does (with a similar setup).

In [None]:
%%time
from catboost import CatBoostClassifier

model = CatBoostClassifier(
                       grow_policy='Depthwise', 
                       bootstrap_type='Bernoulli',
                       subsample=1.,
                       border_count=256, 
                       iterations=10000, 
                       od_wait=300,
                       max_depth=6, 
                       devices='0:0', 
                       learning_rate=0.03, 
                       l2_leaf_reg=1, 
                       min_data_in_leaf=10, 
                       score_function='L2',
                       model_shrink_mode='Constant',
                       **{'task_type': 'GPU', 'verbose': 100, }
                    )

model.fit(X, y, eval_set = (X_test, y_test))    