#### In this tutorial we will implement distillation of complex model's knowledge to simpler models. A complex model called teacher is TabilarAutoMl object. Simpler models called students are BoostCB and BoostLGBM objects.

In [1]:
import numpy as np
import pandas as pd
from pandas import read_csv
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.dataset.roles import NumericRole
from lightautoml.tasks import Task
from lightautoml.addons.distillation import Distiller
from lightautoml.utils.profiler import Profiler
from lightautoml.ml_algo.boost_cb import BoostCB
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.automl.base import MLPipeline
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.automl.base import AutoML

from catboost import CatBoostClassifier, CatBoostRegressor

### 1. Some Setups

In [4]:
RANDOM_STATE = 42
TEST_SIZE = 2000
TARGET_NAME = 'class'
TIMEOUT = 600
DATASET_PATH = 'example_data/dataset_12_mfeat-factors.csv'

np.random.seed(RANDOM_STATE)

### 2. Data loading and preparation

In [3]:
data = read_csv(DATASET_PATH)
# data[['white_piece0_strength', 'white_piece0_file', 'white_piece0_rank',
#        'black_piece0_strength', 'black_piece0_file', 'black_piece0_rank']].astype(float, copy=False)
# data[TARGET_NAME] = LabelEncoder().fit_transform(data[TARGET_NAME])
train, test = train_test_split(data, test_size=0.25, random_state=RANDOM_STATE)

### 3. AutoML and distiller creation

In [9]:
# roles = {'target': TARGET_NAME,
#          NumericRole(discretization=False): ['white_piece0_strength', 'white_piece0_file', 'white_piece0_rank',
#        'black_piece0_strength', 'black_piece0_file', 'black_piece0_rank']}
roles = {'target': TARGET_NAME}

task = Task('multiclass')

automl = TabularUtilizedAutoML(task=task, timeout=TIMEOUT, general_params={'verbose': 0})
distiller = Distiller(automl)

### 4. Distiller fitting and evaluation

In [10]:
distiller.fit(train, roles=roles)
test_pred = distiller.predict(test)
print('Teacher TEST accuracy: {}'.format(accuracy_score(test[roles['target']].values, test_pred.data.argmax(axis=1))))

:	learn: 0.0607304	test: 0.1768808	best: 0.1768808 (900)	total: 12s	remaining: 28.1s
1000:	learn: 0.0552491	test: 0.1726784	best: 0.1726676 (998)	total: 13.4s	remaining: 26.7s
1100:	learn: 0.0498356	test: 0.1676876	best: 0.1676876 (1100)	total: 14.7s	remaining: 25.3s
1200:	learn: 0.0456693	test: 0.1643305	best: 0.1643247 (1198)	total: 16s	remaining: 24s
1300:	learn: 0.0415138	test: 0.1601614	best: 0.1601614 (1300)	total: 17.4s	remaining: 22.7s
1400:	learn: 0.0379228	test: 0.1575179	best: 0.1575179 (1400)	total: 18.8s	remaining: 21.5s
1500:	learn: 0.0343408	test: 0.1536050	best: 0.1536050 (1500)	total: 20.2s	remaining: 20.1s
1600:	learn: 0.0315711	test: 0.1506950	best: 0.1506950 (1600)	total: 21.5s	remaining: 18.8s
1700:	learn: 0.0290776	test: 0.1484531	best: 0.1484511 (1699)	total: 22.8s	remaining: 17.4s
1800:	learn: 0.0271410	test: 0.1470566	best: 0.1470566 (1800)	total: 24.1s	remaining: 16.1s
1900:	learn: 0.0254540	test: 0.1453629	best: 0.1453621 (1899)	total: 25.5s	remaining: 14.7s


In [11]:
accuracy_score(test[roles['target']], test_pred.data.argmax(axis=1))

0.092

### 5. Evaluation of the students on true labels

In [62]:
students = list()
for algo in [BoostCB, BoostLGBM]:
    # TODO: implement students consistent with lightautoml
    reader = PandasToPandasReader(Task('multiclass'), samples=None, max_nan_rate=1, max_constant_rate=1,
                                    advanced_roles=True, drop_score_co=-1, n_jobs=1)
    pipeline_lvl1 = MLPipeline(ml_algos=[algo(default_params={'verbose': 0})],
                                pre_selection=None,
                                features_pipeline=LGBSimpleFeatures(),
                                post_selection=None)
    students.append(AutoML(reader, [[pipeline_lvl1]], skip_conn=False, verbose=0))

In [63]:
preds = dict()
for estimator in students:
    estimator.fit_predict(train, roles={'target': TARGET_NAME})
    preds[estimator.levels[0][0].ml_algos[0].name] = estimator.predict(test)



In [64]:
for key, item in preds.items():
    print(key, accuracy_score(test[roles['target']].values, item.data.argmax(axis=1)))

Lvl_0_Pipe_0_Mod_0_CatBoost 0.8625613565372602
Lvl_0_Pipe_0_Mod_0_LightGBM 0.8548862115127175


### Catboost

In [193]:
cb_model = CatBoostClassifier(loss_function='MultiClass')

In [82]:
cb_model.fit(cb_X_soft_train, cb_y_train, eval_set=(cb_X_soft_val, cb_y_val), early_stopping_rounds=100, verbose=200)

Learning rate set to 0.115175
0:	learn: 1.0084783	test: 1.0078625	best: 1.0078625 (0)	total: 10.7ms	remaining: 10.6s
200:	learn: 0.2879457	test: 0.3024496	best: 0.3024496 (200)	total: 1.41s	remaining: 5.59s
400:	learn: 0.2404575	test: 0.2727211	best: 0.2727211 (400)	total: 3.19s	remaining: 4.76s
600:	learn: 0.2163354	test: 0.2632248	best: 0.2632248 (600)	total: 4.89s	remaining: 3.25s
800:	learn: 0.2007503	test: 0.2610702	best: 0.2609398 (728)	total: 5.98s	remaining: 1.49s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2609397563
bestIteration = 728

Shrink model to first 729 iterations.


<catboost.core.CatBoostClassifier at 0x18a2df760>

In [194]:
cb_model.fit(train.drop(TARGET_NAME, axis=1), train[TARGET_NAME], verbose=200)

Learning rate set to 0.094662
0:	learn: 1.0241425	total: 10.9ms	remaining: 10.9s
200:	learn: 0.3011403	total: 1.38s	remaining: 5.49s
400:	learn: 0.2532611	total: 2.76s	remaining: 4.13s
600:	learn: 0.2309171	total: 4.04s	remaining: 2.68s
800:	learn: 0.2162141	total: 5.74s	remaining: 1.43s
999:	learn: 0.2060449	total: 7.45s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x18aee6760>

In [195]:
accuracy_score(cb_y_test, cb_model.predict(cb_test))

0.8556001784917447

In [180]:
import math


class CustomMetric:
    def __init__(self, metric, is_higher_better, needs_pred_proba):
        self.metric = metric
        self.is_higher_better = is_higher_better
        self.needs_pred_proba = needs_pred_proba

    @staticmethod
    def get_final_error(error, weight):
        return error

    def is_max_optimal(self):
        return self.is_higher_better

    def evaluate(self, approxes, target, weight):
        raise NotImplementedError


class SoftclassCustomMetric(CustomMetric):
    from catboost import MultiRegressionCustomMetric
    def __init__(self, metric, is_higher_better, needs_pred_proba):  # metric is ignored
        super().__init__(metric, is_higher_better, needs_pred_proba)
        self.softlogloss = self.SoftLogLossMetric()  # the metric object to pass to CatBoostRegressor

    def evaluate(self, approxes, target, weight):
        return self.softlogloss.evaluate(approxes, target, weight)

    class SoftLogLossMetric(MultiRegressionCustomMetric):
        def get_final_error(self, error, weight):
            return error

        def is_max_optimal(self):
            return True

        def evaluate(self, approxes, target, weight):
            assert len(target) == len(approxes)
            assert len(target[0]) == len(approxes[0])
            weight_sum = len(target)
            approxes = np.array(approxes)
            approxes = np.exp(approxes)
            approxes = np.multiply(approxes, 1/np.sum(approxes, axis=1)[:, np.newaxis])
            error_sum = soft_log_loss(np.array(target), np.array(approxes))
            return error_sum, weight_sum


class SoftclassObjective(object):
    from catboost import MultiRegressionCustomObjective
    def __init__(self):
        self.softlogloss = self.SoftLogLossObjective()  # the objective object to pass to CatBoostRegressor

    class SoftLogLossObjective(MultiRegressionCustomObjective):
        def calc_ders_multi(self, approxes, targets, weight):
            exp_approx = [math.exp(val) for val in approxes]
            exp_sum = sum(exp_approx)
            exp_approx = [val / exp_sum for val in exp_approx]
            grad = [(targets[j] - exp_approx[j])*weight for j in range(len(targets))]
            hess = [[(exp_approx[j] * exp_approx[j2] - (j==j2)*exp_approx[j]) * weight
                    for j in range(len(targets))] for j2 in range(len(targets))]
            return (grad, hess)

In [181]:
def spunge_augment(X,
                   num_augmented_samples=10000,
                   frac_perturb=0.1,
                   continuous_feature_noise=0.1,
                   **kwargs):
    num_feature_perturb = max(1, int(frac_perturb*len(X.columns)))
    X_aug = pd.concat([X.iloc[[0]].copy()]*num_augmented_samples)
    X_aug.reset_index(drop=True, inplace=True)
    continuous_types = ['float', 'int']
    continuous_featnames = X.select_dtypes(continuous_types).columns

    for i in range(num_augmented_samples): # hot-deck sample some features per datapoint
        og_ind = i % len(X)
        augdata_i = X.iloc[og_ind].copy()
        num_feature_perturb_i = np.random.choice(range(1,num_feature_perturb+1))  # randomly sample number of features to perturb
        cols_toperturb = np.random.choice(list(X.columns), size=num_feature_perturb_i, replace=False)
        for feature in cols_toperturb:
            feature_data = X[feature]
            augdata_i[feature] = feature_data.sample(n=1).values[0]
        X_aug.iloc[i] = augdata_i

    for feature in X.columns:
        if feature in continuous_featnames:
            feature_data = X[feature]
            aug_data = X_aug[feature]
            noise = np.random.normal(scale=np.nanstd(feature_data)*continuous_feature_noise, size=num_augmented_samples)
            mask = np.random.binomial(n=1, p=frac_perturb, size=num_augmented_samples)
            aug_data = aug_data + noise*mask
            X_aug[feature] = pd.Series(aug_data, index=X_aug.index)

    return pd.concat((X, X_aug))

In [196]:
%time train_aug = spunge_augment(train.drop(TARGET_NAME, axis=1), num_augmented_samples=30_000)

CPU times: user 27.5 s, sys: 216 ms, total: 27.7 s
Wall time: 27.9 s


In [210]:
cb_soft_model = CatBoostClassifier(loss_function=SoftclassObjective.SoftLogLossObjective(),
                                   eval_metric=SoftclassCustomMetric.SoftLogLossMetric())

In [79]:
cb_X_soft_train, cb_X_soft_val, cb_y_soft_train, cb_y_soft_val, cb_y_train, cb_y_val = train_test_split(train_aug,
                                                                                                        distiller.predict(train_aug).data,
                                                                                                        train[TARGET_NAME],
                                                                                                        random_state=RANDOM_STATE)

In [199]:
train_aug.sample(10)

Unnamed: 0,white_piece0_strength,white_piece0_file,white_piece0_rank,black_piece0_strength,black_piece0_file,black_piece0_rank
9823,0.0,3.0,2.0,6.0,5.0,8.0
2241,0.0,1.0,5.0,0.0,3.0,3.0
2895,0.0,2.0,6.0,0.0,0.0,1.0
497,7.0,1.0,1.0,6.0,4.0,4.0
24302,6.0,0.0,2.0,4.0,3.0,5.0
23272,6.0,4.0,2.0,0.0,5.156612,1.0
19097,4.0,5.0,1.0,0.0,1.0,6.0
17742,6.0,4.0,1.0,0.0,5.0,1.0
7158,6.0,1.0,0.0,4.11823,5.0,6.0
6786,0.0,0.0,4.0,7.0,3.0,1.0


In [213]:
y_aug = distiller.predict(train_aug).data

IndexError: index 16 is out of bounds for dimension 0 with size 16

In [211]:
cb_soft_model.fit(X=train_aug.astype(int), y=y_aug, verbose=200)

0:	learn: -11.0451632	total: 1.7s	remaining: 28m 21s
200:	learn: -10.5452147	total: 6m 36s	remaining: 26m 17s
400:	learn: -10.6970444	total: 13m 12s	remaining: 19m 43s
600:	learn: -10.9334860	total: 19m 49s	remaining: 13m 9s
800:	learn: -11.1804037	total: 26m 25s	remaining: 6m 33s
999:	learn: -11.3992186	total: 33m 1s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x181dccf40>

In [111]:
accuracy_score(cb_y_test, cb_soft_model.predict_proba(cb_test).argmax(axis=1))

0.8646140116019634

In [212]:
accuracy_score(cb_y_test, cb_soft_model.predict_proba(cb_test).argmax(axis=1))

0.8706827309236947

In [7]:
distiller.teacher.outer_pipes[0].ml_algos[0].models[0][0].reader.roles

AttributeError: 'TabularUtilizedAutoML' object has no attribute 'outer_pipes'

In [171]:
dir(distiller.teacher.outer_pipes[0].ml_algos[0].models[0][0].reader)

['_$_f_record_history_wrapper_-or-cls-DECO',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_create_target',
 '_dropped_features',
 '_get_default_role_from_str',
 '_guess_role',
 '_is_ok_feature',
 '_roles',
 '_used_array_attrs',
 '_used_features',
 'advanced_roles',
 'advanced_roles_guess',
 'advanced_roles_params',
 'class_mapping',
 'cv',
 'dropped_features',
 'fit_read',
 'from_reader',
 'get_own_record_history_wrapper',
 'get_record_history_wrapper',
 'max_constant_rate',
 'max_nan_rate',
 'n_jobs',
 'params',
 'random_state',
 'read',
 'record_history_omit',
 'record_history_only',
 'roles',
 'roles_params',
 'samples',
 'task',
 'upd_used_features',
 'used_arr

In [151]:
distiller.teacher..reader.used_array_attrs

AttributeError: type object 'TabularAutoML' has no attribute 'reader'

In [148]:
accuracy_score(cb_y_test, cb_soft_model.predict_proba(cb_test).argmax(axis=1))

0.8655957161981258

In [37]:
distiller.distill(train, labels=train[TARGET_NAME])

metrics = distiller.eval_metrics(test, metrics=[roc_auc_score, accuracy_score])
metrics

AttributeError: 'TabularUtilizedAutoML' object has no attribute 'reader'

### 6. Teacher knowledge distillation

In [8]:
automl = TabularAutoML(task=task, timeout=30, verbose=0)
distiller = Distiller(automl)
distiller.fit(train, roles=roles)
best_model = distiller.distill(train)
print('Best model after distillation: {}'.format(best_model.levels[0][0].ml_algos[0].name))

Time limit exceeded after calculating fold 2
Time limit exceeded after calculating fold 1
Time limit exceeded after calculating fold 3
Time limit exceeded after calculating fold 1
Best model after distillation: Lvl_0_Pipe_0_Mod_0_LightGBM


### 7. Evaluation of the students on labels derived from teacher

In [9]:
metrics = distiller.eval_metrics(test, metrics=[roc_auc_score,accuracy_score])
metrics

Unnamed: 0,roc_auc_score,accuracy_score
Lvl_0_Pipe_0_Mod_0_CatBoost,0.742482,0.9275
Lvl_0_Pipe_0_Mod_0_LightGBM,0.742203,0.9275


### 8. Profiling report creation

In [10]:
p.profile('profiling_report.html')