# Step 0.1. Import necessary libraries 

In [1]:
# Standard python libraries
import logging
import os
import time
logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.INFO)

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import train_test_split
import torch

# Imports from our package
from lightautoml.automl.base import AutoML
from lightautoml.automl.blend import WeightedBlender
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.linear_sklearn import LinearLBFGS
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures, LGBAdvancedPipeline
from lightautoml.pipelines.features.linear_pipeline import LinearFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ModelBasedImportanceEstimator, ImportanceCutoffSelector
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
from lightautoml.utils.profiler import Profiler
from lightautoml.utils.timer import PipelineTimer

# Step 0.2. Parameters 

In [2]:
N_THREADS = 8 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 600 # Time in seconds for automl run
TARGET_NAME = 'TARGET' # Target column name

# Step 0.3. Fix torch number of threads and numpy seed 

In [3]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

# Step 0.4. Change profiling decorators settings 

By default, profiling decorators are turned off for speed and memory reduction. If you want to see profiling report after using LAMA, you need to turn on the decorators using command below: 

In [4]:
p = Profiler()
p.change_deco_settings({'enabled': True})

ALL_FUNCS len = 381
	 Func with no decorator - <function numpy_and_pandas_concat at 0x7fc085cac950>


# Step 0.5. Example data load 

In [5]:
%%time

data = pd.read_csv('./example_data/test_data_files/sampled_app_train.csv')
data.head()

CPU times: user 76.7 ms, sys: 7.77 ms, total: 84.4 ms
Wall time: 84.1 ms


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,313802,0,Cash loans,M,N,Y,0,270000.0,327024.0,15372.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,319656,0,Cash loans,F,N,N,0,108000.0,675000.0,19737.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,207678,0,Revolving loans,F,Y,Y,2,112500.0,270000.0,13500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
3,381593,0,Cash loans,F,N,N,1,67500.0,142200.0,9630.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
4,258153,0,Cash loans,F,Y,Y,0,337500.0,1483231.5,46570.5,...,0,0,0,0,0.0,0.0,0.0,2.0,0.0,0.0


# Step 0.6. (Optional) Some user feature preparation 

Cell below shows some user feature preparations to create task more difficult (this block can be omitted if you don't want to change the initial data):

In [6]:
%%time

data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
                    ).astype(str)

data['constant'] = 1
data['allnan'] = np.nan

data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

[2020-10-18 16:00:26,242] (INFO): Note: detected 96 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
[2020-10-18 16:00:26,243] (INFO): Note: NumExpr detected 96 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[2020-10-18 16:00:26,243] (INFO): NumExpr defaulting to 8 threads.


CPU times: user 96.9 ms, sys: 11.7 ms, total: 109 ms
Wall time: 106 ms


# Step 0.7. Create fake multiclass target 

In [7]:
data[TARGET_NAME] = np.where(np.random.rand(data.shape[0]) > .5, 2, data[TARGET_NAME].values)
data[TARGET_NAME].value_counts()

2    4924
0    4677
1     399
Name: TARGET, dtype: int64

# Step 0.8. (Optional) Data splitting for train-test 

Block below can be omitted if you are going to train model only or you have specific train and test files:

In [8]:
%%time

train_data, test_data = train_test_split(data, 
                                         test_size=TEST_SIZE, 
                                         stratify=data[TARGET_NAME], 
                                         random_state=RANDOM_STATE)
logging.info('Data splitted. Parts sizes: train_data = {}, test_data = {}'
              .format(train_data.shape, test_data.shape))

[2020-10-18 16:00:26,335] (INFO): Data splitted. Parts sizes: train_data = (8000, 124), test_data = (2000, 124)


CPU times: user 12.3 ms, sys: 355 µs, total: 12.7 ms
Wall time: 12.3 ms


In [9]:
train_data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,BIRTH_DATE,EMP_DATE,constant,allnan
3273,293399,2,Revolving loans,F,Y,N,0,292500.0,765000.0,38250.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1980-08-03,2013-06-26,1,
812,426509,1,Cash loans,F,N,Y,1,225000.0,349258.5,17959.5,...,0.0,0.0,0.0,0.0,0.0,5.0,1984-05-06,2013-02-17,1,
9630,403114,0,Cash loans,M,Y,Y,2,135000.0,922500.0,29889.0,...,0.0,0.0,0.0,4.0,0.0,4.0,1984-01-23,2009-06-01,1,
7757,366682,2,Cash loans,F,N,N,0,254475.0,1485000.0,39303.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1971-11-28,2016-01-10,1,
9973,289050,0,Cash loans,F,N,Y,0,112500.0,260640.0,25906.5,...,,,,,,,1966-11-27,2002-07-12,1,


# ========= AutoML creation =========

![AutoML pipeline for this task](imgs/tutorial_2_pipeline.png)


## Step 1. Create Timer for pipeline

Here we are going to use strict timer for AutoML pipeline, which helps not to go outside the limit:

In [10]:
%%time

timer = PipelineTimer(600, mode=2)

CPU times: user 56 µs, sys: 101 µs, total: 157 µs
Wall time: 160 µs


## Step 2. Create feature selector

In [11]:
%%time

timer_gbm = timer.get_task_timer('gbm') # Get task timer from pipeline timer 
feat_sel_0 = LGBSimpleFeatures()
mod_sel_0 = BoostLGBM(timer=timer_gbm)
imp_sel_0 = ModelBasedImportanceEstimator()
selector_0 = ImportanceCutoffSelector(feat_sel_0, mod_sel_0, imp_sel_0, cutoff=0, )

CPU times: user 348 µs, sys: 631 µs, total: 979 µs
Wall time: 938 µs



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



## Step 3.1. Create GBMs pipeline for AutoML 

Our GBMs ML pipeline:
- Advanced features for gradient boosting built on selected features (using step 2) 
- 2 different models:
    * LightGBM with params tuning (using OptunaTuner)
    * LightGBM with heuristic params


In [12]:
%%time 

feats_gbm_0 = LGBAdvancedPipeline(top_intersections=4, 
                                  output_categories=True, 
                                  feats_imp=imp_sel_0)
timer_gbm_0 = timer.get_task_timer('gbm')
timer_gbm_1 = timer.get_task_timer('gbm')

gbm_0 = BoostLGBM(timer=timer_gbm_0)
gbm_1 = BoostLGBM(timer=timer_gbm_1)

tuner_0 = OptunaTuner(n_trials=20, timeout=30, fit_on_holdout=True)
gbm_lvl0 = MLPipeline([
        (gbm_0, tuner_0),
        gbm_1
    ],
    pre_selection=selector_0,
    features_pipeline=feats_gbm_0, 
    post_selection=None
)

CPU times: user 243 µs, sys: 438 µs, total: 681 µs
Wall time: 683 µs


## Step 3.2. Create linear pipeline for AutoML 

Our linear pipeline:
- Using features, special for linear models
- LinearLBFGS as a model
- Without feature selection here

In [13]:
%%time

feats_reg_0 = LinearFeatures(output_categories=True, 
                             sparse_ohe='auto')

timer_reg = timer.get_task_timer('reg')
reg_0 = LinearLBFGS(timer=timer_reg)

reg_lvl0 = MLPipeline([
        reg_0
    ],
    pre_selection=None,
    features_pipeline=feats_reg_0, 
    post_selection=None
)

CPU times: user 618 µs, sys: 0 ns, total: 618 µs
Wall time: 620 µs


## Step 4. Create multiclass task and reader

In [14]:
%%time 

task = Task('multiclass', metric = 'crossentropy', ) 
reader = PandasToPandasReader(task = task, samples = None, max_nan_rate = 1, max_constant_rate = 1,
                              advanced_roles = True, drop_score_co = -1, n_jobs = 4)

CPU times: user 3.2 ms, sys: 0 ns, total: 3.2 ms
Wall time: 3.03 ms


## Step 5. Create blender for 2nd level 

To combine predictions from different models into one vector we use WeightedBlender:

In [15]:
%%time

blender = WeightedBlender()

CPU times: user 29 µs, sys: 51 µs, total: 80 µs
Wall time: 82.5 µs


## Step 6. Create AutoML pipeline

In [16]:
%%time

automl = AutoML(reader=reader, levels=[
    [gbm_lvl0, reg_lvl0]
], timer=timer, blender=blender, skip_conn=False)

CPU times: user 376 µs, sys: 0 ns, total: 376 µs
Wall time: 377 µs


## Step 7. Train AutoML on loaded data 

In cell below we train AutoML with target column `TARGET` to receive fitted model and OOF predictions:

In [17]:
%%time

oof_pred = automl.fit_predict(train_data, roles={'target': TARGET_NAME})
logging.info('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Feats was rejected during automatic roles guess: []
Train process start. Time left 595.9772322177887 secs
Training until validation scores don't improve for 100 rounds
[100]	valid's multi_logloss: 0.833296
Early stopping, best iteration is:
[32]	valid's multi_logloss: 0.821232
Time history [3.936596393585205]. Time left 129.95871782302856



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer

[32m[I 2020-10-18 16:00:35,784][0m A new study created in memory with name: no-name-45b0b7a1-8fa3-4d74-8e6e-5180ae986078[0m


Optuna may run 157.22964708010355 secs
Training until validation scores don't improve for 100 rounds
[100]	valid's multi_logloss: 0.82247


[32m[I 2020-10-18 16:00:40,362][0m Trial 0 finished with value: -0.8164646521955728 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 108}. Best is trial 0 with value: -0.8164646521955728.[0m


Early stopping, best iteration is:
[55]	valid's multi_logloss: 0.816465
Time history [4.570739030838013]. Time left 8999999995.427128
Training until validation scores don't improve for 100 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's multi_logloss: 0.821377


[32m[I 2020-10-18 16:00:44,741][0m Trial 1 finished with value: -0.8157873297296464 and parameters: {'feature_fraction': 0.5917173949330818, 'num_leaves': 87}. Best is trial 1 with value: -0.8157873297296464.[0m


Early stopping, best iteration is:
[52]	valid's multi_logloss: 0.815787
Time history [4.366024017333984]. Time left 8999999995.631552
Training until validation scores don't improve for 100 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's multi_logloss: 0.821295


[32m[I 2020-10-18 16:00:49,692][0m Trial 2 finished with value: -0.8177480276487767 and parameters: {'feature_fraction': 0.7993292420985183, 'num_leaves': 118}. Best is trial 1 with value: -0.8157873297296464.[0m


Early stopping, best iteration is:
[58]	valid's multi_logloss: 0.817748
Time history [4.939683198928833]. Time left 8999999995.058239
Training until validation scores don't improve for 100 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's multi_logloss: 0.827363


[32m[I 2020-10-18 16:00:54,044][0m Trial 3 finished with value: -0.8189346228726209 and parameters: {'feature_fraction': 0.7229163764267956, 'num_leaves': 230}. Best is trial 1 with value: -0.8157873297296464.[0m


Early stopping, best iteration is:
[34]	valid's multi_logloss: 0.818935
Time history [4.341119766235352]. Time left 8999999995.656822
Training until validation scores don't improve for 100 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's multi_logloss: 0.828237


[32m[I 2020-10-18 16:00:57,443][0m Trial 4 finished with value: -0.8214827415719629 and parameters: {'feature_fraction': 0.5290418060840998, 'num_leaves': 103}. Best is trial 1 with value: -0.8157873297296464.[0m


Early stopping, best iteration is:
[35]	valid's multi_logloss: 0.821483
Time history [3.3893532752990723]. Time left 8999999996.608639
Training until validation scores don't improve for 100 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's multi_logloss: 0.82015


[32m[I 2020-10-18 16:01:01,826][0m Trial 5 finished with value: -0.8166165582090616 and parameters: {'feature_fraction': 0.6668543055695109, 'num_leaves': 119}. Best is trial 1 with value: -0.8157873297296464.[0m


Early stopping, best iteration is:
[52]	valid's multi_logloss: 0.816617
Time history [4.373297452926636]. Time left 8999999995.624895
Training until validation scores don't improve for 100 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's multi_logloss: 0.826314


[32m[I 2020-10-18 16:01:07,010][0m Trial 6 finished with value: -0.8201091270335019 and parameters: {'feature_fraction': 0.8540362888980227, 'num_leaves': 165}. Best is trial 1 with value: -0.8157873297296464.[0m


Early stopping, best iteration is:
[53]	valid's multi_logloss: 0.820109
Time history [5.175848007202148]. Time left 8999999994.822277
Training until validation scores don't improve for 100 rounds
[100]	valid's multi_logloss: 0.821377
Early stopping, best iteration is:
[52]	valid's multi_logloss: 0.815787
Training until validation scores don't improve for 100 rounds
[100]	valid's multi_logloss: 0.836264
Early stopping, best iteration is:
[47]	valid's multi_logloss: 0.825268
Training until validation scores don't improve for 100 rounds
[100]	valid's multi_logloss: 0.833029
Early stopping, best iteration is:
[28]	valid's multi_logloss: 0.821758
Training until validation scores don't improve for 100 rounds
[100]	valid's multi_logloss: 0.841979
Early stopping, best iteration is:
[33]	valid's multi_logloss: 0.829801
Training until validation scores don't improve for 100 rounds
[100]	valid's multi_logloss: 0.837374
Early stopping, best iteration is:
[20]	valid's multi_logloss: 0.827673
Time h

[2020-10-18 16:01:54,210] (INFO): oof_pred:
array([[0.5181386 , 0.02629192, 0.45556957],
       [0.42177942, 0.05007775, 0.52814287],
       [0.4119503 , 0.03302733, 0.5550224 ],
       ...,
       [0.4941727 , 0.02103278, 0.48479462],
       [0.49897087, 0.02025075, 0.4807784 ],
       [0.3574488 , 0.02743687, 0.61511433]], dtype=float32)
Shape = (8000, 3)


Blending, iter 2: score = -0.8218538210578262, weights = [0.4314722  0.20969875 0.35882908]
Blending, iter 3: score = -0.8218538210578262, weights = [0.4314722  0.20969875 0.35882908]
No score update. Terminated
CPU times: user 5min 41s, sys: 4.55 s, total: 5min 46s
Wall time: 1min 27s


## Step 8. Predict to test data and check scores

In [18]:
%%time

test_pred = automl.predict(test_data)
logging.debug('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))

logging.info('Check scores...')
logging.info('OOF score: {}'.format(log_loss(train_data[TARGET_NAME].values, oof_pred.data)))
logging.info('TEST score: {}'.format(log_loss(test_data[TARGET_NAME].values, test_pred.data)))

[2020-10-18 16:01:54,604] (INFO): Check scores...
[2020-10-18 16:01:54,607] (INFO): OOF score: 0.8218538206294179
[2020-10-18 16:01:54,608] (INFO): TEST score: 0.8241283266842365


CPU times: user 989 ms, sys: 7.46 ms, total: 996 ms
Wall time: 393 ms


## Step 9. Check AUCs for each class in train and test data 

In [19]:
for dat, df, name in zip([oof_pred, test_pred], [train_data, test_data], ['train', 'test']):
    logging.debug('Check aucs {0}...'.format(name))
    for cl in range(3):
        sc = roc_auc_score((df[TARGET_NAME].values == cl).astype(np.float32), dat.data[:, cl])
        logging.info('Class {0} {1} auc score: {2}'.format(cl, name, sc))

[2020-10-18 16:01:54,617] (INFO): Class 0 train auc score: 0.543005350509457
[2020-10-18 16:01:54,620] (INFO): Class 1 train auc score: 0.7221756734751182
[2020-10-18 16:01:54,623] (INFO): Class 2 train auc score: 0.5337184979081697
[2020-10-18 16:01:54,625] (INFO): Class 0 test auc score: 0.5429253596444981
[2020-10-18 16:01:54,627] (INFO): Class 1 test auc score: 0.7166536458333335
[2020-10-18 16:01:54,629] (INFO): Class 2 test auc score: 0.5279147808256858


## Step 10. Profiling AutoML 

To build report here, we must turn on decorators on step 0.4. Report is interactive and you can go as deep into functions call stack as you want:

In [20]:
%%time
p.profile('my_report_profile.html')
assert os.path.exists('my_report_profile.html'), 'Profile report failed to build'

	 Func with no stats - <function numpy_and_pandas_concat at 0x7fc085cac950>
FULL_STATS_DF shape = (8791, 6)
RUN_FNAME vc head:
CatMulticlass.forward [115]                 1
WeightedBlender._get_scorer [9]             1
PandasDataset._get_cols_idx [26]            1
TorchBasedLinearEstimator._loss_fn [379]    1
NumericRole.__init__ [225]                  1
Name: run_fname, dtype: int64
CONNECTED COMPONENTS cnt = 1
PATH LENS describe:
count    8792.000000
mean        7.802889
std         2.411472
min         0.000000
25%         6.000000
50%         8.000000
75%         9.000000
max        17.000000
dtype: float64
CPU times: user 950 ms, sys: 16.1 ms, total: 966 ms
Wall time: 958 ms


# Appendix. Profiling report screenshots 

After loading HTML with profiling report, you can see fully folded report (please wait for green LOAD OK text for full load finish). If you click on triangle on the left, it unfolds and look like this:  

<img src="imgs/tutorial_3_initial_report.png" alt="Initial profiling report" style="width: 500px;"/>

If we go even deeper we will receive situation like this:

<img src="imgs/tutorial_3_unfolded_report.png" alt="Profiling report after several unfoldings on different levels" style="width: 500px;"/>
