# AutoML pipeline preset

## Initial phase

### Import necessary libraries 

In [1]:
# Standard python libraries
import logging
import os
import time
logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.INFO)

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch

# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task
from lightautoml.utils.profiler import Profiler

### General parameters setup

In [2]:
N_THREADS = 8 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 300 # Time in seconds for automl run
TARGET_NAME = 'TARGET' # Target column name

### Fix torch number of threads and numpy seed 

In [3]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

### Change profiling decorators settings 

By default, profiling decorators are turned off for speed and memory reduction. If you want to see profiling report after using LAMA, you need to turn on the decorators using command below: 

In [4]:
p = Profiler()
p.change_deco_settings({'enabled': True})

### Example data load 

In [5]:
%%time

data = pd.read_csv('../LightAutoML/example_data/test_data_files/sampled_app_train.csv')
data.head()

CPU times: user 114 ms, sys: 5.19 ms, total: 119 ms
Wall time: 118 ms


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,313802,0,Cash loans,M,N,Y,0,270000.0,327024.0,15372.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,319656,0,Cash loans,F,N,N,0,108000.0,675000.0,19737.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,207678,0,Revolving loans,F,Y,Y,2,112500.0,270000.0,13500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
3,381593,0,Cash loans,F,N,N,1,67500.0,142200.0,9630.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
4,258153,0,Cash loans,F,Y,Y,0,337500.0,1483231.5,46570.5,...,0,0,0,0,0.0,0.0,0.0,2.0,0.0,0.0


### (Optional) Some user feature preparation 

Cell below shows some user feature preparations to create task more difficult (this block can be omitted if you don't want to change the initial data):

In [6]:
%%time

data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
                    ).astype(str)

data['constant'] = 1
data['allnan'] = np.nan

data['report_dt'] = np.datetime64('2018-01-01')

data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

CPU times: user 91.3 ms, sys: 15.6 ms, total: 107 ms
Wall time: 106 ms


### (Optional) Data splitting for train-test 

Block below can be omitted if you are going to train model only or you have specific train and test files:

In [7]:
%%time

train_data, test_data = train_test_split(data, 
                                         test_size=TEST_SIZE, 
                                         stratify=data[TARGET_NAME], 
                                         random_state=RANDOM_STATE)
logging.info('Data splitted. Parts sizes: train_data = {}, test_data = {}'
              .format(train_data.shape, test_data.shape))

[2020-12-03 18:18:52,594] (INFO): Data splitted. Parts sizes: train_data = (8000, 125), test_data = (2000, 125)


CPU times: user 13.6 ms, sys: 889 µs, total: 14.4 ms
Wall time: 13.3 ms


In [8]:
train_data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,BIRTH_DATE,EMP_DATE,constant,allnan,report_dt
6444,112261,0,Cash loans,F,N,N,1,90000.0,640080.0,31261.5,...,0.0,0.0,0.0,1.0,0.0,1985-06-28,2012-06-21,1,,2018-01-01
3586,115058,0,Cash loans,F,N,Y,0,180000.0,239850.0,23850.0,...,0.0,0.0,0.0,0.0,3.0,1953-12-27,2018-01-01,1,,2018-01-01
9349,326623,0,Cash loans,F,N,Y,0,112500.0,337500.0,31086.0,...,0.0,0.0,0.0,0.0,2.0,1975-06-21,2016-06-17,1,,2018-01-01
7734,191976,0,Cash loans,M,Y,Y,1,67500.0,135000.0,9018.0,...,,,,,,1988-04-27,2009-06-05,1,,2018-01-01
2174,281519,0,Revolving loans,F,N,Y,0,67500.0,202500.0,10125.0,...,0.0,0.0,0.0,0.0,2.0,1975-06-13,1997-01-22,1,,2018-01-01


## AutoML preset usage


### Create Task

In [9]:
%%time

task = Task('binary', )

CPU times: user 4.83 ms, sys: 467 µs, total: 5.3 ms
Wall time: 4.55 ms


### Setup columns roles

Roles setup here set target column and base date, which is used to calculate date differences:

In [10]:
%%time

roles = {'target': TARGET_NAME,
         DatetimeRole(base_date=True, seasonality=(), base_feats=False): 'report_dt',
         }

CPU times: user 160 µs, sys: 89 µs, total: 249 µs
Wall time: 254 µs


### Create AutoML from preset

To create AutoML model here we use `TabularAutoML` preset, which looks like:

![TabularAutoML preset pipeline](imgs/tutorial_2_pipeline.png)

All params we set above can be send inside preset to change its configuration:

In [11]:
%%time 

automl = TabularAutoML(task = task, 
                       timeout = TIMEOUT,
                       general_params = {'nested_cv': False, 'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                       reader_params = {'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                       tuning_params = {'max_tuning_iter': 20, 'max_tuning_time': 30},
                       lgb_params = {'default_params': {'num_threads': N_THREADS}}, verbose=0)
oof_pred = automl.fit_predict(train_data, roles = roles)
logging.info('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer


Start automl preset with listed constraints:
- time: 300 seconds
- cpus: 4 cores
- memory: 16 gb


Train data shape: (8000, 125)
Feats was rejected during automatic roles guess: []
Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...
Linear model: C = 1e-05 score = 0.6871187944684104
Linear model: C = 5e-05 score = 0.7049890148660134
Linear model: C = 0.0001 score = 0.7231478957187378
Linear model: C = 0.0005 score = 0.7541949313362306
Linear model: C = 0.001 score = 0.7611922745909308
Linear model: C = 0.005 score = 0.7633358457484056
Linear model: C = 0.01 score = 0.7604973512730461
Linear model: C = 0.05 score = 0.7482720464422599
Linear model: C = 1e-05 score = 0.7097380264945651
Linear model: C = 5e-05 score = 0.7196416440217392
Linear model: C = 0.0001 score = 0.726806640625
Linear model: C = 0.0005 score = 0.7401706861413043
Linear model: C = 0.001 score = 0.7423785665760869
Linear model: C = 0.005 score = 0.7420919667119567
Linear model: C = 0.01 score = 0.7412533967391305
Linear mode

[2020-12-03 18:19:14,771] (INFO): A new study created in memory with name: no-name-09dc6f05-5808-41ae-8f7f-c455fff7d50e


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.742162
[200]	valid's auc: 0.746348
[300]	valid's auc: 0.752356
[400]	valid's auc: 0.755585
[500]	valid's auc: 0.756317
[600]	valid's auc: 0.756884
[700]	valid's auc: 0.75582
[800]	valid's auc: 0.75528
Early stopping, best iteration is:
[626]	valid's auc: 0.757253
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:19:21,806] (INFO): Trial 0 finished with value: 0.7572525939349232 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 108}. Best is trial 0 with value: 0.7572525939349232.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.746203
[200]	valid's auc: 0.75234
[300]	valid's auc: 0.753404
[400]	valid's auc: 0.755376
[500]	valid's auc: 0.756627
[600]	valid's auc: 0.755125
[700]	valid's auc: 0.754617
Early stopping, best iteration is:
[529]	valid's auc: 0.757215
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:19:27,250] (INFO): Trial 1 finished with value: 0.7572151749870369 and parameters: {'feature_fraction': 0.5917173949330818, 'num_leaves': 87}. Best is trial 0 with value: 0.7572525939349232.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.739944
[200]	valid's auc: 0.743691
Early stopping, best iteration is:
[16]	valid's auc: 0.747016
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:19:29,227] (INFO): Trial 2 finished with value: 0.7470158389060837 and parameters: {'feature_fraction': 0.7993292420985183, 'num_leaves': 118}. Best is trial 0 with value: 0.7572525939349232.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.734742
[200]	valid's auc: 0.739169
Early stopping, best iteration is:
[16]	valid's auc: 0.741964
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:19:31,148] (INFO): Trial 3 finished with value: 0.7419642809414607 and parameters: {'feature_fraction': 0.7229163764267956, 'num_leaves': 230}. Best is trial 0 with value: 0.7572525939349232.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.750041
[200]	valid's auc: 0.746668
[300]	valid's auc: 0.745145
Early stopping, best iteration is:
[100]	valid's auc: 0.750041
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:19:33,430] (INFO): Trial 4 finished with value: 0.7500414281208739 and parameters: {'feature_fraction': 0.5290418060840998, 'num_leaves': 103}. Best is trial 0 with value: 0.7572525939349232.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.737666
[200]	valid's auc: 0.743386
Early stopping, best iteration is:
[29]	valid's auc: 0.747989
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:19:35,358] (INFO): Trial 5 finished with value: 0.7479887315511223 and parameters: {'feature_fraction': 0.6668543055695109, 'num_leaves': 119}. Best is trial 0 with value: 0.7572525939349232.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.740505
[200]	valid's auc: 0.749534
[300]	valid's auc: 0.750175
[400]	valid's auc: 0.749908
Early stopping, best iteration is:
[270]	valid's auc: 0.752313
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:19:39,849] (INFO): Trial 6 finished with value: 0.7523132928139582 and parameters: {'feature_fraction': 0.8540362888980227, 'num_leaves': 165}. Best is trial 0 with value: 0.7572525939349232.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.750041
[200]	valid's auc: 0.746668
[300]	valid's auc: 0.745145
Early stopping, best iteration is:
[100]	valid's auc: 0.750041
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:19:42,121] (INFO): Trial 7 finished with value: 0.7500414281208739 and parameters: {'feature_fraction': 0.5282057895135501, 'num_leaves': 103}. Best is trial 0 with value: 0.7572525939349232.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.754366
[200]	valid's auc: 0.765089
[300]	valid's auc: 0.767933
[400]	valid's auc: 0.768852
[500]	valid's auc: 0.7699
[600]	valid's auc: 0.767789
[700]	valid's auc: 0.766794
Early stopping, best iteration is:
[527]	valid's auc: 0.770579
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:19:47,418] (INFO): Trial 8 finished with value: 0.7705790849463573 and parameters: {'feature_fraction': 0.9162213204002109, 'num_leaves': 53}. Best is trial 8 with value: 0.7705790849463573.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.745669
Early stopping, best iteration is:
[51]	valid's auc: 0.757568
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.719153
[200]	valid's auc: 0.724758
[300]	valid's auc: 0.726382
Early stopping, best iteration is:
[295]	valid's auc: 0.727438
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.717248
Early stopping, best iteration is:
[18]	valid's auc: 0.72208
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.727475
Early stopping, best iteration is:
[81]	valid's auc: 0.733467
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.745154
Early stopping, best iteration is:
[84]	valid's auc: 0.747516
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Blending: Optimization starts with equal weights and score 0.7503543502862332

[2020-12-03 18:19:54,797] (INFO): oof_pred:
array([[0.03303517],
       [0.0370398 ],
       [0.03658171],
       ...,
       [0.03143561],
       [0.19412126],
       [0.08112439]], dtype=float32)
Shape = (8000, 1)


CPU times: user 3min 55s, sys: 3.53 s, total: 3min 58s
Wall time: 1min 2s


### Predict to test data and check scores

In [12]:
%%time

test_pred = automl.predict(test_data)
logging.info('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))

logging.info('Check scores...')
logging.info('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))
logging.info('TEST score: {}'.format(roc_auc_score(test_data[TARGET_NAME].values, test_pred.data[:, 0])))

[2020-12-03 18:19:55,293] (INFO): Prediction for test data:
array([[0.06299944],
       [0.0771446 ],
       [0.0329024 ],
       ...,
       [0.0590249 ],
       [0.04663964],
       [0.20362368]], dtype=float32)
Shape = (2000, 1)
[2020-12-03 18:19:55,294] (INFO): Check scores...
[2020-12-03 18:19:55,298] (INFO): OOF score: 0.7504438546933155
[2020-12-03 18:19:55,300] (INFO): TEST score: 0.7319972826086957


CPU times: user 766 ms, sys: 11.6 ms, total: 777 ms
Wall time: 496 ms


### Profiling AutoML 

To build report here, we **must** turn on decorators on step 0.4. Report is interactive and you can go as deep into functions call stack as you want:

In [13]:
%%time
p.profile('my_report_profile.html')
assert os.path.exists('my_report_profile.html'), 'Profile report failed to build'

CPU times: user 1.56 s, sys: 32.2 ms, total: 1.59 s
Wall time: 1.59 s


### Create AutoML with time utilization 

Below we are going to create specific AutoML preset for TIMEOUT utilization (try to spend it as much as possible):

In [14]:
%%time 

automl = TabularUtilizedAutoML(task = task, 
                       timeout = TIMEOUT,
                       general_params = {'nested_cv': False, 'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                       reader_params = {'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                       tuning_params = {'max_tuning_iter': 20, 'max_tuning_time': 30},
                       lgb_params = {'default_params': {'num_threads': N_THREADS}}, verbose=0)
oof_pred = automl.fit_predict(train_data, roles = roles)
logging.info('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

CUR SETUP FOR RANDOM STATE: {'reader_params': {'random_state': 42}}
FOUND reader_params in kwargs, need to combine
MERGED VARIANT FOR reader_params = {'cv': 5, 'random_state': 42}
Start automl preset with listed constraints:
- time: 299.9985098838806 seconds
- cpus: 4 cores
- memory: 16 gb


Train data shape: (8000, 125)
Feats was rejected during automatic roles guess: []
Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...
Linear model: C = 1e-05 score = 0.6871294855963779
Linear model: C = 5e-05 score = 0.7049890148660134
Linear model: C = 0.0001 score = 0.7231478957187378
Linear model: C = 0.0005 score = 0.7541949313362306
Linear model: C = 0.001 score = 0.7611922745909308
Linear model: C = 0.005 score = 0.7633411913123894
Linear model: C = 0.01 score = 0.7601873085619898
Linear model: C = 0.05 score = 0.7482773920062435
Linear model: C = 1e-05 score = 0.7097380264945651
Linear model: C = 5e-05 score = 0.7196522588315217
Linear model: C = 0.0001 score = 0.726806640625
Linear model: C = 0.0

[2020-12-03 18:20:19,823] (INFO): A new study created in memory with name: no-name-7c7c2c67-f112-4163-8961-c1d5d0630166


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.755023
[200]	valid's auc: 0.757825
[300]	valid's auc: 0.755419
[400]	valid's auc: 0.755114
Early stopping, best iteration is:
[231]	valid's auc: 0.758846
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:20:23,530] (INFO): Trial 0 finished with value: 0.7588455720020741 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 108}. Best is trial 0 with value: 0.7588455720020741.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.754708
[200]	valid's auc: 0.760583
[300]	valid's auc: 0.760134
[400]	valid's auc: 0.759915
[500]	valid's auc: 0.761011
Early stopping, best iteration is:
[323]	valid's auc: 0.761593
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:20:27,493] (INFO): Trial 1 finished with value: 0.7615931918897103 and parameters: {'feature_fraction': 0.5917173949330818, 'num_leaves': 87}. Best is trial 1 with value: 0.7615931918897103.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.75312
[200]	valid's auc: 0.758209
[300]	valid's auc: 0.759963
[400]	valid's auc: 0.758375
[500]	valid's auc: 0.760476
[600]	valid's auc: 0.759225
Early stopping, best iteration is:
[489]	valid's auc: 0.760898
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:20:33,780] (INFO): Trial 2 finished with value: 0.7608982685718257 and parameters: {'feature_fraction': 0.7993292420985183, 'num_leaves': 118}. Best is trial 1 with value: 0.7615931918897103.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.762587
[200]	valid's auc: 0.760946
Early stopping, best iteration is:
[48]	valid's auc: 0.764587
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:20:35,976] (INFO): Trial 3 finished with value: 0.7645867077205981 and parameters: {'feature_fraction': 0.7229163764267956, 'num_leaves': 230}. Best is trial 3 with value: 0.7645867077205981.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.756167
[200]	valid's auc: 0.761133
[300]	valid's auc: 0.758792
Early stopping, best iteration is:
[155]	valid's auc: 0.762539
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:20:38,736] (INFO): Trial 4 finished with value: 0.7625393567148303 and parameters: {'feature_fraction': 0.5290418060840998, 'num_leaves': 103}. Best is trial 3 with value: 0.7645867077205981.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.746011
[200]	valid's auc: 0.74956
Early stopping, best iteration is:
[32]	valid's auc: 0.755922
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:20:40,758] (INFO): Trial 5 finished with value: 0.7559215485029749 and parameters: {'feature_fraction': 0.6668543055695109, 'num_leaves': 119}. Best is trial 3 with value: 0.7645867077205981.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.752885
[200]	valid's auc: 0.757087
[300]	valid's auc: 0.759172
[400]	valid's auc: 0.757445
Early stopping, best iteration is:
[245]	valid's auc: 0.759947
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:20:44,999] (INFO): Trial 6 finished with value: 0.7599467581827221 and parameters: {'feature_fraction': 0.8540362888980227, 'num_leaves': 165}. Best is trial 3 with value: 0.7645867077205981.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.756167
[200]	valid's auc: 0.761133
[300]	valid's auc: 0.758792
Early stopping, best iteration is:
[155]	valid's auc: 0.762539
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:20:47,715] (INFO): Trial 7 finished with value: 0.7625393567148303 and parameters: {'feature_fraction': 0.5282057895135501, 'num_leaves': 103}. Best is trial 3 with value: 0.7645867077205981.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.756643
[200]	valid's auc: 0.759129
[300]	valid's auc: 0.76115
[400]	valid's auc: 0.761877
[500]	valid's auc: 0.761518
[600]	valid's auc: 0.75892
Early stopping, best iteration is:
[440]	valid's auc: 0.763443
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:20:52,523] (INFO): Trial 8 finished with value: 0.7634427570280802 and parameters: {'feature_fraction': 0.9162213204002109, 'num_leaves': 53}. Best is trial 3 with value: 0.7645867077205981.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.744482
Early stopping, best iteration is:
[17]	valid's auc: 0.753527
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.723707
[200]	valid's auc: 0.726005
Early stopping, best iteration is:
[142]	valid's auc: 0.728675
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.720772
[200]	valid's auc: 0.720751
Early stopping, best iteration is:
[160]	valid's auc: 0.724386
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.738409
[200]	valid's auc: 0.737623
Early stopping, best iteration is:
[116]	valid's auc: 0.742708
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.742235
Early stopping, best iteration is:
[29]	valid's auc: 0.750547
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Blending: Optimization starts with equal weigh

[2020-12-03 18:21:22,736] (INFO): A new study created in memory with name: no-name-d6787524-da62-4767-b34d-8ed429e3d6c5


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.722891
[200]	valid's auc: 0.732075
[300]	valid's auc: 0.737672
[400]	valid's auc: 0.741788
[500]	valid's auc: 0.739949
[600]	valid's auc: 0.740163
Early stopping, best iteration is:
[409]	valid's auc: 0.742232
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:21:27,997] (INFO): Trial 0 finished with value: 0.7422315591406471 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 108}. Best is trial 0 with value: 0.7422315591406471.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.731524
[200]	valid's auc: 0.739447
[300]	valid's auc: 0.74561
[400]	valid's auc: 0.745086
[500]	valid's auc: 0.747315
[600]	valid's auc: 0.747449
[700]	valid's auc: 0.7453
Early stopping, best iteration is:
[559]	valid's auc: 0.749026
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:21:33,746] (INFO): Trial 1 finished with value: 0.7490257709639655 and parameters: {'feature_fraction': 0.5917173949330818, 'num_leaves': 87}. Best is trial 1 with value: 0.7490257709639655.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.719347
[200]	valid's auc: 0.727857
[300]	valid's auc: 0.734507
[400]	valid's auc: 0.736779
[500]	valid's auc: 0.737586
Early stopping, best iteration is:
[338]	valid's auc: 0.738383
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:21:38,769] (INFO): Trial 2 finished with value: 0.7383827530723629 and parameters: {'feature_fraction': 0.7993292420985183, 'num_leaves': 118}. Best is trial 1 with value: 0.7490257709639655.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.728579
[200]	valid's auc: 0.730984
[300]	valid's auc: 0.736052
[400]	valid's auc: 0.739815
[500]	valid's auc: 0.739896
[600]	valid's auc: 0.740836
Early stopping, best iteration is:
[427]	valid's auc: 0.7412
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:21:44,298] (INFO): Trial 3 finished with value: 0.7411998652917877 and parameters: {'feature_fraction': 0.7229163764267956, 'num_leaves': 230}. Best is trial 1 with value: 0.7490257709639655.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.728419
[200]	valid's auc: 0.737554
[300]	valid's auc: 0.73942
[400]	valid's auc: 0.742274
[500]	valid's auc: 0.740842
[600]	valid's auc: 0.739142
Early stopping, best iteration is:
[420]	valid's auc: 0.743108
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:21:49,031] (INFO): Trial 4 finished with value: 0.7431082316339785 and parameters: {'feature_fraction': 0.5290418060840998, 'num_leaves': 103}. Best is trial 1 with value: 0.7490257709639655.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.727948
[200]	valid's auc: 0.735651
[300]	valid's auc: 0.743392
[400]	valid's auc: 0.745963
[500]	valid's auc: 0.74631
[600]	valid's auc: 0.747524
[700]	valid's auc: 0.746813
Early stopping, best iteration is:
[578]	valid's auc: 0.748283
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:21:55,136] (INFO): Trial 5 finished with value: 0.7482827375702273 and parameters: {'feature_fraction': 0.6668543055695109, 'num_leaves': 119}. Best is trial 1 with value: 0.7490257709639655.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.734497
[200]	valid's auc: 0.733465
Early stopping, best iteration is:
[116]	valid's auc: 0.738126
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.731037
[200]	valid's auc: 0.733988
[300]	valid's auc: 0.731498
Early stopping, best iteration is:
[232]	valid's auc: 0.73644
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.709244
Early stopping, best iteration is:
[74]	valid's auc: 0.71634
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.706994
Early stopping, best iteration is:
[51]	valid's auc: 0.715783
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.747813
[200]	valid's auc: 0.748402
Early stopping, best iteration is:
[156]	valid's auc: 0.753184
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Blending: Optimizati

[2020-12-03 18:22:28,161] (INFO): A new study created in memory with name: no-name-fdd778a6-eeae-4132-8d00-48f0363fc390


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.716156
[200]	valid's auc: 0.72008
[300]	valid's auc: 0.721865
[400]	valid's auc: 0.724094
[500]	valid's auc: 0.72411
[600]	valid's auc: 0.725115
[700]	valid's auc: 0.727082
[800]	valid's auc: 0.7272
[900]	valid's auc: 0.727168
[1000]	valid's auc: 0.728258
[1100]	valid's auc: 0.728745
[1200]	valid's auc: 0.729285
Early stopping, best iteration is:
[1062]	valid's auc: 0.729595
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:22:36,816] (INFO): Trial 0 finished with value: 0.7295946458831138 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 108}. Best is trial 0 with value: 0.7295946458831138.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.71808
[200]	valid's auc: 0.726943
[300]	valid's auc: 0.729039
[400]	valid's auc: 0.731583
[500]	valid's auc: 0.733294
[600]	valid's auc: 0.732706
Early stopping, best iteration is:
[488]	valid's auc: 0.733839
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:22:41,453] (INFO): Trial 1 finished with value: 0.733839023686194 and parameters: {'feature_fraction': 0.5917173949330818, 'num_leaves': 87}. Best is trial 1 with value: 0.733839023686194.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.717508
[200]	valid's auc: 0.724067
[300]	valid's auc: 0.729883
[400]	valid's auc: 0.729985
[500]	valid's auc: 0.729338
Early stopping, best iteration is:
[373]	valid's auc: 0.730899
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:22:46,299] (INFO): Trial 2 finished with value: 0.7308989634951435 and parameters: {'feature_fraction': 0.7993292420985183, 'num_leaves': 118}. Best is trial 1 with value: 0.733839023686194.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.722517
[200]	valid's auc: 0.730359
[300]	valid's auc: 0.73068
[400]	valid's auc: 0.727366
[500]	valid's auc: 0.727382
Early stopping, best iteration is:
[300]	valid's auc: 0.73068
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:22:50,324] (INFO): Trial 3 finished with value: 0.7306797953718107 and parameters: {'feature_fraction': 0.7229163764267956, 'num_leaves': 230}. Best is trial 1 with value: 0.733839023686194.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.720807
[200]	valid's auc: 0.726954
[300]	valid's auc: 0.73293
[400]	valid's auc: 0.734015
[500]	valid's auc: 0.735512
[600]	valid's auc: 0.736303
[700]	valid's auc: 0.737522
[800]	valid's auc: 0.73633
[900]	valid's auc: 0.736437
Early stopping, best iteration is:
[722]	valid's auc: 0.737704
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:22:56,191] (INFO): Trial 4 finished with value: 0.7377038664464294 and parameters: {'feature_fraction': 0.5290418060840998, 'num_leaves': 103}. Best is trial 4 with value: 0.7377038664464294.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.716038
[200]	valid's auc: 0.722522
[300]	valid's auc: 0.725441
[400]	valid's auc: 0.726917
[500]	valid's auc: 0.727996
[600]	valid's auc: 0.727809
[700]	valid's auc: 0.729563
[800]	valid's auc: 0.729541
[900]	valid's auc: 0.729343
Early stopping, best iteration is:
[708]	valid's auc: 0.730167
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:23:02,972] (INFO): Trial 5 finished with value: 0.7301666212293728 and parameters: {'feature_fraction': 0.6668543055695109, 'num_leaves': 119}. Best is trial 4 with value: 0.7377038664464294.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.707635
[200]	valid's auc: 0.718599
[300]	valid's auc: 0.722779
[400]	valid's auc: 0.724645
[500]	valid's auc: 0.725757
Early stopping, best iteration is:
[480]	valid's auc: 0.726623
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.743164
[200]	valid's auc: 0.744427
Early stopping, best iteration is:
[151]	valid's auc: 0.750865
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.747145
[200]	valid's auc: 0.752203
Early stopping, best iteration is:
[190]	valid's auc: 0.753949
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.733903
Early stopping, best iteration is:
[44]	valid's auc: 0.747707
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.75087
[200]	valid's auc: 0.754198
Early stopping, best iteration is:
[198]	valid's auc: 0.75

[2020-12-03 18:23:37,598] (INFO): A new study created in memory with name: no-name-51cd9113-32a0-4d67-b7f6-760041c70669


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.720652
[200]	valid's auc: 0.722319
[300]	valid's auc: 0.727868
[400]	valid's auc: 0.728296
[500]	valid's auc: 0.727162
Early stopping, best iteration is:
[381]	valid's auc: 0.729611
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:23:42,766] (INFO): Trial 0 finished with value: 0.729610682575065 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 108}. Best is trial 0 with value: 0.729610682575065.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.73826
[200]	valid's auc: 0.733182
[300]	valid's auc: 0.737271
Early stopping, best iteration is:
[111]	valid's auc: 0.740339
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:23:45,152] (INFO): Trial 1 finished with value: 0.7403392294904074 and parameters: {'feature_fraction': 0.5917173949330818, 'num_leaves': 87}. Best is trial 1 with value: 0.7403392294904074.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.709324
[200]	valid's auc: 0.713478
Early stopping, best iteration is:
[38]	valid's auc: 0.716097
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:23:47,428] (INFO): Trial 2 finished with value: 0.7160970968242004 and parameters: {'feature_fraction': 0.7993292420985183, 'num_leaves': 118}. Best is trial 1 with value: 0.7403392294904074.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.721362
[200]	valid's auc: 0.719502
[300]	valid's auc: 0.726997
[400]	valid's auc: 0.72774
[500]	valid's auc: 0.729269
[600]	valid's auc: 0.728798
Early stopping, best iteration is:
[479]	valid's auc: 0.729744
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:23:53,618] (INFO): Trial 3 finished with value: 0.7297443216746582 and parameters: {'feature_fraction': 0.7229163764267956, 'num_leaves': 230}. Best is trial 1 with value: 0.7403392294904074.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.724532
[200]	valid's auc: 0.723843
[300]	valid's auc: 0.725227
[400]	valid's auc: 0.724287
Early stopping, best iteration is:
[265]	valid's auc: 0.72782
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:23:57,232] (INFO): Trial 4 finished with value: 0.7278199186405161 and parameters: {'feature_fraction': 0.5290418060840998, 'num_leaves': 103}. Best is trial 1 with value: 0.7403392294904074.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.720256
[200]	valid's auc: 0.716696
Early stopping, best iteration is:
[54]	valid's auc: 0.728312
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:23:59,475] (INFO): Trial 5 finished with value: 0.7283117105270192 and parameters: {'feature_fraction': 0.6668543055695109, 'num_leaves': 119}. Best is trial 1 with value: 0.7403392294904074.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.713419
[200]	valid's auc: 0.720443
[300]	valid's auc: 0.724019
[400]	valid's auc: 0.728509
[500]	valid's auc: 0.729739
[600]	valid's auc: 0.729226
[700]	valid's auc: 0.729253
Early stopping, best iteration is:
[510]	valid's auc: 0.730322
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:24:06,474] (INFO): Trial 6 finished with value: 0.730321642584901 and parameters: {'feature_fraction': 0.8540362888980227, 'num_leaves': 165}. Best is trial 1 with value: 0.7403392294904074.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.724532
[200]	valid's auc: 0.723843
[300]	valid's auc: 0.725227
[400]	valid's auc: 0.724287
Early stopping, best iteration is:
[265]	valid's auc: 0.72782
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed


[2020-12-03 18:24:10,039] (INFO): Trial 7 finished with value: 0.7278199186405161 and parameters: {'feature_fraction': 0.5282057895135501, 'num_leaves': 103}. Best is trial 1 with value: 0.7403392294904074.


Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.723747
[200]	valid's auc: 0.728306
[300]	valid's auc: 0.72225
Early stopping, best iteration is:
[223]	valid's auc: 0.729023
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.748349
[200]	valid's auc: 0.753338
[300]	valid's auc: 0.748004
Early stopping, best iteration is:
[210]	valid's auc: 0.754851
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.725719
Early stopping, best iteration is:
[32]	valid's auc: 0.731042
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.721696
Early stopping, best iteration is:
[78]	valid's auc: 0.730729
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.721627
Early stopping, best iteration is:
[55]	valid's auc: 0.728611
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Blending: Optimizati

[2020-12-03 18:24:18,382] (INFO): oof_pred:
array([[0.0323967 ],
       [0.02751203],
       [0.03151446],
       ...,
       [0.02532397],
       [0.16432318],
       [0.08439195]], dtype=float32)
Shape = (8000, 1)


CPU times: user 16min 16s, sys: 13.6 s, total: 16min 30s
Wall time: 4min 21s


### Predict to test data and check scores for utilized automl

In [15]:
%%time

test_pred = automl.predict(test_data)
logging.info('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))

logging.info('Check scores...')
logging.info('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))
logging.info('TEST score: {}'.format(roc_auc_score(test_data[TARGET_NAME].values, test_pred.data[:, 0])))

[2020-12-03 18:24:20,102] (INFO): Prediction for test data:
array([[0.05997877],
       [0.07474159],
       [0.02688554],
       ...,
       [0.04569402],
       [0.03724101],
       [0.21016812]], dtype=float32)
Shape = (2000, 1)
[2020-12-03 18:24:20,102] (INFO): Check scores...
[2020-12-03 18:24:20,106] (INFO): OOF score: 0.7581720606359404
[2020-12-03 18:24:20,109] (INFO): TEST score: 0.734266304347826


CPU times: user 3.27 s, sys: 112 ms, total: 3.38 s
Wall time: 1.72 s


### Profiling utilized AutoML 

To build report here, we **must** turn on decorators on step 0.4. Report is interactive and you can go as deep into functions call stack as you want:

In [16]:
%%time
p.profile('my_report_profile.html')
assert os.path.exists('my_report_profile.html'), 'Profile report failed to build'

CPU times: user 2.55 s, sys: 120 ms, total: 2.67 s
Wall time: 2.66 s


## Appendix. Profiling report screenshots 

After loading HTML with profiling report, you can see fully folded report (please wait for green LOAD OK text for full load finish). If you click on triangle on the left, it unfolds and look like this:  

<img src="imgs/tutorial_2_initial_report.png" alt="Initial profiling report" style="width: 500px;"/>

If we go even deeper we will receive situation like this:

<img src="imgs/tutorial_2_unfolded_report.png" alt="Profiling report after several unfoldings on different levels" style="width: 600px;"/>
