# Step 0.1. Import necessary libraries 

In [1]:
# Standard python libraries
import logging
import os
import time
logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.INFO)

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch

# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task
from lightautoml.utils.profiler import Profiler

# Step 0.2. Parameters 

In [2]:
N_THREADS = 8 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 600 # Time in seconds for automl run
TARGET_NAME = 'TARGET' # Target column name

# Step 0.3. Fix torch number of threads and numpy seed 

In [3]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

# Step 0.4. Change profiling decorators settings 

By default, profiling decorators are turned off for speed and memory reduction. If you want to see profiling report after using LAMA, you need to turn on the decorators using command below: 

In [4]:
p = Profiler()
p.change_deco_settings({'enabled': True})

ALL_FUNCS len = 387


# Step 0.5. Example data load 

In [5]:
%%time

data = pd.read_csv('./example_data/test_data_files/sampled_app_train.csv')
data.head()

CPU times: user 68.5 ms, sys: 11.9 ms, total: 80.4 ms
Wall time: 80 ms


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,313802,0,Cash loans,M,N,Y,0,270000.0,327024.0,15372.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,319656,0,Cash loans,F,N,N,0,108000.0,675000.0,19737.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,207678,0,Revolving loans,F,Y,Y,2,112500.0,270000.0,13500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
3,381593,0,Cash loans,F,N,N,1,67500.0,142200.0,9630.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
4,258153,0,Cash loans,F,Y,Y,0,337500.0,1483231.5,46570.5,...,0,0,0,0,0.0,0.0,0.0,2.0,0.0,0.0


# Step 0.6. (Optional) Some user feature preparation 

Cell below shows some user feature preparations to create task more difficult (this block can be omitted if you don't want to change the initial data):

In [6]:
%%time

data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
                    ).astype(str)

data['constant'] = 1
data['allnan'] = np.nan

data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

[2020-10-17 19:31:39,435] (INFO): Note: detected 96 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
[2020-10-17 19:31:39,435] (INFO): Note: NumExpr detected 96 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[2020-10-17 19:31:39,436] (INFO): NumExpr defaulting to 8 threads.


CPU times: user 112 ms, sys: 0 ns, total: 112 ms
Wall time: 110 ms


# Step 0.7. (Optional) Data splitting for train-test 

Block below can be omitted if you are going to train model only or you have specific train and test files:

In [7]:
%%time

train_data, test_data = train_test_split(data, 
                                         test_size=TEST_SIZE, 
                                         stratify=data[TARGET_NAME], 
                                         random_state=RANDOM_STATE)
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)
logging.info('Data splitted. Parts sizes: train_data = {}, test_data = {}'
              .format(train_data.shape, test_data.shape))

[2020-10-17 19:31:39,509] (INFO): Data splitted. Parts sizes: train_data = (8000, 124), test_data = (2000, 124)


CPU times: user 7.23 ms, sys: 6.3 ms, total: 13.5 ms
Wall time: 12.6 ms


In [8]:
train_data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,BIRTH_DATE,EMP_DATE,constant,allnan
0,112261,0,Cash loans,F,N,N,1,90000.0,640080.0,31261.5,...,0.0,0.0,0.0,0.0,1.0,0.0,1985-06-28,2012-06-21,1,
1,115058,0,Cash loans,F,N,Y,0,180000.0,239850.0,23850.0,...,0.0,0.0,0.0,0.0,0.0,3.0,1953-12-27,2018-01-01,1,
2,326623,0,Cash loans,F,N,Y,0,112500.0,337500.0,31086.0,...,0.0,0.0,0.0,0.0,0.0,2.0,1975-06-21,2016-06-17,1,
3,191976,0,Cash loans,M,Y,Y,1,67500.0,135000.0,9018.0,...,,,,,,,1988-04-27,2009-06-05,1,
4,281519,0,Revolving loans,F,N,Y,0,67500.0,202500.0,10125.0,...,0.0,0.0,0.0,0.0,0.0,2.0,1975-06-13,1997-01-22,1,


# ========= AutoML preset usage =========


## Step 1. Create Task

In [9]:
%%time

task = Task('binary', )

CPU times: user 4.02 ms, sys: 0 ns, total: 4.02 ms
Wall time: 3.78 ms


## Step 2. Setup columns roles

Roles setup here set target column and base date, which is used to calculate date differences:

In [10]:
%%time

roles = {'target': TARGET_NAME,
         DatetimeRole(base_date=True, seasonality=(), base_feats=False): 'report_dt',
         }

CPU times: user 79 µs, sys: 112 µs, total: 191 µs
Wall time: 194 µs


## Step 3. Create AutoML from preset

To create AutoML model here we use `TabularAutoML` preset, which looks like:

![TabularAutoML preset pipeline](imgs/tutorial_2_pipeline.png)

All params we set above can be send inside preset to change its configuration (**important note**: currently types of algos used can't be changed):

In [11]:
%%time 

automl = TabularAutoML(task = task, 
                       timeout = TIMEOUT,
                       reader_params = {'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                       tuning_params = {'max_tuning_iter': 20, 'max_tuning_time': 30},
                       lgb_params = {'default_params': {'n_threads': N_THREADS}})
oof_pred = automl.fit_predict(train_data, roles = roles)
logging.info('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))


Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Feats was rejected during automatic roles guess: []
Train process start. Time left 598.0537803173065 secs
C = 1e-05 score = 0.6828396704994362
C = 5e-05 score = 0.7006216890913076
C = 0.0001 score = 0.7189302457355764
C = 0.0005 score = 0.7532808398950132
C = 0.001 score = 0.7615504273778405
C = 0.005 score = 0.7635229404878361
C = 0.01 score = 0.7604759690171112
C = 0.05 score = 0.7484537956177066
C = 1e-05 score = 0.706527046535326
C = 5e-05 score = 0.7166456139605979
C = 0.0001 score = 0.7246306046195652
C = 0.0005 score = 0.7377611243206521
C = 0.001 score = 0.7400220788043478
C = 0.005 score = 0.7400220788043479
C = 0.01 score = 0.7395603345788043
C = 0.05 score = 0.740197223165761
C = 0.1 score = 0.7416886039402174
C = 0.5 score = 0.7425590183423914
C = 1 score = 0.7425590183423914
C = 5 score = 0.7425590183423914
C = 1e-05 score = 0.6328390370244565
C = 5e-05 score = 0.6471584154211956
C = 0.0001 score = 0.6595883576766303
C = 0.0005 score = 0.6902757727581521
C = 0.001 score = 


Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer

[32m[I 2020-10-17 19:31:59,107][0m A new study created in memory with name: no-name-ef2397ee-6bac-4ff7-b2f3-e05628a5a9e0[0m


Time history [1.6991512775421143, 1.701791524887085, 3.5021376609802246, 2.2686996459960938, 2.7340776920318604, 1.1866357326507568]. Time left 165.8917441368103
Optuna may run 242.74026489257812 secs
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.742034
[200]	valid's auc: 0.743135
[300]	valid's auc: 0.74313
[400]	valid's auc: 0.742141


[32m[I 2020-10-17 19:32:03,135][0m Trial 0 finished with value: 0.7462941877682806 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 108}. Best is trial 0 with value: 0.7462941877682806.[0m


Early stopping, best iteration is:
[245]	valid's auc: 0.746294
Time history [4.021322011947632]. Time left 8999999995.976622
Training until validation scores don't improve for 200 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's auc: 0.751805
[200]	valid's auc: 0.752554
[300]	valid's auc: 0.752511
[400]	valid's auc: 0.751752


[32m[I 2020-10-17 19:32:07,042][0m Trial 1 finished with value: 0.7536389926819229 and parameters: {'feature_fraction': 0.5917173949330818, 'num_leaves': 87}. Best is trial 1 with value: 0.7536389926819229.[0m


Early stopping, best iteration is:
[281]	valid's auc: 0.753639
Time history [3.8956360816955566]. Time left 8999999996.102427
Training until validation scores don't improve for 200 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's auc: 0.746048
[200]	valid's auc: 0.747481
[300]	valid's auc: 0.745375


[32m[I 2020-10-17 19:32:10,674][0m Trial 2 finished with value: 0.7517894275435529 and parameters: {'feature_fraction': 0.7993292420985183, 'num_leaves': 118}. Best is trial 1 with value: 0.7536389926819229.[0m


Early stopping, best iteration is:
[157]	valid's auc: 0.751789
Time history [3.6198830604553223]. Time left 8999999996.378101
Training until validation scores don't improve for 200 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's auc: 0.740639
[200]	valid's auc: 0.740387


[32m[I 2020-10-17 19:32:13,345][0m Trial 3 finished with value: 0.7459681083652729 and parameters: {'feature_fraction': 0.7229163764267956, 'num_leaves': 230}. Best is trial 1 with value: 0.7536389926819229.[0m


Early stopping, best iteration is:
[76]	valid's auc: 0.745968
Time history [2.6606767177581787]. Time left 8999999997.337381
Training until validation scores don't improve for 200 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's auc: 0.74623
[200]	valid's auc: 0.749357


[32m[I 2020-10-17 19:32:15,572][0m Trial 4 finished with value: 0.7527623201885915 and parameters: {'feature_fraction': 0.5290418060840998, 'num_leaves': 103}. Best is trial 1 with value: 0.7536389926819229.[0m


Early stopping, best iteration is:
[63]	valid's auc: 0.752762
Time history [2.2179367542266846]. Time left 8999999997.780197
Training until validation scores don't improve for 200 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's auc: 0.752848
[200]	valid's auc: 0.754163
[300]	valid's auc: 0.752778


[32m[I 2020-10-17 19:32:18,999][0m Trial 5 finished with value: 0.75802235514858 and parameters: {'feature_fraction': 0.6668543055695109, 'num_leaves': 119}. Best is trial 5 with value: 0.75802235514858.[0m


Early stopping, best iteration is:
[176]	valid's auc: 0.758022
Time history [3.415576219558716]. Time left 8999999996.58245
Training until validation scores don't improve for 200 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's auc: 0.746904
[200]	valid's auc: 0.750881


[32m[I 2020-10-17 19:32:21,664][0m Trial 6 finished with value: 0.7548898546541153 and parameters: {'feature_fraction': 0.8540362888980227, 'num_leaves': 165}. Best is trial 5 with value: 0.75802235514858.[0m


Early stopping, best iteration is:
[63]	valid's auc: 0.75489
Time history [2.656064510345459]. Time left 8999999997.342066
Training until validation scores don't improve for 200 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's auc: 0.74623
[200]	valid's auc: 0.749357


[32m[I 2020-10-17 19:32:23,851][0m Trial 7 finished with value: 0.7527623201885915 and parameters: {'feature_fraction': 0.5282057895135501, 'num_leaves': 103}. Best is trial 5 with value: 0.75802235514858.[0m


Early stopping, best iteration is:
[63]	valid's auc: 0.752762
Time history [2.178288698196411]. Time left 8999999997.819908
Training until validation scores don't improve for 200 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's auc: 0.759653
[200]	valid's auc: 0.755146


[32m[I 2020-10-17 19:32:26,036][0m Trial 8 finished with value: 0.7608394673680047 and parameters: {'feature_fraction': 0.9162213204002109, 'num_leaves': 53}. Best is trial 8 with value: 0.7608394673680047.[0m


Early stopping, best iteration is:
[81]	valid's auc: 0.760839
Time history [2.1754038333892822]. Time left 8999999997.82272
Training until validation scores don't improve for 200 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's auc: 0.749651
[200]	valid's auc: 0.746749


[32m[I 2020-10-17 19:32:28,456][0m Trial 9 finished with value: 0.7512281433252617 and parameters: {'feature_fraction': 0.5003893829205072, 'num_leaves': 203}. Best is trial 8 with value: 0.7608394673680047.[0m


Early stopping, best iteration is:
[93]	valid's auc: 0.751228
Time history [2.4113969802856445]. Time left 8999999997.586662
Training until validation scores don't improve for 200 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's auc: 0.768382
[200]	valid's auc: 0.769547
[300]	valid's auc: 0.767227


[32m[I 2020-10-17 19:32:29,855][0m Trial 10 finished with value: 0.7719047848143218 and parameters: {'feature_fraction': 0.9889980370406001, 'num_leaves': 16}. Best is trial 10 with value: 0.7719047848143218.[0m


Early stopping, best iteration is:
[180]	valid's auc: 0.771905
Time history [1.3851487636566162]. Time left 8999999998.60709
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.758498
Early stopping, best iteration is:
[84]	valid's auc: 0.764635
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.732592
Early stopping, best iteration is:
[59]	valid's auc: 0.737931
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.716372
Early stopping, best iteration is:
[55]	valid's auc: 0.722179
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.734242
[200]	valid's auc: 0.735776
Early stopping, best iteration is:
[123]	valid's auc: 0.739603
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.753073
[200]	valid's auc: 0.747044
Early stopping, best iteration is:
[126]	valid's auc: 0.754671
Time history [1.6991512775421143, 1.701791524887085, 3.50

[2020-10-17 19:32:33,880] (INFO): oof_pred:
array([[0.03453665],
       [0.0283597 ],
       [0.02641694],
       ...,
       [0.02645098],
       [0.16012143],
       [0.09794779]], dtype=float32)
Shape = (8000, 1)


Blending, iter 4: score = 0.7505945877684255, weights = [0.28602386 0.20071685 0.5132593 ]
No score update. Terminated
CPU times: user 3min 28s, sys: 2.71 s, total: 3min 30s
Wall time: 54.3 s


## Step 4. Predict to test data and check scores

In [12]:
%%time

test_pred = automl.predict(test_data)
logging.info('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))

logging.info('Check scores...')
logging.info('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))
logging.info('TEST score: {}'.format(roc_auc_score(test_data[TARGET_NAME].values, test_pred.data[:, 0])))

[2020-10-17 19:32:34,291] (INFO): Prediction for test data:
array([[0.05872984],
       [0.07995276],
       [0.03152021],
       ...,
       [0.04771937],
       [0.04523724],
       [0.23591864]], dtype=float32)
Shape = (2000, 1)
[2020-10-17 19:32:34,291] (INFO): Check scores...
[2020-10-17 19:32:34,295] (INFO): OOF score: 0.750626583999461
[2020-10-17 19:32:34,297] (INFO): TEST score: 0.7357167119565218


CPU times: user 747 ms, sys: 39.7 ms, total: 787 ms
Wall time: 412 ms


## Step 5. Profiling AutoML 

To build report here, we **must** turn on decorators on step 0.4. Report is interactive and you can go as deep into functions call stack as you want:

In [13]:
%%time
p.profile('my_report_profile.html')
assert os.path.exists('my_report_profile.html'), 'Profile report failed to build'

FULL_STATS_DF shape = (11636, 6)
RUN_FNAME vc head:
gini_normalizedc [333]             1
LAMLDataset.set_data [55]          1
CatLinear.forward [319]            1
PandasDataset.to_numpy [21]        1
NumpyDataset._check_dtype [199]    1
Name: run_fname, dtype: int64
CONNECTED COMPONENTS cnt = 1
PATH LENS describe:
count    11637.000000
mean         9.143766
std          2.489588
min          0.000000
25%          8.000000
50%          9.000000
75%         10.000000
max         21.000000
dtype: float64
CPU times: user 997 ms, sys: 59.9 ms, total: 1.06 s
Wall time: 1.05 s


# Appendix. Profiling report screenshots 

After loading HTML with profiling report, you can see fully folded report (please wait for green LOAD OK text for full load finish). If you click on triangle on the left, it unfolds and look like this:  

<img src="imgs/tutorial_2_initial_report.png" alt="Initial profiling report" style="width: 500px;"/>

If we go even deeper we will receive situation like this:

<img src="imgs/tutorial_2_unfolded_report.png" alt="Profiling report after several unfoldings on different levels" style="width: 600px;"/>
