# Step 0.1. Import necessary libraries 

In [1]:
# Standard python libraries
import logging
import os
import time
logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.INFO)

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch

# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task
from lightautoml.utils.profiler import Profiler

# Step 0.2. Parameters 

In [2]:
N_THREADS = 8 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 600 # Time in seconds for automl run
TARGET_NAME = 'TARGET' # Target column name

# Step 0.3. Fix torch number of threads and numpy seed 

In [3]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

# Step 0.4. Change profiling decorators settings 

By default, profiling decorators are turned off for speed and memory reduction. If you want to see profiling report after using LAMA, you need to turn on the decorators using command below: 

In [4]:
p = Profiler()
p.change_deco_settings({'enabled': True})

ALL_FUNCS len = 388
	 Func with no decorator - <function numpy_and_pandas_concat at 0x7fed35eb29d8>


# Step 0.5. Example data load 

In [5]:
%%time

data = pd.read_csv('./example_data/test_data_files/sampled_app_train.csv')
data.head()

CPU times: user 67.3 ms, sys: 14.3 ms, total: 81.6 ms
Wall time: 81.2 ms


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,313802,0,Cash loans,M,N,Y,0,270000.0,327024.0,15372.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,319656,0,Cash loans,F,N,N,0,108000.0,675000.0,19737.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,207678,0,Revolving loans,F,Y,Y,2,112500.0,270000.0,13500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
3,381593,0,Cash loans,F,N,N,1,67500.0,142200.0,9630.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
4,258153,0,Cash loans,F,Y,Y,0,337500.0,1483231.5,46570.5,...,0,0,0,0,0.0,0.0,0.0,2.0,0.0,0.0


# Step 0.6. (Optional) Some user feature preparation 

Cell below shows some user feature preparations to create task more difficult (this block can be omitted if you don't want to change the initial data):

In [6]:
%%time

data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
                    ).astype(str)

data['constant'] = 1
data['allnan'] = np.nan

data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

[2020-10-18 15:58:31,950] (INFO): Note: detected 96 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
[2020-10-18 15:58:31,950] (INFO): Note: NumExpr detected 96 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[2020-10-18 15:58:31,951] (INFO): NumExpr defaulting to 8 threads.


CPU times: user 109 ms, sys: 637 µs, total: 110 ms
Wall time: 107 ms


# Step 0.7. (Optional) Data splitting for train-test 

Block below can be omitted if you are going to train model only or you have specific train and test files:

In [7]:
%%time

train_data, test_data = train_test_split(data, 
                                         test_size=TEST_SIZE, 
                                         stratify=data[TARGET_NAME], 
                                         random_state=RANDOM_STATE)
logging.info('Data splitted. Parts sizes: train_data = {}, test_data = {}'
              .format(train_data.shape, test_data.shape))

[2020-10-18 15:58:32,023] (INFO): Data splitted. Parts sizes: train_data = (8000, 124), test_data = (2000, 124)


CPU times: user 12.5 ms, sys: 683 µs, total: 13.2 ms
Wall time: 12.9 ms


In [8]:
train_data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,BIRTH_DATE,EMP_DATE,constant,allnan
6444,112261,0,Cash loans,F,N,N,1,90000.0,640080.0,31261.5,...,0.0,0.0,0.0,0.0,1.0,0.0,1985-06-28,2012-06-21,1,
3586,115058,0,Cash loans,F,N,Y,0,180000.0,239850.0,23850.0,...,0.0,0.0,0.0,0.0,0.0,3.0,1953-12-27,2018-01-01,1,
9349,326623,0,Cash loans,F,N,Y,0,112500.0,337500.0,31086.0,...,0.0,0.0,0.0,0.0,0.0,2.0,1975-06-21,2016-06-17,1,
7734,191976,0,Cash loans,M,Y,Y,1,67500.0,135000.0,9018.0,...,,,,,,,1988-04-27,2009-06-05,1,
2174,281519,0,Revolving loans,F,N,Y,0,67500.0,202500.0,10125.0,...,0.0,0.0,0.0,0.0,0.0,2.0,1975-06-13,1997-01-22,1,


# ========= AutoML preset usage =========


## Step 1. Create Task

In [9]:
%%time

task = Task('binary', )

CPU times: user 3.94 ms, sys: 0 ns, total: 3.94 ms
Wall time: 3.71 ms


## Step 2. Setup columns roles

Roles setup here set target column and base date, which is used to calculate date differences:

In [10]:
%%time

roles = {'target': TARGET_NAME,
         DatetimeRole(base_date=True, seasonality=(), base_feats=False): 'report_dt',
         }

CPU times: user 87 µs, sys: 140 µs, total: 227 µs
Wall time: 229 µs


## Step 3. Create AutoML from preset

To create AutoML model here we use `TabularAutoML` preset, which looks like:

![TabularAutoML preset pipeline](imgs/tutorial_2_pipeline.png)

All params we set above can be send inside preset to change its configuration (**important note**: currently types of algos used can't be changed):

In [11]:
%%time 

automl = TabularAutoML(task = task, 
                       timeout = TIMEOUT,
                       reader_params = {'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                       tuning_params = {'max_tuning_iter': 20, 'max_tuning_time': 30},
                       lgb_params = {'default_params': {'n_threads': N_THREADS}})
oof_pred = automl.fit_predict(train_data, roles = roles)
logging.info('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))


Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Feats was rejected during automatic roles guess: []
Train process start. Time left 598.0229172706604 secs
C = 1e-05 score = 0.6827835420776069
C = 5e-05 score = 0.7006216890913076
C = 0.0001 score = 0.7189302457355764
C = 0.0005 score = 0.7532808398950132
C = 0.001 score = 0.7615504273778405
C = 0.005 score = 0.7635282860518199
C = 0.01 score = 0.7605294246569484
C = 0.05 score = 0.7486569270490883
C = 1e-05 score = 0.706527046535326
C = 5e-05 score = 0.7166456139605978
C = 0.0001 score = 0.7246306046195652
C = 0.0005 score = 0.7377611243206521
C = 0.001 score = 0.7400220788043478
C = 0.005 score = 0.7400220788043479
C = 0.01 score = 0.7396187160326086
C = 0.05 score = 0.7404254415760869
C = 0.1 score = 0.7416673743206523
C = 0.5 score = 0.7426651664402174
C = 1 score = 0.7426651664402174
C = 5 score = 0.7426651664402174
C = 1e-05 score = 0.6328390370244565
C = 5e-05 score = 0.6471584154211956
C = 0.0001 score = 0.6595883576766303
C = 0.0005 score = 0.6902757727581521
C = 0.001 score =


Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer

[32m[I 2020-10-18 15:58:52,412][0m A new study created in memory with name: no-name-ddf46aff-2d44-4029-ba5c-de1795cef25b[0m


Early stopping, best iteration is:
[39]	valid's auc: 0.75595
Time history [1.7170946598052979, 1.7035210132598877, 3.474874496459961, 2.3089780807495117, 3.008502721786499, 1.4105572700500488]. Time left 165.28762729962668
Optuna may run 242.48521542549133 secs
Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.742034
[200]	valid's auc: 0.743135
[300]	valid's auc: 0.74313
[400]	valid's auc: 0.742141


[32m[I 2020-10-18 15:58:57,109][0m Trial 0 finished with value: 0.7462941877682806 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 108}. Best is trial 0 with value: 0.7462941877682806.[0m


Early stopping, best iteration is:
[245]	valid's auc: 0.746294
Time history [4.69086217880249]. Time left 8999999995.307146
Training until validation scores don't improve for 200 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's auc: 0.751805
[200]	valid's auc: 0.752554
[300]	valid's auc: 0.752511
[400]	valid's auc: 0.751752


[32m[I 2020-10-18 15:59:01,834][0m Trial 1 finished with value: 0.7536389926819229 and parameters: {'feature_fraction': 0.5917173949330818, 'num_leaves': 87}. Best is trial 1 with value: 0.7536389926819229.[0m


Early stopping, best iteration is:
[281]	valid's auc: 0.753639
Time history [4.713425636291504]. Time left 8999999995.28476
Training until validation scores don't improve for 200 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's auc: 0.746048
[200]	valid's auc: 0.747481
[300]	valid's auc: 0.745375


[32m[I 2020-10-18 15:59:05,682][0m Trial 2 finished with value: 0.7517894275435529 and parameters: {'feature_fraction': 0.7993292420985183, 'num_leaves': 118}. Best is trial 1 with value: 0.7536389926819229.[0m


Early stopping, best iteration is:
[157]	valid's auc: 0.751789
Time history [3.8372416496276855]. Time left 8999999996.16086
Training until validation scores don't improve for 200 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's auc: 0.740639
[200]	valid's auc: 0.740387


[32m[I 2020-10-18 15:59:08,731][0m Trial 3 finished with value: 0.7459681083652729 and parameters: {'feature_fraction': 0.7229163764267956, 'num_leaves': 230}. Best is trial 1 with value: 0.7536389926819229.[0m


Early stopping, best iteration is:
[76]	valid's auc: 0.745968
Time history [3.038619041442871]. Time left 8999999996.959381
Training until validation scores don't improve for 200 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's auc: 0.74623
[200]	valid's auc: 0.749357


[32m[I 2020-10-18 15:59:11,193][0m Trial 4 finished with value: 0.7527623201885915 and parameters: {'feature_fraction': 0.5290418060840998, 'num_leaves': 103}. Best is trial 1 with value: 0.7536389926819229.[0m


Early stopping, best iteration is:
[63]	valid's auc: 0.752762
Time history [2.4532225131988525]. Time left 8999999997.544973
Training until validation scores don't improve for 200 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's auc: 0.752848
[200]	valid's auc: 0.754163
[300]	valid's auc: 0.752778


[32m[I 2020-10-18 15:59:15,272][0m Trial 5 finished with value: 0.75802235514858 and parameters: {'feature_fraction': 0.6668543055695109, 'num_leaves': 119}. Best is trial 5 with value: 0.75802235514858.[0m


Early stopping, best iteration is:
[176]	valid's auc: 0.758022
Time history [4.070704936981201]. Time left 8999999995.927485
Training until validation scores don't improve for 200 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's auc: 0.746904
[200]	valid's auc: 0.750881


[32m[I 2020-10-18 15:59:18,324][0m Trial 6 finished with value: 0.7548898546541153 and parameters: {'feature_fraction': 0.8540362888980227, 'num_leaves': 165}. Best is trial 5 with value: 0.75802235514858.[0m


Early stopping, best iteration is:
[63]	valid's auc: 0.75489
Time history [3.0408096313476562]. Time left 8999999996.957289
Training until validation scores don't improve for 200 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's auc: 0.74623
[200]	valid's auc: 0.749357


[32m[I 2020-10-18 15:59:20,487][0m Trial 7 finished with value: 0.7527623201885915 and parameters: {'feature_fraction': 0.5282057895135501, 'num_leaves': 103}. Best is trial 5 with value: 0.75802235514858.[0m


Early stopping, best iteration is:
[63]	valid's auc: 0.752762
Time history [2.151897430419922]. Time left 8999999997.846025
Training until validation scores don't improve for 200 rounds



Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer



[100]	valid's auc: 0.759653
[200]	valid's auc: 0.755146


[32m[I 2020-10-18 15:59:22,676][0m Trial 8 finished with value: 0.7608394673680047 and parameters: {'feature_fraction': 0.9162213204002109, 'num_leaves': 53}. Best is trial 8 with value: 0.7608394673680047.[0m


Early stopping, best iteration is:
[81]	valid's auc: 0.760839
Time history [2.1807444095611572]. Time left 8999999997.817436
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.761524
Early stopping, best iteration is:
[67]	valid's auc: 0.766051
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.730665
[200]	valid's auc: 0.732369
Early stopping, best iteration is:
[185]	valid's auc: 0.735527
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.731371
[200]	valid's auc: 0.734985
Early stopping, best iteration is:
[130]	valid's auc: 0.737777
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.74085
[200]	valid's auc: 0.738833
Early stopping, best iteration is:
[123]	valid's auc: 0.742379
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.749289
Early stopping, best iteration is:
[57]	valid's auc: 0.752744
Time history [1.7170946598052

[2020-10-18 15:59:30,772] (INFO): oof_pred:
array([[0.01766108],
       [0.01031231],
       [0.0174613 ],
       ...,
       [0.01851097],
       [0.15720469],
       [0.10899049]], dtype=float32)
Shape = (8000, 1)


Blending, iter 2: score = 0.7492554232548606, weights = [0.4274692  0.10802531 0.46450552]
Blending, iter 3: score = 0.7492554232548606, weights = [0.4274692  0.10802531 0.46450552]
No score update. Terminated
CPU times: user 3min 45s, sys: 3.05 s, total: 3min 48s
Wall time: 58.7 s


## Step 4. Predict to test data and check scores

In [12]:
%%time

test_pred = automl.predict(test_data)
logging.info('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))

logging.info('Check scores...')
logging.info('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))
logging.info('TEST score: {}'.format(roc_auc_score(test_data[TARGET_NAME].values, test_pred.data[:, 0])))

[2020-10-18 15:59:31,200] (INFO): Prediction for test data:
array([[0.06026712],
       [0.07974271],
       [0.02047962],
       ...,
       [0.04342499],
       [0.03358944],
       [0.25128332]], dtype=float32)
Shape = (2000, 1)
[2020-10-18 15:59:31,200] (INFO): Check scores...
[2020-10-18 15:59:31,204] (INFO): OOF score: 0.7492607382434048
[2020-10-18 15:59:31,206] (INFO): TEST score: 0.7318002717391304


CPU times: user 824 ms, sys: 35.8 ms, total: 860 ms
Wall time: 430 ms


## Step 5. Profiling AutoML 

To build report here, we **must** turn on decorators on step 0.4. Report is interactive and you can go as deep into functions call stack as you want:

In [13]:
%%time
p.profile('my_report_profile.html')
assert os.path.exists('my_report_profile.html'), 'Profile report failed to build'

	 Func with no stats - <function numpy_and_pandas_concat at 0x7fed35eb29d8>
FULL_STATS_DF shape = (11269, 6)
RUN_FNAME vc head:
MetricFunc.__call__ [13]                    1
NumericRole.__init__ [305]                  1
TorchBasedLinearEstimator._loss_fn [607]    1
ginic [518]                                 1
encoding_check [7]                          1
Name: run_fname, dtype: int64
CONNECTED COMPONENTS cnt = 1
PATH LENS describe:
count    11270.000000
mean         9.203106
std          2.502676
min          0.000000
25%          8.000000
50%          9.000000
75%         10.000000
max         21.000000
dtype: float64
CPU times: user 1.02 s, sys: 48.2 ms, total: 1.07 s
Wall time: 1.06 s


# Appendix. Profiling report screenshots 

After loading HTML with profiling report, you can see fully folded report (please wait for green LOAD OK text for full load finish). If you click on triangle on the left, it unfolds and look like this:  

<img src="imgs/tutorial_2_initial_report.png" alt="Initial profiling report" style="width: 500px;"/>

If we go even deeper we will receive situation like this:

<img src="imgs/tutorial_2_unfolded_report.png" alt="Profiling report after several unfoldings on different levels" style="width: 600px;"/>
