# Create your own pipeline

## Initial phase

### Import necessary libraries 

In [1]:
# Standard python libraries
import logging
import os
import time
logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.INFO)

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch

# Imports from our package
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector, ModelBasedImportanceEstimator
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
from lightautoml.automl.blend import WeightedBlender

### General parameters setup

In [2]:
N_THREADS = 8 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TARGET_NAME = 'TARGET' # Target column name

### Fix torch number of threads and numpy seed 

In [3]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)


### Example data load 

In [5]:
%%time

data = pd.read_csv('../LightAutoML/example_data/test_data_files/sampled_app_train.csv')
data.head()

CPU times: user 97.6 ms, sys: 28.7 ms, total: 126 ms
Wall time: 125 ms


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,313802,0,Cash loans,M,N,Y,0,270000.0,327024.0,15372.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,319656,0,Cash loans,F,N,N,0,108000.0,675000.0,19737.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,207678,0,Revolving loans,F,Y,Y,2,112500.0,270000.0,13500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
3,381593,0,Cash loans,F,N,N,1,67500.0,142200.0,9630.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
4,258153,0,Cash loans,F,Y,Y,0,337500.0,1483231.5,46570.5,...,0,0,0,0,0.0,0.0,0.0,2.0,0.0,0.0


### (Optional) Some user feature preparation 

Cell below shows some user feature preparations to create task more difficult (this block can be omitted if you don't want to change the initial data):

In [6]:
%%time

data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
                    ).astype(str)

data['constant'] = 1
data['allnan'] = np.nan

data['report_dt'] = np.datetime64('2018-01-01')

data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

CPU times: user 101 ms, sys: 4.41 ms, total: 105 ms
Wall time: 104 ms


### (Optional) Data splitting for train-test 

Block below can be omitted if you are going to train model only or you have specific train and test files:

In [7]:
%%time

train_data, test_data = train_test_split(data, 
                                         test_size=TEST_SIZE, 
                                         stratify=data[TARGET_NAME], 
                                         random_state=RANDOM_STATE)
logging.info('Data splitted. Parts sizes: train_data = {}, test_data = {}'
              .format(train_data.shape, test_data.shape))

[2020-12-03 18:11:27,047] (INFO): Data splitted. Parts sizes: train_data = (8000, 125), test_data = (2000, 125)


CPU times: user 10.7 ms, sys: 3.39 ms, total: 14 ms
Wall time: 12.9 ms


In [8]:
train_data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,BIRTH_DATE,EMP_DATE,constant,allnan,report_dt
6444,112261,0,Cash loans,F,N,N,1,90000.0,640080.0,31261.5,...,0.0,0.0,0.0,1.0,0.0,1985-06-28,2012-06-21,1,,2018-01-01
3586,115058,0,Cash loans,F,N,Y,0,180000.0,239850.0,23850.0,...,0.0,0.0,0.0,0.0,3.0,1953-12-27,2018-01-01,1,,2018-01-01
9349,326623,0,Cash loans,F,N,Y,0,112500.0,337500.0,31086.0,...,0.0,0.0,0.0,0.0,2.0,1975-06-21,2016-06-17,1,,2018-01-01
7734,191976,0,Cash loans,M,Y,Y,1,67500.0,135000.0,9018.0,...,,,,,,1988-04-27,2009-06-05,1,,2018-01-01
2174,281519,0,Revolving loans,F,N,Y,0,67500.0,202500.0,10125.0,...,0.0,0.0,0.0,0.0,2.0,1975-06-13,1997-01-22,1,,2018-01-01


## AutoML Modules Setup

![AutoML pipeline for this task](imgs/tutorial_1_pipeline.png)


### Create Task and PandasReader

In [9]:
%%time

task = Task('binary')
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE)

CPU times: user 5 ms, sys: 1.69 ms, total: 6.69 ms
Wall time: 5.19 ms


### Create feature selector (if necessary) 

In [10]:
%%time

model0 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'seed': 42, 'num_threads': N_THREADS}
)
pipe0 = LGBSimpleFeatures()
mbie = ModelBasedImportanceEstimator()
selector = ImportanceCutoffSelector(pipe0, model0, mbie, cutoff=0)

Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer


CPU times: user 2.93 ms, sys: 0 ns, total: 2.93 ms
Wall time: 2.32 ms


### Create 1st level ML pipeline for AutoML 

Our first level ML pipeline:
- Simple features for gradient boosting built on selected features (using step 2) 
- 2 different models:
    * LightGBM with params tuning (using OptunaTuner)
    * LightGBM with heuristic params


In [11]:
%%time 

pipe = LGBSimpleFeatures()

params_tuner1 = OptunaTuner(n_trials=20, timeout=30) # stop after 20 iterations or after 30 seconds 
model1 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 128, 'seed': 1, 'num_threads': N_THREADS}
)
model2 = BoostLGBM(
    default_params={'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads': N_THREADS}
)

pipeline_lvl1 = MLPipeline([
    (model1, params_tuner1),
    model2
], pre_selection=selector, features_pipeline=pipe, post_selection=None)

CPU times: user 922 µs, sys: 414 µs, total: 1.34 ms
Wall time: 1.34 ms


### Create 2nd level ML pipeline for AutoML 

Our second level ML pipeline:
- Using simple features as well, but now it will be Out-Of-Fold (OOF) predictions of algos from 1st level
- Only one LGBM model without params tuning
- Without feature selection on this stage because we want to use all OOFs here

In [12]:
%%time

pipe1 = LGBSimpleFeatures()

model = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'max_bin': 1024, 'seed': 3, 'num_threads': N_THREADS},
    freeze_defaults=True
)

pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None)

CPU times: user 861 µs, sys: 162 µs, total: 1.02 ms
Wall time: 1.03 ms


### Create AutoML pipeline 

AutoML pipeline consist of:
- Reader for data preparation
- First level ML pipeline (as built in step 3.1)
- Second level ML pipeline (as built in step 3.2)
- `Skip_conn = False` equals here "not to use initial features on the second level pipeline"

In [13]:
%%time 

automl = AutoML(reader, [
    [pipeline_lvl1],
    [pipeline_lvl2],
], skip_conn=False, verbose=0)

CPU times: user 735 µs, sys: 0 ns, total: 735 µs
Wall time: 741 µs


### Train AutoML on loaded data 

In cell below we train AutoML with target column `TARGET` to receive fitted model and OOF predictions:

In [14]:
%%time 

oof_pred = automl.fit_predict(train_data, roles={'target': TARGET_NAME})
logging.info('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

Train data shape: (8000, 125)
Feats was rejected during automatic roles guess: []
Start fitting LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.716183
Early stopping, best iteration is:
[16]	valid's auc: 0.720694
LightGBM fitting and predicting completed
Optuna may run 6299999991.996463 secs


[2020-12-03 18:11:42,540] (INFO): A new study created in memory with name: no-name-ff745603-6864-422c-91d2-cb43d4e7abd1


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.718332
[200]	valid's auc: 0.716862
Early stopping, best iteration is:
[133]	valid's auc: 0.722645
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2020-12-03 18:11:43,990] (INFO): Trial 0 finished with value: 0.7226454127042673 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 108}. Best is trial 0 with value: 0.7226454127042673.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.729792
[200]	valid's auc: 0.731156
Early stopping, best iteration is:
[137]	valid's auc: 0.733123
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2020-12-03 18:11:45,335] (INFO): Trial 1 finished with value: 0.7331227181123745 and parameters: {'feature_fraction': 0.5917173949330818, 'num_leaves': 87}. Best is trial 1 with value: 0.7331227181123745.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.725099
Early stopping, best iteration is:
[49]	valid's auc: 0.732246
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2020-12-03 18:11:46,314] (INFO): Trial 2 finished with value: 0.732246045619043 and parameters: {'feature_fraction': 0.7993292420985183, 'num_leaves': 118}. Best is trial 1 with value: 0.7331227181123745.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.724741
[200]	valid's auc: 0.727237
[300]	valid's auc: 0.728632
Early stopping, best iteration is:
[262]	valid's auc: 0.732492
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2020-12-03 18:11:48,315] (INFO): Trial 3 finished with value: 0.7324919415622945 and parameters: {'feature_fraction': 0.7229163764267956, 'num_leaves': 230}. Best is trial 1 with value: 0.7331227181123745.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.724842
Early stopping, best iteration is:
[51]	valid's auc: 0.730375
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2020-12-03 18:11:49,232] (INFO): Trial 4 finished with value: 0.7303750982247382 and parameters: {'feature_fraction': 0.5290418060840998, 'num_leaves': 103}. Best is trial 1 with value: 0.7331227181123745.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.72937
[200]	valid's auc: 0.731059
Early stopping, best iteration is:
[146]	valid's auc: 0.733144
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2020-12-03 18:11:50,668] (INFO): Trial 5 finished with value: 0.7331441003683093 and parameters: {'feature_fraction': 0.6668543055695109, 'num_leaves': 119}. Best is trial 5 with value: 0.7331441003683093.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.730803
[200]	valid's auc: 0.732123
Early stopping, best iteration is:
[130]	valid's auc: 0.738019
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2020-12-03 18:11:52,106] (INFO): Trial 6 finished with value: 0.7380192547214695 and parameters: {'feature_fraction': 0.8540362888980227, 'num_leaves': 165}. Best is trial 6 with value: 0.7380192547214695.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.724842
Early stopping, best iteration is:
[51]	valid's auc: 0.730375
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2020-12-03 18:11:53,002] (INFO): Trial 7 finished with value: 0.7303750982247382 and parameters: {'feature_fraction': 0.5282057895135501, 'num_leaves': 103}. Best is trial 6 with value: 0.7380192547214695.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.737993
Early stopping, best iteration is:
[44]	valid's auc: 0.746385
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2020-12-03 18:11:53,771] (INFO): Trial 8 finished with value: 0.7463850623560038 and parameters: {'feature_fraction': 0.9162213204002109, 'num_leaves': 53}. Best is trial 8 with value: 0.7463850623560038.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.72271
[200]	valid's auc: 0.716696
Early stopping, best iteration is:
[115]	valid's auc: 0.725746
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2020-12-03 18:11:55,005] (INFO): Trial 9 finished with value: 0.7257458398148297 and parameters: {'feature_fraction': 0.5003893829205072, 'num_leaves': 203}. Best is trial 8 with value: 0.7463850623560038.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.754371
[200]	valid's auc: 0.743461
Early stopping, best iteration is:
[125]	valid's auc: 0.755029
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2020-12-03 18:11:55,582] (INFO): Trial 10 finished with value: 0.7550288393176923 and parameters: {'feature_fraction': 0.9889980370406001, 'num_leaves': 16}. Best is trial 10 with value: 0.7550288393176923.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.754371
[200]	valid's auc: 0.743461
Early stopping, best iteration is:
[125]	valid's auc: 0.755029
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2020-12-03 18:11:56,150] (INFO): Trial 11 finished with value: 0.7550288393176923 and parameters: {'feature_fraction': 0.9936916651465633, 'num_leaves': 16}. Best is trial 10 with value: 0.7550288393176923.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.74692
Early stopping, best iteration is:
[36]	valid's auc: 0.753652
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2020-12-03 18:11:56,535] (INFO): Trial 12 finished with value: 0.7536523565918822 and parameters: {'feature_fraction': 0.9687670811213088, 'num_leaves': 18}. Best is trial 10 with value: 0.7550288393176923.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.752233
[200]	valid's auc: 0.741232
Early stopping, best iteration is:
[102]	valid's auc: 0.75373
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2020-12-03 18:11:57,064] (INFO): Trial 13 finished with value: 0.7537298672696464 and parameters: {'feature_fraction': 0.9971650985967954, 'num_leaves': 16}. Best is trial 10 with value: 0.7550288393176923.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.740473
Early stopping, best iteration is:
[32]	valid's auc: 0.744541
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2020-12-03 18:11:57,771] (INFO): Trial 14 finished with value: 0.7445408427816177 and parameters: {'feature_fraction': 0.9153368210114127, 'num_leaves': 50}. Best is trial 10 with value: 0.7550288393176923.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.735582
Early stopping, best iteration is:
[30]	valid's auc: 0.742531
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2020-12-03 18:11:58,465] (INFO): Trial 15 finished with value: 0.7425309107237359 and parameters: {'feature_fraction': 0.997800619645706, 'num_leaves': 48}. Best is trial 10 with value: 0.7550288393176923.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.755654
Early stopping, best iteration is:
[88]	valid's auc: 0.76154
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2020-12-03 18:11:58,933] (INFO): Trial 16 finished with value: 0.761539736249873 and parameters: {'feature_fraction': 0.9216897149325091, 'num_leaves': 16}. Best is trial 16 with value: 0.761539736249873.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.738821
[200]	valid's auc: 0.73648
Early stopping, best iteration is:
[116]	valid's auc: 0.743231
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2020-12-03 18:12:00,289] (INFO): Trial 17 finished with value: 0.7432311796056043 and parameters: {'feature_fraction': 0.9000391526693481, 'num_leaves': 73}. Best is trial 16 with value: 0.761539736249873.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.725099
Early stopping, best iteration is:
[49]	valid's auc: 0.732246
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2020-12-03 18:12:01,301] (INFO): Trial 18 finished with value: 0.732246045619043 and parameters: {'feature_fraction': 0.7972778723084907, 'num_leaves': 160}. Best is trial 16 with value: 0.761539736249873.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.746925
Early stopping, best iteration is:
[49]	valid's auc: 0.751896
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2020-12-03 18:12:01,887] (INFO): Trial 19 finished with value: 0.7518963388232275 and parameters: {'feature_fraction': 0.853875799774876, 'num_leaves': 33}. Best is trial 16 with value: 0.761539736249873.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.755654
Early stopping, best iteration is:
[88]	valid's auc: 0.76154
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.726377
Early stopping, best iteration is:
[91]	valid's auc: 0.728547
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.72082
Early stopping, best iteration is:
[27]	valid's auc: 0.728147
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.719461
[200]	valid's auc: 0.721563
[300]	valid's auc: 0.721441
Early stopping, best iteration is:
[246]	valid's auc: 0.726695
Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.751751
Early stopping, best iteration is:
[88]	valid's auc: 0.756146
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_0_Mod_1_LightGBM ...
Training until validation scores

[2020-12-03 18:12:10,847] (INFO): oof_pred:
array([[0.07475057],
       [0.0326896 ],
       [0.0394009 ],
       ...,
       [0.02765582],
       [0.10423973],
       [0.17042336]], dtype=float32)
Shape = (8000, 1)


CPU times: user 3min 45s, sys: 2.44 s, total: 3min 48s
Wall time: 34.5 s


###  Analyze fitted model  

Below we analyze feature importances of different algos:

In [15]:
logging.info('Feature importances of selector:\n{}'
              .format(selector.get_features_score()))
logging.info('=' * 70)

logging.info('Feature importances of top level algorithm:\n{}'
              .format(automl.levels[-1][0].ml_algos[0].get_features_score()))
logging.info('=' * 70)

logging.info('Feature importances of lowest level algorithm - model 0:\n{}'
              .format(automl.levels[0][0].ml_algos[0].get_features_score()))
logging.info('=' * 70)

logging.info('Feature importances of lowest level algorithm - model 1:\n{}'
              .format(automl.levels[0][0].ml_algos[1].get_features_score()))
logging.info('=' * 70)

[2020-12-03 18:14:52,366] (INFO): Feature importances of selector:
EXT_SOURCE_3              1029.681686
EXT_SOURCE_2               894.265428
BIRTH_DATE                 537.081401
EXT_SOURCE_1               424.764621
DAYS_LAST_PHONE_CHANGE     262.583100
                             ...     
FLAG_DOCUMENT_16             0.000000
FLAG_DOCUMENT_14             0.000000
FLAG_DOCUMENT_13             0.000000
FLAG_DOCUMENT_11             0.000000
FLAG_PHONE                   0.000000
Length: 110, dtype: float64
[2020-12-03 18:14:52,370] (INFO): Feature importances of top level algorithm:
Lvl_0_Pipe_0_Mod_0_LightGBM_prediction_0    2861.708537
Lvl_0_Pipe_0_Mod_1_LightGBM_prediction_0    2043.129412
dtype: float64
[2020-12-03 18:14:52,374] (INFO): Feature importances of lowest level algorithm - model 0:
EXT_SOURCE_2                  1516.181118
EXT_SOURCE_3                  1409.738812
dtdiff__BIRTH_DATE             765.700954
EXT_SOURCE_1                   596.701360
DAYS_REGISTRATION      

### Predict to test data and check scores

In [16]:
%%time

test_pred = automl.predict(test_data)
logging.info('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))

logging.info('Check scores...')
logging.info('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))
logging.info('TEST score: {}'.format(roc_auc_score(test_data[TARGET_NAME].values, test_pred.data[:, 0])))


[2020-12-03 18:14:52,997] (INFO): Prediction for test data:
array([[0.04925682],
       [0.05368711],
       [0.0534688 ],
       ...,
       [0.04521164],
       [0.04988748],
       [0.16970594]], dtype=float32)
Shape = (2000, 1)
[2020-12-03 18:14:52,998] (INFO): Check scores...
[2020-12-03 18:14:53,002] (INFO): OOF score: 0.706138322789459
[2020-12-03 18:14:53,005] (INFO): TEST score: 0.7235682744565217


CPU times: user 362 ms, sys: 35.9 ms, total: 398 ms
Wall time: 147 ms
