# Step 0.0. Install LightAutoML

Uncomment if doesn't clone repository by git. (ex.: colab, kaggle version)

In [1]:
#! pip install -U lightautoml

# Step 0.1. Import necessary libraries 

In [2]:
# Standard python libraries
import logging
import os
import time
import requests
logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.INFO)

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch

# Imports from our package
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.random_forest import RandomForestSklearn
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector, ModelBasedImportanceEstimator
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
from lightautoml.utils.profiler import Profiler
from lightautoml.automl.blend import WeightedBlender

from lightautoml.transformers.base import SequentialTransformer
from lightautoml.transformers.numeric import FillnaMedian, FillInf
from lightautoml.pipelines.features.base import FeaturesPipeline, TabularDataFeatures




# Step 0.2. Parameters 

In [3]:
N_THREADS = 8 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TARGET_NAME = 'TARGET' # Target column name

# Step 0.3. Fix torch number of threads and numpy seed 

In [4]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

# Step 0.5. Example data load 

Load a dataset from the repository if doesn't clone repository by git.

In [6]:
DATASET_DIR = './example_data/test_data_files'
DATASET_NAME = 'sampled_app_train.csv'
DATASET_FULLNAME = os.path.join(DATASET_DIR, DATASET_NAME)
DATASET_URL = 'https://raw.githubusercontent.com/sberbank-ai-lab/LightAutoML/master/example_data/test_data_files/sampled_app_train.csv'

In [7]:
%%time

if not os.path.exists(DATASET_FULLNAME):
    os.makedirs(DATASET_DIR, exist_ok=True)

    dataset = requests.get(DATASET_URL).text
    with open(DATASET_FULLNAME, 'w') as output:
        output.write(dataset)

CPU times: user 27 µs, sys: 21 µs, total: 48 µs
Wall time: 81.8 µs


In [8]:
%%time

data = pd.read_csv(DATASET_FULLNAME)
data.head()

CPU times: user 71.5 ms, sys: 15.3 ms, total: 86.9 ms
Wall time: 86.3 ms


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,313802,0,Cash loans,M,N,Y,0,270000.0,327024.0,15372.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,319656,0,Cash loans,F,N,N,0,108000.0,675000.0,19737.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,207678,0,Revolving loans,F,Y,Y,2,112500.0,270000.0,13500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
3,381593,0,Cash loans,F,N,N,1,67500.0,142200.0,9630.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
4,258153,0,Cash loans,F,Y,Y,0,337500.0,1483231.5,46570.5,...,0,0,0,0,0.0,0.0,0.0,2.0,0.0,0.0


# Step 0.6. (Optional) Some user feature preparation 

Cell below shows some user feature preparations to create task more difficult (this block can be omitted if you don't want to change the initial data):

In [9]:
%%time

data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
                    ).astype(str)

data['constant'] = 1
data['allnan'] = np.nan

data['report_dt'] = np.datetime64('2018-01-01')

data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

[2021-06-08 16:43:43,428] (INFO): Note: detected 96 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
[2021-06-08 16:43:43,429] (INFO): Note: NumExpr detected 96 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[2021-06-08 16:43:43,430] (INFO): NumExpr defaulting to 8 threads.


CPU times: user 116 ms, sys: 3.87 ms, total: 120 ms
Wall time: 118 ms


# Step 0.7. (Optional) Data splitting for train-test 

Block below can be omitted if you are going to train model only or you have specific train and test files:

In [10]:
%%time

train_data, test_data = train_test_split(data, 
                                         test_size=TEST_SIZE, 
                                         stratify=data[TARGET_NAME], 
                                         random_state=RANDOM_STATE)
logging.info('Data splitted. Parts sizes: train_data = {}, test_data = {}'
              .format(train_data.shape, test_data.shape))

[2021-06-08 16:43:43,513] (INFO): Data splitted. Parts sizes: train_data = (8000, 125), test_data = (2000, 125)


CPU times: user 18.1 ms, sys: 0 ns, total: 18.1 ms
Wall time: 17 ms


In [11]:
train_data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,BIRTH_DATE,EMP_DATE,constant,allnan,report_dt
6444,112261,0,Cash loans,F,N,N,1,90000.0,640080.0,31261.5,...,0.0,0.0,0.0,1.0,0.0,1985-06-28,2012-06-21,1,,2018-01-01
3586,115058,0,Cash loans,F,N,Y,0,180000.0,239850.0,23850.0,...,0.0,0.0,0.0,0.0,3.0,1953-12-27,2018-01-01,1,,2018-01-01
9349,326623,0,Cash loans,F,N,Y,0,112500.0,337500.0,31086.0,...,0.0,0.0,0.0,0.0,2.0,1975-06-21,2016-06-17,1,,2018-01-01
7734,191976,0,Cash loans,M,Y,Y,1,67500.0,135000.0,9018.0,...,,,,,,1988-04-27,2009-06-05,1,,2018-01-01
2174,281519,0,Revolving loans,F,N,Y,0,67500.0,202500.0,10125.0,...,0.0,0.0,0.0,0.0,2.0,1975-06-13,1997-01-22,1,,2018-01-01


# ========= AutoML creation =========

![AutoML pipeline for this task](imgs/tutorial_1_pipeline.png)


## Step 1. Create Task and PandasReader

In [12]:
%%time

task = Task('binary')
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE)

CPU times: user 4.38 ms, sys: 265 µs, total: 4.64 ms
Wall time: 4.08 ms


## Step 2. Create feature selector (if necessary) 

In [13]:
%%time

model0 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'seed': 42, 'num_threads': N_THREADS}
)
pipe0 = LGBSimpleFeatures()
mbie = ModelBasedImportanceEstimator()
selector = ImportanceCutoffSelector(pipe0, model0, mbie, cutoff=0)

Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer


CPU times: user 0 ns, sys: 1.63 ms, total: 1.63 ms
Wall time: 1.36 ms


## Step 3.1. Create 1st level ML pipeline for AutoML 

Our first level ML pipeline:
- Simple features for gradient boosting built on selected features (using step 2) 
- 2 different models:
    * LightGBM with params tuning (using OptunaTuner)
    * LightGBM with heuristic params


In [14]:
%%time 

class Filler(FeaturesPipeline, TabularDataFeatures):
    def create_pipeline(self, train):
        return SequentialTransformer([FillInf(), FillnaMedian()])

pipe = LGBSimpleFeatures().set_sequential(True).append(Filler())

params_tuner1 = OptunaTuner(n_trials=20, timeout=50) # stop after 20 iterations or after 30 seconds 
model1 = RandomForestSklearn(
    default_params={'random_state': 1, 'n_jobs': N_THREADS, 'verbose': 0}
)
model2 = BoostLGBM(
    default_params={'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads': N_THREADS}
)

pipeline_lvl1 = MLPipeline([
    (model1, params_tuner1),
    model2
], pre_selection=selector, features_pipeline=pipe, post_selection=None)

CPU times: user 330 µs, sys: 248 µs, total: 578 µs
Wall time: 587 µs


## Step 3.2. Create 2nd level ML pipeline for AutoML 

Our second level ML pipeline:
- Using simple features as well, but now it will be Out-Of-Fold (OOF) predictions of algos from 1st level
- Only one LGBM model without params tuning
- Without feature selection on this stage because we want to use all OOFs here

In [15]:
%%time

pipe1 = LGBSimpleFeatures()

model = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'max_bin': 1024, 'seed': 3, 'num_threads': N_THREADS},
    freeze_defaults=True
)

pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None)

CPU times: user 148 µs, sys: 111 µs, total: 259 µs
Wall time: 267 µs


## Step 4. Create AutoML pipeline 

AutoML pipeline consist of:
- Reader for data preparation
- First level ML pipeline (as built in step 3.1)
- Second level ML pipeline (as built in step 3.2)
- `Skip_conn = False` equals here "not to use initial features on the second level pipeline"

In [16]:
%%time 

automl = AutoML(reader, [
    [pipeline_lvl1],
    [pipeline_lvl2],
], skip_conn=False)

CPU times: user 252 µs, sys: 21 µs, total: 273 µs
Wall time: 279 µs


## Step 5. Train AutoML on loaded data 

In cell below we train AutoML with target column `TARGET` to receive fitted model and OOF predictions:

In [17]:
%%time 

oof_pred = automl.fit_predict(train_data, roles={'target': TARGET_NAME})
logging.info('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

Train data shape: (8000, 125)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 9999999994.654877 secs
Start fitting LightGBM ...

===== Start working with fold 0 for LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.716183
Early stopping, best iteration is:
[16]	valid's auc: 0.720694
LightGBM fitting and predicting completed
Optuna may run 6299999995.020297 secs


[2021-06-08 16:43:50,712] (INFO): A new study created in memory with name: no-name-f3949fda-0f51-4a80-a6d7-9b8a43ff6fc6


Start fitting Lvl_0_Pipe_0_Mod_0_RFSklearn ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:43:51,403] (INFO): Score for RF model: 0.742734


Lvl_0_Pipe_0_Mod_0_RFSklearn fitting and predicting completed


[2021-06-08 16:43:51,410] (INFO): Trial 0 finished with value: 0.7427340421551176 and parameters: {'min_samples_leaf': 103, 'max_depth': 4}. Best is trial 0 with value: 0.7427340421551176.


Start fitting Lvl_0_Pipe_0_Mod_0_RFSklearn ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:43:52,104] (INFO): Score for RF model: 0.751693


Lvl_0_Pipe_0_Mod_0_RFSklearn fitting and predicting completed


[2021-06-08 16:43:52,110] (INFO): Trial 1 finished with value: 0.7516932073918458 and parameters: {'min_samples_leaf': 93, 'max_depth': 8}. Best is trial 1 with value: 0.7516932073918458.


Start fitting Lvl_0_Pipe_0_Mod_0_RFSklearn ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:43:52,810] (INFO): Score for RF model: 0.744589


Lvl_0_Pipe_0_Mod_0_RFSklearn fitting and predicting completed


[2021-06-08 16:43:52,816] (INFO): Trial 2 finished with value: 0.7445889528574713 and parameters: {'min_samples_leaf': 189, 'max_depth': 5}. Best is trial 1 with value: 0.7516932073918458.


Start fitting Lvl_0_Pipe_0_Mod_0_RFSklearn ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:43:53,610] (INFO): Score for RF model: 0.753580


Lvl_0_Pipe_0_Mod_0_RFSklearn fitting and predicting completed


[2021-06-08 16:43:53,616] (INFO): Trial 3 finished with value: 0.753580191478102 and parameters: {'min_samples_leaf': 103, 'max_depth': 10}. Best is trial 3 with value: 0.753580191478102.


Start fitting Lvl_0_Pipe_0_Mod_0_RFSklearn ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:43:54,309] (INFO): Score for RF model: 0.745252


Lvl_0_Pipe_0_Mod_0_RFSklearn fitting and predicting completed


[2021-06-08 16:43:54,314] (INFO): Trial 4 finished with value: 0.7452518027914535 and parameters: {'min_samples_leaf': 211, 'max_depth': 7}. Best is trial 3 with value: 0.753580191478102.


Start fitting Lvl_0_Pipe_0_Mod_0_RFSklearn ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:43:55,004] (INFO): Score for RF model: 0.749614


Lvl_0_Pipe_0_Mod_0_RFSklearn fitting and predicting completed


[2021-06-08 16:43:55,011] (INFO): Trial 5 finished with value: 0.7496137830021756 and parameters: {'min_samples_leaf': 75, 'max_depth': 8}. Best is trial 3 with value: 0.753580191478102.


Start fitting Lvl_0_Pipe_0_Mod_0_RFSklearn ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:43:55,705] (INFO): Score for RF model: 0.742659


Lvl_0_Pipe_0_Mod_0_RFSklearn fitting and predicting completed


[2021-06-08 16:43:55,711] (INFO): Trial 6 finished with value: 0.7426592042593454 and parameters: {'min_samples_leaf': 117, 'max_depth': 4}. Best is trial 3 with value: 0.753580191478102.


Start fitting Lvl_0_Pipe_0_Mod_0_RFSklearn ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:43:56,402] (INFO): Score for RF model: 0.752303


Lvl_0_Pipe_0_Mod_0_RFSklearn fitting and predicting completed


[2021-06-08 16:43:56,407] (INFO): Trial 7 finished with value: 0.752302601685991 and parameters: {'min_samples_leaf': 104, 'max_depth': 8}. Best is trial 3 with value: 0.753580191478102.


Start fitting Lvl_0_Pipe_0_Mod_0_RFSklearn ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:43:57,100] (INFO): Score for RF model: 0.746759


Lvl_0_Pipe_0_Mod_0_RFSklearn fitting and predicting completed


[2021-06-08 16:43:57,105] (INFO): Trial 8 finished with value: 0.7467592518348648 and parameters: {'min_samples_leaf': 131, 'max_depth': 6}. Best is trial 3 with value: 0.753580191478102.


Start fitting Lvl_0_Pipe_0_Mod_0_RFSklearn ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:43:57,697] (INFO): Score for RF model: 0.732193


Lvl_0_Pipe_0_Mod_0_RFSklearn fitting and predicting completed


[2021-06-08 16:43:57,702] (INFO): Trial 9 finished with value: 0.7321925899792057 and parameters: {'min_samples_leaf': 53, 'max_depth': 2}. Best is trial 3 with value: 0.753580191478102.


Start fitting Lvl_0_Pipe_0_Mod_0_RFSklearn ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:43:58,500] (INFO): Score for RF model: 0.751789


Lvl_0_Pipe_0_Mod_0_RFSklearn fitting and predicting completed


[2021-06-08 16:43:58,505] (INFO): Trial 10 finished with value: 0.751789427543553 and parameters: {'min_samples_leaf': 10, 'max_depth': 10}. Best is trial 3 with value: 0.753580191478102.


Start fitting Lvl_0_Pipe_0_Mod_0_RFSklearn ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:43:59,200] (INFO): Score for RF model: 0.750453


Lvl_0_Pipe_0_Mod_0_RFSklearn fitting and predicting completed


[2021-06-08 16:43:59,205] (INFO): Trial 11 finished with value: 0.7504530365476209 and parameters: {'min_samples_leaf': 171, 'max_depth': 10}. Best is trial 3 with value: 0.753580191478102.


Start fitting Lvl_0_Pipe_0_Mod_0_RFSklearn ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:44:00,001] (INFO): Score for RF model: 0.754435


Lvl_0_Pipe_0_Mod_0_RFSklearn fitting and predicting completed


[2021-06-08 16:44:00,006] (INFO): Trial 12 finished with value: 0.7544354817154983 and parameters: {'min_samples_leaf': 35, 'max_depth': 9}. Best is trial 12 with value: 0.7544354817154983.


Start fitting Lvl_0_Pipe_0_Mod_0_RFSklearn ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:44:00,905] (INFO): Score for RF model: 0.732904


Lvl_0_Pipe_0_Mod_0_RFSklearn fitting and predicting completed


[2021-06-08 16:44:00,911] (INFO): Trial 13 finished with value: 0.7329035499890416 and parameters: {'min_samples_leaf': 2, 'max_depth': 10}. Best is trial 12 with value: 0.7544354817154983.


Start fitting Lvl_0_Pipe_0_Mod_0_RFSklearn ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:44:01,706] (INFO): Score for RF model: 0.756798


Lvl_0_Pipe_0_Mod_0_RFSklearn fitting and predicting completed


[2021-06-08 16:44:01,711] (INFO): Trial 14 finished with value: 0.7567982209963062 and parameters: {'min_samples_leaf': 51, 'max_depth': 9}. Best is trial 14 with value: 0.7567982209963062.


Start fitting Lvl_0_Pipe_0_Mod_0_RFSklearn ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:44:02,518] (INFO): Score for RF model: 0.754435


Lvl_0_Pipe_0_Mod_0_RFSklearn fitting and predicting completed


[2021-06-08 16:44:02,526] (INFO): Trial 15 finished with value: 0.7544354817154983 and parameters: {'min_samples_leaf': 35, 'max_depth': 9}. Best is trial 14 with value: 0.7567982209963062.


Start fitting Lvl_0_Pipe_0_Mod_0_RFSklearn ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:44:03,222] (INFO): Score for RF model: 0.750517


Lvl_0_Pipe_0_Mod_0_RFSklearn fitting and predicting completed


[2021-06-08 16:44:03,227] (INFO): Trial 16 finished with value: 0.7505171833154257 and parameters: {'min_samples_leaf': 31, 'max_depth': 7}. Best is trial 14 with value: 0.7567982209963062.


Start fitting Lvl_0_Pipe_0_Mod_0_RFSklearn ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:44:03,921] (INFO): Score for RF model: 0.754115


Lvl_0_Pipe_0_Mod_0_RFSklearn fitting and predicting completed


[2021-06-08 16:44:03,927] (INFO): Trial 17 finished with value: 0.7541147478764747 and parameters: {'min_samples_leaf': 50, 'max_depth': 9}. Best is trial 14 with value: 0.7567982209963062.


Start fitting Lvl_0_Pipe_0_Mod_0_RFSklearn ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:44:04,519] (INFO): Score for RF model: 0.722913


Lvl_0_Pipe_0_Mod_0_RFSklearn fitting and predicting completed


[2021-06-08 16:44:04,524] (INFO): Trial 18 finished with value: 0.7229100181214619 and parameters: {'min_samples_leaf': 4, 'max_depth': 2}. Best is trial 14 with value: 0.7567982209963062.


Start fitting Lvl_0_Pipe_0_Mod_0_RFSklearn ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:44:05,217] (INFO): Score for RF model: 0.747786


Lvl_0_Pipe_0_Mod_0_RFSklearn fitting and predicting completed


[2021-06-08 16:44:05,222] (INFO): Trial 19 finished with value: 0.7477856001197406 and parameters: {'min_samples_leaf': 140, 'max_depth': 9}. Best is trial 14 with value: 0.7567982209963062.


Start fitting Lvl_0_Pipe_0_Mod_0_RFSklearn ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:44:06,023] (INFO): Score for RF model: 0.756798



===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:44:06,814] (INFO): Score for RF model: 0.735304



===== Start working with fold 2 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:44:07,622] (INFO): Score for RF model: 0.710179



===== Start working with fold 3 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:44:08,412] (INFO): Score for RF model: 0.710906



===== Start working with fold 4 for Lvl_0_Pipe_0_Mod_0_RFSklearn =====



[2021-06-08 16:44:09,203] (INFO): Score for RF model: 0.738632


Lvl_0_Pipe_0_Mod_0_RFSklearn fitting and predicting completed
Start fitting Lvl_0_Pipe_0_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_1_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.746508
Early stopping, best iteration is:
[83]	valid's auc: 0.750528

===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_1_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.709489
[200]	valid's auc: 0.71417
Early stopping, best iteration is:
[134]	valid's auc: 0.715964

===== Start working with fold 2 for Lvl_0_Pipe_0_Mod_1_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.717752
[200]	valid's auc: 0.717317
Early stopping, best iteration is:
[127]	valid's auc: 0.724731

===== Start working with fold 3 for Lvl_0_Pipe_0_Mod_1_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.71988
[2

[2021-06-08 16:44:17,921] (INFO): oof_pred:
array([[0.05241532],
       [0.05364249],
       [0.06840997],
       ...,
       [0.04890947],
       [0.13222274],
       [0.07969784]], dtype=float32)
Shape = (8000, 1)


CPU times: user 2min 17s, sys: 4.71 s, total: 2min 22s
Wall time: 34.3 s


## Step 6. Analyze fitted model  

Below we analyze feature importances of different algos:

In [18]:
logging.info('Feature importances of selector:\n{}'
              .format(selector.get_features_score()))
logging.info('=' * 70)

logging.info('Feature importances of top level algorithm:\n{}'
              .format(automl.levels[-1][0].ml_algos[0].get_features_score()))
logging.info('=' * 70)

logging.info('Feature importances of lowest level algorithm - model 0:\n{}'
              .format(automl.levels[0][0].ml_algos[0].get_features_score()))
logging.info('=' * 70)

logging.info('Feature importances of lowest level algorithm - model 1:\n{}'
              .format(automl.levels[0][0].ml_algos[1].get_features_score()))
logging.info('=' * 70)

[2021-06-08 16:44:17,934] (INFO): Feature importances of selector:
EXT_SOURCE_3              1029.681686
EXT_SOURCE_2               894.265428
BIRTH_DATE                 537.081401
EXT_SOURCE_1               424.764621
DAYS_LAST_PHONE_CHANGE     262.583100
                             ...     
FLAG_DOCUMENT_16             0.000000
FLAG_DOCUMENT_14             0.000000
FLAG_DOCUMENT_13             0.000000
FLAG_DOCUMENT_11             0.000000
FLAG_PHONE                   0.000000
Length: 110, dtype: float64
[2021-06-08 16:44:17,938] (INFO): Feature importances of top level algorithm:
Lvl_0_Pipe_0_Mod_0_RFSklearn_prediction_0    1787.472717
Lvl_0_Pipe_0_Mod_1_LightGBM_prediction_0     1623.708955
dtype: float64
[2021-06-08 16:44:18,453] (INFO): Feature importances of lowest level algorithm - model 0:
fillnamed__fillinf__EXT_SOURCE_2                   0.159814
fillnamed__fillinf__EXT_SOURCE_3                   0.135888
fillnamed__fillinf__dtdiff__BIRTH_DATE             0.066009
fillnamed

## Step 7. Predict to test data and check scores

In [19]:
%%time

test_pred = automl.predict(test_data)
logging.info('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))

logging.info('Check scores...')
logging.info('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))
logging.info('TEST score: {}'.format(roc_auc_score(test_data[TARGET_NAME].values, test_pred.data[:, 0])))

[2021-06-08 16:44:19,174] (INFO): Prediction for test data:
array([[0.08095095],
       [0.0592024 ],
       [0.05832473],
       ...,
       [0.06434166],
       [0.06747682],
       [0.13112217]], dtype=float32)
Shape = (2000, 1)
[2021-06-08 16:44:19,175] (INFO): Check scores...
[2021-06-08 16:44:19,179] (INFO): OOF score: 0.6937083716809757
[2021-06-08 16:44:19,181] (INFO): TEST score: 0.7122435461956522


CPU times: user 1.04 s, sys: 122 ms, total: 1.16 s
Wall time: 716 ms
