# Step 0.0. Install LightAutoML

Uncomment if doesn't clone repository by git. (ex.: colab, kaggle version)

In [1]:
#! pip install -U lightautoml

# Step 0.1. Import necessary libraries 

In [13]:
# Standard python libraries
import logging
import os
import time
import requests
logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.INFO)

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch

# Imports from our package
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector, ModelBasedImportanceEstimator
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
from lightautoml.automl.blend import WeightedBlender

# Step 0.2. Parameters 

In [14]:
N_THREADS = 8 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TARGET_NAME = 'TARGET' # Target column name

# Step 0.3. Fix torch number of threads and numpy seed 

In [15]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

# Step 0.4. Example data load 

Load a dataset from the repository if doesn't clone repository by git.

In [16]:
DATASET_DIR = './example_data/test_data_files'
DATASET_NAME = 'sampled_app_train.csv'
DATASET_FULLNAME = os.path.join(DATASET_DIR, DATASET_NAME)
DATASET_URL = 'https://raw.githubusercontent.com/sberbank-ai-lab/LightAutoML/master/example_data/test_data_files/sampled_app_train.csv'

In [17]:
%%time

if not os.path.exists(DATASET_FULLNAME):
    os.makedirs(DATASET_DIR, exist_ok=True)

    dataset = requests.get(DATASET_URL).text
    with open(DATASET_FULLNAME, 'w') as output:
        output.write(dataset)

Wall time: 0 ns


In [18]:
%%time

data = pd.read_csv(DATASET_FULLNAME)
data.head()

Wall time: 96 ms


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,313802,0,Cash loans,M,N,Y,0,270000.0,327024.0,15372.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,319656,0,Cash loans,F,N,N,0,108000.0,675000.0,19737.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,207678,0,Revolving loans,F,Y,Y,2,112500.0,270000.0,13500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
3,381593,0,Cash loans,F,N,N,1,67500.0,142200.0,9630.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
4,258153,0,Cash loans,F,Y,Y,0,337500.0,1483231.5,46570.5,...,0,0,0,0,0.0,0.0,0.0,2.0,0.0,0.0


# Step 0.5. (Optional) Some user feature preparation 

Cell below shows some user feature preparations to create task more difficult (this block can be omitted if you don't want to change the initial data):

In [19]:
%%time

data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
                    ).astype(str)

data['constant'] = 1
data['allnan'] = np.nan

data['report_dt'] = np.datetime64('2018-01-01')

data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

Wall time: 132 ms


# Step 0.6. (Optional) Data splitting for train-test 

Block below can be omitted if you are going to train model only or you have specific train and test files:

In [20]:
%%time

train_data, test_data = train_test_split(data, 
                                         test_size=TEST_SIZE, 
                                         stratify=data[TARGET_NAME], 
                                         random_state=RANDOM_STATE)
logging.info('Data splitted. Parts sizes: train_data = {}, test_data = {}'
              .format(train_data.shape, test_data.shape))

[2021-08-28 19:24:14,106] (INFO): Data splitted. Parts sizes: train_data = (8000, 125), test_data = (2000, 125)


Wall time: 11 ms


In [21]:
train_data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,BIRTH_DATE,EMP_DATE,constant,allnan,report_dt
6444,112261,0,Cash loans,F,N,N,1,90000.0,640080.0,31261.5,...,0.0,0.0,0.0,1.0,0.0,1985-06-28,2012-06-21,1,,2018-01-01
3586,115058,0,Cash loans,F,N,Y,0,180000.0,239850.0,23850.0,...,0.0,0.0,0.0,0.0,3.0,1953-12-27,2018-01-01,1,,2018-01-01
9349,326623,0,Cash loans,F,N,Y,0,112500.0,337500.0,31086.0,...,0.0,0.0,0.0,0.0,2.0,1975-06-21,2016-06-17,1,,2018-01-01
7734,191976,0,Cash loans,M,Y,Y,1,67500.0,135000.0,9018.0,...,,,,,,1988-04-27,2009-06-05,1,,2018-01-01
2174,281519,0,Revolving loans,F,N,Y,0,67500.0,202500.0,10125.0,...,0.0,0.0,0.0,0.0,2.0,1975-06-13,1997-01-22,1,,2018-01-01


# ========= AutoML creation =========

![AutoML pipeline for this task](imgs/tutorial_1_pipeline.png)


## Step 1. Create Task and PandasReader

In [22]:
%%time

task = Task('binary')
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE)

Wall time: 2 ms


## Step 2. Create feature selector (if necessary) 

In [23]:
# helper class with group_by features

from lightautoml.pipelines.features.base import FeaturesPipeline
from lightautoml.pipelines.features.base import TabularDataFeatures

from lightautoml.transformers.base import UnionTransformer

class LGBSimpleFeaturesWithGroupBy(LGBSimpleFeatures, FeaturesPipeline, TabularDataFeatures):
    def __init__(self, feats_imp=None, top_category: int = 1, top_numeric: int = 1, **kwargs):
        super().__init__(feats_imp=feats_imp)

        self.top_group_by_categorical = top_category
        self.top_group_by_numerical = top_numeric

    def create_pipeline(self, train):
        original = super().create_pipeline(train)
        group_by =  self.get_group_by(train)
        return UnionTransformer([original, group_by])

In [24]:
%%time

model0 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'seed': 42, 'num_threads': N_THREADS}
)
pipe0 = LGBSimpleFeaturesWithGroupBy()
mbie = ModelBasedImportanceEstimator()
selector = ImportanceCutoffSelector(pipe0, model0, mbie, cutoff=0)

Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer


Wall time: 0 ns


## Step 3.1. Create 1st level ML pipeline for AutoML 

Our first level ML pipeline:
- Simple features for gradient boosting built on selected features (using step 2) 
- 2 different models:
    * LightGBM with params tuning (using OptunaTuner)
    * LightGBM with heuristic params


In [25]:
%%time 

pipe = LGBSimpleFeaturesWithGroupBy()

params_tuner1 = OptunaTuner(n_trials=20, timeout=30) # stop after 20 iterations or after 30 seconds 
model1 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 128, 'seed': 1, 'num_threads': N_THREADS}
)
model2 = BoostLGBM(
    default_params={'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads': N_THREADS}
)

pipeline_lvl1 = MLPipeline([
    (model1, params_tuner1),
    model2
], pre_selection=selector, features_pipeline=pipe, post_selection=None)

Wall time: 0 ns


## Step 3.2. Create 2nd level ML pipeline for AutoML 

Our second level ML pipeline:
- Using simple features as well, but now it will be Out-Of-Fold (OOF) predictions of algos from 1st level
- Only one LGBM model without params tuning
- Without feature selection on this stage because we want to use all OOFs here

In [26]:
%%time

pipe1 = LGBSimpleFeatures()

model = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'max_bin': 1024, 'seed': 3, 'num_threads': N_THREADS},
    freeze_defaults=True
)

pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None)

Wall time: 0 ns


## Step 4. Create AutoML pipeline 

AutoML pipeline consist of:
- Reader for data preparation
- First level ML pipeline (as built in step 3.1)
- Second level ML pipeline (as built in step 3.2)
- `Skip_conn = False` equals here "not to use initial features on the second level pipeline"

In [27]:
%%time 

automl = AutoML(reader, [
    [pipeline_lvl1],
    [pipeline_lvl2],
], skip_conn=False)

Wall time: 1 ms


## Step 5. Train AutoML on loaded data 

In cell below we train AutoML with target column `TARGET` to receive fitted model and OOF predictions:

In [28]:
%%time 

oof_pred = automl.fit_predict(train_data, roles={'target': TARGET_NAME})
logging.info('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

Train data shape: (8000, 125)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 9999999993.774479 secs
GroupByPipeline.create_pipeline.cat_feats_to_select:['AMT_ANNUITY']
GroupByPipeline.create_pipeline.num_feats_to_select:['FLAG_CONT_MOBILE']
GroupByTransformer.__fit.begin
GroupByTransformer.__fit.type(dataset.data.to_numpy())=<class 'numpy.ndarray'>
GroupByTransformer.__fit.cat_cols=['fillnamed__le__AMT_ANNUITY']
GroupByTransformer.__fit.num_cols:['FLAG_CONT_MOBILE']
GroupByTransformer.__fit.end
GroupByTransformer.transform.begin
GroupByTransformer.transform.end
GroupByTransformer.transform.begin
GroupByTransformer.transform.end
Start fitting LightGBM ...

===== Start working with fold 0 for LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.732941
[200]	valid's auc: 0.735085
Early stopping, best iteration is:
[160]	valid's auc: 0.739874
LightGBM fitting and predicting completed
GroupByPipe

[2021-08-28 19:24:23,682] (INFO): A new study created in memory with name: no-name-4f8dc331-cc72-4c12-a208-ee8f33ac197e


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.731744
Early stopping, best iteration is:
[85]	valid's auc: 0.733214
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-08-28 19:24:25,090] (INFO): Trial 0 finished with value: 0.7332135927000978 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 244}. Best is trial 0 with value: 0.7332135927000978.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.729782
[200]	valid's auc: 0.731337
[300]	valid's auc: 0.73347
Early stopping, best iteration is:
[251]	valid's auc: 0.734785
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-08-28 19:24:27,588] (INFO): Trial 1 finished with value: 0.7347851885113139 and parameters: {'feature_fraction': 0.8659969709057025, 'num_leaves': 159}. Best is trial 1 with value: 0.7347851885113139.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.721667
Early stopping, best iteration is:
[59]	valid's auc: 0.730926
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-08-28 19:24:28,467] (INFO): Trial 2 finished with value: 0.7309256913150621 and parameters: {'feature_fraction': 0.5780093202212182, 'num_leaves': 53}. Best is trial 1 with value: 0.7347851885113139.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.725992
[200]	valid's auc: 0.726574
Early stopping, best iteration is:
[124]	valid's auc: 0.731236
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-08-28 19:24:29,828] (INFO): Trial 3 finished with value: 0.7312357340261185 and parameters: {'feature_fraction': 0.5290418060840998, 'num_leaves': 223}. Best is trial 1 with value: 0.7347851885113139.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.722244
[200]	valid's auc: 0.715782
Early stopping, best iteration is:
[104]	valid's auc: 0.723442
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-08-28 19:24:31,376] (INFO): Trial 4 finished with value: 0.7234419017378428 and parameters: {'feature_fraction': 0.8005575058716043, 'num_leaves': 185}. Best is trial 1 with value: 0.7347851885113139.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.739671
[200]	valid's auc: 0.741788
Early stopping, best iteration is:
[116]	valid's auc: 0.746332
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-08-28 19:24:32,698] (INFO): Trial 5 finished with value: 0.7463316067161666 and parameters: {'feature_fraction': 0.5102922471479012, 'num_leaves': 248}. Best is trial 5 with value: 0.7463316067161666.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.730076
Early stopping, best iteration is:
[54]	valid's auc: 0.738613
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-08-28 19:24:33,941] (INFO): Trial 6 finished with value: 0.7386126123236633 and parameters: {'feature_fraction': 0.9162213204002109, 'num_leaves': 66}. Best is trial 5 with value: 0.7463316067161666.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.728354
Early stopping, best iteration is:
[52]	valid's auc: 0.734684
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-08-28 19:24:34,850] (INFO): Trial 7 finished with value: 0.734683622795623 and parameters: {'feature_fraction': 0.5909124836035503, 'num_leaves': 60}. Best is trial 5 with value: 0.7463316067161666.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.736993
[200]	valid's auc: 0.731177
Early stopping, best iteration is:
[114]	valid's auc: 0.738121
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-08-28 19:24:36,339] (INFO): Trial 8 finished with value: 0.7381208204371601 and parameters: {'feature_fraction': 0.6521211214797689, 'num_leaves': 141}. Best is trial 5 with value: 0.7463316067161666.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.722266
[200]	valid's auc: 0.720887
Early stopping, best iteration is:
[125]	valid's auc: 0.726473
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-08-28 19:24:38,016] (INFO): Trial 9 finished with value: 0.7264728365166166 and parameters: {'feature_fraction': 0.7159725093210578, 'num_leaves': 85}. Best is trial 5 with value: 0.7463316067161666.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.734502
Early stopping, best iteration is:
[98]	valid's auc: 0.735015
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-08-28 19:24:39,818] (INFO): Trial 10 finished with value: 0.7350150477626141 and parameters: {'feature_fraction': 0.9725682721151934, 'num_leaves': 202}. Best is trial 5 with value: 0.7463316067161666.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.754799
[200]	valid's auc: 0.744386
Early stopping, best iteration is:
[125]	valid's auc: 0.758471
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-08-28 19:24:40,722] (INFO): Trial 11 finished with value: 0.7584713825232131 and parameters: {'feature_fraction': 0.9847685553939332, 'num_leaves': 18}. Best is trial 11 with value: 0.7584713825232131.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.722902
Early stopping, best iteration is:
[48]	valid's auc: 0.729472
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-08-28 19:24:41,921] (INFO): Trial 12 finished with value: 0.729471697911488 and parameters: {'feature_fraction': 0.7890603020725826, 'num_leaves': 106}. Best is trial 11 with value: 0.7584713825232131.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.747492
[200]	valid's auc: 0.735143
Early stopping, best iteration is:
[102]	valid's auc: 0.748603
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-08-28 19:24:42,938] (INFO): Trial 13 finished with value: 0.7486034714092509 and parameters: {'feature_fraction': 0.9888586258294667, 'num_leaves': 25}. Best is trial 11 with value: 0.7584713825232131.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.758546
[200]	valid's auc: 0.751731
Early stopping, best iteration is:
[111]	valid's auc: 0.760278
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-08-28 19:24:43,787] (INFO): Trial 14 finished with value: 0.7602781831497133 and parameters: {'feature_fraction': 0.9851076138758569, 'num_leaves': 17}. Best is trial 14 with value: 0.7602781831497133.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.745578
Early stopping, best iteration is:
[68]	valid's auc: 0.747545
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-08-28 19:24:44,448] (INFO): Trial 15 finished with value: 0.747545049740473 and parameters: {'feature_fraction': 0.9033741235277806, 'num_leaves': 18}. Best is trial 14 with value: 0.7602781831497133.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.723672
[200]	valid's auc: 0.727162
[300]	valid's auc: 0.726473
Early stopping, best iteration is:
[227]	valid's auc: 0.729926
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-08-28 19:24:46,743] (INFO): Trial 16 finished with value: 0.729926070850105 and parameters: {'feature_fraction': 0.8564110068519739, 'num_leaves': 108}. Best is trial 14 with value: 0.7602781831497133.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.752105
[200]	valid's auc: 0.745418
Early stopping, best iteration is:
[131]	valid's auc: 0.753463
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-08-28 19:24:48,200] (INFO): Trial 17 finished with value: 0.7534625890704599 and parameters: {'feature_fraction': 0.9446452628982245, 'num_leaves': 36}. Best is trial 14 with value: 0.7602781831497133.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.726724
[200]	valid's auc: 0.724372
Early stopping, best iteration is:
[101]	valid's auc: 0.727713
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-08-28 19:24:49,971] (INFO): Trial 18 finished with value: 0.7277130073608417 and parameters: {'feature_fraction': 0.997384741288122, 'num_leaves': 87}. Best is trial 14 with value: 0.7602781831497133.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.74221
Early stopping, best iteration is:
[68]	valid's auc: 0.744616
Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed


[2021-08-28 19:24:51,067] (INFO): Trial 19 finished with value: 0.7446156806773899 and parameters: {'feature_fraction': 0.8461581921081283, 'num_leaves': 39}. Best is trial 14 with value: 0.7602781831497133.


Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.758546
[200]	valid's auc: 0.751731
Early stopping, best iteration is:
[111]	valid's auc: 0.760278

===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.725697
[200]	valid's auc: 0.724891
Early stopping, best iteration is:
[134]	valid's auc: 0.729768

===== Start working with fold 2 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.725538
Early stopping, best iteration is:
[22]	valid's auc: 0.730649

===== Start working with fold 3 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's auc: 0.726817
[200]	valid's auc: 0.722062
Early stopping, best iteration is:

[2021-08-28 19:25:03,605] (INFO): oof_pred:
array([[0.09988016],
       [0.05787185],
       [0.06359739],
       ...,
       [0.0633311 ],
       [0.1612968 ],
       [0.08905532]], dtype=float32)
Shape = (8000, 1)


Wall time: 49.4 s


## Step 6. Analyze fitted model  

Below we analyze feature importances of different algos:

In [29]:
logging.info('Feature importances of selector:\n{}'
              .format(selector.get_features_score()))
logging.info('=' * 70)

logging.info('Feature importances of top level algorithm:\n{}'
              .format(automl.levels[-1][0].ml_algos[0].get_features_score()))
logging.info('=' * 70)

logging.info('Feature importances of lowest level algorithm - model 0:\n{}'
              .format(automl.levels[0][0].ml_algos[0].get_features_score()))
logging.info('=' * 70)

logging.info('Feature importances of lowest level algorithm - model 1:\n{}'
              .format(automl.levels[0][0].ml_algos[1].get_features_score()))
logging.info('=' * 70)

[2021-08-28 19:25:03,618] (INFO): Feature importances of selector:
EXT_SOURCE_3                  2066.325377
EXT_SOURCE_2                  2054.822369
BIRTH_DATE                    1337.189284
DAYS_REGISTRATION             1220.646604
DAYS_ID_PUBLISH               1161.667440
                                 ...     
FLAG_DOCUMENT_13                 0.000000
FLAG_DOCUMENT_9                  0.000000
FLAG_DOCUMENT_14                 0.000000
AMT_REQ_CREDIT_BUREAU_HOUR       0.000000
FLAG_DOCUMENT_18                 0.000000
Length: 110, dtype: float64
[2021-08-28 19:25:03,621] (INFO): Feature importances of top level algorithm:
Lvl_0_Pipe_0_Mod_0_LightGBM_prediction_0    2410.118824
Lvl_0_Pipe_0_Mod_1_LightGBM_prediction_0    1601.807475
dtype: float64
[2021-08-28 19:25:03,625] (INFO): Feature importances of lowest level algorithm - model 0:
EXT_SOURCE_2                                                       1533.009782
EXT_SOURCE_3                                                       1

## Step 7. Predict to test data and check scores

In [30]:
%%time

test_pred = automl.predict(test_data)
logging.info('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))

logging.info('Check scores...')
logging.info('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))
logging.info('TEST score: {}'.format(roc_auc_score(test_data[TARGET_NAME].values, test_pred.data[:, 0])))

GroupByTransformer.transform.begin
GroupByTransformer.transform.end


[2021-08-28 19:25:03,807] (INFO): Prediction for test data:
array([[0.07188743],
       [0.05989537],
       [0.09868789],
       ...,
       [0.0667676 ],
       [0.07799368],
       [0.13785653]], dtype=float32)
Shape = (2000, 1)
[2021-08-28 19:25:03,808] (INFO): Check scores...
[2021-08-28 19:25:03,812] (INFO): OOF score: 0.7037766607797854
[2021-08-28 19:25:03,815] (INFO): TEST score: 0.714366508152174


Wall time: 169 ms


### before
[2021-08-28 13:32:36,516] (INFO): OOF score: 0.6979918272484156

[2021-08-28 13:32:36,519] (INFO): TEST score: 0.7158254076086956

### after
[2021-08-28 19:25:03,812] (INFO): OOF score: 0.7037766607797854

[2021-08-28 19:25:03,815] (INFO): TEST score: 0.714366508152174

### best configuration:
* 1 categorial
* 1 numeric
* all the other configurations show lower scores