# Step 0.1. Import necessary libraries 

In [1]:
# Standard python libraries
import logging
import os
import time

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
import torch
from matplotlib import pyplot as plt

# Imports from our package
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures, LGBSeqSimpleFeatures, LGBMultiSeqSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector, ModelBasedImportanceEstimator
from lightautoml.reader.base import PandasToPandasReader, DictToNumpySeqReader
from lightautoml.tasks import Task
from lightautoml.automl.blend import WeightedBlender
from lightautoml.dataset.seq_np_pd_dataset import SeqNumpyPandasDataset
from lightautoml.dataset.roles import NumericRole, CategoryRole, DatetimeRole
from lightautoml.dataset.np_pd_dataset import NumpyDataset, PandasDataset
from lightautoml.ml_algo.random_forest import RandomForestSklearn
from lightautoml.validation.np_iterators import TimeSeriesIterator

from lightautoml.pipelines.features.generator_pipeline import FeatureGeneratorPipeline

In [2]:
DATA_FOLDER = '/home/simakovde/hdd/kaggle/idao21_final/idao_2021_train'
train = pd.read_csv(f'{DATA_FOLDER}/funnel.csv' )

clients = pd.read_csv(f'{DATA_FOLDER}/client.csv')
deals = pd.read_csv(f'{DATA_FOLDER}/deals.csv')
aum = pd.read_csv(f'{DATA_FOLDER}/aum.csv')
trxn = pd.read_csv(f'{DATA_FOLDER}/trxn.csv')
com = pd.read_csv(f'{DATA_FOLDER}/com.csv')
appl = pd.read_csv(f'{DATA_FOLDER}/appl.csv')
payments = pd.read_csv(f'{DATA_FOLDER}/payments.csv')
balance = pd.read_csv(f'{DATA_FOLDER}/balance.csv')

In [3]:
train.shape, clients.shape, deals.shape, aum.shape, trxn.shape, com.shape, appl.shape, payments.shape, balance.shape

((21498, 16),
 (21498, 8),
 (109016, 8),
 (117392, 4),
 (3035705, 11),
 (113055, 10),
 (12030, 6),
 (188068, 4),
 (1194684, 9))

In [4]:
train.head()

Unnamed: 0,client_id,sale_flg,sale_amount,contacts,feature_1,client_segment,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,region_cd
0,7513301859607023584,0,,1,7,13.0,571533.0,15717.0,0.0,0.0,0.0,571852.0,472605.0,10.4,12548.0,86.0
1,9157009756404187626,0,,1,3,13.0,3642369.0,94787.0,0.0,0.0,84823.0,3642369.0,3314257.0,8.9,77210.0,2.0
2,-1893104556496814867,0,,1,5,16.0,352826.0,5500.0,0.0,6822.0,0.0,265893.0,204534.0,8.9,5508.0,52.0
3,6886062013213911831,0,,1,4,3.0,6070615.0,40580.0,0.0,30401.0,0.0,2005731.0,1825051.0,7.9,40583.0,86.0
4,-8156468515495593794,1,138018.05,1,7,14.0,3642369.0,97156.0,81488.0,0.0,160308.0,3642369.0,3314257.0,10.4,78108.0,27.0


In [5]:
clients.head()

Unnamed: 0,client_id,gender,age,region,city,citizenship,education,job_type
0,7513301859607023584,F,33.0,0,115,RUSSIA,,
1,9157009756404187626,F,59.0,17,668,RUSSIA,,
2,-1893104556496814867,M,51.0,28,65,RUSSIA,,
3,6886062013213911831,F,56.0,0,40,RUSSIA,,
4,-8156468515495593794,F,34.0,-1,-1,RUSSIA,HIGHER_PROFESSIONAL,


In [6]:
balance.head()

Unnamed: 0,client_id,crncy_cd,eop_bal_sum_rur,min_bal_sum_rur,max_bal_sum_rur,avg_bal_sum_rur,month_end_dt,prod_cat_name,prod_group_name
0,7513301859607023584,810.0,0.0,0.0,0.0,0.0,2018-09-30,CURRENT ACCOUNTS,Cash on demand
1,7513301859607023584,810.0,0.0,0.0,0.0,0.0,2018-09-30,CURRENT ACCOUNTS,Cash on demand
2,7513301859607023584,810.0,0.0,0.0,0.0,0.0,2018-09-30,CURRENT ACCOUNTS,Cash on demand
3,7513301859607023584,810.0,0.0,0.0,0.0,0.0,2018-09-30,CURRENT ACCOUNTS,Cash on demand
4,7513301859607023584,810.0,0.0,0.0,0.0,0.0,2018-09-30,CURRENT ACCOUNTS,Cash on demand


In [7]:
# в доп таблицах только данные текущих актуальных клиентов <- реализован
# в доп таблицах данные для всех клиентов
# в доп таблицах данные только для новых клиентов 

In [8]:
seq_params = {'clients': {'case': 'ids',
                          'params': {},
                          'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                         }
             }

X_train = {'plain':train , 'seq': None}
task = Task('binary', metric='logloss')

roles={'target': 'sale_flg', 'drop': ['sale_amount', 'contacts']}

reader = DictToNumpySeqReader(task=task, cv=3, seq_params=seq_params)

feats = LGBMultiSeqSimpleFeatures()
model = BoostLGBM()
pipeline_lvl1 = MLPipeline([model], pre_selection=None, features_pipeline=feats, post_selection=None)

automl = AutoML(reader, [
    [pipeline_lvl1],
], skip_conn=False)

oof_pred = automl.fit_predict(X_train, roles=roles, verbose=3)

metric_l = log_loss(train[roles['target']], oof_pred.data[:, 0])
metric_a = roc_auc_score(train[roles['target']], oof_pred.data[:, 0])

print('=============')
print(f'log-loss: {metric_l}, roc_auc: {metric_a}')

_pred = automl.predict(X_train)


[11:50:39] Feats was rejected during automatic roles guess: []
[11:50:39] Layer 1 train process start. Time left 9999999996.62 secs
[11:50:39] Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
[11:50:39] ===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[11:50:40] ===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[11:50:41] ===== Start working with fold 2 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[11:50:42] Fitting Lvl_0_Pipe_0_Mod_0_LightGBM finished. score = -0.34980425030874734
[11:50:42] Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed
[11:50:42] Time left 9999999994.03 secs

[11:50:42] Layer 1 training completed.

log-loss: 0.34980425030874734, roc_auc: 0.8192736336435955


In [9]:
%%time
seq_params = {'trxn': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       }
             }

X_train = {'plain':train , 'seq': {'trxn': trxn}}

task = Task('binary', metric='logloss')

roles={'target': 'sale_flg', 'drop': ['sale_amount', 'contacts']}

reader = DictToNumpySeqReader(task=task, cv=3, seq_params=seq_params)

feats = LGBMultiSeqSimpleFeatures()
model = BoostLGBM()
pipeline_lvl1 = MLPipeline([model], pre_selection=None, features_pipeline=feats, post_selection=None)

automl = AutoML(reader, [
    [pipeline_lvl1],
], skip_conn=False)

oof_pred = automl.fit_predict(X_train, roles=roles, verbose=3)

metric_l = log_loss(train[roles['target']], oof_pred.data[:, 0])
metric_a = roc_auc_score(train[roles['target']], oof_pred.data[:, 0])

print('=============')
print(f'log-loss: {metric_l}, roc_auc: {metric_a}')

_pred = automl.predict(X_train)


[11:50:44] Feats was rejected during automatic roles guess: []
[11:50:44] Layer 1 train process start. Time left 9999999997.62 secs
[11:50:53] Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
[11:50:53] ===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[11:50:54] ===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[11:50:55] ===== Start working with fold 2 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[11:50:56] Fitting Lvl_0_Pipe_0_Mod_0_LightGBM finished. score = -0.3468762868274541
[11:50:56] Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed
[11:50:56] Time left 9999999985.98 secs

[11:50:56] Layer 1 training completed.

log-loss: 0.3468762868274541, roc_auc: 0.8231983444548147
CPU times: user 29.2 s, sys: 1.98 s, total: 31.2 s
Wall time: 23.5 s


In [10]:
%%time
seq_params = {'clients': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id', 'type': 'lookup'}
                         }
             }

X_train = {'plain':train , 'seq': {'clients': clients}}
task = Task('binary', metric='logloss')

roles={'target': 'sale_flg', 'drop': ['sale_amount', 'contacts']}

reader = DictToNumpySeqReader(task=task, cv=3, seq_params=seq_params)

feats = LGBMultiSeqSimpleFeatures()
model = BoostLGBM()
pipeline_lvl1 = MLPipeline([model], pre_selection=None, features_pipeline=feats, post_selection=None)

automl = AutoML(reader, [
    [pipeline_lvl1],
], skip_conn=False)

oof_pred = automl.fit_predict(X_train, roles=roles, verbose=3)

metric_l = log_loss(train[roles['target']], oof_pred.data[:, 0])
metric_a = roc_auc_score(train[roles['target']], oof_pred.data[:, 0])

print('=============')
print(f'log-loss: {metric_l}, roc_auc: {metric_a}')

_pred = automl.predict(X_train)


[11:51:06] Feats was rejected during automatic roles guess: []
[11:51:06] Layer 1 train process start. Time left 9999999999.22 secs
[11:51:07] Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
[11:51:07] ===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[11:51:08] ===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[11:51:09] ===== Start working with fold 2 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[11:51:10] Fitting Lvl_0_Pipe_0_Mod_0_LightGBM finished. score = -0.24634341369370094
[11:51:10] Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed
[11:51:10] Time left 9999999995.45 secs

[11:51:10] Layer 1 training completed.

log-loss: 0.24634341369370094, roc_auc: 0.9212829858762476
CPU times: user 15.6 s, sys: 190 ms, total: 15.8 s
Wall time: 5.65 s


In [11]:
%%time
seq_params = {'balance': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       }, 
              'payments': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       }, 
              'appl': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       },
              'com': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       },
              'trxn': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       },
              'aum': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       },
              'deals': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       },
              'clients': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id', 'type': 'lookup'}
                       }
             }
X_train = {'plain':train ,
           'seq': {'balance': balance,
                   'payments': payments,
                   'appl': appl,
                   'com': com,
                   'trxn': trxn,
                   'aum': aum,
                   'deals': deals,
                   'clients': clients}}

task = Task('binary', metric='logloss')

roles={'target': 'sale_flg', 'drop': ['sale_amount', 'contacts']}

reader = DictToNumpySeqReader(task=task, cv=3, seq_params=seq_params)

feats = LGBMultiSeqSimpleFeatures()
model = BoostLGBM()
pipeline_lvl1 = MLPipeline([model], pre_selection=None, features_pipeline=feats, post_selection=None)

automl = AutoML(reader, [
    [pipeline_lvl1],
], skip_conn=False)

oof_pred = automl.fit_predict(X_train, roles=roles, verbose=4)

metric_l = log_loss(train[roles['target']], oof_pred.data[:, 0])
metric_a = roc_auc_score(train[roles['target']], oof_pred.data[:, 0])

print('=============')
print(f'log-loss: {metric_l}, roc_auc: {metric_a}')

_pred = automl.predict(X_train)

[11:51:16] Feats was rejected during automatic roles guess: []
[11:51:16] Layer 1 train process start. Time left 9999999994.92 secs
[11:51:33] Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
[11:51:33] Training params: {'task': 'train', 'learning_rate': 0.05, 'num_leaves': 128, 'feature_fraction': 0.7, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 1, 'reg_lambda': 0.0, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 4, 'max_bin': 255, 'min_data_in_bin': 3, 'num_trees': 3000, 'early_stopping_rounds': 100, 'random_state': 42}
[11:51:33] ===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[11:51:33] Training until validation scores don't improve for 100 rounds
[11:51:34] [100]	valid's binary_logloss: 0.22936
[11:51:34] [200]	valid's binary_logloss: 0.23542
[11:51:34] Early stopping, best iteration is:
[103]	valid's binary_logloss: 0.228787
[11:51:34] ===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_LightGBM

In [12]:
%%time
#interesting_values = {
#    'bureau': {'CREDIT_ACTIVE': ['Active', 'Closed']},
#    'app_prev': {'NAME_CONTRACT_TYPE': ['Consumer']}
#}

generator = FeatureGeneratorPipeline(seq_params,
                                     max_gener_features=500,
                                     #interesting_values = interesting_values,
                                     generate_interesting_values = True,
                                     per_top_categories = 25,
                                     sample_size = None,
                                     n_jobs = 16)


seq_params = {'balance': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       }, 
              'payments': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       }, 
              'appl': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       },
              'com': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       },
              'trxn': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       },
              'aum': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       },
              'deals': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       },
              'clients': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id', 'type': 'lookup'}
                       }
             }
X_train = {'plain':train ,
           'seq': {'balance': balance,
                   'payments': payments,
                   'appl': appl,
                   'com': com,
                   'trxn': trxn,
                   'aum': aum,
                   'deals': deals,
                   'clients': clients}}

task = Task('binary', metric='logloss')

roles={'target': 'sale_flg', 'drop': ['sale_amount', 'contacts']}

reader = DictToNumpySeqReader(task=task, cv=3, seq_params=seq_params)

simpleransf = LGBSimpleFeatures()
feats = generator.append(simpleransf)

model = BoostLGBM()
pipeline_lvl1 = MLPipeline([model], pre_selection=None, features_pipeline=feats, post_selection=None)

automl = AutoML(reader, [
    [pipeline_lvl1],
], skip_conn=False)

oof_pred = automl.fit_predict(X_train, roles=roles, verbose=4)

metric_l = log_loss(train[roles['target']], oof_pred.data[:, 0])
metric_a = roc_auc_score(train[roles['target']], oof_pred.data[:, 0])

print('=============')
print(f'log-loss: {metric_l}, roc_auc: {metric_a}')



[11:52:44] Feats was rejected during automatic roles guess: []
[11:52:44] Layer 1 train process start. Time left 9999999995.06 secs
[11:52:44] This selector only for holdout training. fit_on_holout argument added just to be compatible
[11:52:44] Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
[11:52:48] Interesting values have been generated
[11:52:48] Interesting values have been added to the entityset
[11:52:48] Relationships have been added to the entityset
[11:52:49] 188 are going to be generated
EntitySet scattered to 16 workers in 17 seconds
[11:54:11] Training until validation scores don't improve for 100 rounds
[11:54:13] [100]	valid's binary_logloss: 0.247128
[11:54:15] [200]	valid's binary_logloss: 0.254412
[11:54:15] Early stopping, best iteration is:
[125]	valid's binary_logloss: 0.246491
[11:54:15] LightGBM fitting and predicting completed
[11:54:15] Started iteration 0, chunk = ['ft__plain_client_id.NUM_UNIQUE(clients.edu

[11:54:17] Features in SCI = ['ft__plain_client_id.NUM_UNIQUE(clients.education)', 'ft__plain_client_id.MAX(clients.region)', 'ft__plain_client_id.STD(com.count_comm)', 'ft__plain_client_id.ENTROPY(appl.appl_prod_type_name)', 'ft__plain_client_id.MAX(trxn.merchant_cd)', 'ft__plain_client_id.MAX(clients.city)', 'ft__plain_client_id.MEAN(balance.eop_bal_sum_rur)', 'ft__plain_client_id.MEDIAN(trxn.card_id)', 'ft__plain_client_id.ENTROPY(deals.prod_type_name)', 'ft__plain_client_id.MIN(trxn.tran_amt_rur)', 'ft__plain_client_id.STD(trxn.mcc_cd)', 'ft__plain_client_id.MEDIAN(deals.agrmnt_rate_active)', 'ft__plain_client_id.MAX(deals.agrmnt_rate_active)', 'ft__plain_client_id.ENTROPY(balance.prod_cat_name)', 'ft__plain_client_id.ENTROPY(com.channel)', 'ft__plain_client_id.STD(deals.agrmnt_rate_active)', 'ft__plain_client_id.MEAN(clients.city)', 'ft__plain_client_id.MIN(aum.balance_rur_amt)', 'ft__plain_client_id.SUM(com.count_comm)', 'ft__plain_client_id.STD(com.otkaz)', 'ft__plain_client_id.

[11:54:19] Features in SCI = ['ft__plain_client_id.NUM_UNIQUE(clients.education)', 'ft__plain_client_id.MAX(clients.region)', 'ft__plain_client_id.MAX(trxn.merchant_cd)', 'ft__plain_client_id.ENTROPY(trxn.tsp_name)', 'ft__plain_client_id.MEDIAN(trxn.card_id)', 'ft__plain_client_id.STD(trxn.mcc_cd)', 'ft__plain_client_id.SUM(deals.agrmnt_rate_active)', 'ft__plain_client_id.SUM(balance.max_bal_sum_rur)', 'ft__plain_client_id.ENTROPY(balance.prod_cat_name)', 'ft__plain_client_id.STD(deals.agrmnt_rate_active)', 'ft__plain_client_id.MEAN(clients.city)', 'ft__plain_client_id.SUM(com.count_comm)', 'ft__plain_client_id.MEAN(trxn.mcc_cd)', 'ft__plain_client_id.STD(trxn.tran_amt_rur)', 'ft__plain_client_id.MEAN(com.count_comm)', 'ft__plain_client_id.MEDIAN(trxn.tran_amt_rur)', 'ft__plain_client_id.SUM(trxn.card_id)', 'ft__plain_client_id.STD(aum.balance_rur_amt)', 'ft__plain_client_id.SUM(balance.eop_bal_sum_rur)', 'ft__plain_client_id.NUM_UNIQUE(trxn.txn_city)', 'ft__plain_client_id.MEAN(balanc

[11:54:22] Features in SCI = ['ft__plain_client_id.NUM_UNIQUE(clients.education)', 'ft__plain_client_id.MAX(clients.region)', 'ft__plain_client_id.STD(balance.max_bal_sum_rur)', 'ft__plain_client_id.MAX(trxn.merchant_cd)', 'ft__plain_client_id.ENTROPY(trxn.tsp_name)', 'ft__plain_client_id.MEDIAN(trxn.card_id)', 'ft__plain_client_id.SUM(balance.avg_bal_sum_rur)', 'ft__plain_client_id.STD(trxn.mcc_cd)', 'ft__plain_client_id.SUM(deals.agrmnt_rate_active)', 'ft__plain_client_id.SUM(balance.max_bal_sum_rur)', 'ft__plain_client_id.STD(balance.avg_bal_sum_rur)', 'ft__plain_client_id.ENTROPY(balance.prod_cat_name)', 'ft__plain_client_id.STD(deals.agrmnt_rate_active)', 'ft__plain_client_id.MAX(deals.agrmnt_rate_passive)', 'ft__plain_client_id.MEAN(clients.city)', 'ft__plain_client_id.SUM(com.count_comm)', 'ft__plain_client_id.MEAN(trxn.mcc_cd)', 'ft__plain_client_id.STD(trxn.tran_amt_rur)', 'ft__plain_client_id.MEAN(com.count_comm)', 'ft__plain_client_id.NUM_UNIQUE(appl.appl_stts_name_dc)', 'ft

[11:54:26] Features in SCI = ['ft__plain_client_id.NUM_UNIQUE(clients.education)', 'ft__plain_client_id.MAX(clients.region)', 'ft__plain_client_id.STD(balance.max_bal_sum_rur)', 'ft__plain_client_id.MAX(trxn.merchant_cd)', 'ft__plain_client_id.ENTROPY(trxn.tsp_name)', 'ft__plain_client_id.MEDIAN(trxn.card_id)', 'ft__plain_client_id.SUM(balance.avg_bal_sum_rur)', 'ft__plain_client_id.STD(trxn.mcc_cd)', 'ft__plain_client_id.SUM(deals.agrmnt_rate_active)', 'ft__plain_client_id.SUM(balance.max_bal_sum_rur)', 'ft__plain_client_id.STD(balance.avg_bal_sum_rur)', 'ft__plain_client_id.ENTROPY(balance.prod_cat_name)', 'ft__plain_client_id.STD(deals.agrmnt_rate_active)', 'ft__plain_client_id.MAX(deals.agrmnt_rate_passive)', 'ft__plain_client_id.MEAN(clients.city)', 'ft__plain_client_id.SUM(com.count_comm)', 'ft__plain_client_id.MEAN(trxn.mcc_cd)', 'ft__plain_client_id.STD(trxn.tran_amt_rur)', 'ft__plain_client_id.MIN(clients.region)', 'ft__plain_client_id.ENTROPY(appl.appl_stts_name_dc)', 'ft__pl

[11:54:30] Features in SCI = ['ft__plain_client_id.NUM_UNIQUE(clients.education)', 'ft__plain_client_id.MAX(clients.region)', 'ft__plain_client_id.STD(balance.max_bal_sum_rur)', 'ft__plain_client_id.MAX(trxn.merchant_cd)', 'ft__plain_client_id.ENTROPY(trxn.tsp_name)', 'ft__plain_client_id.MEDIAN(trxn.card_id)', 'ft__plain_client_id.SUM(balance.avg_bal_sum_rur)', 'ft__plain_client_id.STD(trxn.mcc_cd)', 'ft__plain_client_id.SUM(deals.agrmnt_rate_active)', 'ft__plain_client_id.SUM(balance.max_bal_sum_rur)', 'ft__plain_client_id.STD(balance.avg_bal_sum_rur)', 'ft__plain_client_id.ENTROPY(balance.prod_cat_name)', 'ft__plain_client_id.STD(balance.crncy_cd)', 'ft__plain_client_id.STD(deals.agrmnt_rate_active)', 'ft__plain_client_id.MAX(deals.agrmnt_rate_passive)', 'ft__plain_client_id.MEAN(clients.city)', 'ft__plain_client_id.SUM(com.count_comm)', 'ft__plain_client_id.MEAN(trxn.mcc_cd)', 'ft__plain_client_id.MEDIAN(com.ring_up_flg)', 'ft__plain_client_id.STD(trxn.tran_amt_rur)', 'ft__plain_cl

[11:54:34] LightGBM fitting and predicting completed
[11:54:34] Current score = -0.2389414505791569, current best score = -0.24122479564358676
[11:54:34] Update best score from -0.24122479564358676 to -0.2389414505791569
[11:54:34] Started iteration 6, chunk = ['ft__plain_client_id.MIN(balance.crncy_cd)', 'ft__plain_client_id.NUM_UNIQUE(trxn.txn_country)', 'ft__plain_client_id.MEDIAN(com.dumaet)', 'ft__plain_client_id.MIN(com.agr_flg)', 'ft__plain_client_id.COUNT(clients)', 'ft__plain_client_id.MIN(com.dumaet)', 'ft__plain_client_id.MIN(com.otkaz)', 'ft__plain_client_id.MIN(com.ring_up_flg)', 'ft__plain_client_id.ENTROPY(clients.job_type)', 'ft__plain_client_id.MIN(deals.crncy_cd)', 'ft__plain_client_id.NUM_UNIQUE(clients.gender)', 'ft__plain_client_id.STD(clients.age)', 'ft__plain_client_id.NUM_UNIQUE(clients.job_type)', 'ft__plain_client_id.MAX(deals.crncy_cd)', 'ft__plain_client_id.MEDIAN(com.agr_flg)', 'ft__plain_client_id.ENTROPY(clients.gender)', 'ft__plain_client_id.STD(clients.

[11:54:34] Features in SCI = ['ft__plain_client_id.NUM_UNIQUE(clients.education)', 'ft__plain_client_id.MAX(clients.region)', 'ft__plain_client_id.STD(balance.max_bal_sum_rur)', 'ft__plain_client_id.MAX(trxn.merchant_cd)', 'ft__plain_client_id.ENTROPY(trxn.tsp_name)', 'ft__plain_client_id.MEDIAN(trxn.card_id)', 'ft__plain_client_id.SUM(balance.avg_bal_sum_rur)', 'ft__plain_client_id.STD(trxn.mcc_cd)', 'ft__plain_client_id.SUM(deals.agrmnt_rate_active)', 'ft__plain_client_id.SUM(balance.max_bal_sum_rur)', 'ft__plain_client_id.STD(balance.avg_bal_sum_rur)', 'ft__plain_client_id.MIN(com.agr_flg)', 'ft__plain_client_id.ENTROPY(balance.prod_cat_name)', 'ft__plain_client_id.STD(balance.crncy_cd)', 'ft__plain_client_id.STD(deals.agrmnt_rate_active)', 'ft__plain_client_id.MAX(deals.agrmnt_rate_passive)', 'ft__plain_client_id.MEAN(clients.city)', 'ft__plain_client_id.SUM(com.count_comm)', 'ft__plain_client_id.MEAN(trxn.mcc_cd)', 'ft__plain_client_id.MEDIAN(com.ring_up_flg)', 'ft__plain_client_i

[11:54:34] Training until validation scores don't improve for 100 rounds
[11:54:36] [100]	valid's binary_logloss: 0.242272
[11:54:38] [200]	valid's binary_logloss: 0.246128
[11:54:39] Early stopping, best iteration is:
[115]	valid's binary_logloss: 0.239854
[11:54:39] LightGBM fitting and predicting completed
[11:54:39] Current score = -0.23985414014928866, current best score = -0.2389414505791569
[11:54:39] Without update for 1 steps. Remove last added group ['ft__plain_client_id.MIN(balance.crncy_cd)', 'ft__plain_client_id.NUM_UNIQUE(trxn.txn_country)', 'ft__plain_client_id.MEDIAN(com.dumaet)', 'ft__plain_client_id.MIN(com.agr_flg)', 'ft__plain_client_id.COUNT(clients)', 'ft__plain_client_id.MIN(com.dumaet)', 'ft__plain_client_id.MIN(com.otkaz)', 'ft__plain_client_id.MIN(com.ring_up_flg)', 'ft__plain_client_id.ENTROPY(clients.job_type)', 'ft__plain_client_id.MIN(deals.crncy_cd)', 'ft__plain_client_id.NUM_UNIQUE(clients.gender)', 'ft__plain_client_id.STD(clients.age)', 'ft__plain_clie

[11:54:39] Update mapped importance
[11:54:39] Finally selected feats = ['ft__plain_client_id.NUM_UNIQUE(clients.education)', 'ft__plain_client_id.MAX(clients.city)', 'ft__plain_client_id.MEAN(clients.city)', 'ft__plain_client_id.MIN(balance.avg_bal_sum_rur)', 'ft__plain_client_id.NUM_UNIQUE(trxn.txn_comment_1)', 'ft__plain_client_id.ENTROPY(trxn.txn_comment_1)', 'ft__plain_client_id.MIN(balance.max_bal_sum_rur)', 'ft__plain_client_id.ENTROPY(balance.prod_group_name)', 'ft__plain_client_id.MAX(clients.age)', 'ft__plain_client_id.MAX(clients.region)', 'ft__plain_client_id.MEDIAN(aum.balance_rur_amt)', 'ft__plain_client_id.SUM(trxn.tran_amt_rur)', 'ft__plain_client_id.ENTROPY(appl.appl_prod_type_name)', 'ft__plain_client_id.ENTROPY(trxn.txn_city)', 'ft__plain_client_id.MAX(deals.agrmnt_rate_active)', 'ft__plain_client_id.MEDIAN(deals.agrmnt_rate_active)', 'ft__plain_client_id.MEAN(com.count_comm)', 'ft__plain_client_id.NUM_UNIQUE(trxn.txn_city)', 'ft__plain_client_id.MIN(payments.sum_rur

[11:55:58] Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
[11:55:58] Training params: {'task': 'train', 'learning_rate': 0.05, 'num_leaves': 128, 'feature_fraction': 0.7, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 1, 'reg_lambda': 0.0, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 4, 'max_bin': 255, 'min_data_in_bin': 3, 'num_trees': 3000, 'early_stopping_rounds': 100, 'random_state': 42}
[11:55:58] ===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[11:55:58] Training until validation scores don't improve for 100 rounds
[11:55:59] [100]	valid's binary_logloss: 0.226793
[11:56:01] [200]	valid's binary_logloss: 0.231778
[11:56:01] Early stopping, best iteration is:
[110]	valid's binary_logloss: 0.226447
[11:56:01] ===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[11:56:02] Training until validation scores don't improve for 100 rounds
[11:56:03] [100]	valid's binary_logloss: 0.230118


In [13]:
%%time
_pred = automl.predict(X_train)

EntitySet scattered to 16 workers in 17 seconds
CPU times: user 26.6 s, sys: 13.6 s, total: 40.1 s
Wall time: 1min 23s
