# Step 0.1. Import necessary libraries 

In [1]:
# Standard python libraries
import logging
import os
import time

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
import torch
from matplotlib import pyplot as plt

# Imports from our package
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures, LGBSeqSimpleFeatures, LGBMultiSeqSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector, ModelBasedImportanceEstimator
from lightautoml.reader.base import PandasToPandasReader, DictToNumpySeqReader
from lightautoml.tasks import Task
from lightautoml.automl.blend import WeightedBlender
from lightautoml.dataset.seq_np_pd_dataset import SeqNumpyPandasDataset
from lightautoml.dataset.roles import NumericRole, CategoryRole, DatetimeRole
from lightautoml.dataset.np_pd_dataset import NumpyDataset, PandasDataset
from lightautoml.ml_algo.random_forest import RandomForestSklearn
from lightautoml.validation.np_iterators import TimeSeriesIterator

from lightautoml.pipelines.features.generator_pipeline import FeatureGeneratorPipeline

In [2]:
DATA_FOLDER = '/home/simakovde/hdd/kaggle/idao21_final/idao_2021_train'
train = pd.read_csv(f'{DATA_FOLDER}/funnel.csv' )

clients = pd.read_csv(f'{DATA_FOLDER}/client.csv')
deals = pd.read_csv(f'{DATA_FOLDER}/deals.csv')
aum = pd.read_csv(f'{DATA_FOLDER}/aum.csv')
trxn = pd.read_csv(f'{DATA_FOLDER}/trxn.csv')
com = pd.read_csv(f'{DATA_FOLDER}/com.csv')
appl = pd.read_csv(f'{DATA_FOLDER}/appl.csv')
payments = pd.read_csv(f'{DATA_FOLDER}/payments.csv')
balance = pd.read_csv(f'{DATA_FOLDER}/balance.csv')

In [3]:
train.shape, clients.shape, deals.shape, aum.shape, trxn.shape, com.shape, appl.shape, payments.shape, balance.shape

((21498, 16),
 (21498, 8),
 (109016, 8),
 (117392, 4),
 (3035705, 11),
 (113055, 10),
 (12030, 6),
 (188068, 4),
 (1194684, 9))

In [4]:
train.head()

Unnamed: 0,client_id,sale_flg,sale_amount,contacts,feature_1,client_segment,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,region_cd
0,7513301859607023584,0,,1,7,13.0,571533.0,15717.0,0.0,0.0,0.0,571852.0,472605.0,10.4,12548.0,86.0
1,9157009756404187626,0,,1,3,13.0,3642369.0,94787.0,0.0,0.0,84823.0,3642369.0,3314257.0,8.9,77210.0,2.0
2,-1893104556496814867,0,,1,5,16.0,352826.0,5500.0,0.0,6822.0,0.0,265893.0,204534.0,8.9,5508.0,52.0
3,6886062013213911831,0,,1,4,3.0,6070615.0,40580.0,0.0,30401.0,0.0,2005731.0,1825051.0,7.9,40583.0,86.0
4,-8156468515495593794,1,138018.05,1,7,14.0,3642369.0,97156.0,81488.0,0.0,160308.0,3642369.0,3314257.0,10.4,78108.0,27.0


In [5]:
clients.head()

Unnamed: 0,client_id,gender,age,region,city,citizenship,education,job_type
0,7513301859607023584,F,33.0,0,115,RUSSIA,,
1,9157009756404187626,F,59.0,17,668,RUSSIA,,
2,-1893104556496814867,M,51.0,28,65,RUSSIA,,
3,6886062013213911831,F,56.0,0,40,RUSSIA,,
4,-8156468515495593794,F,34.0,-1,-1,RUSSIA,HIGHER_PROFESSIONAL,


In [6]:
balance.head()

Unnamed: 0,client_id,crncy_cd,eop_bal_sum_rur,min_bal_sum_rur,max_bal_sum_rur,avg_bal_sum_rur,month_end_dt,prod_cat_name,prod_group_name
0,7513301859607023584,810.0,0.0,0.0,0.0,0.0,2018-09-30,CURRENT ACCOUNTS,Cash on demand
1,7513301859607023584,810.0,0.0,0.0,0.0,0.0,2018-09-30,CURRENT ACCOUNTS,Cash on demand
2,7513301859607023584,810.0,0.0,0.0,0.0,0.0,2018-09-30,CURRENT ACCOUNTS,Cash on demand
3,7513301859607023584,810.0,0.0,0.0,0.0,0.0,2018-09-30,CURRENT ACCOUNTS,Cash on demand
4,7513301859607023584,810.0,0.0,0.0,0.0,0.0,2018-09-30,CURRENT ACCOUNTS,Cash on demand


In [7]:
# в доп таблицах только данные текущих актуальных клиентов <- реализован
# в доп таблицах данные для всех клиентов
# в доп таблицах данные только для новых клиентов 

In [8]:
seq_params = {'clients': {'case': 'ids',
                          'params': {},
                          'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                         }
             }

X_train = {'plain':train , 'seq': None}
task = Task('binary', metric='logloss')

roles={'target': 'sale_flg', 'drop': ['sale_amount', 'contacts']}

reader = DictToNumpySeqReader(task=task, cv=3, seq_params=seq_params)

feats = LGBMultiSeqSimpleFeatures()
model = BoostLGBM()
pipeline_lvl1 = MLPipeline([model], pre_selection=None, features_pipeline=feats, post_selection=None)

automl = AutoML(reader, [
    [pipeline_lvl1],
], skip_conn=False)

oof_pred = automl.fit_predict(X_train, roles=roles, verbose=3)

metric_l = log_loss(train[roles['target']], oof_pred.data[:, 0])
metric_a = roc_auc_score(train[roles['target']], oof_pred.data[:, 0])

print('=============')
print(f'log-loss: {metric_l}, roc_auc: {metric_a}')

_pred = automl.predict(X_train)


[13:36:38] Feats was rejected during automatic roles guess: []
[13:36:38] Layer 1 train process start. Time left 9999999996.60 secs
[13:36:38] Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
[13:36:38] ===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[13:36:39] ===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[13:36:40] ===== Start working with fold 2 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[13:36:40] Fitting Lvl_0_Pipe_0_Mod_0_LightGBM finished. score = -0.34980425030874734
[13:36:40] Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed
[13:36:40] Time left 9999999993.92 secs

[13:36:40] Layer 1 training completed.

log-loss: 0.34980425030874734, roc_auc: 0.8192736336435955


In [9]:
%%time
seq_params = {'trxn': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       }
             }

X_train = {'plain':train , 'seq': {'trxn': trxn}}

task = Task('binary', metric='logloss')

roles={'target': 'sale_flg', 'drop': ['sale_amount', 'contacts']}

reader = DictToNumpySeqReader(task=task, cv=3, seq_params=seq_params)

feats = LGBMultiSeqSimpleFeatures()
model = BoostLGBM()
pipeline_lvl1 = MLPipeline([model], pre_selection=None, features_pipeline=feats, post_selection=None)

automl = AutoML(reader, [
    [pipeline_lvl1],
], skip_conn=False)

oof_pred = automl.fit_predict(X_train, roles=roles, verbose=3)

metric_l = log_loss(train[roles['target']], oof_pred.data[:, 0])
metric_a = roc_auc_score(train[roles['target']], oof_pred.data[:, 0])

print('=============')
print(f'log-loss: {metric_l}, roc_auc: {metric_a}')

_pred = automl.predict(X_train)


[13:36:43] Feats was rejected during automatic roles guess: []
[13:36:43] Layer 1 train process start. Time left 9999999997.50 secs
[13:36:52] Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
[13:36:52] ===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[13:36:53] ===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[13:36:54] ===== Start working with fold 2 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[13:36:55] Fitting Lvl_0_Pipe_0_Mod_0_LightGBM finished. score = -0.3468762868274541
[13:36:55] Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed
[13:36:55] Time left 9999999985.94 secs

[13:36:55] Layer 1 training completed.

log-loss: 0.3468762868274541, roc_auc: 0.8231983444548147
CPU times: user 29.4 s, sys: 2.2 s, total: 31.6 s
Wall time: 24 s


In [10]:
%%time
seq_params = {'clients': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id', 'type': 'lookup'}
                         }
             }

X_train = {'plain':train , 'seq': {'clients': clients}}
task = Task('binary', metric='logloss')

roles={'target': 'sale_flg', 'drop': ['sale_amount', 'contacts']}

reader = DictToNumpySeqReader(task=task, cv=3, seq_params=seq_params)

feats = LGBMultiSeqSimpleFeatures()
model = BoostLGBM()
pipeline_lvl1 = MLPipeline([model], pre_selection=None, features_pipeline=feats, post_selection=None)

automl = AutoML(reader, [
    [pipeline_lvl1],
], skip_conn=False)

oof_pred = automl.fit_predict(X_train, roles=roles, verbose=3)

metric_l = log_loss(train[roles['target']], oof_pred.data[:, 0])
metric_a = roc_auc_score(train[roles['target']], oof_pred.data[:, 0])

print('=============')
print(f'log-loss: {metric_l}, roc_auc: {metric_a}')

_pred = automl.predict(X_train)


[13:37:05] Feats was rejected during automatic roles guess: []
[13:37:05] Layer 1 train process start. Time left 9999999999.21 secs
[13:37:06] Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
[13:37:06] ===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[13:37:07] ===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[13:37:08] ===== Start working with fold 2 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[13:37:09] Fitting Lvl_0_Pipe_0_Mod_0_LightGBM finished. score = -0.24634341369370094
[13:37:09] Lvl_0_Pipe_0_Mod_0_LightGBM fitting and predicting completed
[13:37:09] Time left 9999999995.37 secs

[13:37:09] Layer 1 training completed.

log-loss: 0.24634341369370094, roc_auc: 0.9212829858762476
CPU times: user 15.8 s, sys: 203 ms, total: 16 s
Wall time: 5.7 s


In [11]:
%%time
seq_params = {'balance': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       }, 
              'payments': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       }, 
              'appl': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       },
              'com': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       },
              'trxn': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       },
              'aum': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       },
              'deals': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       },
              'clients': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id', 'type': 'lookup'}
                       }
             }
X_train = {'plain':train ,
           'seq': {'balance': balance,
                   'payments': payments,
                   'appl': appl,
                   'com': com,
                   'trxn': trxn,
                   'aum': aum,
                   'deals': deals,
                   'clients': clients}}

task = Task('binary', metric='logloss')

roles={'target': 'sale_flg', 'drop': ['sale_amount', 'contacts']}

reader = DictToNumpySeqReader(task=task, cv=3, seq_params=seq_params)

feats = LGBMultiSeqSimpleFeatures()
model = BoostLGBM()
pipeline_lvl1 = MLPipeline([model], pre_selection=None, features_pipeline=feats, post_selection=None)

automl = AutoML(reader, [
    [pipeline_lvl1],
], skip_conn=False)

oof_pred = automl.fit_predict(X_train, roles=roles, verbose=4)

metric_l = log_loss(train[roles['target']], oof_pred.data[:, 0])
metric_a = roc_auc_score(train[roles['target']], oof_pred.data[:, 0])

print('=============')
print(f'log-loss: {metric_l}, roc_auc: {metric_a}')

_pred = automl.predict(X_train)

[13:37:16] Feats was rejected during automatic roles guess: []
[13:37:16] Layer 1 train process start. Time left 9999999994.74 secs
[13:37:32] Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
[13:37:32] Training params: {'task': 'train', 'learning_rate': 0.05, 'num_leaves': 128, 'feature_fraction': 0.7, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 1, 'reg_lambda': 0.0, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 4, 'max_bin': 255, 'min_data_in_bin': 3, 'num_trees': 3000, 'early_stopping_rounds': 100, 'random_state': 42}
[13:37:32] ===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[13:37:32] Training until validation scores don't improve for 100 rounds
[13:37:33] [100]	valid's binary_logloss: 0.22936
[13:37:33] [200]	valid's binary_logloss: 0.23542
[13:37:33] Early stopping, best iteration is:
[103]	valid's binary_logloss: 0.228787
[13:37:33] ===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_LightGBM

In [12]:
%%time
#interesting_values = {
#    'bureau': {'CREDIT_ACTIVE': ['Active', 'Closed']},
#    'app_prev': {'NAME_CONTRACT_TYPE': ['Consumer']}
#}


seq_params = {'balance': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       }, 
              'payments': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       }, 
              'appl': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       },
              'com': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       },
              'trxn': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       },
              'aum': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       },
              'deals': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       },
              'clients': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'client_id', 'to_id': 'client_id'}
                       }
             }

generator = FeatureGeneratorPipeline(seq_params,
                                     max_gener_features=500,
                                     #interesting_values = interesting_values,
                                     generate_interesting_values = True,
                                     per_top_categories = 25,
                                     sample_size = None,
                                     n_jobs = 16)

X_train = {'plain':train ,
           'seq': {'balance': balance,
                   'payments': payments,
                   'appl': appl,
                   'com': com,
                   'trxn': trxn,
                   'aum': aum,
                   'deals': deals,
                   'clients': clients}}

task = Task('binary', metric='logloss')

roles={'target': 'sale_flg', 'drop': ['sale_amount', 'contacts']}

reader = DictToNumpySeqReader(task=task, cv=3, seq_params=seq_params)

simpleransf = LGBSimpleFeatures()
feats = generator.append(simpleransf)

model = BoostLGBM()
pipeline_lvl1 = MLPipeline([model], pre_selection=None, features_pipeline=feats, post_selection=None)

automl = AutoML(reader, [
    [pipeline_lvl1],
], skip_conn=False)

oof_pred = automl.fit_predict(X_train, roles=roles, verbose=4)

metric_l = log_loss(train[roles['target']], oof_pred.data[:, 0])
metric_a = roc_auc_score(train[roles['target']], oof_pred.data[:, 0])

print('=============')
print(f'log-loss: {metric_l}, roc_auc: {metric_a}')



[13:38:01] Feats was rejected during automatic roles guess: []
[13:38:01] Layer 1 train process start. Time left 9999999994.94 secs
[13:38:01] This selector only for holdout training. fit_on_holout argument added just to be compatible
[13:38:01] Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
[13:38:06] Interesting values have been generated
[13:38:06] Interesting values have been added to the entityset
[13:38:06] Relationships have been added to the entityset
[13:38:06] 188 are going to be generated
EntitySet scattered to 16 workers in 17 seconds
[13:39:27] Training until validation scores don't improve for 100 rounds
[13:39:29] [100]	valid's binary_logloss: 0.230961
[13:39:31] Early stopping, best iteration is:
[86]	valid's binary_logloss: 0.230555
[13:39:31] LightGBM fitting and predicting completed
[13:39:31] Started iteration 0, chunk = ['ft__plain_client_id.NUM_UNIQUE(clients.education)', 'ft__plain_client_id.MAX(clients.city)', 

[13:39:32] Features in SCI = ['ft__plain_client_id.COUNT(balance)', 'ft__plain_client_id.ENTROPY(appl.appl_prod_type_name)', 'ft__plain_client_id.ENTROPY(balance.prod_cat_name)', 'ft__plain_client_id.ENTROPY(balance.prod_group_name)', 'ft__plain_client_id.ENTROPY(com.channel)', 'ft__plain_client_id.ENTROPY(trxn.txn_city)', 'ft__plain_client_id.ENTROPY(trxn.txn_comment_1)', 'ft__plain_client_id.MAX(balance.max_bal_sum_rur)', 'ft__plain_client_id.MAX(clients.age)', 'ft__plain_client_id.MAX(clients.city)', 'ft__plain_client_id.MAX(clients.region)', 'ft__plain_client_id.MAX(deals.agrmnt_rate_active)', 'ft__plain_client_id.MAX(payments.sum_rur)', 'ft__plain_client_id.MAX(trxn.card_id)', 'ft__plain_client_id.MAX(trxn.merchant_cd)', 'ft__plain_client_id.MAX(trxn.tran_amt_rur)', 'ft__plain_client_id.MEAN(balance.eop_bal_sum_rur)', 'ft__plain_client_id.MEAN(balance.min_bal_sum_rur)', 'ft__plain_client_id.MEAN(clients.city)', 'ft__plain_client_id.MEAN(com.count_comm)', 'ft__plain_client_id.MEAN(

[13:39:35] Features in SCI = ['ft__plain_client_id.COUNT(balance)', 'ft__plain_client_id.COUNT(trxn)', 'ft__plain_client_id.ENTROPY(appl.appl_prod_type_name)', 'ft__plain_client_id.ENTROPY(balance.prod_group_name)', 'ft__plain_client_id.ENTROPY(com.prod)', 'ft__plain_client_id.ENTROPY(deals.prod_type_name)', 'ft__plain_client_id.ENTROPY(trxn.txn_city)', 'ft__plain_client_id.ENTROPY(trxn.txn_comment_1)', 'ft__plain_client_id.MAX(balance.avg_bal_sum_rur)', 'ft__plain_client_id.MAX(balance.min_bal_sum_rur)', 'ft__plain_client_id.MAX(clients.age)', 'ft__plain_client_id.MAX(clients.city)', 'ft__plain_client_id.MAX(clients.region)', 'ft__plain_client_id.MAX(trxn.merchant_cd)', 'ft__plain_client_id.MEAN(aum.balance_rur_amt)', 'ft__plain_client_id.MEAN(balance.avg_bal_sum_rur)', 'ft__plain_client_id.MEAN(clients.age)', 'ft__plain_client_id.MEAN(clients.city)', 'ft__plain_client_id.MEAN(clients.region)', 'ft__plain_client_id.MEAN(com.count_comm)', 'ft__plain_client_id.MEAN(com.not_ring_up_flg)'

[13:39:37] Features in SCI = ['ft__plain_client_id.COUNT(appl)', 'ft__plain_client_id.COUNT(aum)', 'ft__plain_client_id.COUNT(balance)', 'ft__plain_client_id.COUNT(com)', 'ft__plain_client_id.COUNT(deals)', 'ft__plain_client_id.COUNT(payments)', 'ft__plain_client_id.ENTROPY(appl.appl_prod_type_name)', 'ft__plain_client_id.ENTROPY(balance.prod_group_name)', 'ft__plain_client_id.ENTROPY(trxn.tsp_name)', 'ft__plain_client_id.ENTROPY(trxn.txn_city)', 'ft__plain_client_id.ENTROPY(trxn.txn_comment_1)', 'ft__plain_client_id.MAX(aum.balance_rur_amt)', 'ft__plain_client_id.MAX(balance.eop_bal_sum_rur)', 'ft__plain_client_id.MAX(clients.age)', 'ft__plain_client_id.MAX(clients.city)', 'ft__plain_client_id.MAX(clients.region)', 'ft__plain_client_id.MAX(deals.agrmnt_sum_rur)', 'ft__plain_client_id.MAX(trxn.mcc_cd)', 'ft__plain_client_id.MAX(trxn.merchant_cd)', 'ft__plain_client_id.MEAN(balance.max_bal_sum_rur)', 'ft__plain_client_id.MEAN(clients.city)', 'ft__plain_client_id.MEAN(com.agr_flg)', 'ft_

[13:39:39] Features in SCI = ['ft__plain_client_id.COUNT(appl)', 'ft__plain_client_id.COUNT(aum)', 'ft__plain_client_id.COUNT(balance)', 'ft__plain_client_id.COUNT(com)', 'ft__plain_client_id.COUNT(deals)', 'ft__plain_client_id.COUNT(payments)', 'ft__plain_client_id.ENTROPY(appl.appl_prod_group_name)', 'ft__plain_client_id.ENTROPY(appl.appl_prod_type_name)', 'ft__plain_client_id.ENTROPY(appl.appl_sale_channel_name)', 'ft__plain_client_id.ENTROPY(appl.appl_stts_name_dc)', 'ft__plain_client_id.ENTROPY(balance.prod_group_name)', 'ft__plain_client_id.ENTROPY(payments.pmnts_name)', 'ft__plain_client_id.ENTROPY(trxn.tsp_name)', 'ft__plain_client_id.ENTROPY(trxn.txn_city)', 'ft__plain_client_id.ENTROPY(trxn.txn_comment_1)', 'ft__plain_client_id.ENTROPY(trxn.txn_country)', 'ft__plain_client_id.MAX(aum.balance_rur_amt)', 'ft__plain_client_id.MAX(balance.eop_bal_sum_rur)', 'ft__plain_client_id.MAX(clients.age)', 'ft__plain_client_id.MAX(clients.city)', 'ft__plain_client_id.MAX(clients.region)', 

[13:39:42] Started iteration 5, chunk = ['ft__plain_client_id.STD(deals.crncy_cd)', 'ft__plain_client_id.SUM(com.not_ring_up_flg)', 'ft__plain_client_id.MEDIAN(balance.max_bal_sum_rur)', 'ft__plain_client_id.SUM(com.otkaz)', 'ft__plain_client_id.SUM(clients.age)', 'ft__plain_client_id.MEDIAN(com.not_ring_up_flg)', 'ft__plain_client_id.MIN(com.count_comm)', 'ft__plain_client_id.MEDIAN(balance.min_bal_sum_rur)', 'ft__plain_client_id.MEDIAN(clients.age)', 'ft__plain_client_id.MIN(clients.region)', 'ft__plain_client_id.STD(balance.crncy_cd)', 'ft__plain_client_id.MEDIAN(trxn.merchant_cd)', 'ft__plain_client_id.MIN(deals.agrmnt_rate_passive)', 'ft__plain_client_id.MAX(com.agr_flg)', 'ft__plain_client_id.MEDIAN(balance.eop_bal_sum_rur)', 'ft__plain_client_id.NUM_UNIQUE(com.channel)', 'ft__plain_client_id.MEDIAN(com.ring_up_flg)', 'ft__plain_client_id.ENTROPY(aum.product_code)', 'ft__plain_client_id.NUM_UNIQUE(aum.product_code)', 'ft__plain_client_id.NUM_UNIQUE(com.prod)', 'ft__plain_client_i

[13:39:42] Training until validation scores don't improve for 100 rounds
[13:39:43] [100]	valid's binary_logloss: 0.224025
[13:39:45] [200]	valid's binary_logloss: 0.227809
[13:39:45] Early stopping, best iteration is:
[107]	valid's binary_logloss: 0.222708
[13:39:45] LightGBM fitting and predicting completed
[13:39:45] Current score = -0.22270783039936146, current best score = -0.224535651317646
[13:39:45] Update best score from -0.224535651317646 to -0.22270783039936146
[13:39:45] Started iteration 6, chunk = ['ft__plain_client_id.NUM_UNIQUE(clients.gender)', 'ft__plain_client_id.NUM_UNIQUE(clients.job_type)', 'ft__plain_client_id.MAX(deals.crncy_cd)', 'ft__plain_client_id.MIN(balance.crncy_cd)', 'ft__plain_client_id.MEDIAN(deals.crncy_cd)', 'ft__plain_client_id.MEDIAN(com.dumaet)', 'ft__plain_client_id.MIN(deals.crncy_cd)', 'ft__plain_client_id.STD(clients.age)', 'ft__plain_client_id.STD(clients.region)', 'ft__plain_client_id.MEDIAN(balance.crncy_cd)', 'ft__plain_client_id.SUM(com.d

[13:39:45] Training until validation scores don't improve for 100 rounds
[13:39:46] [100]	valid's binary_logloss: 0.230255
[13:39:47] Early stopping, best iteration is:
[89]	valid's binary_logloss: 0.228171
[13:39:47] LightGBM fitting and predicting completed
[13:39:47] Current score = -0.228170615324404, current best score = -0.22270783039936146
[13:39:47] Without update for 1 steps. Remove last added group ['ft__plain_client_id.NUM_UNIQUE(clients.gender)', 'ft__plain_client_id.NUM_UNIQUE(clients.job_type)', 'ft__plain_client_id.MAX(deals.crncy_cd)', 'ft__plain_client_id.MIN(balance.crncy_cd)', 'ft__plain_client_id.MEDIAN(deals.crncy_cd)', 'ft__plain_client_id.MEDIAN(com.dumaet)', 'ft__plain_client_id.MIN(deals.crncy_cd)', 'ft__plain_client_id.STD(clients.age)', 'ft__plain_client_id.STD(clients.region)', 'ft__plain_client_id.MEDIAN(balance.crncy_cd)', 'ft__plain_client_id.SUM(com.dumaet)', 'ft__plain_client_id.MEDIAN(com.agr_flg)', 'ft__plain_client_id.SUM(com.agr_flg)', 'ft__plain_cl

[13:39:47] Selection completed
[13:39:47] 84 features have been selected from generated
EntitySet scattered to 16 workers in 17 seconds
[13:40:57] Start fitting Lvl_0_Pipe_0_Mod_0_LightGBM ...
[13:40:57] Training params: {'task': 'train', 'learning_rate': 0.05, 'num_leaves': 128, 'feature_fraction': 0.7, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 1, 'reg_lambda': 0.0, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 4, 'max_bin': 255, 'min_data_in_bin': 3, 'num_trees': 3000, 'early_stopping_rounds': 100, 'random_state': 42}
[13:40:57] ===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LightGBM =====
[13:40:57] Training until validation scores don't improve for 100 rounds
[13:40:58] [100]	valid's binary_logloss: 0.227175
[13:40:59] [200]	valid's binary_logloss: 0.232547
[13:41:00] Early stopping, best iteration is:
[111]	valid's binary_logloss: 0.226815
[13:41:00] ===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_Li

In [13]:
%%time
_pred = automl.predict(X_train)

EntitySet scattered to 16 workers in 18 seconds
CPU times: user 25.4 s, sys: 12.9 s, total: 38.3 s
Wall time: 1min 16s


In [14]:
import pickle
with open('automl_se.pkl', 'wb') as f:
    pickle.dump(automl, f)