# Step 0.0 Install LightAutoML

In [1]:
#! pip install -U lightautoml

# Step 0.1 Import necessary libraries

In [1]:
# Standard python libraries
import os
from os.path import join as pjoin
import time
import pickle

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split

# Imports from lightautoml package
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM

from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.reader.base import DictToNumpySeqReader
from lightautoml.tasks import Task

# Import Feature Generator Transformer
from lightautoml.pipelines.features.generator_pipeline import FeatureGeneratorPipeline

# Step 0.2 Example data load

In [3]:
# Define data filepaths
data_dir = 'data2'

filepaths = {
    'app_train': pjoin(data_dir, 'application_train.zip'),
    'app_test': pjoin(data_dir, 'application_test.zip'),
    'bureau': pjoin(data_dir, 'bureau.zip'),
    'credit_bl': pjoin(data_dir, 'credit_card_balance.zip'),
    'install_pays': pjoin(data_dir, 'installments_payments.zip'),
    'pc_balance': pjoin(data_dir, 'POS_CASH_balance.zip'),
    'app_prev': pjoin(data_dir, 'previous_application.zip'),
}

In [4]:
dataframes = {}
for df_name in filepaths.keys():
    dataframes[df_name] = pd.read_csv(filepaths[df_name], encoding='latin1')
    print(df_name, dataframes[df_name].shape)

app_train (5000, 122)
app_test (5000, 121)
bureau (49227, 17)
credit_bl (113392, 23)
install_pays (392693, 8)
pc_balance (287697, 8)
app_prev (49100, 37)


# Step 0.3 Create sequential star scheme dictionary

In [5]:
seq_params = {'bureau': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'SK_ID_CURR', 'to_id': 'SK_ID_CURR'},
                       },
             'app_prev': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'SK_ID_CURR', 'to_id': 'SK_ID_CURR'},
                         },
             'credit_bl': {'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'SK_ID_CURR', 'to_id': 'SK_ID_CURR'},
                          },
              'install_pays':{'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'SK_ID_CURR', 'to_id': 'SK_ID_CURR'},
                          },
              'pc_balance':{'case': 'ids',
                       'params': {},
                       'scheme': {'to': 'plain', 'from_id': 'SK_ID_CURR', 'to_id': 'SK_ID_CURR'},
                          },                  
                         }

Create a dict with second-level tables.

In [6]:
seq_data = {'bureau': dataframes['bureau'],
       'app_prev': dataframes['app_prev'],
       'credit_bl': dataframes['credit_bl'],
       'install_pays': dataframes['install_pays'],
       'pc_balance': dataframes['pc_balance']              
       }

Define train and test data samples.

In [7]:
X_train = {'plain':dataframes['app_train'] , 
           'seq': seq_data
          }

X_test = {'plain':dataframes['app_test'] , 
          'seq': seq_data
          }

# Step 1. Create Task snd Sequential Reader for the star scheme data

In [8]:
task = Task('binary', metric='logloss')
roles={'target': 'TARGET'}
reader = DictToNumpySeqReader(task=task, seq_params=seq_params)

# Step 2. Create Feature Generator Pipeline

Define interesing values for feature generation in corresponding slices (optional).

In [9]:
interesting_values = {
    'bureau': {'CREDIT_ACTIVE': ['Active', 'Closed']},
    'app_prev': {'NAME_CONTRACT_TYPE': ['Consumer']}
}

In [10]:
dataframes['app_train'].shape

(5000, 122)

Params of feature generator:
- seq_params: sequence-related params.
- max_gener_features: maximum generated features.
- max_depth: maximum allowed depth of features.
- agg_primitives: list of aggregation primitives.
- trans_primitives: list of transform primitives.
- interesting_values: categorical values if the form of {table_name: {column: [values]}} for feature generation in corresponding slices.
- generate_interesting_values: whether generate feature in slices of unique categories or not.
- per_top_categories: percent of most frequent categories for feature generation in corresponding slices. If number of unique values is less than 10, then the all values are be used. 
- sample_size: size of data to make generated feature selection on it.
- n_jobs: number of processes to run in parallel

In [11]:
generator = FeatureGeneratorPipeline(seq_params,
                                     max_gener_features=500,
                                     interesting_values = interesting_values,
                                     generate_interesting_values = True,
                                     per_top_categories = 25,
                                     sample_size = None,
                                     n_jobs = 16)


# Step 3. Create one-level ML pipeline for AutoML

In [12]:
simpleransf = LGBSimpleFeatures()
feats = generator.append(simpleransf)

model = BoostLGBM()

pipeline_lvl1 = MLPipeline([model], pre_selection=None,
                           features_pipeline=feats, 
                           post_selection=None)

In [13]:
automl = AutoML(reader, [
    [pipeline_lvl1],
], skip_conn=False)

# Step 4. Train AutoML on loaded data

In [14]:
%%time

train_pred = automl.fit_predict(X_train, roles=roles, verbose=2)

[20:23:11] Layer [1m1[0m train process start. Time left 9999999992.53 secs
[20:23:11] This selector only for holdout training. fit_on_holout argument added just to be compatible
[20:23:11] Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
EntitySet scattered to 16 workers in 22 seconds
[20:24:00] [1mLightGBM[0m fitting and predicting completed
[20:24:01] [1mLightGBM[0m fitting and predicting completed
[20:24:04] [1mLightGBM[0m fitting and predicting completed
[20:24:07] [1mLightGBM[0m fitting and predicting completed
[20:24:09] [1mLightGBM[0m fitting and predicting completed
[20:24:11] [1mLightGBM[0m fitting and predicting completed
[20:24:14] [1mLightGBM[0m fitting and predicting completed
[20:24:16] [1mLightGBM[0m fitting and predicting completed
EntitySet scattered to 16 workers in 20 seconds
[20:24:46] Start fitting [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m ...
[20:24:46] ===== Start working with [1mfold 0[0m for [1mLv

# Step 5. Analyze fitted model

In [15]:
feature_imps = model.get_features_score()
feature_imps

EXT_SOURCE_3                                       804.856992
EXT_SOURCE_2                                       721.087537
EXT_SOURCE_1                                       159.317814
DAYS_ID_PUBLISH                                    131.567411
ft__plain_SK_ID_CURR.MEDIAN(bureau.DAYS_CREDIT)    130.531718
                                                      ...    
ord__FLAG_OWN_CAR                                    0.000000
FLAG_DOCUMENT_6                                      0.000000
FLAG_PHONE                                           0.000000
FLAG_EMP_PHONE                                       0.000000
FLAG_DOCUMENT_5                                      0.000000
Length: 244, dtype: float64

In [16]:
feature_imps.index[feature_imps > 0]

Index(['EXT_SOURCE_3', 'EXT_SOURCE_2', 'EXT_SOURCE_1', 'DAYS_ID_PUBLISH',
       'ft__plain_SK_ID_CURR.MEDIAN(bureau.DAYS_CREDIT)',
       'ft__plain_SK_ID_CURR.MEAN(bureau.DAYS_CREDIT)',
       'ft__plain_SK_ID_CURR.MIN(install_pays.AMT_PAYMENT)',
       'ft__plain_SK_ID_CURR.STD(install_pays.DAYS_ENTRY_PAYMENT)',
       'DAYS_EMPLOYED',
       'ft__plain_SK_ID_CURR.MEDIAN(bureau.DAYS_CREDIT_UPDATE)',
       ...
       'ord__WALLSMATERIAL_MODE', 'ord__NAME_CONTRACT_TYPE',
       'ord__NONLIVINGAPARTMENTS_AVG', 'FLAG_WORK_PHONE', 'FLOORSMIN_MODE',
       'FLOORSMIN_MEDI', 'FLAG_EMAIL', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'FLOORSMAX_MEDI', 'FLAG_DOCUMENT_8'],
      dtype='object', length=220)

In [17]:
metric_l = log_loss(X_train['plain'][roles['target']], train_pred.data[:, 0])
metric_a = roc_auc_score(X_train['plain'][roles['target']], train_pred.data[:, 0])

print(f'log-loss: {metric_l}, roc_auc: {metric_a}')

log-loss: 0.24695388874122873, roc_auc: 0.7370297876927845


In [18]:
test_pred = automl.predict(X_test)

EntitySet scattered to 16 workers in 22 seconds


# Step 6. Pickle the model (optional)

In [19]:
with open('ft_model.pickle', 'wb') as f:
    pickle.dump(automl, f)
    
with open('ft_model.pickle', 'rb') as f:
    automl = pickle.load(f)
_pred = automl.predict(X_train)

EntitySet scattered to 16 workers in 21 seconds
