# Создание кастомного пайплайна

## Подготовка

### Шаг 1. Установка LightAutoML

Убрать комментарий для запуска в colab/kaggle.

In [None]:
#!pip install -U lightautoml

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lightautoml
  Downloading LightAutoML-0.3.7.3-py3-none-any.whl (319 kB)
[K     |████████████████████████████████| 319 kB 5.0 MB/s 
[?25hCollecting catboost>=0.26.1
  Downloading catboost-1.1.1-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 1.1 MB/s 
[?25hCollecting torch<1.9
  Downloading torch-1.8.1-cp37-cp37m-manylinux1_x86_64.whl (804.1 MB)
[K     |████████████████████████████████| 804.1 MB 2.5 kB/s 
Collecting lightgbm<=3.2.1,>=2.3
  Downloading lightgbm-3.2.1-py3-none-manylinux1_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 47.4 MB/s 
[?25hCollecting autowoe>=1.2
  Downloading AutoWoE-1.3.2-py3-none-any.whl (215 kB)
[K     |████████████████████████████████| 215 kB 57.1 MB/s 
[?25hCollecting cmaes
  Downloading cmaes-0.9.0-py3-none-any.whl (23 kB)
Collecting importlib-metadata<2.0,>=1.0
  Downloadi

### Шаг 2. Импорт необходимых библиотек

In [None]:
# Standard python libraries
import os
import time
import requests


# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch


# Imports from our package
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector, ModelBasedImportanceEstimator
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
from lightautoml.automl.blend import WeightedBlender

### Шаг 3. Параметры

In [None]:
N_THREADS = 8 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TARGET_NAME = 'TARGET' # Target column name

### Шаг 4. Фиксируем случайный сид и число потоков

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

### Шаг 5. Загрузка данных

Загрузим данные из репозитория

In [None]:
DATASET_DIR = '../data/'
DATASET_NAME = 'sampled_app_train.csv'
DATASET_FULLNAME = os.path.join(DATASET_DIR, DATASET_NAME)
DATASET_URL = 'https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/master/examples/data/sampled_app_train.csv'

In [None]:
if not os.path.exists(DATASET_FULLNAME):
    os.makedirs(DATASET_DIR, exist_ok=True)

    dataset = requests.get(DATASET_URL).text
    with open(DATASET_FULLNAME, 'w') as output:
        output.write(dataset)

In [None]:
data = pd.read_csv(DATASET_FULLNAME)
data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,313802,0,Cash loans,M,N,Y,0,270000.0,327024.0,15372.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,319656,0,Cash loans,F,N,N,0,108000.0,675000.0,19737.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,207678,0,Revolving loans,F,Y,Y,2,112500.0,270000.0,13500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
3,381593,0,Cash loans,F,N,N,1,67500.0,142200.0,9630.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
4,258153,0,Cash loans,F,Y,Y,0,337500.0,1483231.5,46570.5,...,0,0,0,0,0.0,0.0,0.0,2.0,0.0,0.0


### Шаг 6. (Опционально) Кастомная предобработка признаков

В ячейке ниже показана некоторая дополнительная предобработка признаков (эту ячейку можно опустить, если вы не хотите менять исходные данные)

In [None]:
data['BIRTH_DATE'] = (np.datetime64('2018-01-01') + data['DAYS_BIRTH'].astype(np.dtype('timedelta64[D]'))).astype(str)
data['EMP_DATE'] = (np.datetime64('2018-01-01') + np.clip(data['DAYS_EMPLOYED'], None, 0).astype(np.dtype('timedelta64[D]'))
                    ).astype(str)

data['constant'] = 1
data['allnan'] = np.nan

data['report_dt'] = np.datetime64('2018-01-01')

data.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

### Шаг 7. Разбиение данных на обучение и тест

In [None]:
train_data, test_data = train_test_split(data,
                                         test_size=TEST_SIZE,
                                         stratify=data[TARGET_NAME],
                                         random_state=RANDOM_STATE)
print('Data splitted. Parts sizes: train_data = {}, test_data = {}'
              .format(train_data.shape, test_data.shape))

Data splitted. Parts sizes: train_data = (8000, 125), test_data = (2000, 125)


In [None]:
train_data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,BIRTH_DATE,EMP_DATE,constant,allnan,report_dt
6444,112261,0,Cash loans,F,N,N,1,90000.0,640080.0,31261.5,...,0.0,0.0,0.0,1.0,0.0,1985-06-28,2012-06-21,1,,2018-01-01
3586,115058,0,Cash loans,F,N,Y,0,180000.0,239850.0,23850.0,...,0.0,0.0,0.0,0.0,3.0,1953-12-27,2018-01-01,1,,2018-01-01
9349,326623,0,Cash loans,F,N,Y,0,112500.0,337500.0,31086.0,...,0.0,0.0,0.0,0.0,2.0,1975-06-21,2016-06-17,1,,2018-01-01
7734,191976,0,Cash loans,M,Y,Y,1,67500.0,135000.0,9018.0,...,,,,,,1988-04-27,2009-06-05,1,,2018-01-01
2174,281519,0,Revolving loans,F,N,Y,0,67500.0,202500.0,10125.0,...,0.0,0.0,0.0,0.0,2.0,1975-06-13,1997-01-22,1,,2018-01-01


![](https://github.com/sb-ai-lab/LightAutoML/raw/master/imgs/tutorial_1_laml_big.png)

![](https://github.com/sb-ai-lab/LightAutoML/raw/master/imgs/tutorial_1_ml_pipeline.png)

## Создание AutoML

![AutoML pipeline for this task](https://github.com/AILab-MLTools/LightAutoML/blob/master/imgs/tutorial_1_pipeline.png?raw=1)

### Шаг 1. Создание объектов Task и PandasReader

In [None]:
task = Task('binary')
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE)

In [None]:
train_ds = reader.fit_read(train_data, roles={'target': 'TARGET'})
train_ds.shape

(8000, 110)

In [None]:
type(train_ds)

lightautoml.dataset.np_pd_dataset.PandasDataset

In [None]:
train_ds.target

6444    0
3586    0
9349    0
7734    0
2174    0
       ..
2895    0
2317    1
5505    0
268     0
525     0
Name: TARGET, Length: 8000, dtype: int64

In [None]:
train_ds.data

Unnamed: 0,HOUSETYPE_MODE,REGION_RATING_CLIENT_W_CITY,DEF_30_CNT_SOCIAL_CIRCLE,EMERGENCYSTATE_MODE,BASEMENTAREA_MODE,EMP_DATE,YEARS_BEGINEXPLUATATION_MODE,FLOORSMIN_MEDI,FLAG_DOCUMENT_14,COMMONAREA_MODE,...,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_MON,AMT_INCOME_TOTAL,NAME_CONTRACT_TYPE,DEF_60_CNT_SOCIAL_CIRCLE,ENTRANCES_AVG,FLOORSMIN_MODE,FLAG_DOCUMENT_16,FLAG_DOCUMENT_13,ORGANIZATION_TYPE
0,block of flats,2.0,0.0,No,0.0666,2012-06-21,0.9881,0.0417,0.0,0.0473,...,1.0,0.0,90000.0,Cash loans,0.0,0.1379,0.0417,0.0,0.0,Self-employed
1,block of flats,2.0,0.0,No,0.2275,2018-01-01,0.9921,,0.0,,...,0.0,0.0,180000.0,Cash loans,0.0,0.2759,,0.0,0.0,XNA
2,block of flats,2.0,0.0,No,0.0470,2016-06-17,0.9886,0.4167,0.0,0.0488,...,0.0,0.0,112500.0,Cash loans,0.0,0.0690,0.4167,0.0,0.0,Business Entity Type 3
3,block of flats,2.0,0.0,No,0.0551,2009-06-05,0.9821,0.0417,0.0,0.0121,...,,,67500.0,Cash loans,0.0,0.0690,0.0417,0.0,0.0,Self-employed
4,block of flats,2.0,0.0,No,0.0000,1997-01-22,0.9752,0.0833,0.0,0.0014,...,0.0,0.0,67500.0,Revolving loans,0.0,0.0690,0.0833,0.0,0.0,School
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,block of flats,2.0,0.0,No,0.1127,2014-03-25,0.9777,0.2083,0.0,0.0113,...,0.0,0.0,112500.0,Revolving loans,0.0,0.2069,0.2083,0.0,0.0,Self-employed
7996,block of flats,2.0,0.0,No,,2016-07-25,0.9816,,0.0,,...,0.0,0.0,157500.0,Revolving loans,0.0,0.2069,,0.0,0.0,Construction
7997,,2.0,0.0,,,2010-05-28,,,0.0,,...,0.0,0.0,202500.0,Cash loans,0.0,,,0.0,0.0,Kindergarten
7998,,2.0,0.0,,,2016-05-09,,,0.0,,...,,,135000.0,Revolving loans,0.0,,,0.0,0.0,Business Entity Type 3


In [None]:
train_ds.roles

{'HOUSETYPE_MODE': Category role, dtype <class 'object'>. Additional params: [('encoding_type', 'freq'), ('unknown', 5), ('force_input', False), ('label_encoded', False), ('ordinal', False)],
 'REGION_RATING_CLIENT_W_CITY': Numeric role, dtype <class 'numpy.float32'>. Additional params: [('force_input', False), ('prob', False), ('discretization', False)],
 'DEF_30_CNT_SOCIAL_CIRCLE': Category role, dtype <class 'numpy.float32'>. Additional params: [('encoding_type', 'freq'), ('unknown', 5), ('force_input', False), ('label_encoded', False), ('ordinal', False)],
 'EMERGENCYSTATE_MODE': Category role, dtype <class 'object'>. Additional params: [('encoding_type', 'freq'), ('unknown', 5), ('force_input', False), ('label_encoded', False), ('ordinal', False)],
 'BASEMENTAREA_MODE': Numeric role, dtype <class 'numpy.float32'>. Additional params: [('force_input', False), ('prob', False), ('discretization', False)],
 'EMP_DATE': Datetime role, dtype <class 'numpy.datetime64'>. Additional params:

In [None]:
task = Task('binary')
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE)

### Шаг 2.1. Создание кастомного feature пайплайна

In [None]:
from lightautoml.transformers.numeric import FillnaMedian, StandardScaler
from lightautoml.transformers.categorical import LabelEncoder, TargetEncoder
from lightautoml.reader.guess_roles import SequentialTransformer
from lightautoml.pipelines.utils import get_columns_by_role
from lightautoml.transformers.base import UnionTransformer, ColumnsSelector
from lightautoml.pipelines.features.base import FeaturesPipeline


class MyCustomFeatures(FeaturesPipeline):
    def create_pipeline(self, train):
        pipes = []

        num_cols = get_columns_by_role(train, 'Numeric')
        cat_cols = get_columns_by_role(train, 'Category')

        if cat_cols:
            te_pipe = SequentialTransformer([
                ColumnsSelector(cat_cols),
                LabelEncoder(),
                TargetEncoder()
            ])
            pipes.append(te_pipe)

        if num_cols:
            scale_pipe = SequentialTransformer([
                ColumnsSelector(num_cols),
                FillnaMedian(),
                StandardScaler()
            ])
            pipes.append(scale_pipe)

        return UnionTransformer(pipes)

In [None]:
my_features = MyCustomFeatures()
train_features = my_features.fit_transform(train_ds)
type(train_features), train_features.shape

(lightautoml.dataset.np_pd_dataset.NumpyDataset, (8000, 108))

Добавим к существующему пайплайну

In [None]:
pipe = LGBSimpleFeatures()
pipe = pipe.append(my_features)
# pipe0.set_sequential(True).append(my_features) - for sequential application of pipelines

### Шаг 2.2. Создание feature selector (при необходимости)

In [None]:
model0 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'seed': 42, 'num_threads': N_THREADS}
)
pipe0 = LGBSimpleFeatures()
mbie = ModelBasedImportanceEstimator()
selector = ImportanceCutoffSelector(pipe0, model0, mbie, cutoff=0)



### Шаг 3.1. Создание ML пайплайнов 1-го уровня для AutoML

ML пайплайн 1-го уровня состоит из:
- Простые признаки для градиентного бустинга на основе отобранных признаков (см. шаг 2)
- 2 модели:
    * LightGBM с подбором гиперпараметров (с помощью OptunaTuner)
    * LightGBM с гиперпараметрами, расчитанными эвристически

In [None]:
params_tuner1 = OptunaTuner(n_trials=20, timeout=30) # stop after 20 iterations or after 30 seconds
model1 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 128, 'seed': 1, 'num_threads': N_THREADS}
)
model2 = BoostLGBM(
    default_params={'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads': N_THREADS}
)

#pipe = LGBSimpleFeatures()
pipeline_lvl1 = MLPipeline([
    (model1, params_tuner1),
    model2
], pre_selection=selector, features_pipeline=pipe, post_selection=None)

### Шаг 3.2. Создание ML пайплайнов 2-го уровня для AutoML

Второй уровень ML пайплайнов будет состоять из:
- Такие же простые признаки, но полученные как Out-Of-Fold (OOF) предсказания от алгоритмов 1-го уровня
- Одна LGBM модель без подбора гиперпараметров
- Отбора признаков на этой стадии не будет, мы хотим использовать все OOF предсказания

In [None]:
pipe1 = LGBSimpleFeatures()

model = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'max_bin': 1024, 'seed': 3, 'num_threads': N_THREADS},
    freeze_defaults=True
)

pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None)

### Шаг 4. Создание AutoML пайплайна

AutoML пайплайн состоит из:
- Reader для подготовки данных
- 1-ый уровень ML пайплайнов (см. шаг 3.1)
- 2-ый уровень ML пайплайнов (см. шаг 3.2)
- `Skip_conn = False` соответствует "не использовать исходные признаки данных в моделях 2-го уровня"

In [None]:
automl = AutoML(reader, [
    [pipeline_lvl1],
    [pipeline_lvl2],
], skip_conn=False)

### Шаг 5. Обучение AutoML на загруженных данных

В следующей ячейке мы обучим AutoML на целевую переменную `TARGET`, на выходе получим обученную модель и OOF предсказания:

In [None]:
oof_pred = automl.fit_predict(train_data, roles={'target': TARGET_NAME})
print('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

INFO:lightautoml.reader.base:[1mTrain data shape: (8000, 125)[0m

INFO3:lightautoml.reader.base:Feats was rejected during automatic roles guess: []
INFO:lightautoml.automl.base:Layer [1m1[0m train process start. Time left 9999999996.29 secs
INFO3:lightautoml.ml_algo.boost_lgbm:Training until validation scores don't improve for 100 rounds
DEBUG:lightautoml.ml_algo.boost_lgbm:[100]	valid's auc: 0.725521
DEBUG:lightautoml.ml_algo.boost_lgbm:[200]	valid's auc: 0.727269
DEBUG:lightautoml.ml_algo.boost_lgbm:Early stopping, best iteration is:
[153]	valid's auc: 0.730648
INFO:lightautoml.ml_algo.base:[1mLightGBM[0m fitting and predicting completed
INFO:lightautoml.ml_algo.tuning.optuna:Start hyperparameters optimization for [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m ... Time budget is 30.00 secs
INFO:optuna.storages._in_memory:A new study created in memory with name: no-name-63d91bdf-ef45-4a74-b630-eb687a751848
INFO3:lightautoml.ml_algo.boost_lgbm:Training until validation scores don't improve 

oof_pred:
array([[0.09544795],
       [0.06756714],
       [0.06756714],
       ...,
       [0.06497721],
       [0.11555641],
       [0.09543057]], dtype=float32)
Shape = (8000, 1)


### Шаг 6. Анализируем обученную модель

Ниже мы анализируем важности признаков разных алгоритмов:

In [None]:
print('Feature importances of selector:\n{}'
              .format(selector.get_features_score()))
print('=' * 70)

print('Feature importances of top level algorithm:\n{}'
              .format(automl.levels[-1][0].ml_algos[0].get_features_score()))
print('=' * 70)

print('Feature importances of lowest level algorithm - model 0:\n{}'
              .format(automl.levels[0][0].ml_algos[0].get_features_score()))
print('=' * 70)

print('Feature importances of lowest level algorithm - model 1:\n{}'
              .format(automl.levels[0][0].ml_algos[1].get_features_score()))
print('=' * 70)

Feature importances of selector:
EXT_SOURCE_2         1989.559315
EXT_SOURCE_3         1893.068587
BIRTH_DATE           1225.729354
DAYS_REGISTRATION    1200.938859
EMP_DATE             1100.489046
                        ...     
FLAG_DOCUMENT_18        0.000000
FLAG_DOCUMENT_5         0.000000
FLAG_DOCUMENT_9         0.000000
FLAG_EMP_PHONE          0.000000
FLAG_DOCUMENT_16        0.000000
Length: 110, dtype: float64
Feature importances of top level algorithm:
Lvl_0_Pipe_0_Mod_0_LightGBM_prediction_0    1921.580413
Lvl_0_Pipe_0_Mod_1_LightGBM_prediction_0       0.000000
dtype: float64
Feature importances of lowest level algorithm - model 0:
EXT_SOURCE_2                                     1513.385413
EXT_SOURCE_3                                     1340.909902
dtdiff__BIRTH_DATE                               1032.414607
oof__le__REGION_POPULATION_RELATIVE               740.429211
SK_ID_CURR                                        739.805585
                                           

### Шаг 7. Предсказание для тестовых данных и подсчет метрик

In [None]:
test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))

print('Check scores...')
print('OOF score: {}'.format(roc_auc_score(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))
print('TEST score: {}'.format(roc_auc_score(test_data[TARGET_NAME].values, test_pred.data[:, 0])))

Prediction for test data:
array([[0.07139476],
       [0.07544799],
       [0.06545913],
       ...,
       [0.06437033],
       [0.06437033],
       [0.09935696]], dtype=float32)
Shape = (2000, 1)
Check scores...
OOF score: 0.7019827458463896
TEST score: 0.7204415760869565
