In [None]:
!pip install -U lightautoml

In [2]:
# Standard python libraries
import os
import time
import requests

# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task

In [3]:
# set some constants for reproducibility

RANDOM_STATE = 42
N_THREADS = 4

np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

## 1. Загрузка данных

In [4]:
DATASET_DIR = 'data/'
DATASET_NAME = 'sampled_app_train.csv'
DATASET_FULLNAME = os.path.join(DATASET_DIR, DATASET_NAME)
DATASET_URL = 'https://raw.githubusercontent.com/sb-ai-lab/LightAutoML/master/examples/data/sampled_app_train.csv'

In [5]:
if not os.path.exists(DATASET_FULLNAME):
    os.makedirs(DATASET_DIR, exist_ok=True)

    dataset = requests.get(DATASET_URL).text
    with open(DATASET_FULLNAME, 'w') as output:
        output.write(dataset)

In [6]:
TARGET_NAME = 'TARGET'

data = pd.read_csv('data/sampled_app_train.csv')
data.sample(5)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
6252,220929,0,Cash loans,M,N,Y,1,181084.5,1350000.0,37125.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
4684,200351,0,Cash loans,M,N,Y,1,270000.0,1563840.0,66388.5,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,1.0
1731,198362,0,Cash loans,F,N,N,0,157500.0,1303812.0,38119.5,...,0,0,0,0,,,,,,
4742,391215,0,Cash loans,F,N,Y,0,180000.0,364896.0,17685.0,...,0,0,0,0,0.0,0.0,0.0,0.0,2.0,3.0
4521,141764,0,Cash loans,M,N,Y,0,202500.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,2.0


Разделим данные на обучение и тест

In [7]:
tr_data, te_data = train_test_split(
    data,
    test_size=0.2, 
    stratify=data[TARGET_NAME], 
    random_state=RANDOM_STATE
)

print(f'Data splitted. Parts sizes: tr_data = {tr_data.shape}, te_data = {te_data.shape}')

tr_data.head()

Data splitted. Parts sizes: tr_data = (8000, 122), te_data = (2000, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
6444,112261,0,Cash loans,F,N,N,1,90000.0,640080.0,31261.5,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0
3586,115058,0,Cash loans,F,N,Y,0,180000.0,239850.0,23850.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
9349,326623,0,Cash loans,F,N,Y,0,112500.0,337500.0,31086.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
7734,191976,0,Cash loans,M,Y,Y,1,67500.0,135000.0,9018.0,...,0,0,0,0,,,,,,
2174,281519,0,Revolving loans,F,N,Y,0,67500.0,202500.0,10125.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0


## 2. Создание и обучение пресета `TabularAutoML`

In [8]:
# specify task type
#  'binary' - for binary classification.
#  'reg' - for regression.
#  'multiclass' - for multiclass classification.
task = Task(
    'binary',  # required
    loss='logloss',
    metric='auc'
)

In [9]:
# specify feature roles
roles = {
    'target': TARGET_NAME,  # required
    'drop': ['SK_ID_CURR']  # remove user's id
}

In [10]:
N_FOLDS = 5
TIMEOUT = 60 * 30  # 30 minutes

In [11]:
automl = TabularAutoML(
    task=task,  # required
    timeout=TIMEOUT,
    cpu_limit=N_THREADS,
    reader_params={'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE}
)

In [12]:
%%time
oof_preds = automl.fit_predict(tr_data, roles=roles, verbose=1)

[12:46:54] Stdout logging level is INFO.
[12:46:54] Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
[12:46:54] Task: binary

[12:46:54] Start automl preset with listed constraints:
[12:46:54] - time: 1800.00 seconds
[12:46:54] - CPU: 4 cores
[12:46:54] - memory: 16 GB

[12:46:54] [1mTrain data shape: (8000, 122)[0m

[12:46:57] Layer [1m1[0m train process start. Time left 1797.39 secs
[12:46:57] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[12:47:00] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.7354017993149616[0m
[12:47:00] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[12:47:00] Time left 1793.66 secs

[12:47:03] [1mSelector_LightGBM[0m fitting and predicting completed
[12:47:04] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[12:47:22] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m0.735377350367659[0m
[12:47:22] [1mLvl_0_Pipe_1_Mod_0_LightGBM[

In [13]:
print(automl.create_model_str_desc())

Final prediction for new objects (level 0) = 
	 0.25199 * (5 averaged models Lvl_0_Pipe_0_Mod_0_LinearL2) +
	 0.24665 * (5 averaged models Lvl_0_Pipe_1_Mod_0_LightGBM) +
	 0.08711 * (5 averaged models Lvl_0_Pipe_1_Mod_1_Tuned_LightGBM) +
	 0.06325 * (5 averaged models Lvl_0_Pipe_1_Mod_2_CatBoost) +
	 0.35101 * (5 averaged models Lvl_0_Pipe_1_Mod_3_Tuned_CatBoost) 


In [14]:
%%time

te_preds = automl.predict(te_data)
print(f'Prediction for te_data:\n{te_preds}\nShape = {te_preds.shape}')

Prediction for te_data:
array([[0.06484319],
       [0.07277071],
       [0.03308925],
       ...,
       [0.0629824 ],
       [0.04203793],
       [0.2079497 ]], dtype=float32)
Shape = (2000, 1)
CPU times: user 1.74 s, sys: 38.7 ms, total: 1.78 s
Wall time: 888 ms


Используем метрику ROC-AUC для оценки качества

In [15]:
print(f'OOF score: {roc_auc_score(tr_data[TARGET_NAME].values, oof_preds.data[:, 0])}')
print(f'HOLDOUT score: {roc_auc_score(te_data[TARGET_NAME].values, te_preds.data[:, 0])}')

OOF score: 0.75204706783775
HOLDOUT score: 0.7344361413043479


Пресет `TabularAutoML` обучался 6.5 минут, несмотря на то, что `TIMEOUT` установлен равным 30 минутам. Чтобы использовать отведенное время, воспользуемся пресетом `TabularUtilizedAutoML` вместо `TabularAutoML`.

## 3. Создание и обучение `TabularUtilizedAutoML`

In [16]:
utilized_automl = TabularUtilizedAutoML(
    task=task,
    timeout=TIMEOUT,
    cpu_limit=N_THREADS,
    reader_params={'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
)

In [17]:
%%time 
oof_pred = utilized_automl.fit_predict(tr_data, roles=roles, verbose=1)

[12:53:33] Start automl [1mutilizator[0m with listed constraints:
[12:53:33] - time: 1800.00 seconds
[12:53:33] - CPU: 4 cores
[12:53:33] - memory: 16 GB

[12:53:33] [1mIf one preset completes earlier, next preset configuration will be started[0m

[12:53:33] Start 0 automl preset configuration:
[12:53:33] [1mconf_0_sel_type_0.yml[0m, random state: {'reader_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
[12:53:33] Stdout logging level is INFO.
[12:53:33] Task: binary

[12:53:33] Start automl preset with listed constraints:
[12:53:33] - time: 1800.00 seconds
[12:53:33] - CPU: 4 cores
[12:53:33] - memory: 16 GB

[12:53:33] [1mTrain data shape: (8000, 122)[0m

[12:53:35] Layer [1m1[0m train process start. Time left 1797.48 secs
[12:53:36] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[12:53:39] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.7354017993149616[0m
[12:53:39] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting

[13:08:27] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[13:08:31] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.7370124534433578[0m
[13:08:31] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[13:08:31] Time left 902.05 secs

[13:08:34] [1mSelector_LightGBM[0m fitting and predicting completed
[13:08:35] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[13:08:49] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m0.7396506011570942[0m
[13:08:49] [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m fitting and predicting completed
[13:08:49] Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m ... Time budget is 118.38 secs
[13:10:51] Hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m completed
[13:10:51] Start fitting [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m ...
[13:10:53] Fitting [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m finished. score = [1m0.7484952735932873[0m
[13:10:53]

In [18]:
print(utilized_automl.create_model_str_desc())

Final prediction for new objects = 
	0.17570 * 1 averaged models with config = "conf_0_sel_type_0.yml" and different CV random_states. Their structures: 

	    Model #0.
		Final prediction for new objects (level 0) = 
			 0.37190 * (5 averaged models Lvl_0_Pipe_0_Mod_0_LinearL2) +
			 0.08570 * (5 averaged models Lvl_0_Pipe_1_Mod_0_LightGBM) +
			 0.09661 * (5 averaged models Lvl_0_Pipe_1_Mod_1_Tuned_LightGBM) +
			 0.05502 * (5 averaged models Lvl_0_Pipe_1_Mod_2_CatBoost) +
			 0.39078 * (5 averaged models Lvl_0_Pipe_1_Mod_3_Tuned_CatBoost) 


	+ 0.13009 * 1 averaged models with config = "conf_1_sel_type_1.yml" and different CV random_states. Their structures: 

	    Model #0.
		Final prediction for new objects (level 0) = 
			 0.28397 * (5 averaged models Lvl_0_Pipe_0_Mod_0_LinearL2) +
			 0.20168 * (5 averaged models Lvl_0_Pipe_1_Mod_0_LightGBM) +
			 0.15632 * (5 averaged models Lvl_0_Pipe_1_Mod_1_Tuned_LightGBM) +
			 0.35804 * (5 averaged models Lvl_0_Pipe_1_Mod_3_Tuned_CatBoost)

In [19]:
%%time

te_pred = utilized_automl.predict(te_data)
print(f'Prediction for te_data:\n{te_pred}\nShape = {te_pred.shape}')

Prediction for te_data:
array([[0.06552619],
       [0.09151144],
       [0.03593913],
       ...,
       [0.05892349],
       [0.04312227],
       [0.23252055]], dtype=float32)
Shape = (2000, 1)
CPU times: user 4.08 s, sys: 22.5 ms, total: 4.1 s
Wall time: 2.03 s


In [20]:
print(f'OOF score: {roc_auc_score(tr_data[TARGET_NAME].values, oof_pred.data[:, 0])}')
print(f'HOLDOUT score: {roc_auc_score(te_data[TARGET_NAME].values, te_pred.data[:, 0])}')

OOF score: 0.7592782160517331
HOLDOUT score: 0.7355638586956523


Метрика ROC-AUC немного подросла