In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import median_absolute_error, accuracy_score,roc_auc_score
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import ReduceLROnPlateau

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

'nlp' extra dependecy package 'gensim' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'gensim' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.




In [3]:
# utils

def map_class(x, task, reader):
    if task.name == 'multiclass':
        return reader[x]
    else:
        return x

mapped = np.vectorize(map_class)

def score(task, y_true, y_pred):
    if task.name == 'binary':
        return roc_auc_score(y_true, y_pred)
    elif task.name == 'multiclass':
        return accuracy_score(y_true, np.argmax(y_pred, 1))
    elif task.name == 'reg' or task.name == 'multi:reg':
        return median_absolute_error(y_true, y_pred)
    else:
        raise 'Task is not correct.'
        
def take_pred_from_task(pred, task):
    if task.name == 'binary' or task.name == 'reg':
        return pred[:, 0]
    elif task.name == 'multiclass' or task.name == 'multi:reg':
        return pred
    else:
        raise 'Task is not correct.'
        
def use_plr(USE_PLR):
    if USE_PLR:
        return "plr"
    else:
        return "cont"

### 0.2 Constants

Here we setup the constants to use in the kernel:
- `N_THREADS` - number of vCPUs for LightAutoML model creation
- `N_FOLDS` - number of folds in LightAutoML inner CV
- `RANDOM_STATE` - random seed for better reproducibility
- `TEST_SIZE` - houldout data part size 
- `TIMEOUT` - limit in seconds for model to train
- `TARGET_NAME` - target column name in dataset
- `TASK` - task name, 'reg', 'binary', 'multiclass', 'multi:reg'
- `ALGOS_FOR_BLEND` - algorithms used in blending
- `USE_PLR` - if True use PLR embedder for continuous features, else Basic Embedder
- `TRAIN_BS` - train batch size

In [4]:
RANDOM_STATE = 42
N_THREADS = 16

In [5]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

### 0.3 Data loading

In [6]:
train = pd.read_csv('../input/playground-series-s4e7/train.csv')
test = pd.read_csv('../input/playground-series-s4e7/test.csv')

In [7]:
# X_train, X_val = train_test_split(train, test_size=0.2, random_state=42, shuffle=True, stratify=train.Response)

In [8]:
task = Task('binary') #‘binary’ 
automl = TabularAutoML(
    task = task, 
    timeout = 600 * 3600,
    cpu_limit = 16,
    general_params = {"use_algos": [['autoint']]}, # ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn', 'node', 'autoint', 'fttransformer'] or custom torch model
    nn_params = {
        "n_epochs": 10, 
        "bs": 1024, 
        "num_workers": 0, 
        "path_to_save": None, 
        "freeze_defaults": True,
        "cont_embedder": 'plr',
        'cat_embedder': 'weighted',
        "hidden_size": 32,
        'verbose_bar': True,
        "snap_params": { 'k': 1, 'early_stopping': True, 'patience': 1, 'swa': True }
    },
    nn_pipeline_params = {"use_qnt": False, "use_te": False},
    reader_params = {'n_jobs': 16, 'cv': 5, 'random_state': 42, 'advanced_roles': True}
)

out_of_fold_predictions = automl.fit_predict(
    train, #valid_data=X_val,
    roles = {
        'target': 'Response',
        'drop': ['id']
    }, 
    verbose = 4
)

[20:58:21] Stdout logging level is DEBUG.
[20:58:21] Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
[20:58:21] Task: binary

[20:58:21] Start automl preset with listed constraints:
[20:58:21] - time: 2160000.00 seconds
[20:58:21] - CPU: 16 cores
[20:58:21] - memory: 16 GB

[20:58:21] [1mTrain data shape: (11504798, 12)[0m

[20:58:48] Feats was rejected during automatic roles guess: []
[20:58:49] Layer [1m1[0m train process start. Time left 2159971.84 secs
[20:59:16] number of text features: 0 
[20:59:16] number of categorical features: 8 
[20:59:16] number of continuous features: 2 
[20:59:16] Start fitting [1mLvl_0_Pipe_0_Mod_0_TorchNN_autoint_0[0m ...
[20:59:16] Training params: {'num_workers': 0, 'pin_memory': False, 'max_length': 256, 'is_snap': False, 'input_bn': False, 'max_emb_size': 256, 'bert_name': None, 'pooling': 'cls', 'device': device(type='cuda', index=0), 'use_cont': True, 'use_cat': True, 'use_text': False, 'lan

train (loss=0.250588): 100%|██████████| 8988/8988 [08:33<00:00, 17.49it/s]
val: 100%|██████████| 2248/2248 [01:43<00:00, 21.82it/s]


[21:09:38] Epoch: 0, train loss: 0.2505876123905182, val loss: 0.2447975128889084, val metric: 0.8902458299818045


train (loss=0.243603): 100%|██████████| 8988/8988 [08:39<00:00, 17.30it/s]
val: 100%|██████████| 2248/2248 [01:42<00:00, 21.83it/s]


[21:20:01] Epoch: 1, train loss: 0.2436031848192215, val loss: 0.24361677467823029, val metric: 0.8912519269723287


train (loss=0.241971): 100%|██████████| 8988/8988 [08:42<00:00, 17.21it/s]
val: 100%|██████████| 2248/2248 [01:43<00:00, 21.73it/s]


[21:30:28] Epoch: 2, train loss: 0.24197076261043549, val loss: 0.24329131841659546, val metric: 0.8913994607435136


train (loss=0.240614): 100%|██████████| 8988/8988 [08:44<00:00, 17.15it/s]
val: 100%|██████████| 2248/2248 [01:43<00:00, 21.68it/s]


[21:40:56] Epoch: 3, train loss: 0.24061401188373566, val loss: 0.24300727248191833, val metric: 0.891664029821912


train (loss=0.239419): 100%|██████████| 8988/8988 [08:43<00:00, 17.18it/s]
val: 100%|██████████| 2248/2248 [01:43<00:00, 21.81it/s]


[21:51:24] Epoch: 4, train loss: 0.23941949009895325, val loss: 0.24315482378005981, val metric: 0.891591445472411


val: 100%|██████████| 2248/2248 [01:42<00:00, 21.94it/s]


[21:53:07] Early stopping: val loss: 0.24300727248191833, val metric: 0.891664029821912
[21:53:08] ===== Start working with [1mfold 1[0m for [1mLvl_0_Pipe_0_Mod_0_TorchNN_autoint_0[0m =====


train (loss=0.250512): 100%|██████████| 8988/8988 [08:33<00:00, 17.52it/s]
val: 100%|██████████| 2248/2248 [01:42<00:00, 21.94it/s]


[22:03:27] Epoch: 0, train loss: 0.2505122125148773, val loss: 0.2445298582315445, val metric: 0.8899244088318721


train (loss=0.243447): 100%|██████████| 8988/8988 [08:44<00:00, 17.15it/s]
val: 100%|██████████| 2248/2248 [01:42<00:00, 21.87it/s]


[22:13:55] Epoch: 1, train loss: 0.24344712495803833, val loss: 0.24380099773406982, val metric: 0.8908066239654691


train (loss=0.241932): 100%|██████████| 8988/8988 [08:44<00:00, 17.14it/s]
val: 100%|██████████| 2248/2248 [01:42<00:00, 21.89it/s]


[22:24:23] Epoch: 2, train loss: 0.2419317662715912, val loss: 0.24354542791843414, val metric: 0.8911563496434213


train (loss=0.240811): 100%|██████████| 8988/8988 [08:45<00:00, 17.11it/s]
val: 100%|██████████| 2248/2248 [01:43<00:00, 21.77it/s]


[22:34:53] Epoch: 3, train loss: 0.24081052839756012, val loss: 0.24330024421215057, val metric: 0.8914351696553349


train (loss=0.239801): 100%|██████████| 8988/8988 [08:48<00:00, 16.99it/s]
val: 100%|██████████| 2248/2248 [01:43<00:00, 21.77it/s]


[22:45:26] Epoch: 4, train loss: 0.239800825715065, val loss: 0.24344559013843536, val metric: 0.8913363137333145


val: 100%|██████████| 2248/2248 [01:43<00:00, 21.79it/s]


[22:47:10] Early stopping: val loss: 0.24330024421215057, val metric: 0.8914351696553349
[22:47:11] ===== Start working with [1mfold 2[0m for [1mLvl_0_Pipe_0_Mod_0_TorchNN_autoint_0[0m =====


train (loss=0.250487): 100%|██████████| 8988/8988 [08:31<00:00, 17.56it/s]
val: 100%|██████████| 2248/2248 [01:41<00:00, 22.12it/s]


[22:57:28] Epoch: 0, train loss: 0.25048723816871643, val loss: 0.24473683536052704, val metric: 0.8899995544876889


train (loss=0.24329): 100%|██████████| 8988/8988 [08:32<00:00, 17.55it/s] 
val: 100%|██████████| 2248/2248 [01:42<00:00, 21.88it/s]


[23:07:44] Epoch: 1, train loss: 0.24329034984111786, val loss: 0.24374517798423767, val metric: 0.8910647257353954


train (loss=0.241433): 100%|██████████| 8988/8988 [08:43<00:00, 17.18it/s]
val: 100%|██████████| 2248/2248 [01:42<00:00, 21.85it/s]


[23:18:11] Epoch: 2, train loss: 0.24143289029598236, val loss: 0.24335059523582458, val metric: 0.8913061573596993


train (loss=0.240049): 100%|██████████| 8988/8988 [08:46<00:00, 17.08it/s]
val: 100%|██████████| 2248/2248 [01:42<00:00, 21.93it/s]


[23:28:40] Epoch: 3, train loss: 0.2400493174791336, val loss: 0.24359077215194702, val metric: 0.8911720925219182


val: 100%|██████████| 2248/2248 [01:42<00:00, 21.96it/s]


[23:30:24] Early stopping: val loss: 0.24335059523582458, val metric: 0.8913061573596993
[23:30:25] ===== Start working with [1mfold 3[0m for [1mLvl_0_Pipe_0_Mod_0_TorchNN_autoint_0[0m =====


train (loss=0.250395): 100%|██████████| 8988/8988 [08:32<00:00, 17.54it/s]
val: 100%|██████████| 2248/2248 [01:42<00:00, 21.93it/s]


[23:40:43] Epoch: 0, train loss: 0.25039467215538025, val loss: 0.24486181139945984, val metric: 0.8898996397579347


train (loss=0.243486): 100%|██████████| 8988/8988 [08:36<00:00, 17.41it/s]
val: 100%|██████████| 2248/2248 [01:42<00:00, 21.94it/s]


[23:51:03] Epoch: 1, train loss: 0.2434862107038498, val loss: 0.2437405288219452, val metric: 0.8908294017131517


train (loss=0.2419): 100%|██████████| 8988/8988 [08:41<00:00, 17.23it/s]  
val: 100%|██████████| 2248/2248 [01:41<00:00, 22.11it/s]


[00:01:27] Epoch: 2, train loss: 0.24189972877502441, val loss: 0.24347320199012756, val metric: 0.8912214780468363


train (loss=0.240541): 100%|██████████| 8988/8988 [08:38<00:00, 17.32it/s]
val: 100%|██████████| 2248/2248 [01:42<00:00, 22.04it/s]


[00:11:49] Epoch: 3, train loss: 0.24054066836833954, val loss: 0.24357271194458008, val metric: 0.8911722623086288


val: 100%|██████████| 2248/2248 [01:41<00:00, 22.09it/s]


[00:13:32] Early stopping: val loss: 0.24347320199012756, val metric: 0.8912214780468363
[00:13:33] ===== Start working with [1mfold 4[0m for [1mLvl_0_Pipe_0_Mod_0_TorchNN_autoint_0[0m =====


train (loss=0.250646): 100%|██████████| 8988/8988 [08:23<00:00, 17.85it/s]
val: 100%|██████████| 2248/2248 [01:41<00:00, 22.23it/s]


[00:23:41] Epoch: 0, train loss: 0.25064602494239807, val loss: 0.24381543695926666, val metric: 0.890817505865249


train (loss=0.243595): 100%|██████████| 8988/8988 [08:28<00:00, 17.68it/s]
val: 100%|██████████| 2248/2248 [01:41<00:00, 22.20it/s]


[00:33:51] Epoch: 1, train loss: 0.24359457194805145, val loss: 0.24350899457931519, val metric: 0.8914025949208267


train (loss=0.241855): 100%|██████████| 8988/8988 [08:32<00:00, 17.54it/s]
val: 100%|██████████| 2248/2248 [01:42<00:00, 22.01it/s]


[00:44:07] Epoch: 2, train loss: 0.24185453355312347, val loss: 0.2431061714887619, val metric: 0.8919181065491354


train (loss=0.240343): 100%|██████████| 8988/8988 [08:32<00:00, 17.53it/s]
val: 100%|██████████| 2248/2248 [01:41<00:00, 22.16it/s]


[00:54:22] Epoch: 3, train loss: 0.24034304916858673, val loss: 0.2430184781551361, val metric: 0.8918571935670279


train (loss=0.23903): 100%|██████████| 8988/8988 [08:33<00:00, 17.51it/s] 
val: 100%|██████████| 2248/2248 [01:42<00:00, 22.01it/s]


[01:04:38] Epoch: 4, train loss: 0.23902958631515503, val loss: 0.24313180148601532, val metric: 0.8916047217014021


val: 100%|██████████| 2248/2248 [01:41<00:00, 22.06it/s]


[01:06:21] Early stopping: val loss: 0.2430184781551361, val metric: 0.8918571935670279
[01:06:26] Fitting [1mLvl_0_Pipe_0_Mod_0_TorchNN_autoint_0[0m finished. score = [1m0.8914542101523242[0m
[01:06:26] [1mLvl_0_Pipe_0_Mod_0_TorchNN_autoint_0[0m fitting and predicting completed
[01:06:26] Time left 2145114.59 secs

[01:06:26] [1mLayer 1 training completed.[0m

[01:06:27] [1mAutoml preset training completed in 14885.52 seconds[0m

[01:06:27] Model description:
Final prediction for new objects (level 0) = 
	 1.00000 * (5 averaged models Lvl_0_Pipe_0_Mod_0_TorchNN_autoint_0) 



In [9]:
roc_auc_score(train.Response, out_of_fold_predictions.data)

0.8914542101523242

In [10]:
pred = automl.predict(test)

test: 100%|██████████| 7491/7491 [02:56<00:00, 42.54it/s]
test: 100%|██████████| 7491/7491 [02:55<00:00, 42.63it/s]
test: 100%|██████████| 7491/7491 [02:55<00:00, 42.74it/s]
test: 100%|██████████| 7491/7491 [02:54<00:00, 42.96it/s]
test: 100%|██████████| 7491/7491 [02:54<00:00, 42.83it/s]


In [11]:
import joblib
joblib.dump((out_of_fold_predictions.data[:, 0], pred.data[:, 0]), 'autoint_5fold_oof_test_089145.jbl')
joblib.dump(automl, 'autoint_5fold_model_089145.jbl')

['autoint_5fold_model_089145.jbl']