This notebook is based on @alexryzhkov's [LightAutoML baseline TPS May 2021](https://www.kaggle.com/alexryzhkov/lightautoml-baseline-tps-may-2021/). It trains LightAutoML with [Kaggler](https://github.com/jeongyoonlee/Kaggler)'s [DAE](https://www.kaggle.com/jeongyoonlee/tps5-dae-features) and target encoded features.

Enjoy~!

# Step 0.0. Install LightAutoML & Kaggler

In [None]:
!pip install -U lightautoml

In [None]:
!pip install -U Kaggler

# Step 0.1. Import necessary libraries 

In [None]:
# Standard python libraries
import os
import time
import re

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

import numpy as np
import lightgbm as lgb
import os
import pandas as pd
from pathlib import Path
import seaborn
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from warnings import simplefilter

# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task

import kaggler
from kaggler.model import AutoLGB
from kaggler.preprocessing import DAE, TargetEncoder, LabelEncoder
print(kaggler.__version__)

In [None]:
pd.set_option('max_columns', 500)
simplefilter('ignore')

# Step 0.2. Parameters 

In [None]:
feature_name = 'dae_te_le'
algo_name = 'lml'
version = 1
model_name = f'{algo_name}_{feature_name}_v{version}'

data_dir = Path('../input/tabular-playground-series-may-2021')
train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'

dae_feature_file = '../input/tps5-dae-features/dae.h5'
predict_val_file = f'{model_name}.val.txt'
predict_tst_file = f'{model_name}.tst.txt'
submission_file = f'{model_name}.sub.csv'

id_col = 'id'
target_col = 'target'

In [None]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 3 * 3600 # Time in seconds for automl run
TARGET_NAME = 'target'

encoding_dim = 128
seed = 42
n_fold = 5
n_class = 4

# Step 0.3. Data load 

In [None]:
trn = pd.read_csv(train_file, index_col=id_col)
tst = pd.read_csv(test_file, index_col=id_col)
sub = pd.read_csv(sample_file, index_col=id_col)
print(trn.shape, tst.shape, sub.shape)

In [None]:
y = trn[target_col].str.split('_').str[1].astype(int) - 1
n_trn = trn.shape[0]
df = pd.concat([trn.drop(target_col, axis=1), tst], axis=0)
feature_cols = df.columns.tolist()
print(y.shape, df.shape)

# Step 0.5. Add new features

In [None]:
df_dae = pd.read_hdf(dae_feature_file, key='data')

cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
te = TargetEncoder(cv=cv)
te.fit(trn[feature_cols], y)
df_te = te.transform(df[feature_cols])
df_te.columns = [f'te_{x}' for x in df.columns]

le = LabelEncoder(min_obs=50)
df_le = le.fit_transform(df[feature_cols])
df_le.columns = [f'le_{x}' for x in df.columns]

all_df = pd.concat([df_le, df_te, df_dae], axis=1)

train_data, test_data = all_df[:n_trn], all_df[n_trn:]
print(train_data.shape, test_data.shape)

In [None]:
train_data[target_col] = y
print(train_data.shape)
train_data.head()

# ========= AutoML preset usage =========


## Step 1. Create Task & Setup Column Roles

In [None]:
task = Task('multiclass',)
roles = {
    'target': TARGET_NAME,
}

## Step 3. Train on full data 

In [None]:
%%time 

automl = TabularUtilizedAutoML(task = task, 
                               timeout = TIMEOUT,
                               cpu_limit = N_THREADS,
                               reader_params = {'n_jobs': N_THREADS},
                               configs_list=[
                                   '../input/lightautoml-configs/conf_0_sel_type_0.yml',
                                   '../input/lightautoml-configs/conf_1_sel_type_1.yml'
                               ])
oof_pred = automl.fit_predict(train_data, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred[:10], oof_pred.shape))

In [None]:
%%time

# Fast feature importances calculation
fast_fi = automl.get_feature_scores('fast', silent = False)
fast_fi.set_index('Feature')['Importance'].plot.bar(figsize = (20, 10), grid = True)

## Step 4. Predict for test data and check OOF score

In [None]:
%%time

test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred[:10], test_pred.shape))

print('Check scores...')
print('OOF score: {}'.format(log_loss(train_data[TARGET_NAME].values, oof_pred.data)))

## Step 5. Prepare submission

In [None]:
sub.iloc[:, :] = test_pred.data
sub.to_csv(submission_file)