# TPS 6 - Supervised DAE + AutoLGB

The contents of the notebooks are organized as follows:

1. Installing and loading libraries: installs `Kaggler` and load data and libraries
3. Model definition and training: shows how to train LightGBM with `Kaggler`'s `AutoLGB`
4. Submission

Enjoy~!

# Part 1. Loading Libraries and Data

In [None]:
import gc
import joblib
import lightgbm as lgb
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from warnings import simplefilter

In [None]:
!pip install kaggler

In [None]:
import kaggler
from kaggler.model import AutoLGB
print(kaggler.__version__)

In [None]:
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
simplefilter('ignore')

In [None]:
feature_name = 'le_te_sdae'
algo_name = 'lgb'
version = 2
model_name = f'{algo_name}_{feature_name}_v{version}'

data_dir = Path('../input/tabular-playground-series-jun-2021')
train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'

feature_dir = Path('../input/tps-6-dae-features')

build_dir = Path('.')
predict_val_file = build_dir / f'{model_name}.val.txt'
predict_tst_file = build_dir / f'{model_name}.tst.txt'
submission_file = build_dir / f'{model_name}.sub.csv'

id_col = 'id'
target_col = 'target'

n_fold = 5
seed = 42
n_class = 9
encoding_dim = 128
ratio = 4
batch_size = 64 * ratio
lr = 0.0001 * ratio

In [None]:
trn = pd.read_csv(train_file, index_col=id_col)
tst = pd.read_csv(test_file, index_col=id_col)
sub = pd.read_csv(sample_file, index_col=id_col)
print(trn.shape, tst.shape, sub.shape)

In [None]:
y = trn[target_col].str.split('_').str[1].astype(int) - 1
n_trn = trn.shape[0]
df = pd.concat([trn.drop(target_col, axis=1), tst], axis=0)
print(df.shape)

# Part 2. Model Training with AutoLGB

In [None]:
feature_cols = [f'le_{x}' for x in df.columns] + [f'te_{x}' for x in df.columns] + [f'sdae_{i}' for i in range(encoding_dim)]
print(len(feature_cols))

In [None]:
# parameters and best epochs from AutoLGB
is_tuned = True
n_best = 376
params = {'bagging_freq': 1, 
          'verbosity': -1, 
          'seed': seed, 
          'num_threads': -1, 
          'feature_pre_filter': False, 
          'num_class': n_class, 
          'objective': 'multiclass', 
          'metric': 'multi_logloss', 
          'boosting': 'gbdt', 
          'bagging_fraction': 0.5, 
          'feature_fraction': 0.8, 
          'lambda_l1': 10, 
          'lambda_l2': 10, 
          'learning_rate': 0.013959172480364537, 
          'max_depth': 6, 
          'min_child_samples': 25, 
          'num_leaves': 31}

In [None]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

P = np.zeros((n_trn, n_class), dtype=float)
P_tst = np.zeros((tst.shape[0], n_class), dtype=float)
for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
    y_trn, y_val = y[i_trn], y[i_val]
    
    X_trn_i = joblib.load(str(feature_dir / f'{feature_name}.trn{i}.joblib'))
    X_val_i = joblib.load(str(feature_dir / f'{feature_name}.val{i}.joblib'))
    X_tst_i = joblib.load(str(feature_dir / f'{feature_name}.tst{i}.joblib'))
    
    if not is_tuned:
        clf = AutoLGB(objective='multiclass', metric='multi_logloss', params={'num_class': n_class}, 
                      sample_size=X_trn_i.shape[0], feature_selection=False, random_state=seed)
        clf.tune(pd.DataFrame(X_trn_i, columns=feature_cols), y_trn)
        features = clf.features
        params = clf.params
        n_best = clf.n_best
        print(f'{n_best}')
        print(f'{params}')
        print(f'{features}')
        is_tuned = True
    
    trn_data = lgb.Dataset(X_trn_i, y_trn)
    val_data = lgb.Dataset(X_val_i, y_val)
    clf = lgb.train(params, trn_data, n_best, val_data, verbose_eval=100)
    
    P[i_val] = clf.predict(X_val_i)
    P_tst += clf.predict(X_tst_i) / n_fold
    print(f'CV #{i} Loss: {log_loss(y_val, P[i_val]):.6f}')

In [None]:
print(f'CV Loss: {log_loss(y, P):.6f}')
np.savetxt(predict_val_file, P, fmt='%.6f')
np.savetxt(predict_tst_file, P_tst, fmt='%.6f')

# Part 3. Submission

In [None]:
sub[sub.columns] = P_tst
sub.to_csv(submission_file)
sub.head()

Hope this helps.