 # Load Dependencies

In [None]:
!pip uninstall -y typing
!pip install  "git+https://github.com/dreamquark-ai/tabnet.git@develop#egg=pytorch_tabnet" --upgrade

In [None]:
import pandas as pd
import numpy as np
import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
#from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder

#from pytorch_tabnet.multitask import TabNetMultiTaskClassifier
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.pretraining import TabNetPretrainer
import torch

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
train=pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
test=pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')
sub=pd.read_csv('../input/tabular-playground-series-jun-2021/sample_submission.csv')

In [None]:
conditions = [
    (train.target == "Class_1"), (train.target == "Class_2"), (train.target == "Class_3"),
    (train.target == "Class_4"), (train.target == "Class_5"), (train.target == "Class_6"),
    (train.target == "Class_7"), (train.target == "Class_8"), (train.target == "Class_9")
]
choices = [0, 1, 2, 3, 4, 5, 6, 7, 8]
train["target"] = np.select(conditions, choices)

# Simple preprocessing

In [None]:
full = pd.concat([train, test], axis=0)
full.iloc[:,1:] = full.iloc[:,1:].applymap(str)

In [None]:
nunique = full.nunique()
types = full.dtypes

categorical_columns = []
categorical_dims =  {}
for col in full.drop(['id', 'target'], axis=1).columns:
    if types[col] == 'object' or nunique[col] < 200:
        print(col, full[col].nunique())
        l_enc = LabelEncoder()
        full[col] = full[col].fillna("X")
        full[col] = l_enc.fit_transform(full[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)

# Define cat features for embeddings

In [None]:
unused_feat = ['Set', 'id']

features = [ col for col in full.columns if col not in unused_feat+['target']] 

cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]
cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]
cat_emb_dims = np.ceil(np.log(cat_dims)).astype(np.int).tolist()

In [None]:
X = full[features].values[full.id.isin(train.id)]
y = full['target'].values[full.id.isin(train.id)]

X_test = full[features].values[full.id.isin(test.id)]
y_test = full['target'].values[full.id.isin(test.id)]

see: https://www.kaggle.com/gomes555/tps-jun2021-lightautoml

In [None]:
oof_lightautoml=pd.read_csv('../input/tps-jun2021-lightautoml/oof_lightautoml.csv')
sub_lightautoml=pd.read_csv('../input/tps-jun2021-lightautoml/sub_lightautoml.csv')

oof_lightautoml = oof_lightautoml.drop('id', axis=1)
oof_lightautoml.columns = ['pred_lightautoml' + str(i) for i in range(1, 10)]

sub_lightautoml = sub_lightautoml.drop('id', axis=1)
sub_lightautoml.columns = ['pred_lightautoml' + str(i) for i in range(1, 10)]

X = pd.concat([pd.DataFrame(X), oof_lightautoml], axis=1).values
X_test = pd.concat([pd.DataFrame(X_test), sub_lightautoml], axis=1).values

# Self Supervised Training

In [None]:
# TabNetPretrainer
unsupervised_model = TabNetPretrainer(
    n_d=64, n_a=64,
    n_steps=3,
    n_independent=1,
    n_shared=1,
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=cat_emb_dims,
    gamma=1.2,
    lambda_sparse=0.,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='sparsemax', 
    scheduler_params=dict(mode="min",
                          patience=3,
                          min_lr=1e-5,
                          factor=0.5,),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    verbose=1
)

In [None]:
unsupervised_model.fit(
    X_train=X_test,
    eval_set=[X],
    max_epochs=30 , 
    patience=25,
    batch_size=256,
    virtual_batch_size=256,
    num_workers=1,
    drop_last=True,
    pretraining_ratio=0.5

)

In [None]:
# Make reconstruction from a dataset
reconstructed_X, embedded_X = unsupervised_model.predict(X)
assert(reconstructed_X.shape==embedded_X.shape)

In [None]:
#unsupervised_explain_matrix, unsupervised_masks = unsupervised_model.explain(X)

In [None]:
#fig, axs = plt.subplots(1, 3, figsize=(20,20))
#
#for i in range(3):
#    axs[i].imshow(unsupervised_masks[i][:50])
#    axs[i].set_title(f"mask {i}")

In [None]:
## Save and load
unsupervised_model.save_model('./test_pretrain')
loaded_pretrain = TabNetPretrainer()
loaded_pretrain.load_model('./test_pretrain.zip')

# Baseline

In [None]:
N_SPLITS=5

skf = StratifiedKFold(n_splits=N_SPLITS, random_state=2021, shuffle=True)
tab_pred = 0
tab_oof = np.zeros((X.shape[0], 9))

for fold, (train_index, valid_index) in enumerate(skf.split(X, y)):
    print(f"➜ FOLD :{fold}")
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    
    start = time.time()
    
    clf = TabNetClassifier(n_d=64,
                           n_a=64,
                           n_steps=3,
                           gamma=1.2,
                           n_independent=1,
                           n_shared=1,
                           lambda_sparse=1e-5,
                           seed=0,
                           clip_value=2,
                           cat_idxs=cat_idxs,
                           cat_dims=cat_dims,
                           cat_emb_dim=cat_emb_dims,
                           optimizer_fn=torch.optim.Adam,
                           optimizer_params=dict(lr=1e-1, weight_decay=1e-5),
                           scheduler_params=dict(mode='min',
                                                        factor=0.5,
                                                        patience=3,
                                                        is_batch_level=False,),
                           scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                           mask_type='sparsemax',
                           verbose=1
                          )

    clf.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_name=['train', 'valid'],
        eval_metric=['logloss'],
        max_epochs=100 ,
        batch_size=2048, 
        virtual_batch_size=256,
        num_workers=0,
        drop_last=True,
        pin_memory=True,
        patience=10,
        from_unsupervised=loaded_pretrain
    )
    
    tab_oof[valid_index,:] = clf.predict_proba(X_valid)
    tab_pred += clf.predict_proba(X_test)/N_SPLITS
    
    tab_logloss = log_loss(y_valid, tab_oof[valid_index])
    print(f"score: {tab_logloss:.6f} ")
    print(f"elapsed: {time.time()-start:.2f} sec\n")
    
    del clf
    
tab_logloss = log_loss(y, tab_oof)
print(f"Final logloss score: {tab_logloss} ✔️ ")

In [None]:
sub.iloc[:, 1:] = tab_pred
sub.to_csv("sub_tab_default.csv", index=False)

In [None]:
oof_tab = pd.concat([train.id,
                     pd.DataFrame(tab_oof, 
                                  columns=["Class_1", "Class_2", "Class_3",
                                           "Class_4", "Class_5", "Class_6",
                                           "Class_7", "Class_8", "Class_9"])],
                    axis=1)
oof_tab.to_csv("oof_tab_optuned.csv", index=False)

# Draft

In [None]:
#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=314, stratify=y)
#
#X_train, y_train = X_train.values, y_train.values
#X_val, y_val = X_val.values, y_val.values
#
#scaler = MinMaxScaler()
#scaler.fit(X_train)
#
#X_train = scaler.transform(X_train)
#X_val = scaler.transform(X_val)
#X_test = scaler.transform(X_test)

In [None]:
## TabNetPretrainer
#unsupervised_model = TabNetPretrainer(
#    optimizer_fn=torch.optim.Adam,
#    optimizer_params=dict(lr=2e-2),
#    mask_type='sparsemax'
#)
#
#unsupervised_model.fit(
#    X_train=X_train,
#    eval_set=[X_val],
#    pretraining_ratio=0.5,
#)

In [None]:
#model = TabNetClassifier(verbose = 1)
#
#model.fit(
#    X_train=X_train, y_train=y_train,
#    eval_set=[(X_train, y_train), (X_val, y_val)],
#    eval_name=['train', 'val'],
#    eval_metric=['logloss'],
#    max_epochs=30, 
#    patience=15,
#    from_unsupervised=unsupervised_model
#)

In [None]:
#fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(12, 5))
#
## plot losses
#axs[0].plot(model.history['loss'])
#
## plot logloss
#axs[1].plot(model.history['train_logloss'])
#axs[1].plot(model.history['val_logloss'])
#
## plot learning rates
#axs[2].plot(model.history['lr'])
#
#fig.tight_layout()
#plt.show()

In [None]:
#val_preds = model.predict_proba(X_val)
#val_logloss = [log_loss(y_pred=task_pred, y_true=y_val[:,1])
#             for task_idx, task_pred in enumerate(val_preds)]
#
#np.mean(val_logloss)

In [None]:
#task_preds = model.predict_proba(X_test)
#tab_pred = np.mean(task_preds, axis=0)

In [None]:
#sub.iloc[:, 1:] = tab_pred
#sub.to_csv("sub_tab_default.csv", index=False)

# Optuna

In [None]:
#def objective(trial):
#    
#    global X, y, X_test
#    
#    hyperparams = {
#        'n_a_d': trial.suggest_categorical('n_a_d', [8, 16, 24, 32, 64, 128]),
#        'n_steps': trial.suggest_int('n_steps', 3, 10, 1),
#        'gamma': trial.suggest_categorical('gamma', [1.0, 1.2, 1.5, 2.0]),
#        'lambda': trial.suggest_categorical('lambda', [0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001, 0]),
#        'batch_size': trial.suggest_categorical('batch_size', [1024, 2048, 4096, 8192, 16384, 32768]),
#        'virtual_batch_size': trial.suggest_categorical('virtual_batch_size', [128, 256, 512, 1024]),
#        'lr': trial.suggest_categorical('lr', [0.005, 0.01, 0.02, 0.025]),
#        'gamma_decay': trial.suggest_categorical('gamma_decay', [0.4, 0.8, 0.9, 0.95]),
#        #'mask_type': trial.suggest_categorical('mask_type', ['entmax', 'sparsemax']),
#        'batch_momentum': trial.suggest_categorical('batch_momentum', [0.6, 0.7, 0.8, 0.9, 0.95, 0.98]),
#    }
#    
#    model = TabNetMultiTaskClassifier(
#        n_d=hyperparams['n_a_d'],
#        n_a=hyperparams['n_a_d'],
#        gamma=hyperparams['gamma'],
#        optimizer_fn=torch.optim.Adam,
#        optimizer_params={'lr':hyperparams['lr']},
#        scheduler_params={"step_size":hyperparams['n_steps'],
#                          "gamma":hyperparams['gamma_decay']},
#        scheduler_fn=torch.optim.lr_scheduler.StepLR,
#        mask_type='entmax',
#        lambda_sparse=hyperparams['lambda'],
#        momentum=hyperparams['batch_momentum'],
#        verbose = 0
#    )
#
#    model.fit(
#        X_train=X_train, y_train=y_train,
#        eval_set=[(X_train, y_train), (X_val, y_val)],
#        eval_name=['train', 'val'],
#        max_epochs=MAX_EPOCHS, 
#        patience=PATIENCE,
#        batch_size=hyperparams['batch_size'],
#        virtual_batch_size=hyperparams['virtual_batch_size'],
#        num_workers=0,
#        drop_last=False
#    )
#
#    val_preds = model.predict_proba(X_val)
#    val_logloss = [log_loss(y_pred=task_pred, y_true=y_val[:,task_idx])
#                 for task_idx, task_pred in enumerate(val_preds)]
#    
#    del model
#
#    return np.mean(val_logloss)

In [None]:
#study = optuna.create_study(direction='minimize',
#                            sampler=optuna.samplers.TPESampler(multivariate=True, seed=123))
#
#study.optimize(objective, 
#               timeout=60*60*6, 
#               #n_trials=2, 
#               gc_after_trial=False)

In [None]:
#study.best_value

In [None]:
#plot_optimization_history(study)

In [None]:
#optuna.visualization.plot_parallel_coordinate(study)

In [None]:
#plot_param_importances(study)

In [None]:
#study.best_params

# Final Model

In [None]:
#tab_oof = np.zeros((X.shape[0], 9))
#tab_pred = 0
#
#for fold, (train_idx, val_idx) in enumerate(kf.split(X=X, y=y)):
#    print(f"➜ FOLD :{fold}")
#    X_train = X.values[train_idx]
#    y_train = y.values[train_idx]
#    X_val = X.values[val_idx]
#    y_val = y.values[val_idx]
#    
#    y_train = y_train.reshape(-1, 1)
#    y_train = np.hstack([y_train]*NB_TASKS)
#
#    y_val = y_val.reshape(-1, 1)
#    y_val = np.hstack([y_val]*NB_TASKS)
#
#    scaler = StandardScaler()
#    scaler.fit(X_train)
#    
#    X_train = scaler.transform(X_train)
#    X_val = scaler.transform(X_val)
#    X_test = scaler.transform(X_test)
#    
#    
#    start = time.time()
#    
#    model = TabNetMultiTaskClassifier(
#        n_d=study.best_params['n_a_d'],
#        n_a=study.best_params['n_a_d'],
#        optimizer_fn=torch.optim.Adam,
#        optimizer_params=dict(lr=0.2),
#        scheduler_params={"step_size":study.best_params['n_steps'],
#                          "gamma":study.best_params['gamma']},
#        scheduler_fn=torch.optim.lr_scheduler.StepLR,
#        mask_type='entmax',
#        lambda_sparse=study.best_params['lambda'],
#        momentum=study.best_params['batch_momentum'],
#        verbose = 1
#    )
#    
#    model.fit(
#        X_train=X_train, y_train=y_train,
#        eval_set=[(X_train, y_train), (X_val, y_val)],
#        eval_name=['train', 'val'],
#        max_epochs=MAX_EPOCHS, 
#        patience=PATIENCE,
#        batch_size=BATCH_SIZE, 
#        virtual_batch_size=VIRTUAL_BATCH_SIZE,
#        num_workers=0,
#        drop_last=False
#    )
#    
#    val_preds = model.predict_proba(X_val)
#    val_logloss = [log_loss(y_pred=task_pred, y_true=y_val[:,task_idx])
#                 for task_idx, task_pred in enumerate(val_preds)]
#    
#    tab_oof[val_idx,:] = np.mean(val_preds, axis=0)
#    
#    task_pred = model.predict_proba(X_test)
#    tab_pred += np.mean(task_pred, axis=0) / K
#    
#    print(f"score: {np.mean(val_logloss):.6f} ")
#    print(f"elapsed: {time.time()-start:.2f} sec\n")
#    
#    del model
#
#tab_logloss = log_loss(y, tab_oof)
#print(f"Final logloss score: {tab_logloss} ✔️ ")

In [None]:
#PATIENCE = 15

In [None]:
#final_model = TabNetMultiTaskClassifier(
#    n_d=study.best_params['n_a_d'],
#    n_a=study.best_params['n_a_d'],
#    gamma=study.best_params['gamma'],
#    optimizer_fn=torch.optim.Adam,
#    optimizer_params=dict(lr=study.best_params['lr']),
#    scheduler_params={"step_size":study.best_params['n_steps'],
#                      "gamma":study.best_params['gamma_decay']},
#    scheduler_fn=torch.optim.lr_scheduler.StepLR,
#    mask_type='entmax',
#    lambda_sparse=study.best_params['lambda'],
#    momentum=study.best_params['batch_momentum'],
#    verbose = 1
#)
#
#final_model.fit(
#    X_train=X_train, y_train=y_train,
#    eval_set=[(X_train, y_train), (X_val, y_val)],
#    eval_name=['train', 'val'],
#    max_epochs=MAX_EPOCHS, 
#    patience=PATIENCE,
#    batch_size=study.best_params['batch_size'], 
#    virtual_batch_size=study.best_params['virtual_batch_size']
#)
#
#task_pred = final_model.predict_proba(X_test)
#tab_pred = np.mean(task_pred, axis=0)

In [None]:
#fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(12, 5))
#
## plot losses
#axs[0].plot(final_model.history['loss'])
#
## plot logloss
#axs[1].plot(final_model.history['train_logloss'])
#axs[1].plot(final_model.history['val_logloss'])
#
## plot learning rates
#axs[2].plot(final_model.history['lr'])
#
#fig.tight_layout()
#plt.show()

# Sub

In [None]:
#sub.iloc[:, 1:] = tab_pred
#sub.to_csv("sub_tab_optuned.csv", index=False)

In [None]:
#oof_tab = pd.concat([train.id,
#                     pd.DataFrame(tab_oof, 
#                              columns=["Class_1", "Class_2", "Class_3",
#                                       "Class_4", "Class_5", "Class_6",
#                                       "Class_7", "Class_8", "Class_9"])],
#                    axis=1)
#oof_tab.to_csv("oof_tab_optuned.csv", index=False)

# Sources

- https://arxiv.org/pdf/1908.07442.pdf
- https://reposhub.com/python/deep-learning/dreamquark-ai-tabnet.html
- https://towardsdatascience.com/modelling-tabular-data-with-googles-tabnet-ba7315897bfb
- https://github.com/google-research/google-research/blob/master/tabnet/tabnet_model.py
- https://github.com/hussius/tabnet_fork/blob/master/opt_tabnet.py
- https://www.kaggle.com/optimo/tabnetbaseline/