# About this notebook

This is just a very simple baseline for TabNet usage as I saw some starters with very poor results.

There is nothing very specific to this competition in this notebook. This is a simple adaptation of a previous notebook I shared on an other competition : https://www.kaggle.com/optimo/tabnetregressor-baseline


In [None]:
!pip install pytorch-tabnet

# install develop branch
# !pip install  "git+https://github.com/dreamquark-ai/tabnet.git@develop#egg=pytorch_tabnet" --upgrade

In [None]:
import numpy as np 
import pandas as pd 
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

import plotly.express as px
from matplotlib import pyplot as plt

In [None]:
df_train = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/train.csv')
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/test.csv')
df_sub = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/sample_submission.csv')
# set submission preds to 0
df_sub[[col for col in df_sub.columns if col.startswith("Class")]] = 0

In [None]:
# Consider everything as numerical
# CAT_COLS = [] 
# NUM_COLS = [c for c in df_train.columns if c.startswith("feature_")]

# Consider everything as categorical variables might be useful : this is the only trick of this notebook
CAT_COLS = [c for c in df_train.columns if c.startswith("feature_")] 
NUM_COLS = [] 

FEATURES = CAT_COLS + NUM_COLS


encoders = {}
# Categorical features need to be LabelEncoded
for cat_col in CAT_COLS:
    label_enc = LabelEncoder()
        
    df_train[cat_col] = label_enc.fit_transform(df_train[cat_col])
    encoders[cat_col] = label_enc
    
# Encode test set
for cat_col in CAT_COLS:
    label_enc = encoders[cat_col]
    le_dict = dict(zip(label_enc.classes_, label_enc.transform(label_enc.classes_)))
    # Replace unknown values by the most common value
    # Changing this to another value might make more sense
    if le_dict.get("low_frequency") is not None:
        default_val = le_dict["low_frequency"]
    else:
        default_val = df_train[cat_col].mode().values[0]
    df_test[cat_col] = df_test[cat_col].apply(lambda x: le_dict.get(x, default_val ))
    
# Clip numerical features in test set to match training set
for num_col in NUM_COLS:
    df_test[num_col] = np.clip(df_test[num_col], df_train[num_col].min(), df_train[num_col].max())

In [None]:
cat_dims = df_train[CAT_COLS].nunique().to_list()
cat_idxs = [FEATURES.index(cat_col) for cat_col in CAT_COLS]
cat_emb_dims = np.ceil(np.log(cat_dims)).astype(np.int).tolist()
# cat_emb_dims = np.ceil(np.clip((np.array(cat_dims)) / 2, a_min=1, a_max=50)).astype(np.int).tolist()
# cat_emb_dims=1

X = df_train[FEATURES].values
y = df_train["target"].values

X_test = df_test[FEATURES].values

## Pretraining

In [None]:
from pytorch_tabnet.pretraining import TabNetPretrainer

N_D = 64 #64 # 32
N_A = 64 # 32
N_INDEP = 1 #2
N_SHARED = 1 #2
N_STEPS = 3 #2
MASK_TYPE = "sparsemax"
GAMMA = 1.2
BS = 256
MAX_EPOCH =  30
PRETRAIN = True


if PRETRAIN:
    pretrain_params = dict(n_d=N_D, n_a=N_A, n_steps=N_STEPS,  #0.2,
                           n_independent=N_INDEP, n_shared=N_SHARED,
                           cat_idxs=cat_idxs,
                           cat_dims=cat_dims,
                           cat_emb_dim=cat_emb_dims,
                           gamma=GAMMA,
                           lambda_sparse=0., optimizer_fn=torch.optim.Adam,
                           optimizer_params=dict(lr=2e-2),
                           mask_type=MASK_TYPE,
                           scheduler_params=dict(mode="min",
                                                 patience=3,
                                                 min_lr=1e-5,
                                                 factor=0.5,),
                           scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,                         
                           verbose=1,
                          )

    pretrainer = TabNetPretrainer(**pretrain_params)

    pretrainer.fit(X_train=X_test, 
                   eval_set=[X],
                   max_epochs=MAX_EPOCH,
                   patience=25, batch_size=BS, virtual_batch_size=BS, #128,
                   num_workers=1, drop_last=True,
                   pretraining_ratio=0.5 # The bigger your pretraining_ratio the harder it is to reconstruct
                  )

## Actual training

In [None]:
BS = 2048
MAX_EPOCH =  100
LAMBDA_SPARSE = 1e-5 #1e-5

N_SPLITS = 5
NB_FOLDS = 5 # max N_SPLITS
skf = StratifiedKFold(n_splits=N_SPLITS, random_state=2021, shuffle=True)


LR = 1e-1 # 5e-2
fold_nb = 1
for train_index, valid_index in skf.split(X, y):
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

    tabnet_params = dict(n_d=N_D, 
                         n_a=N_A,
                         n_steps=N_STEPS, gamma=GAMMA,
                         n_independent=N_INDEP, n_shared=N_SHARED,
                         lambda_sparse=LAMBDA_SPARSE,
                         seed=0,
                         clip_value=2,
                         cat_idxs=cat_idxs,
                         cat_dims=cat_dims,
                         cat_emb_dim=cat_emb_dims,
                         mask_type=MASK_TYPE,
                         device_name='auto',
                         optimizer_fn=torch.optim.Adam,
                         optimizer_params=dict(lr=LR, weight_decay=1e-5),
#                          scheduler_params=dict(max_lr=LR,
#                                                steps_per_epoch=int(X_train.shape[0] / BS),
#                                                epochs=MAX_EPOCH,
#                                                #final_div_factor=100,
#                                                is_batch_level=True),
#                         scheduler_fn=torch.optim.lr_scheduler.OneCycleLR,
                              scheduler_params=dict(mode='min',
                                                    factor=0.5,
                                                    patience=3,
                                                    is_batch_level=False,),
                              scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                         verbose=1)
    # Defining TabNet model
    model = TabNetClassifier(**tabnet_params)

    model.fit(X_train=X_train, y_train=y_train,
              from_unsupervised=pretrainer if PRETRAIN else None,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_name=["train", "valid"],
              eval_metric=["logloss"],
              batch_size=BS,
              virtual_batch_size=256,
              max_epochs=MAX_EPOCH,
              drop_last=True,
              pin_memory=True,
              patience=10,
             )  
    
    test_preds = model.predict_proba(X_test)
    df_sub[model.classes_] += test_preds
    fold_nb+=1
    
    if fold_nb > NB_FOLDS:
        break

df_sub[model.classes_] = df_sub[model.classes_] / NB_FOLDS


In [None]:
df_sub.to_csv('submission.csv', index=None)