## Pip install pytorch-tabnet

In [None]:
!pip install pytorch-tabnet

In [None]:
import numpy as np 
import pandas as pd 
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

import plotly.express as px
from matplotlib import pyplot as plt

In [None]:
df_train = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/train.csv')
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/test.csv')
df_sub = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/sample_submission.csv')

# DEFINE CATEGORICAL AND CONTINUOUS VARIABLES

CAT_COLS = [c for c in df_train.columns if c.startswith("cat")]
NUM_COLS = [c for c in df_train.columns if c.startswith("cont")]

LOW_FREQ_THRESH = 50

encoders = {}
# Categorical features need to be LabelEncoded
for cat_col in CAT_COLS:
    label_enc = LabelEncoder()
    
    # Group low frequencies into one value
    value_counts = df_train[cat_col].value_counts()
    is_low_frequency = value_counts < LOW_FREQ_THRESH    
    low_freq_values = value_counts.index[is_low_frequency]    
    if len(low_freq_values)>0:
        df_train.loc[df_train[cat_col].isin(low_freq_values), cat_col] = "low_frequency"
        # update test set as well
        df_test.loc[df_test[cat_col].isin(low_freq_values), cat_col] = "low_frequency"
        
    df_train[cat_col] = label_enc.fit_transform(df_train[cat_col])
    encoders[cat_col] = label_enc
    
# Encode test set
for cat_col in CAT_COLS:
    label_enc = encoders[cat_col]
    le_dict = dict(zip(label_enc.classes_, label_enc.transform(label_enc.classes_)))
    # Replace unknown values by the most common value
    # Changing this to another value might make more sense
    if le_dict.get("low_frequency") is not None:
        default_val = le_dict["low_frequency"]
    else:
        default_val = df_train[cat_col].mode().values[0]
    df_test[cat_col] = df_test[cat_col].apply(lambda x: le_dict.get(x, default_val ))
    
# Clip numerical features in test set to match training set
for num_col in NUM_COLS:
    df_test[num_col] = np.clip(df_test[num_col], df_train[num_col].min(), df_train[num_col].max())
    
    # Taken from https://www.kaggle.com/siavrez/kerasembeddings
    df_train[f'q_{num_col}'], bins_ = pd.qcut(df_train[num_col], 25, retbins=True, labels=[i for i in range(25)])
    df_test[f'q_{num_col}'] = pd.cut(df_test[num_col], bins=bins_, labels=False, include_lowest=True)
    CAT_COLS.append(f'q_{num_col}')
    
FEATURES = CAT_COLS + NUM_COLS

In [None]:
cat_dims = df_train[CAT_COLS].nunique().to_list()
cat_idxs = [FEATURES.index(cat_col) for cat_col in CAT_COLS]
cat_emb_dims = np.ceil(np.log(cat_dims)).astype(np.int).tolist()
cat_emb_dims = np.ceil(np.clip((np.array(cat_dims)) / 2, a_min=1, a_max=50)).astype(np.int).tolist()

In [None]:
X = df_train[FEATURES].values
y = df_train["target"].values

X_test = df_test[FEATURES].values

### Let's try to use information contained in test set to pretrain models

As the pretraining seems quite smooth, it might help to try pretraining on both X and X_test to learn from more data.

In [None]:
from pytorch_tabnet.pretraining import TabNetPretrainer

N_D = 16
N_A = 16
N_INDEP = 2
N_SHARED = 2
N_STEPS = 1 #2
MASK_TYPE = "sparsemax"
GAMMA = 1.5
BS = 512
MAX_EPOCH =  20 # 20
PRETRAIN = True


if PRETRAIN:
    pretrain_params = dict(n_d=N_D, n_a=N_A, n_steps=N_STEPS,  #0.2,
                           n_independent=N_INDEP, n_shared=N_SHARED,
                           cat_idxs=cat_idxs,
                           cat_dims=cat_dims,
                           cat_emb_dim=cat_emb_dims,
                           gamma=GAMMA,
                           lambda_sparse=0., optimizer_fn=torch.optim.Adam,
                           optimizer_params=dict(lr=2e-2),
                           mask_type=MASK_TYPE,
                           scheduler_params=dict(mode="min",
                                                 patience=3,
                                                 min_lr=1e-5,
                                                 factor=0.5,),
                           scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,                         
                           verbose=1,
                          )

    pretrainer = TabNetPretrainer(**pretrain_params)

    pretrainer.fit(X_train=X_test, 
                   eval_set=[X],
                   max_epochs=MAX_EPOCH,
                   patience=25, batch_size=BS, virtual_batch_size=BS, #128,
                   num_workers=0, drop_last=True,
                   pretraining_ratio=0.5 # The bigger your pretraining_ratio the harder it is to reconstruct
                  )

In [None]:
BS = 2048
MAX_EPOCH =  20
skf = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True)


df_train['oof_preds'] = np.nan

fold_nb = 1
for train_index, valid_index in skf.split(X, y):
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

    tabnet_params = dict(n_d=N_D, 
                         n_a=N_A,
                         n_steps=N_STEPS, gamma=GAMMA,
                         n_independent=N_INDEP, n_shared=N_SHARED,
                         lambda_sparse=1e-5,
                         seed=0,
                         clip_value=2,
                         cat_idxs=cat_idxs,
                         cat_dims=cat_dims,
                         cat_emb_dim=cat_emb_dims,
                         mask_type=MASK_TYPE,
                         device_name='auto',
                         optimizer_fn=torch.optim.Adam,
                         optimizer_params=dict(lr=5e-2, weight_decay=1e-5),
                         scheduler_params=dict(max_lr=5e-2,
                                               steps_per_epoch=int(X_train.shape[0] / BS),
                                               epochs=MAX_EPOCH,
                                               #final_div_factor=100,
                                               is_batch_level=True),
                        scheduler_fn=torch.optim.lr_scheduler.OneCycleLR,
#                               scheduler_params=dict(mode='max',
#                                                     factor=0.5,
#                                                     patience=5,
#                                                     is_batch_level=False,),
#                               scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                         verbose=1)
    # Defining TabNet model
    model = TabNetClassifier(**tabnet_params)

    model.fit(X_train=X_train, y_train=y_train,
              from_unsupervised=pretrainer if PRETRAIN else None,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_name=["train", "valid"],
              eval_metric=["auc"],
              batch_size=BS,
              virtual_batch_size=256,
              max_epochs=MAX_EPOCH,
              drop_last=True,
              pin_memory=True,
              patience=10,
             )


    val_preds = model.predict_proba(X_valid)[:, -1]
    print('auc:', roc_auc_score(y_true=y_valid, y_score=val_preds))
    
    df_train.loc[valid_index, 'oof_preds'] = val_preds
    
    test_preds = model.predict_proba(X_test)[:, -1]
    df_sub[f"fold_{fold_nb}"] = test_preds
    fold_nb+=1

In [None]:
global_auc = roc_auc_score(y_true=df_train.target, y_score=df_train.oof_preds)
print(f"Global AUC score : {global_auc:.4f}")

# Few plots to understand one model (last fold)

In [None]:
# Global Feature importance
feat_importances = model.feature_importances_
indices = np.argsort(feat_importances)

fig = px.bar(y=feat_importances[indices], x=[FEATURES[idx] for idx in indices],
             title="Global feature importance")
fig.update_xaxes(type='category', title='Feature Name')
fig.update_yaxes(title='Importance')

In [None]:
LIMIT_EXPLAIN = 60000

explain_mat, masks = model.explain(X_valid[:LIMIT_EXPLAIN, :])
# Normalize the importance by sample
normalized_explain_mat = np.divide(explain_mat, explain_mat.sum(axis=1).reshape(-1, 1))

# Add prediction to better understand correlation between features and predictions
explain_and_preds = np.hstack([normalized_explain_mat, val_preds[:LIMIT_EXPLAIN].reshape(-1, 1)])

In [None]:
# sort rows in prediction order
sorted_index = np.argsort(explain_and_preds[:,-1])

px.imshow(explain_and_preds[sorted_index],
          labels=dict(x="Features", y="Samples", color="Importance"),
          x=FEATURES+["prediction"],
          title="Sample wise feature importance (reality is more complex than global feature importance)")

# How to read this

It looks like looking at only a few features is enough to get a pretty decent score...

This defintely should be investigated!

In [None]:
non_null_idx = (explain_and_preds == 0).sum(axis=0) < 0.9*LIMIT_EXPLAIN
correlation_importance = np.corrcoef(explain_and_preds[:, non_null_idx].T)

px.imshow(correlation_importance,
          labels=dict(x="Features", y="Features", color="Correlation"),
          x=np.array(FEATURES+["prediction"])[non_null_idx], y=np.array(FEATURES+["prediction"])[non_null_idx],
          title="Correlation between attention mechanism for each feature and predictions")

# Create submission file

In [None]:
final_sub = df_sub.copy()
fold_cols = [f for f in final_sub.columns if f.startswith("fold")]
final_sub['target'] = final_sub[fold_cols].mean(axis=1)
final_sub.drop(columns=fold_cols, inplace=True)

In [None]:
final_sub.to_csv('submission.csv', index=False)