### In this Notebook I'm going to use PyTorch TabNet

In [None]:
!pip install pytorch-tabnet

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time

import optuna

import torch
from pytorch_tabnet.tab_model import TabNetClassifier

from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt

import os

In [None]:
BASE_DIR = '/kaggle/input/tabular-playground-series-mar-2021'

FILE_TRAIN = BASE_DIR + '/train.csv'
FILE_TEST = BASE_DIR + '/test.csv'

In [None]:
orig_data = pd.read_csv(FILE_TRAIN)

In [None]:
orig_data.head()

In [None]:
# GLOBALS
FOLDS = 5
PREDICTOR = 'target'

# feature not used
unused_feat = ['id']

In [None]:
# before any transformation, make a copy
data = orig_data.copy()

In [None]:
# columns that will be used
num_col_list = [ 'cont0', 'cont1', 'cont2', 'cont3', 'cont4',
                'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10']

cat_col_list = ['cat0', 'cat1', 'cat2','cat3','cat4','cat5','cat6',
                'cat7','cat8', 'cat9', 'cat10',
                'cat11', 'cat12','cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18']

all_col_list = num_col_list + cat_col_list

In [None]:
# encode cat features (with label encoder)
nunique = data.nunique()
types = data.dtypes

categorical_columns = cat_col_list
categorical_dims =  {}

# I need to save the encoder list for the processing of the test set
enc_list = []
 
for col in cat_col_list:
    print(col, data[col].nunique())
    l_enc = LabelEncoder()
    data[col] = l_enc.fit_transform(data[col].values)
    
    # save the encoder for the test set
    enc_list.append(l_enc)
    categorical_dims[col] = len(l_enc.classes_)

In [None]:
# split data in train, validation
FRAC = 0.8

N_TRAIN = int(data.shape[0] * FRAC)
N_VALID = data.shape[0] - N_TRAIN

# before splitting, shuffle
data = data.sample(frac = 1)

df_train = data[:N_TRAIN]
df_valid = data[N_TRAIN:]

print('Number of records in train dataset:', N_TRAIN)
print('Number of records in validation dataset:', N_VALID)

In [None]:
label_train = df_train[PREDICTOR].values
label_valid = df_valid[PREDICTOR].values

df_train = df_train[all_col_list]
df_valid = df_valid[all_col_list]

In [None]:
features = [ col for col in df_train.columns if col not in unused_feat+[PREDICTOR]] 

cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

In [None]:
df_train.head()

In [None]:
# fit parameters
EPOCHS = 30
BATCH_SIZE = 4096

In [None]:
def objective(trial):
    # parameter set by optuna
    N_D = trial.suggest_int('N_D', 8, 32)
    N_A = N_D
    GAMMA = trial.suggest_float('GAMMA', 1.0, 2.0)
    N_STEPS = trial.suggest_int('N_STEPS', 1, 3, 1)
    LAMBDA_SPARSE = trial.suggest_loguniform("LAMBDA_SPARSE", 1e-5, 1e-1)
    
    # changes
    # introduced lambda-sparse
    clf = TabNetClassifier(cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=1,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=2e-2),
                       scheduler_params={"step_size":4, # how to use learning rate scheduler
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='sparsemax',
                          n_d = N_D,
                          n_a = N_A,
                          gamma = GAMMA,
                          n_steps = N_STEPS,
                          lambda_sparse = LAMBDA_SPARSE)
    
    clf.fit(df_train.values, label_train,
        eval_set=[(df_train.values, label_train),(df_valid.values, label_valid)],
        max_epochs = EPOCHS,
        batch_size = BATCH_SIZE,
        patience = 5,
        eval_name=['train', 'valid'],
        eval_metric=['auc']
           )
    
    # changed, now score is max val_uac
    score = np.max(clf.history['valid_auc'])
    
    return score

In [None]:
study = optuna.create_study(direction='maximize', study_name = 'tabnet-study1')

study.optimize(objective, n_trials=100, timeout = 3600*8)
 
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
from optuna.visualization import plot_optimization_history

plot_optimization_history(study)

In [None]:
from optuna.visualization import plot_param_importances

plot_param_importances(study)

### Predictions on test Set