# Example making hyperparameter tuning for Tab Net

Author: Plamen Pasliev

File from [github.com](https://github.com/plamenpasliev/tabnet/blob/adding-gridsearch-example/hyperparameter_optimization_example.ipynb).

> Note: In this notebook, I am studying the article mentioned above. Some changes may have been made to the code during its implementation.

In [1]:
from pytorch_tabnet.tab_model import TabNetClassifier

import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
import pandas as pd
import numpy as np
np.random.seed(0)


import os
import wget
from pathlib import Path

# Download census-income dataset

In [9]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
dataset_name = 'census-income'
out = Path(os.getcwd()+'/datasets/'+dataset_name+'.csv')

In [10]:
out.parent.mkdir(parents=True, exist_ok=True)
if out.exists():
    print("File already exists.")
else:
    print("Downloading file...")
    wget.download(url, out.as_posix())

File already exists.


# Load data and split

In [11]:
cols = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "target",
]

In [12]:
train = pd.read_csv(out, names=cols)
target = 'target'
if "Set" not in train.columns:
    train["Set"] = np.random.choice(["train", "valid", "test"], p =[.8, .1, .1], size=(train.shape[0],))

train_indices = train[train.Set=="train"].index
valid_indices = train[train.Set=="valid"].index
test_indices = train[train.Set=="test"].index

In [13]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target,Set
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,train
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,train
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,train
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,train
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,train


In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  target          32561 non-null  object
 15  Set             32561 non-null  object
dtypes: int64(6), object(10)
memory usage: 4.0+ MB


# Simple preprocessing

Label encode categorical features and fill empty cells.

In [15]:
categorical_columns = []
categorical_dims =  {}
for col in train.columns[train.dtypes == object]:
    print(col, train[col].nunique())
    l_enc = LabelEncoder()
    train[col] = train[col].fillna("VV_likely")
    train[col] = l_enc.fit_transform(train[col].values)
    categorical_columns.append(col)
    categorical_dims[col] = len(l_enc.classes_)

for col in train.columns[train.dtypes == 'float64']:
    train.fillna(train.loc[train_indices, col].mean(), inplace=True)

workclass 9
education 16
marital-status 7
occupation 15
relationship 6
race 5
sex 2
native-country 42
target 2
Set 3


In [16]:
train.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target,Set
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0,1
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0,1
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0,1
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0,1
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0,1
5,37,4,284582,12,14,2,4,5,4,0,0,0,40,39,0,0
6,49,4,160187,6,5,3,8,1,2,0,0,0,16,23,0,1
7,52,6,209642,11,9,2,4,0,4,1,0,0,45,39,1,1
8,31,4,45781,12,14,4,10,1,4,0,14084,0,50,39,1,1
9,42,4,159449,9,13,2,4,0,4,1,5178,0,40,39,1,2


In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   age             32561 non-null  int64
 1   workclass       32561 non-null  int64
 2   fnlwgt          32561 non-null  int64
 3   education       32561 non-null  int64
 4   education-num   32561 non-null  int64
 5   marital-status  32561 non-null  int64
 6   occupation      32561 non-null  int64
 7   relationship    32561 non-null  int64
 8   race            32561 non-null  int64
 9   sex             32561 non-null  int64
 10  capital-gain    32561 non-null  int64
 11  capital-loss    32561 non-null  int64
 12  hours-per-week  32561 non-null  int64
 13  native-country  32561 non-null  int64
 14  target          32561 non-null  int64
 15  Set             32561 non-null  int64
dtypes: int64(16)
memory usage: 4.0 MB


# Define categorical features for categorical embeddings

In [24]:
unused_feat = ['Set']

features = [ col for col in train.columns if col not in unused_feat+[target]] 

cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

# define your embedding sizes : here just a random choice
cat_emb_dim = [5, 4, 3, 6, 2, 2, 1, 10]

print(cat_idxs)
print(cat_dims)

[1, 3, 5, 6, 7, 8, 9, 13]
[9, 16, 7, 15, 6, 5, 2, 42]


# Hyperparameter optimization

In this section, we will build a wrapper around the TabNetClassifier which supports early stopping.

In [25]:
X = train[features].values
y = train[target].values

In [29]:
X[0:10,:]

array([[    39,      7,  77516,      9,     13,      4,      1,      1,
             4,      1,   2174,      0,     40,     39],
       [    50,      6,  83311,      9,     13,      2,      4,      0,
             4,      1,      0,      0,     13,     39],
       [    38,      4, 215646,     11,      9,      0,      6,      1,
             4,      1,      0,      0,     40,     39],
       [    53,      4, 234721,      1,      7,      2,      6,      0,
             2,      1,      0,      0,     40,     39],
       [    28,      4, 338409,      9,     13,      2,     10,      5,
             2,      0,      0,      0,     40,      5],
       [    37,      4, 284582,     12,     14,      2,      4,      5,
             4,      0,      0,      0,     40,     39],
       [    49,      4, 160187,      6,      5,      3,      8,      1,
             2,      0,      0,      0,     16,     23],
       [    52,      6, 209642,     11,      9,      2,      4,      0,
             4,      1,  

In [32]:
X.shape

(32561, 14)

In [20]:
num_workers = os.cpu_count() if torch.cuda.is_available() else 0

In [21]:
class TabNetTuner(TabNetClassifier):
    def fit(self, X, y, *args, **kwargs):
        # Dirty trick => would be better to add n_d in grid, or fix it in __init__ of tuner
        self.n_d = self.n_a
        X_train, X_valid, y_train, y_valid = train_test_split(
            X, y, test_size=0.2, random_state=0, shuffle=True, stratify=y
        )
        
        return super().fit(
            X_train,
            y_train,
            patience=20,
            eval_set=[(X_valid, y_valid)],
            num_workers=num_workers,
            max_epochs=1000,
            batch_size=1024,
            virtual_batch_size=128
        )

In [22]:
clf = TabNetTuner(cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, cat_idxs=cat_idxs, device_name='cpu')



In [28]:
print(clf)

TabNetTuner(n_d=8, n_a=8, n_steps=3, gamma=1.3, cat_idxs=[1, 3, 5, 6, 7, 8, 9, 13], cat_dims=[9, 16, 7, 15, 6, 5, 2, 42], cat_emb_dim=[5, 4, 3, 6, 2, 2, 1, 10], n_independent=2, n_shared=2, epsilon=1e-15, momentum=0.02, lambda_sparse=0.001, seed=0, clip_value=1, verbose=1, optimizer_fn=<class 'torch.optim.adam.Adam'>, optimizer_params={'lr': 0.02}, scheduler_fn=None, scheduler_params={}, mask_type='sparsemax', input_dim=None, output_dim=None, device_name='cpu', n_shared_decoder=1, n_indep_decoder=1)


In [23]:
# Let's generate embedding size based on cat dims
cat_emb_dim_list = []
for max_dim in [1, 5, 10, 20, 50]:
    cat_emb_dim_list.append([min(nb // 2, max_dim) for nb in cat_dims])
cat_emb_dim_list

[[1, 1, 1, 1, 1, 1, 1, 1],
 [4, 5, 3, 5, 3, 2, 1, 5],
 [4, 8, 3, 7, 3, 2, 1, 10],
 [4, 8, 3, 7, 3, 2, 1, 20],
 [4, 8, 3, 7, 3, 2, 1, 21]]

In [26]:
grid = {
    "n_a": [3, 5, 8, 13, 21],
    "cat_emb_dim": cat_emb_dim_list,
    "n_independent": [0, 1, 2, 5],
    "n_shared": [0, 1, 2],
    "n_steps": [1, 3, 5, 8],
    "clip_value": [1],
    "gamma": [0.5, 1.3, 3],
    "momentum": [0.1, 0.05, 0.02, 0.005],
    "lambda_sparse": [0.1, 0.01, 0.001],
    "optimizer_params": [
        {'lr': 0.01}, 
        {'lr': 0.02}, 
        {'lr': 0.001}],
    "verbose": [0]
}

In [29]:
search = RandomizedSearchCV(
    clf,
    grid,
    n_iter=5,
    scoring="roc_auc",
    n_jobs=1,
    refit=False,
    cv=3,
    verbose=1,
    pre_dispatch=0,
    random_state=0,
    return_train_score=False,
)

In [30]:
search.fit(X, y)
search.best_params_

Fitting 3 folds for each of 5 candidates, totalling 15 fits





Early stopping occurred at epoch 37 with best_epoch = 17 and best_val_0_auc = 0.89103





Early stopping occurred at epoch 51 with best_epoch = 31 and best_val_0_auc = 0.89437





Early stopping occurred at epoch 57 with best_epoch = 37 and best_val_0_auc = 0.89287





Early stopping occurred at epoch 105 with best_epoch = 85 and best_val_0_auc = 0.89888





Early stopping occurred at epoch 70 with best_epoch = 50 and best_val_0_auc = 0.89263





Early stopping occurred at epoch 116 with best_epoch = 96 and best_val_0_auc = 0.89779





Early stopping occurred at epoch 36 with best_epoch = 16 and best_val_0_auc = 0.90484





Early stopping occurred at epoch 62 with best_epoch = 42 and best_val_0_auc = 0.90673





Early stopping occurred at epoch 50 with best_epoch = 30 and best_val_0_auc = 0.91027





Early stopping occurred at epoch 65 with best_epoch = 45 and best_val_0_auc = 0.89964





Early stopping occurred at epoch 57 with best_epoch = 37 and best_val_0_auc = 0.90263





Early stopping occurred at epoch 47 with best_epoch = 27 and best_val_0_auc = 0.89908





Early stopping occurred at epoch 86 with best_epoch = 66 and best_val_0_auc = 0.89158





Early stopping occurred at epoch 73 with best_epoch = 53 and best_val_0_auc = 0.89445





Early stopping occurred at epoch 177 with best_epoch = 157 and best_val_0_auc = 0.90298




{'verbose': 0,
 'optimizer_params': {'lr': 0.02},
 'n_steps': 1,
 'n_shared': 2,
 'n_independent': 5,
 'n_a': 3,
 'momentum': 0.005,
 'lambda_sparse': 0.001,
 'gamma': 1.3,
 'clip_value': 1,
 'cat_emb_dim': [4, 5, 3, 5, 3, 2, 1, 5]}

In [31]:
search.best_params_

{'verbose': 0,
 'optimizer_params': {'lr': 0.02},
 'n_steps': 1,
 'n_shared': 2,
 'n_independent': 5,
 'n_a': 3,
 'momentum': 0.005,
 'lambda_sparse': 0.001,
 'gamma': 1.3,
 'clip_value': 1,
 'cat_emb_dim': [4, 5, 3, 5, 3, 2, 1, 5]}

In [32]:
clf = TabNetClassifier(cat_idxs=cat_idxs, cat_dims=cat_dims, device_name='cpu', **search.best_params_)

In [33]:
clf.fit(
    X[train_indices],
    y[train_indices],
    patience=20,
    eval_set=[(X[valid_indices], y[valid_indices])]
)


Early stopping occurred at epoch 36 with best_epoch = 16 and best_val_0_auc = 0.90362




In [34]:
preds = clf.predict_proba(X[test_indices])[:, 1]

In [35]:
roc_auc_score(y_score=preds, y_true=y[test_indices])

0.8987008278445827