In [8]:
import os
import gc
import pandas as pd
from sklearn.model_selection import train_test_split

from pytorch_tabnet.tab_model import TabNetClassifier
import torch

from sklearn.impute import SimpleImputer

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [9]:
DATA_DIRECTORY = ""

In [10]:
train = pd.read_csv(os.path.join(DATA_DIRECTORY, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_DIRECTORY, 'test.csv'))
labels = pd.read_csv(os.path.join(DATA_DIRECTORY, 'labels.csv'))

In [11]:
test_id = test['SK_ID_CURR']

In [12]:
imputer = SimpleImputer(strategy = 'median')
imputer.fit(train)
train = imputer.transform(train)
test = imputer.transform(test)

In [13]:
target = labels.to_numpy()
del labels
gc.collect()

77

In [14]:
x_train, x_val, y_train, y_val = train_test_split(train, target, test_size=0.30, random_state=8)

In [15]:
# https://towardsdatascience.com/tabnet-deep-neural-network-for-structured-tabular-data-39eb4b27a9e4
tabnet_clf = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=2e-2),
                       scheduler_params={"step_size":10, # how to use learning rate scheduler
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='entmax' # "sparsemax"
                      ) 

Device used : cpu


In [25]:
tabnet_clf.fit(
    x_train,y_train[:,0],
    eval_set=[(x_train, y_train[:,0]), (x_val, y_val[:,0])],
    eval_name=['train', 'valid'],
    eval_metric=['auc','accuracy'],
    max_epochs=1000 , patience=50,
    batch_size=256, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False
)    

epoch 0  | loss: 0.61076 | train_auc: 0.77175 | train_accuracy: 0.74201 | valid_auc: 0.7711  | valid_accuracy: 0.74042 |  0:02:23s
epoch 1  | loss: 0.56527 | train_auc: 0.78888 | train_accuracy: 0.69157 | valid_auc: 0.77553 | valid_accuracy: 0.68739 |  0:04:41s
epoch 2  | loss: 0.54917 | train_auc: 0.80296 | train_accuracy: 0.66217 | valid_auc: 0.77216 | valid_accuracy: 0.65395 |  0:07:00s
epoch 3  | loss: 0.53056 | train_auc: 0.81838 | train_accuracy: 0.69091 | valid_auc: 0.76808 | valid_accuracy: 0.67918 |  0:09:17s
epoch 4  | loss: 0.51182 | train_auc: 0.8294  | train_accuracy: 0.70444 | valid_auc: 0.75974 | valid_accuracy: 0.68901 |  0:11:32s
epoch 5  | loss: 0.49595 | train_auc: 0.83775 | train_accuracy: 0.70741 | valid_auc: 0.76093 | valid_accuracy: 0.68872 |  0:13:47s
epoch 6  | loss: 0.48385 | train_auc: 0.84128 | train_accuracy: 0.7125  | valid_auc: 0.75622 | valid_accuracy: 0.69052 |  0:16:04s
epoch 7  | loss: 0.47527 | train_auc: 0.84801 | train_accuracy: 0.68734 | valid_auc

In [29]:
predictions = tabnet_clf.predict_proba(test)[:, 1]
submission = pd.DataFrame({'SK_ID_CURR': test_id.values, 'TARGET': predictions})
submission.to_csv('tabnet.csv', index = False)