TabNet (https://github.com/dreamquark-ai/tabnet/) starter kernel for RiiiD challenge

Feature engineering is based on:
- https://www.kaggle.com/code1110/riiid-gbdt-pipeline-baseline
- https://www.kaggle.com/lgreig/simple-lgbm-baseline
- https://www.kaggle.com/jsylas/riiid-lgbm-starter

In [None]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl >/dev/null
!pip install "../input/pytorch-tabnet/pytorch_tabnet-1.2.0-py3-none-any.whl" >> quit

# Preprocess

In [None]:
# Used most of coding from this kernel 
import random
import os
import operator
import riiideducation

import datatable as dt
import dask.dataframe as dd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style as style

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter

import torch
from pytorch_tabnet.tab_model import TabNetClassifier

sns.set_context("talk")
style.use('fivethirtyeight')

# Config

In [None]:
class CFG:
    START_IDX = 90000000
    SEED = 42
    TEST_SIZE = 0.2
    N_EPOCHS = 5
    BATCH_SZ = 1024
    PATIENCE = 3
    VIRTUAL_BS = 128
    LR = 0.01
    ND = 8  # Width of the decision prediction layer. Bigger values gives more capacity to the model with the risk of overfitting. 
    NA = 8  # Width of the attention embedding for each mask. According to the paper n_d=n_a is usually a good choice. 
    N_STEPS = 3 # Number of steps in the architecture (usually between 3 and 10)
    GAMMA = 1.3 # This is the coefficient for feature reusage in the masks. A value close to 1 will make mask selection least correlated between layers. 
    #Values range from 1.0 to 2.0.
    N_INDEPENDENT = 1 # Number of independent Gated Linear Units layers at each step. Usual values range from 1 to 5.
    LAMBDA = 0
    N_SHARED = 3 # Number of shared Gated Linear Units at each step Usual values range from 1 to 5
    MOMENTUM = 0.1
    CLIP = 1.0
    MASK_TYPE = 'sparsemax' #(default='sparsemax') Either "sparsemax" or "entmax" : this is the masking function to use for selecting features

# UTILS

In [None]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = False
        torch.backends.cudnn.benchmark = False
        
seed_everything(CFG.SEED)

# PREPARE TRAIN SET

In [None]:
env = riiideducation.make_env()
train = dt.fread("../input/riiid-test-answer-prediction/train.csv").to_pandas()

# Remove lectures
train = train[train.content_type_id == False]

# Arrange by timestamp
train = train.sort_values(['timestamp'], ascending=True)

# Drop useless columns
train.drop(['timestamp','content_type_id'], axis=1, inplace=True)

In [None]:
# Average of correct answers for each content_id
results_c = train[['content_id', 'answered_correctly']].groupby(['content_id']).agg(['mean'])
results_c.columns = ["answered_correctly_content"]

# Number of correct answers for each suser
results_u = train[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean', 'sum'])
results_u.columns = ["answered_correctly_user", 'sum']

In [None]:
# Select 10% of the training set (last interactions)
X = train.iloc[CFG.START_IDX:,:]

#del train

# Merge with features previously computer
X = pd.merge(X, results_u, on=['user_id'], how="left")
X = pd.merge(X, results_c, on=['content_id'], how="left")

# Remove all lectures
X = X[X.answered_correctly!= -1 ]
X = X.sort_values(['user_id'])

# Get ground truth
Y = X[["answered_correctly"]]
X = X.drop(["answered_correctly"], axis=1)

# Categorical encoding
lb_make = LabelEncoder()
X["prior_question_had_explanation_enc"] = lb_make.fit_transform(X["prior_question_had_explanation"])

# Keep relevant features
X = X[['answered_correctly_user', 'answered_correctly_content', 'sum', 'prior_question_elapsed_time', 'prior_question_had_explanation_enc']] 
X.fillna(0.5, inplace=True)

X.head()

# TRAIN SPLIT

In [None]:
#X = X[:2000]
#Y = Y[:2000]
Xt, Xv, Yt, Yv = train_test_split(X, Y, test_size = CFG.TEST_SIZE, shuffle = False, random_state=CFG.SEED)

# MODEL

In [None]:
cat_idxs = Xt.columns.get_loc('prior_question_had_explanation_enc')
cat_dims = Xt['prior_question_had_explanation_enc'].nunique()

print(cat_idxs, cat_dims)

In [None]:
model = TabNetClassifier(
    n_d = CFG.ND,
    n_a = CFG.NA,
    n_steps = CFG.N_STEPS,
    gamma = CFG.GAMMA, 
    n_independent = CFG.N_INDEPENDENT,
    n_shared = CFG.N_SHARED,
    cat_dims=[cat_dims],
    cat_emb_dim=1,
    optimizer_params=dict(lr=CFG.LR),
    momentum=CFG.MOMENTUM,
    cat_idxs=[cat_idxs],
    verbose=1,
    #scheduler_params=dict(milestones=[20, 50, 80], gamma=0.5), 
    #scheduler_fn=torch.optim.lr_scheduler.MultiStepLR,
    mask_type = CFG.MASK_TYPE,
    lambda_sparse = CFG.LAMBDA,
    clip_value = CFG.CLIP
)

model.fit(
    X_train = Xt.values, 
    y_train = Yt['answered_correctly'].values,
    X_valid = Xv.values, 
    y_valid = Yv['answered_correctly'].values,
    max_epochs = CFG.N_EPOCHS, 
    patience = CFG.PATIENCE,
    batch_size = CFG.BATCH_SZ, 
    virtual_batch_size = CFG.VIRTUAL_BS,
    num_workers = 0,
    weights = 1,
    drop_last = False
)

In [None]:
# Plot losses
#plt.plot(model.history['train']['loss'])
#plt.plot(model.history['valid']['loss'])

In [None]:
# Plot learning rate
#plt.plot(model.history['train']['lr'])

In [None]:
# Plot metric
#plt.plot(np.array(model.history['train']['metric']) * -1)
#plt.plot(np.array(model.history['valid']['metric']) * -1)

# SUBMIT

In [None]:
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_df = pd.merge(test_df, results_u, on=['user_id'], how="left")
    test_df = pd.merge(test_df, results_c, on=['content_id'], how="left")
    test_df['answered_correctly_user'].fillna(0.5, inplace=True)
    test_df['answered_correctly_content'].fillna(0.5, inplace=True)
    test_df['sum'].fillna(0, inplace=True)
    test_df['prior_question_had_explanation'].fillna(False, inplace=True)
    test_df["prior_question_had_explanation_enc"] = lb_make.fit_transform(test_df["prior_question_had_explanation"])
    
    test_ = test_df[['answered_correctly_user', 'answered_correctly_content', 'sum','prior_question_elapsed_time','prior_question_had_explanation_enc']]
    test_.fillna(0.5, inplace=True)   # should be modified !
    test_df['answered_correctly'] = model.predict(test_.values)
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])