### Tabnet Starter
Simple starter notebook, which uses for prediction a simple ensemble with tabnet and a linear model using the statsmodel library and another fast ai neural network.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1

In [None]:
!pip install /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.1-py3-none-any.whl

In [None]:
from collections import defaultdict
import datatable as dt

import statsmodels.api as sm
from sklearn.metrics import roc_auc_score

from matplotlib import pyplot as plt
import riiideducation
from pathlib import Path
import seaborn as sns

from pytorch_tabnet.tab_model import TabNetClassifier

### Load data

In [None]:
path = Path('/kaggle/input')
assert path.exists()

In [None]:
%%time

data_types_dict = {
    'user_id': 'int32', 
    'content_id': 'int16', 
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool'
}
target = 'answered_correctly'
train_df = dt.fread(path/"riidtrainjay/train.jay", columns=set(data_types_dict.keys())).to_pandas()

In [None]:
%%time

train_df = train_df[train_df[target] != -1].reset_index(drop=True)
train_df.drop(columns=['timestamp'], inplace=True)

### Feature generation

In [None]:
train_df['prior_question_had_explanation'].fillna(False, inplace=True)
train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].astype('uint8')

In [None]:
train_df['lag'] = train_df.groupby('user_id')[target].shift()
train_df['lag'].fillna(0, inplace=True)

In [None]:
train_df.info()

In [None]:
%%time

cum = train_df.groupby(['user_id'])['lag'].agg(['cumsum', 'cumcount'])
train_df['user_correctness'] = cum['cumsum'] / cum['cumcount']
train_df.drop(columns=['lag'], inplace=True)
del cum

In [None]:
user_agg = train_df.groupby('user_id')[target].agg(['sum', 'count'])
content_agg = train_df.groupby('content_id')[target].agg(['sum', 'count'])

In [None]:
for f in ['prior_question_elapsed_time']:
    train_df[f] = pd.to_numeric(train_df[f], downcast='float')

In [None]:
%%time

train_df['residual'] =  train_df[target] - train_df['content_id'].map(content_agg['sum'] / content_agg['count'])
residual_agg = train_df.groupby('user_id')['residual'].agg(['sum'])

In [None]:
prior_question_elapsed_time_agg = train_df.groupby('user_id').agg({'prior_question_elapsed_time': ['sum', lambda x: len(x)]})
prior_question_elapsed_time_agg.columns = ['sum', 'count']
prior_question_elapsed_time_agg['count'] = prior_question_elapsed_time_agg['count'].astype('int32')
prior_question_elapsed_time_agg.info()

In [None]:
# Covariance between time and user mean

train_df['user_prior_question_elapsed_time_diff'] = (train_df[target] - train_df['user_id'].map(user_agg['sum'] / user_agg['count'])) * (train_df['prior_question_elapsed_time'] - train_df['user_id'].map(prior_question_elapsed_time_agg['sum'] / prior_question_elapsed_time_agg['count']))
user_prior_question_elapsed_time_diff_agg = train_df.groupby('user_id')['user_prior_question_elapsed_time_diff'].agg(['sum'])
train_df['user_prior_question_elapsed_time_diff_mean'] = train_df['user_id'].map(user_prior_question_elapsed_time_diff_agg['sum'] / user_agg['count'])

### Create dataset

In [None]:
USER_TRIES = 70

import math
VALID_TRIES = math.ceil(USER_TRIES / 10)

In [None]:
train_df = train_df.groupby('user_id').tail(USER_TRIES).reset_index(drop=True)

In [None]:
train_df.shape

##### Question related

In [None]:
data_types_dict = {'question_id': 'int16', 'part': 'int8', 'bundle_id': 'int16', 'tags': 'string'}

questions_df = pd.read_csv(
    path/'riiid-test-answer-prediction/questions.csv', 
    usecols=data_types_dict.keys(),
    dtype=data_types_dict
)

In [None]:
unique_tags_combos_keys = {v:i for i,v in enumerate(questions_df['tags'].unique())}
questions_df['tags_encoded'] = questions_df['tags'].apply(lambda x : unique_tags_combos_keys[x])
questions_df['tags_encoded'] = pd.to_numeric(questions_df['tags_encoded'], downcast='integer')
questions_df.info()

In [None]:
def extract_tag_factory(tag_pos):
    def extract_tag(x):
        if isinstance(x, str) and tag_pos < len(x.split()):
            splits = x.split()
            splits.sort()
            return int(splits[tag_pos])
        else:
            return 255
    return extract_tag
        
for i in range(0, 3):
    questions_df[f'tag_{i + 1}'] = questions_df['tags'].apply(extract_tag_factory(i))
    questions_df[f'tag_{i + 1}'] = questions_df[f'tag_{i + 1}'].astype('uint8')

In [None]:
train_df = pd.merge(train_df, questions_df, left_on='content_id', right_on='question_id', how='left')
train_df.drop(columns=['question_id'], inplace=True)

In [None]:
train_df['content_count'] = train_df['content_id'].map(content_agg['count']).astype('int32')
train_df['content_id'] = train_df['content_id'].map(content_agg['sum'] / content_agg['count'])

In [None]:
train_df['prior_question_elapsed_time_mean'] = train_df['user_id'].map(prior_question_elapsed_time_agg['sum'] / prior_question_elapsed_time_agg['count'])

In [None]:
train_df['residual_user_mean'] = train_df['user_id'].map(residual_agg['sum'] / user_agg['count'])

In [None]:
train_df['prior_question_elapsed_time'].fillna(train_df['prior_question_elapsed_time'].mean(), inplace=True)
train_df['user_correctness'].fillna(train_df['user_correctness'].mean(), inplace=True)

In [None]:
for f in ['user_correctness', 'content_id']:
    train_df[f] = pd.to_numeric(train_df[f], downcast='float')

In [None]:
valid_df = train_df.groupby('user_id').tail(VALID_TRIES)
# train_df.drop(valid_df.index, inplace=True)

In [None]:
train_df['user_correctness'] = train_df['user_correctness'].replace(train_df['user_correctness'].mean(), 0.0)
valid_df['user_correctness'] = valid_df['user_correctness'].replace(valid_df['user_correctness'].mean(), 0.0)

In [None]:
train_df

### Training

In [None]:
features = [
    'prior_question_elapsed_time',
    'prior_question_had_explanation',
    'user_correctness',
    'part',
    'content_id',
    'content_count',
    'tags_encoded',
    'tag_1',
    'tag_2',
    'prior_question_elapsed_time_mean',
    'residual_user_mean'
]

##### Linear model

In [None]:
%%time

model = sm.OLS(train_df[target], train_df[features])
lin_model = model.fit()
roc_auc_score(valid_df[target], lin_model.predict(valid_df[features]))

In [None]:
lin_model.predict(valid_df[features].values[:10])

##### Fast AI

In [None]:
EPOCHS=5
BATCH_SIZE=4096

In [None]:
cat_features = ['tags_encoded', 'tag_1', 'tag_2']

In [None]:
cont_features = [x for x in features if x not in cat_features]

In [None]:
cont_features

In [None]:
# Check fast ai version
import fastai
from fastai.tabular.all import *

fastai.__version__

In [None]:
train_df.info()

In [None]:
train_df[target] = train_df[target].astype('float32')

In [None]:
%%time

dls = TabularDataLoaders.from_df(train_df, 
    procs=[Categorify, FillMissing, Normalize],
    cat_names=cat_features, 
    cont_names=cont_features,
    y_names=target, valid_idx=valid_df.index, bs=BATCH_SIZE)

In [None]:
def my_auc(inp, targ):
    "Simple wrapper around scikit's roc_auc_score function for regression problems"
    inp,targ = flatten_check(inp,targ)
    return roc_auc_score(targ.cpu().numpy(), inp.cpu().numpy())

In [None]:
def bce(inp,targ):
    "Binary cross entropy"
    inp,targ = flatten_check(inp,targ)
    loss = F.binary_cross_entropy(inp, targ)
    return loss

In [None]:
learn = tabular_learner(dls, layers=[200,100], metrics=my_auc)

In [None]:
learn.model

In [None]:
learn.model.layers.add_module('sigmoid', nn.Sigmoid())
learn.loss_func = bce

In [None]:
lr_find_res = learn.lr_find()

In [None]:
%%time

learn.fit_one_cycle(3, lr=lr_find_res.lr_min)

In [None]:
def predict_batch(self, df):
    dl = self.dls.test_dl(df)
    dl.dataset.conts = dl.dataset.conts.astype(np.float32)
    inp,preds,_,dec_preds = self.get_preds(dl=dl, with_input=True, with_decoded=True)
    return preds.numpy()

setattr(learn, 'predict_batch', predict_batch)

### LightGBM

In [None]:
import lightgbm as lgb

In [None]:
cat_features = ['tags_encoded', 'tag_1', 'tag_2']

lgb_train = lgb.Dataset(train_df[features], train_df[target], categorical_feature = cat_features, free_raw_data=False)
lgb_eval = lgb.Dataset(valid_df[features], valid_df[target], categorical_feature = cat_features, reference=lgb_train, free_raw_data=False)

In [None]:
METRICS = ['auc']

params = {
    'objective': 'binary',
    'seed': 42,
    'metric': METRICS,
    'learning_rate': 0.05,
    'max_bin': 800,
    'num_leaves': 80
}

In [None]:
%%time

NUM_BOOST_ROUNDS = 300

evals_result = {}

lgb_model = lgb.train (
    params, 
    lgb_train, 
    valid_sets=[lgb_train, lgb_eval], 
    verbose_eval=20, 
    num_boost_round=NUM_BOOST_ROUNDS, 
    early_stopping_rounds=20,
    evals_result=evals_result
)

In [None]:
lgb.plot_importance(lgb_model)

#### Tabnet

In [None]:
import torch

# Tabnet object
clf_tabnet = TabNetClassifier(cat_idxs=[list(train_df[features].columns).index(x) for x in cat_features], 
                              scheduler_params={"step_size": 50, "gamma": 0.9},
                                scheduler_fn=torch.optim.lr_scheduler.StepLR)
clf_tabnet

In [None]:
%%time

# Fit TabNet model
clf_tabnet.fit(
    X_train=train_df[features].values, y_train=train_df[target].values,
    eval_set=[(valid_df[features].values, valid_df[target].values)],
    max_epochs=2,
    batch_size=8192 * 4
)

In [None]:
preds = clf_tabnet.predict_proba(valid_df[features].values[:1000])
preds[:,1].shape

### Predict

In [None]:
user_sum_dict = user_agg['sum'].astype('int16').to_dict(defaultdict(int))
user_count_dict = user_agg['count'].astype('int16').to_dict(defaultdict(int))
content_sum_dict = content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = content_agg['count'].astype('int32').to_dict(defaultdict(int))
residual_sum_dict = residual_agg['sum'].astype('float32').to_dict(defaultdict(int))

In [None]:
prior_question_elapsed_time_sum_dict = prior_question_elapsed_time_agg['sum'].astype('int64').to_dict(defaultdict(int))
prior_question_elapsed_time_count_dict = prior_question_elapsed_time_agg['count'].astype('int32').to_dict(defaultdict(int))

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()
prior_test_df = None

In [None]:
def clip(count): return np.clip(count, 1e-8, np.inf)

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop=True)
        
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        prior_question_elapsed_times = prior_test_df['prior_question_elapsed_time'].values
        targets = prior_test_df[target].values
        
        for user_id, content_id, prior_question_elapsed_time, answered_correctly in zip(user_ids, content_ids, prior_question_elapsed_times, targets):
            user_sum_dict[user_id] += answered_correctly
            user_count_dict[user_id] += 1
            content_sum_dict[content_id] += answered_correctly
            content_count_dict[content_id] += 1
            mean_accuracy = content_sum_dict[content_id] / clip(content_count_dict[content_id])
            residual_sum_dict[user_id] += answered_correctly - mean_accuracy
            
            prior_question_elapsed_time_sum_dict[user_id] += 0 if np.isnan(prior_question_elapsed_time) else prior_question_elapsed_time
            prior_question_elapsed_time_count_dict[user_id] += 0 if np.isnan(prior_question_elapsed_time) else 1
    
    prior_test_df = test_df.copy()
    
    test_df = pd.merge(test_df, questions_df, left_on='content_id', right_on='question_id', how='left')
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('uint8')
    
    user_sum = np.zeros(len(test_df), dtype=np.int16)
    user_count = np.zeros(len(test_df), dtype=np.int16)
    res_sum = np.zeros(len(test_df), dtype=np.float32)
    content_sum = np.zeros(len(test_df), dtype=np.int32)
    content_count = np.zeros(len(test_df), dtype=np.int32)
    prior_question_elapsed_time_sum = np.zeros(len(test_df), dtype=np.int32)
    prior_question_elapsed_time_count = np.zeros(len(test_df), dtype=np.int32)
    
    for i, (user_id, content_id) in enumerate(zip(test_df['user_id'].values, test_df['content_id'].values)):
        user_sum[i] = user_sum_dict[user_id]
        user_count[i] = user_count_dict[user_id]
        res_sum[i] = residual_sum_dict[user_id]
        content_sum[i] = content_sum_dict[content_id]
        content_count[i] = content_count_dict[content_id]
        prior_question_elapsed_time_sum[i] = prior_question_elapsed_time_sum_dict[user_id]
        prior_question_elapsed_time_count[i] = prior_question_elapsed_time_count_dict[user_id]

    content_count = clip(content_count)
    user_count = clip(user_count)
    prior_question_elapsed_time_count = clip(prior_question_elapsed_time_count)
    test_df['user_correctness'] = user_sum / user_count
    test_df['residual_user_mean'] = res_sum / user_count
    test_df['content_count'] = content_count
    test_df['content_id'] = content_sum / content_count
    test_df['prior_question_elapsed_time_mean'] = prior_question_elapsed_time_sum / prior_question_elapsed_time_count
    
    test_df['prior_question_elapsed_time'].fillna(train_df['prior_question_elapsed_time'].mean(), inplace=True)
    
    test_df[cat_features] = test_df[cat_features].apply(pd.to_numeric, downcast='integer')
    test_df.fillna(0, inplace=True)
       
    test_df[target] = np.average([
        clf_tabnet.predict_proba(test_df[features].values)[:,1],
        lin_model.predict(test_df[features]),
        learn.predict_batch(learn, test_df[features])[:,0],
        lgb_model.predict(test_df[features])
    ], weights=[0.25, 0.2, 0.25, 0.3], axis=0)
    
    env.predict(test_df[['row_id', target]])

In [None]:
test_df[target] = np.average([
    clf_tabnet.predict_proba(test_df[features].values)[:,1],
    lin_model.predict(test_df[features]),
    learn.predict_batch(learn, test_df[features])[:,0],
    lgb_model.predict(test_df[features])
], weights=[0.25, 0.2, 0.25, 0.3], axis=0)

In [None]:
lgb_model.predict(test_df[features])

In [None]:
test_df