### Ignite Starter
Simple starter notebook, which uses for prediction the [Ignite](https://github.com/pytorch/ignite) library.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1

In [None]:
!pip install /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.1-py3-none-any.whl

In [None]:
from collections import defaultdict
import datatable as dt

import statsmodels.api as sm
from sklearn.metrics import roc_auc_score

from matplotlib import pyplot as plt
import riiideducation
from pathlib import Path
import seaborn as sns

In [None]:
import torch
from torch import nn, optim
from torch.optim import lr_scheduler
import torch.nn.functional as F
from torch.utils.data import DataLoader

In [None]:
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss, RunningAverage, ConfusionMatrix
from ignite.handlers import ModelCheckpoint, EarlyStopping

### Load data

In [None]:
path = Path('/kaggle/input')
assert path.exists()

In [None]:
%%time

data_types_dict = {
    'user_id': 'int32', 
    'content_id': 'int16', 
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool'
}
target = 'answered_correctly'
train_df = dt.fread(path/"riidtrainjay/train.jay", columns=set(data_types_dict.keys())).to_pandas()

In [None]:
del train_df['task_container_id']
del train_df['content_type_id']
del train_df['row_id']

In [None]:
train_df.info()

In [None]:
%%time

train_df = train_df[train_df[target] != -1].reset_index(drop=True)
train_df.drop(columns=['timestamp'], inplace=True)

### Feature generation

In [None]:
train_df['prior_question_had_explanation'].fillna(False, inplace=True)
train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].astype('uint8')

In [None]:
train_df['lag'] = train_df.groupby('user_id')[target].shift()
train_df['lag'].fillna(0, inplace=True)

In [None]:
train_df.info()

In [None]:
%%time

cum = train_df.groupby(['user_id'])['lag'].agg(['cumsum', 'cumcount'])
train_df['user_correctness'] = cum['cumsum'] / cum['cumcount']
train_df.drop(columns=['lag'], inplace=True)
del cum

In [None]:
user_agg = train_df.groupby('user_id')[target].agg(['sum', 'count'])
content_agg = train_df.groupby('content_id')[target].agg(['sum', 'count'])

In [None]:
for f in ['prior_question_elapsed_time']:
    train_df[f] = pd.to_numeric(train_df[f], downcast='float')

In [None]:
%%time

train_df['residual'] =  train_df[target] - train_df['content_id'].map(content_agg['sum'] / content_agg['count'])
residual_agg = train_df.groupby('user_id')['residual'].agg(['sum'])

In [None]:
prior_question_elapsed_time_agg = train_df.groupby('user_id').agg({'prior_question_elapsed_time': ['sum', lambda x: len(x)]})
prior_question_elapsed_time_agg.columns = ['sum', 'count']
prior_question_elapsed_time_agg['count'] = prior_question_elapsed_time_agg['count'].astype('int32')
prior_question_elapsed_time_agg.info()

### Create dataset

In [None]:
USER_TRIES = 70

import math
VALID_TRIES = math.ceil(USER_TRIES / 10)

In [None]:
train_df = train_df.groupby('user_id').tail(USER_TRIES).reset_index(drop=True)

In [None]:
train_df.shape

##### Question related

In [None]:
data_types_dict = {'question_id': 'int16', 'part': 'int8', 'bundle_id': 'int16', 'tags': 'string'}

questions_df = pd.read_csv(
    path/'riiid-test-answer-prediction/questions.csv', 
    usecols=data_types_dict.keys(),
    dtype=data_types_dict
)

In [None]:
unique_tags_combos_keys = {v:i for i,v in enumerate(questions_df['tags'].unique())}
questions_df['tags_encoded'] = questions_df['tags'].apply(lambda x : unique_tags_combos_keys[x])
questions_df['tags_encoded'] = pd.to_numeric(questions_df['tags_encoded'], downcast='integer')
questions_df.info()

In [None]:
def extract_tag_factory(tag_pos):
    def extract_tag(x):
        if isinstance(x, str) and tag_pos < len(x.split()):
            splits = x.split()
            splits.sort()
            return int(splits[tag_pos])
        else:
            return 255
    return extract_tag
        
for i in range(0, 3):
    questions_df[f'tag_{i + 1}'] = questions_df['tags'].apply(extract_tag_factory(i))
    questions_df[f'tag_{i + 1}'] = questions_df[f'tag_{i + 1}'].astype('uint8')
    unique_tag_keys = {v:i for i,v in enumerate(questions_df[f'tag_{i + 1}'].unique())}
    questions_df[f'tag_{i + 1}'] = questions_df[f'tag_{i + 1}'].apply(lambda x : unique_tag_keys[x])

In [None]:
train_df = pd.merge(train_df, questions_df, left_on='content_id', right_on='question_id', how='left')
train_df.drop(columns=['question_id'], inplace=True)

In [None]:
train_df['content_count'] = train_df['content_id'].map(content_agg['count']).astype('int32')
train_df['content_id'] = train_df['content_id'].map(content_agg['sum'] / content_agg['count'])

In [None]:
train_df['prior_question_elapsed_time_mean'] = train_df['user_id'].map(prior_question_elapsed_time_agg['sum'] / prior_question_elapsed_time_agg['count'])

In [None]:
train_df['residual_user_mean'] = train_df['user_id'].map(residual_agg['sum'] / user_agg['count'])

In [None]:
train_df['prior_question_elapsed_time'].fillna(train_df['prior_question_elapsed_time'].mean(), inplace=True)
train_df['user_correctness'].fillna(train_df['user_correctness'].mean(), inplace=True)

In [None]:
for f in ['user_correctness', 'content_id']:
    train_df[f] = pd.to_numeric(train_df[f], downcast='float')

In [None]:
valid_df = train_df.groupby('user_id').tail(VALID_TRIES)
train_df.drop(valid_df.index, inplace=True)

In [None]:
train_df['user_correctness'] = train_df['user_correctness'].replace(train_df['user_correctness'].mean(), 0.0)
valid_df['user_correctness'] = valid_df['user_correctness'].replace(valid_df['user_correctness'].mean(), 0.0)

In [None]:
train_df

### Training

In [None]:
features = [
    'prior_question_elapsed_time',
    'prior_question_had_explanation',
    'user_correctness',
    'part',
    'content_id',
    'content_count',
    'tags_encoded',
    'tag_1',
    'tag_2',
    'prior_question_elapsed_time_mean',
    'residual_user_mean'
]

#### Ignite

In [None]:
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelEncoder, StandardScaler
from pandas.api.types import is_string_dtype, is_numeric_dtype

In [None]:
cat_features = ['part', 'tags_encoded', 'tag_1', 'tag_2']
cont_features = [x for x in features if x not in cat_features]

In [None]:
class Riiid(torch.utils.data.Dataset):
    
    def __init__(self, df, cat_fields, cont_fields, target):
        df_cat = df[cat_fields]
        df_cont = df[cont_fields]
        
        cats = [c.values for _, c in df_cat.items()]
        conts = [c.values for _, c in df_cont.items()]
        
        n = len(cats[0])
        self.cats = np.stack(cats, 1).astype(np.int64)
        self.conts = np.stack(conts, 1).astype(np.float32)
        self.y = df[target].values.astype(np.float32) if target is not None else np.zeros((n,1))
        
    def __len__(self): return len(self.y)
    
    def __getitem__(self, idx):
        return [self.cats[idx], self.conts[idx], self.y[idx]]

In [None]:
def scale_vars(df, mapper, cols):
    if mapper is None:
        map_f = [([n],StandardScaler()) for n in df.columns if
                 is_numeric_dtype(df[n]) and n in cols]
        mapper = DataFrameMapper(map_f).fit(df)
    df[mapper.transformed_names_] = mapper.transform(df)
    return mapper

In [None]:
%%time

mapper = scale_vars(train_df, None, cont_features)
mapper.transform(train_df)
mapper = scale_vars(valid_df, None, cont_features)
mapper.transform(valid_df).shape

In [None]:
train_ds = Riiid(train_df, cat_features, cont_features, target)
valid_ds = Riiid(valid_df, cat_features, cont_features, target)

In [None]:
NUM_WORKERS = 6
BATCH_SIZE = 8192 * 16
EPOCHS = 5
MODEL_PATH = "riid-output"
train_dl = DataLoader(train_ds, BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)
valid_dl = DataLoader(valid_ds, BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

In [None]:
for v in cat_features: 
    train_df[v] = train_df[v].astype('category').cat.as_ordered()
cat_sz = [(c, len(train_df[c].cat.categories)+1) for c in cat_features]
embed_sizes = [(c, min(50, (c+1)//2)) for _,c in cat_sz]
embed_sizes

In [None]:
class RiiidModel(nn.Module):
    def __init__(self, embed_sizes, n_cont, emb_drop, out_sz, sizes, 
                 drops, use_bn=False):
        super().__init__()
        for i,(c,s) in enumerate(embed_sizes): 
            assert c > 1, f"cardinality must be >=2, got embed_sizes[{i}]: ({c},{s})"
        self.embs = nn.ModuleList([nn.Embedding(c, s) 
                                      for c,s in embed_sizes])
        for emb in self.embs: 
            self.emb_init(emb)
        n_emb = sum(e.embedding_dim for e in self.embs)
        self.n_emb, self.n_cont = n_emb, n_cont
        sizes = [n_emb + n_cont] + sizes
        self.linears = nn.ModuleList([
            nn.Linear(sizes[i], sizes[i+1]) for i in range(len(sizes)-1)])
        self.batch_norms = nn.ModuleList([
            nn.BatchNorm1d(sz) for sz in sizes[1:]])
        for o in self.linears: 
            nn.init.kaiming_normal_(o.weight.data)
        self.outp = nn.Linear(sizes[-1], out_sz)
        nn.init.kaiming_normal_(self.outp.weight.data)
        self.emb_drop = nn.Dropout(emb_drop)
        self.drops = nn.ModuleList([nn.Dropout(drop) 
                                        for drop in drops])
        self.bn = nn.BatchNorm1d(n_cont)
        self.use_bn = use_bn
        
    def forward(self, x_cat, x_cont):
        if self.n_emb > 0:
            x = [e(x_cat[:,i]) for i,e in enumerate(self.embs)]
            x = torch.cat(x, 1)
            x = self.emb_drop(x)
        if self.n_cont > 0:
            x2 = self.bn(x_cont)
            x = torch.cat([x, x2], 1) if self.n_emb != 0 else x2
        for l,d,b in zip(self.linears, self.drops, self.batch_norms):
            x = F.relu(l(x))
            if self.use_bn: 
                x = b(x)
            x = d(x)
        x = self.outp(x)
        x = torch.sigmoid(x)
        return x
        
        
    def emb_init(self, x):
        x = x.weight.data
        sc = 2 / (x.size(1)+1)
        x.uniform_(-sc,sc)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
def create_model():
    riiid_model = RiiidModel(embed_sizes, len(cont_features), emb_drop = 0.04, out_sz = 1,
          sizes = [200, 100], drops = [0.001,0.01], use_bn=True)
    riiid_model.to(device)
    return riiid_model

riid_model = create_model()

In [None]:
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss
from ignite.contrib.metrics.roc_auc import ROC_AUC
from tqdm.notebook import tqdm

LR = 0.006

criterion = nn.BCELoss()

optimizer = torch.optim.Adam(riid_model.parameters(), LR, weight_decay=0.01)

# Decay LR by a factor of 0.2 every 1 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.2)

val_metrics = {
    "auc": ROC_AUC(),
    "loss": Loss(criterion)
}

In [None]:
def prepare_batch(batch, device):
    x_cat, x_cont, t = batch
    return x_cat.to(device), x_cont.to(device), t.to(device)

def train_step(trainer, batch):
    riid_model.train()
    optimizer.zero_grad()
    x_cat, x_cont, y = prepare_batch(batch, device=device)
    y_pred = riid_model(x_cat, x_cont)
    loss = criterion(y_pred, y.unsqueeze(1))
    loss.backward()
    optimizer.step()
    return loss.item()

def predict_on_batch(engine, batch):
    riid_model.eval()
    with torch.no_grad():
        x_cat, x_cont, y = prepare_batch(batch, device=device)
        y_pred = riid_model(x_cat, x_cont)

    return y_pred, y.unsqueeze(1)

In [None]:
from ignite.engine.engine import Engine
from ignite.handlers import ModelCheckpoint, global_step_from_engine

trainer = Engine(train_step)
evaluator = Engine(predict_on_batch)

# Checkpoint to store n_saved best models wrt score function
model_checkpoint = ModelCheckpoint(
    MODEL_PATH,
    n_saved=1,
    filename_prefix="best",
    score_function=lambda engine : engine.state.metrics["auc"],
    score_name="auc",
    global_step_transform=global_step_from_engine(trainer),
)

for name, metric in val_metrics.items():
    metric.attach(evaluator, name)
    
# Save the model (if relevant) every epoch completed of evaluator
evaluator.add_event_handler(Events.COMPLETED, model_checkpoint, {"model": riid_model});

In [None]:
%%time

desc = "ITERATION - loss: {:.5f}"
pbar = tqdm(initial=0, leave=False, total=len(train_dl), desc=desc.format(0))
log_interval = 5

@trainer.on(Events.ITERATION_COMPLETED(every=log_interval))
def log_training_loss(engine):
    pbar.desc = desc.format(engine.state.output)
    pbar.update(log_interval)

# Un-coment this if you want to have training evaluation. This turned out to be a bit slow
# @trainer.on(Events.EPOCH_COMPLETED)
# def log_training_results(engine):
#     exp_lr_scheduler.step()
#     tqdm.write(f"Optimizer learning rate: {optimizer.param_groups[0]['lr']}")
#     pbar.refresh()
#     evaluator.run(train_dl)
#     metrics = evaluator.state.metrics
#     auc = metrics["auc"]
#     loss = metrics["loss"]
#     tqdm.write(
#         f"Training Results - Epoch: {engine.state.epoch}  AUC: {auc:.5f} Loss: {loss:.5f}"
#     )

@trainer.on(Events.EPOCH_COMPLETED)
def log_validation_results(engine):
    evaluator.run(valid_dl)
    metrics = evaluator.state.metrics
    auc = metrics["auc"]
    loss = metrics["loss"]
    tqdm.write(
        "Validation Results - Epoch: {}  AUC: {:.5f} Loss: {:.5f}".format(
            engine.state.epoch, auc, loss
        )
    )
    pbar.n = pbar.last_print_n = 0

@trainer.on(Events.EPOCH_COMPLETED | Events.COMPLETED)
def log_time(engine):
    tqdm.write(
        "{} took {} seconds".format(trainer.last_event_name.name, trainer.state.times[trainer.last_event_name.name])
    )
    
trainer.run(train_dl, max_epochs=EPOCHS)
pbar.close()

In [None]:
def predict(df):
    mapper = scale_vars(df, None, cont_features)
    mapper.transform(df)
    ds = Riiid(df, cat_features, cont_features, None)
    dl = DataLoader(ds, len(df), shuffle=False, num_workers=1)
    return predict_on_batch(None, next(iter(dl)))[0].squeeze().cpu().numpy()

### Predict

In [None]:
# load best model from disk

import os

riid_model = create_model()
riid_model.load_state_dict(torch.load(Path(MODEL_PATH)/os.listdir(MODEL_PATH)[0]))

In [None]:
user_sum_dict = user_agg['sum'].astype('int16').to_dict(defaultdict(int))
user_count_dict = user_agg['count'].astype('int16').to_dict(defaultdict(int))
content_sum_dict = content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = content_agg['count'].astype('int32').to_dict(defaultdict(int))
residual_sum_dict = residual_agg['sum'].astype('float32').to_dict(defaultdict(int))

In [None]:
prior_question_elapsed_time_sum_dict = prior_question_elapsed_time_agg['sum'].astype('int32').to_dict(defaultdict(int))
prior_question_elapsed_time_count_dict = prior_question_elapsed_time_agg['count'].astype('int32').to_dict(defaultdict(int))

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()
prior_test_df = None

In [None]:
def clip(count): return np.clip(count, 1e-8, np.inf)

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop=True)
        
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        prior_question_elapsed_times = prior_test_df['prior_question_elapsed_time'].values
        targets = prior_test_df[target].values
        
        for user_id, content_id, prior_question_elapsed_time, answered_correctly in zip(user_ids, content_ids, prior_question_elapsed_times, targets):
            user_sum_dict[user_id] += answered_correctly
            user_count_dict[user_id] += 1
            content_sum_dict[content_id] += answered_correctly
            content_count_dict[content_id] += 1
            mean_accuracy = content_sum_dict[content_id] / clip(content_count_dict[content_id])
            residual_sum_dict[user_id] += answered_correctly - mean_accuracy
            
            prior_question_elapsed_time_sum_dict[user_id] += 0 if np.isnan(prior_question_elapsed_time) else prior_question_elapsed_time
            prior_question_elapsed_time_count_dict[user_id] += 0 if np.isnan(prior_question_elapsed_time) else 1
    
    prior_test_df = test_df.copy()
    
    test_df = pd.merge(test_df, questions_df, left_on='content_id', right_on='question_id', how='left')
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(False).astype('uint8')
    
    user_sum = np.zeros(len(test_df), dtype=np.int16)
    user_count = np.zeros(len(test_df), dtype=np.int16)
    res_sum = np.zeros(len(test_df), dtype=np.float32)
    content_sum = np.zeros(len(test_df), dtype=np.int32)
    content_count = np.zeros(len(test_df), dtype=np.int32)
    prior_question_elapsed_time_sum = np.zeros(len(test_df), dtype=np.int32)
    prior_question_elapsed_time_count = np.zeros(len(test_df), dtype=np.int32)
    
    for i, (user_id, content_id) in enumerate(zip(test_df['user_id'].values, test_df['content_id'].values)):
        user_sum[i] = user_sum_dict[user_id]
        user_count[i] = user_count_dict[user_id]
        res_sum[i] = residual_sum_dict[user_id]
        content_sum[i] = content_sum_dict[content_id]
        content_count[i] = content_count_dict[content_id]
        prior_question_elapsed_time_sum[i] = prior_question_elapsed_time_sum_dict[user_id]
        prior_question_elapsed_time_count[i] = prior_question_elapsed_time_count_dict[user_id]

    content_count = clip(content_count)
    user_count = clip(user_count)
    prior_question_elapsed_time_count = clip(prior_question_elapsed_time_count)
    test_df['user_correctness'] = user_sum / user_count
    test_df['residual_user_mean'] = res_sum / user_count
    test_df['content_count'] = content_count
    test_df['content_id'] = content_sum / content_count
    test_df['prior_question_elapsed_time_mean'] = prior_question_elapsed_time_sum / prior_question_elapsed_time_count
    
    test_df['prior_question_elapsed_time'].fillna(train_df['prior_question_elapsed_time'].mean(), inplace=True)
    
    test_df.fillna(0, inplace=True)
    test_df[cat_features] = test_df[cat_features].apply(pd.to_numeric, downcast='integer')
    test_df[target] = predict(test_df)
    
    env.predict(test_df[['row_id', target]])

In [None]:
test_df