# Main changes and references

1. Version 1: A warm-up scheduler is added as it is known that Transformer-based model usually needs a warm-up phase.
https://ufal.mff.cuni.cz/pbml/110/art-popel-bojar.pdf
The learning rate will be some magnitudes smaller than the targeted learning rate, gradually increase to that level, then comes back down in various fashion. Sometime in later epochs, the learning rate will increase again (called restart).

2. Version 2: @gilfernandes 's modification of stacked attention layers with non-shared weights are added: https://www.kaggle.com/gilfernandes/riiid-self-attention-transformer, a simple function checking model's size is added as well.

**Reference**: 
- https://www.kaggle.com/leadbest/sakt-with-randomization-state-updates 
- https://www.kaggle.com/wangsg/a-self-attentive-model-for-knowledge-tracing
- https://www.kaggle.com/manikanthr5/riiid-sakt-model-training-public
- https://www.kaggle.com/marisakamozz/cv-strategy-in-the-kaggle-environment
- https://huggingface.co/transformers/v1.2.0/_modules/pytorch_transformers/optimization.html


In [None]:
import gc
import psutil
import random
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.metrics import roc_auc_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

`WARMUP_EPOCHS` is the warm-up epochs, where the learning rate linearly increases from 0 to the desired learning rate. Then the learning rate will start to decay, the `NUM_RESTARTS` controls the number of restarts happened in all the epochs.

In [None]:
WARMUP_EPOCHS = 10
NUM_RESTARTS = 2
EPOCHS = 40

BATCH_SIZE = 256
VAL_BATCH_SIZE = 2048
MAX_SEQ = 160
NUM_HEADS = 8
NUM_EMBED = 128
NUM_LAYERS = 2

ACCEPTED_USER_CONTENT_SIZE = 3
n_skill = 13523

LEARNING_RATE = 5e-4

# Data
Here we use @marisakamozz 's almost unbiased CV file: https://www.kaggle.com/marisakamozz/cv-strategy-in-the-kaggle-environment

In [None]:
%%time

dtypes = {'timestamp': 'int64', 
          'user_id': 'int32' ,
          'content_id': 'int16',
          'content_type_id': 'int8',
          'answered_correctly':'int8'}
train_cols = ['timestamp', 
              'user_id', 
              'content_id', 
              'content_type_id', 
              'answered_correctly']
train_df = pd.read_parquet('../input/cv-strategy-in-the-kaggle-environment/cv4_train.parquet')
train_df = train_df[train_cols]
train_df = train_df.astype(dtypes)

# for col, dtype in dtypes.items():
#     train_df[col] = train_df[col].astype(dtype)
print(train_df.dtypes)

## Preprocess

In [None]:
train_df = train_df[train_df.content_type_id == False]
train_df = train_df.sort_values(['timestamp'], ascending=True).reset_index(drop=True)
group = train_df[['user_id', 'content_id', 'answered_correctly']].groupby('user_id')\
    .apply(lambda r: (r['content_id'].values,
                      r['answered_correctly'].values))
del train_df
gc.collect();

In [None]:
%%time
valid_df = pd.read_parquet('../input/cv-strategy-in-the-kaggle-environment/cv4_valid.parquet')
valid_df = valid_df[train_cols]
valid_df = valid_df.astype(dtypes)

valid_df = valid_df[valid_df.content_type_id == False]
val_group = valid_df[['user_id', 'content_id', 'answered_correctly']]\
    .groupby('user_id')\
    .apply(lambda r: (r['content_id'].values,
                      r['answered_correctly'].values))
del valid_df
gc.collect();

In [None]:
class SAKTDataset(Dataset):
    def __init__(self, group, n_skill, max_seq=MAX_SEQ):
        super(SAKTDataset, self).__init__()
        self.max_seq = max_seq
        self.n_skill = n_skill
        self.samples = {}
        
        self.user_ids = []
        for user_id in group.index:
            q, qa = group[user_id]
            if len(q) < ACCEPTED_USER_CONTENT_SIZE:
                continue
            
            # Main Contribution by Manikanth Reddy
            if len(q) > self.max_seq:
                total_questions = len(q)
                initial = total_questions % self.max_seq
                if initial >= ACCEPTED_USER_CONTENT_SIZE:
                    self.user_ids.append(f"{user_id}_0")
                    self.samples[f"{user_id}_0"] = (q[:initial], qa[:initial])
                for seq in range(total_questions // self.max_seq):
                    self.user_ids.append(f"{user_id}_{seq+1}")
                    start = initial + seq * self.max_seq
                    end = initial + (seq + 1) * self.max_seq
                    self.samples[f"{user_id}_{seq+1}"] = (q[start:end], qa[start:end])
            else:
                user_id = str(user_id)
                self.user_ids.append(user_id)
                self.samples[user_id] = (q, qa)
    
    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        user_id = self.user_ids[index]
        q_, qa_ = self.samples[user_id]
        seq_len = len(q_)

        q = np.zeros(self.max_seq, dtype=int)
        qa = np.zeros(self.max_seq, dtype=int)
        if seq_len == self.max_seq:
            q[:] = q_
            qa[:] = qa_
        else:
            q[-seq_len:] = q_
            qa[-seq_len:] = qa_
        
        target_id = q[1:]
        label = qa[1:]

        x = np.zeros(self.max_seq-1, dtype=int)
        x = q[:-1].copy()
        x += (qa[:-1] == 1) * self.n_skill

        return x, target_id, label

In [None]:
dataset = SAKTDataset(group, n_skill)
dataloader = DataLoader(dataset, 
                        batch_size=BATCH_SIZE, 
                        shuffle=True, 
                        num_workers=4)


valid_dataset = SAKTDataset(val_group, n_skill)
val_loader = DataLoader(valid_dataset, 
                          batch_size=VAL_BATCH_SIZE, 
                          shuffle=False, 
                          drop_last=False,
                          num_workers=4)

item = next(iter(dataloader))
val_item = next(iter(val_loader))

for i in range(len(item)):
    print(item[i].shape, val_item[i].shape)

# Model

In [None]:
class FFN(nn.Module):
    def __init__(self, 
                 state_size=200, 
                 bn_size=MAX_SEQ-1,
                 dropout=0.2):
        super(FFN, self).__init__()
        self.state_size = state_size

        self.lr1 = nn.Linear(state_size, state_size)
        self.relu = nn.ReLU()
        self.lr2 = nn.Linear(state_size, state_size)
        self.dropout = nn.Dropout(dropout)
        self.bn = nn.BatchNorm1d(bn_size)
        
    def forward(self, x):
        x = self.lr1(x)
        x = self.relu(x)
        x = self.bn(x)
        x = self.lr2(x)
        return self.dropout(x)

def future_mask(seq_length):
    future_mask = np.triu(np.ones((seq_length, seq_length)), k=1).astype('bool')
    return torch.from_numpy(future_mask)


class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, 
                 heads = 8, 
                 dropout = 0.2):
        super(TransformerBlock, self).__init__()
        self.multi_att = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=heads, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.layer_normal = nn.LayerNorm(embed_dim)
        self.ffn = FFN(embed_dim, dropout=dropout)
        self.layer_normal_2 = nn.LayerNorm(embed_dim)
        

    def forward(self, value, key, query, att_mask):
        att_output, att_weight = self.multi_att(value, key, query, attn_mask=att_mask)
        att_output = self.dropout(self.layer_normal(att_output + value))
        att_output = att_output.permute(1, 0, 2) # att_output: [s_len, bs, embed] => [bs, s_len, embed]
        x = self.ffn(att_output)
        x = self.dropout(self.layer_normal_2(x + att_output))
        return x.squeeze(-1), att_weight
    
class Encoder(nn.Module):
    def __init__(self, n_skill, 
                 max_seq=100, 
                 embed_dim=128, 
                 dropout = 0.2, 
                 num_layers=1, 
                 heads = 8):
        super(Encoder, self).__init__()
        self.n_skill, self.embed_dim = n_skill, embed_dim
        self.embedding = nn.Embedding(2 * n_skill + 1, embed_dim)
        self.pos_embedding = nn.Embedding(max_seq - 1, embed_dim)
        self.e_embedding = nn.Embedding(n_skill+1, embed_dim)
        self.layers = nn.ModuleList([TransformerBlock(embed_dim)\
                                     for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, question_ids):
        device = x.device
        x = self.embedding(x)
        pos_id = torch.arange(x.size(1)).unsqueeze(0).to(device)
        pos_x = self.pos_embedding(pos_id)
        x = self.dropout(x + pos_x)
        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        e = self.e_embedding(question_ids)
        e = e.permute(1, 0, 2)
        for layer in self.layers:
            att_mask = future_mask(e.size(0)).to(device)
            x, att_weight = layer(e, x, x, att_mask=att_mask)
            x = x.permute(1, 0, 2)
        x = x.permute(1, 0, 2)
        return x, att_weight

class SAKTModel(nn.Module):
    def __init__(self, n_skill, 
                 max_seq=100, 
                 embed_dim=128, 
                 dropout = 0.2, 
                 enc_layers=1, 
                 heads = 8):
        super(SAKTModel, self).__init__()
        self.encoder = Encoder(n_skill, 
                               max_seq, 
                               embed_dim, 
                               dropout, 
                               num_layers=enc_layers,
                               heads=heads)
        self.pred = nn.Linear(embed_dim, 1)
        
    def forward(self, x, question_ids):
        x, att_weight = self.encoder(x, question_ids)
        x = self.pred(x)
        return x.squeeze(-1), att_weight

# Scheduler

This is the scheduler adapted from https://huggingface.co/transformers/v2.0.0/main_classes/optimizer_schedules.html#schedules

In [None]:
from torch.optim.lr_scheduler import LambdaLR
import math

class WarmupCosineSchedule(LambdaLR):
    """ Linear warmup and then cosine decay.
        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
        Decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps following a cosine curve.
        If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
    """
    def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1):
        self.warmup_steps = warmup_steps
        self.t_total = t_total
        self.cycles = cycles
        self.name = 'Warmup Cosine'
        super(WarmupCosineSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)

    def lr_lambda(self, step):
        if step < self.warmup_steps:
            return float(step) / float(max(1.0, self.warmup_steps))
        # progress after warmup
        progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps))
        return max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
    
class WarmupCosineWithHardRestartsSchedule(LambdaLR):
    """ Linear warmup and then cosine cycles with hard restarts.
        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
        If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
        learning rate (with hard restarts).
    """
    def __init__(self, optimizer, warmup_steps, t_total, cycles=1., last_epoch=-1):
        self.warmup_steps = warmup_steps
        self.t_total = t_total
        self.cycles = cycles
        self.name = 'Warmup Cosine With Hard Restarts'
        super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)

    def lr_lambda(self, step):
        if step < self.warmup_steps:
            return float(step) / float(max(1, self.warmup_steps))
        # progress after warmup
        progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps))
        if progress >= 1.0:
            return 0.0
        return max(0.0, 0.5 * (1. + math.cos(math.pi * ((float(self.cycles) * progress) % 1.0))))


# Create the model
Here we added a function of checking model's size, just making sure the model does not grow too large.

In [None]:
def get_num_params(model):
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    n_params = sum([np.prod(p.size()) for p in model_parameters])
    return n_params

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SAKTModel(n_skill, embed_dim=NUM_EMBED, max_seq=MAX_SEQ, 
                  enc_layers=NUM_LAYERS, heads=NUM_HEADS, dropout=0.1)

criterion = nn.BCEWithLogitsLoss()

model.to(device)
criterion.to(device);
n_params = get_num_params(model)
print(f"Current model has {n_params} parameters.")

In [None]:
def train_epoch(model, dataloader, optimizer, criterion, device="cuda"):
    model.train()

    train_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []
    len_loader = len(dataloader)
#     with tqdm(total=len_loader) as pbar:
    for idx, item in enumerate(dataloader):
        x = item[0].to(device).long()
        target_id = item[1].to(device).long()
        label = item[2].to(device).float()

        optimizer.zero_grad()
        output, atten_weight = model(x, target_id)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())

        output = output[:, -1]
        label = label[:, -1] 
        pred = (torch.sigmoid(output) >= 0.5).long()

        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.view(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())
            
#             if idx % 8==0: pbar.update(8)

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.mean(train_loss)

    return loss, acc, auc

def valid_epoch(model, valid_iterator, criterion, device="cuda"):
    model.eval()
    
    valid_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []
    len_dataset = len(valid_iterator)
    
    for idx, item in enumerate(valid_iterator): 
        x = item[0].to(device).long()
        target_id = item[1].to(device).long()
        label = item[2].to(device).float()

        with torch.no_grad():
            output, _ = model(x, target_id)
        loss = criterion(output, label)
        valid_loss.append(loss.item())

        output = output[:, -1] # (BS, 1)   
        
        output = torch.sigmoid(output)
        
        label = label[:, -1] 
        pred = (output >= 0.5).long()

        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.view(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.mean(valid_loss)

    return loss, acc, auc

## Scheduler is here
After each epoch, `scheduler.step()` needs to be called.

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
# scheduler = WarmupCosineSchedule(optimizer, 
#                         warmup_steps=WARMUP_EPOCHS, 
#                         t_total=EPOCHS)
scheduler = WarmupCosineWithHardRestartsSchedule(optimizer, 
                                                 warmup_steps=WARMUP_EPOCHS, 
                                                 cycles=NUM_RESTARTS, 
                                                 t_total=EPOCHS)
lr = []
best_auc = 0
best_epoch = 0
saving_threshold =  0.7675

In [None]:
for epoch in range(EPOCHS):
    loss, acc, auc = train_epoch(model, 
                                 dataloader, 
                                 optimizer, 
                                 criterion, 
                                 device)
    scheduler.step()
    lr.append(optimizer.param_groups[0]['lr'])
    print(f"\nEpoch - [{epoch+1}/{EPOCHS}]")
    print(f"Learning rate : {lr[-1]:.4e} ")
    print(f"train: loss - {loss:.3f} acc - {acc:.3f} auc - {auc:.3f}")
    val_loss, val_acc, val_auc = valid_epoch(model, 
                                             val_loader, 
                                             criterion, 
                                             device)
    print(f"valid: loss - {val_loss:.3f} acc - {val_acc:.3f} auc - {val_auc:.3f}")
    
    if best_auc < val_auc:
        print(f'epoch - {epoch + 1} best model with val auc: {val_auc}')
        best_auc = val_auc
        best_epoch = epoch
        if best_auc > saving_threshold:
            model_name = f"sakt_layer_{NUM_LAYERS}_head_{NUM_HEADS}"
            model_name += f"_embed_{NUM_EMBED}_seq_{MAX_SEQ}"
            model_name += f"_auc_{val_auc:.4f}.pt"
            torch.save(model.state_dict(), model_name)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(lr)
plt.xlabel('epoch')
plt.ylabel('Learning rate')
plt.suptitle(f'{scheduler.name}')
plt.show()

In [None]:
del dataset, valid_dataset

# Test

In [None]:
class TestDataset(Dataset):
    def __init__(self, samples, test_df, n_skill, max_seq=MAX_SEQ): #HDKIM 100
        super(TestDataset, self).__init__()
        self.samples = samples
        self.user_ids = [x for x in test_df["user_id"].unique()]
        self.test_df = test_df
        self.n_skill = n_skill
        self.max_seq = max_seq

    def __len__(self):
        return self.test_df.shape[0]

    def __getitem__(self, index):
        test_info = self.test_df.iloc[index]

        user_id = test_info["user_id"]
        target_id = test_info["content_id"]

        q = np.zeros(self.max_seq, dtype=int)
        qa = np.zeros(self.max_seq, dtype=int)

        if user_id in self.samples.index:
            q_, qa_ = self.samples[user_id]
            
            seq_len = len(q_)

            if seq_len >= self.max_seq:
                q = q_[-self.max_seq:]
                qa = qa_[-self.max_seq:]
            else:
                q[-seq_len:] = q_
                qa[-seq_len:] = qa_          
        
        x = np.zeros(self.max_seq-1, dtype=int)
        x = q[1:].copy()
        x += (qa[1:] == 1) * self.n_skill
        
        questions = np.append(q[2:], [target_id])
        
        return x, questions

In [None]:
# def find_files(name, path):
#     result = []
#     for root, dirs, files in os.walk(path):
#         for _file in files:
#             if name in _file:
#                 result.append(os.path.join(root, _file))
#     return result


# try: 
#     model = SAKTModel(n_skill, 
#                   embed_dim=NUM_EMBED, 
#                   max_seq=MAX_SEQ, 
#                   enc_layers=NUM_LAYERS, 
#                   heads=NUM_HEADS, dropout=0.1)

#     model.to(device)
#     print("Loading the best AUC model trained in the current notebook")
#     model_name = f"../working/sakt_layer_{NUM_LAYERS}_head_{NUM_HEADS}"
#     model_name += f"_embed_{NUM_EMBED}_seq_{MAX_SEQ}"
#     model_name += f"_auc_{val_auc:.4f}.pt"
#     model.load_state_dict(torch.load(model_name, map_location=device))
# except:
# #     model_files = find_files('sakt', '../working/')
# #     print(f"Loading {model_files[-1]}")
# #     model.load_state_dict(torch.load(model_files[-1], map_location=device))
#     print("Use current trained model.")

In [None]:
import riiideducation

env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
%%time
model.eval()
prev_test_df = None

for (test_df, sample_prediction_df) in tqdm(iter_test):
    if (prev_test_df is not None) & (psutil.virtual_memory().percent < 95):
        prev_test_df['answered_correctly'] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prev_test_df = prev_test_df[prev_test_df.content_type_id == False]
        
        prev_group = prev_test_df[['user_id', 
                                   'content_id', 
                                   'answered_correctly']]\
            .groupby('user_id').apply(lambda r: (
                                            r['content_id'].values,
                                            r['answered_correctly'].values))
        for prev_user_id in prev_group.index:
            if prev_user_id in group.index:
                group[prev_user_id] = (
                    np.append(group[prev_user_id][0], 
                              prev_group[prev_user_id][0])[-MAX_SEQ:], 
                    np.append(group[prev_user_id][1], 
                              prev_group[prev_user_id][1])[-MAX_SEQ:]
                )
 
            else:
                group[prev_user_id] = (
                    prev_group[prev_user_id][0], 
                    prev_group[prev_user_id][1]
                )

    prev_test_df = test_df.copy()
    
    test_df = test_df[test_df.content_type_id == False]
    test_dataset = TestDataset(group, test_df, n_skill)
    test_dataloader = DataLoader(test_dataset, batch_size=len(test_df), shuffle=False)
    
    outs = []

    for item in tqdm(test_dataloader):
        x = item[0].to(device).long()
        target_id = item[1].to(device).long()

        with torch.no_grad():
            output, att_weight = model(x, target_id)
        outs.extend(torch.sigmoid(output)[:, -1].view(-1).data.cpu().numpy())
        
    test_df['answered_correctly'] = outs
    env.predict(test_df.loc[test_df['content_type_id'] == 0, 
                            ['row_id', 'answered_correctly']])

In [None]:
sub = pd.read_csv('../working/submission.csv')
sub['answered_correctly'].hist(bins=15)