In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import gc
import math
import random
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.optim import Optimizer
from torch.optim.lr_scheduler import LambdaLR
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

In [None]:
n_skill = 13523
max_seq = 130

batch_size = 256
embed_dim = 256
num_head = 8
num_layer = 2

## Load data

In [None]:
%%time
dtype = {'timestamp': 'int64', 
         'user_id': 'int32' ,
         'content_id': 'int16',
         'content_type_id': 'int8',
         'answered_correctly':'int8'}

train_df = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/train.csv", usecols=[1,2,3,4,7], dtype=dtype)

In [None]:
question_df = pd.read_csv("../input/riiid-test-answer-prediction/questions.csv")

## Preprocess

In [None]:
train_df = train_df[train_df.content_type_id == False]
train_df = pd.merge(train_df, question_df[["question_id", "part"]], left_on="content_id", right_on="question_id", how="left")

In [None]:
group = train_df[['user_id', 'content_id', 'answered_correctly', 'part']].groupby('user_id').apply(lambda r: (
        r['content_id'].values,
        r['answered_correctly'].values,
        r['part'].values))

del train_df
gc.collect()

## Dataset

In [None]:
class RIIIDDataset(Dataset):
    def __init__(self, group, n_skill, subset="train", max_seq=100):
        super(RIIIDDataset, self).__init__()
        self.max_seq = max_seq
        self.n_skill = n_skill # 13523
        self.group = group
        self.subset = subset
        
        # self.user_ids = [x for x in group.index]
        self.user_ids = []
        for user_id in group.index:
            '''
            q: question_id
            qa: question answer correct or not
            '''
            content_id_, correct_, part_ = group[user_id]
            if len(content_id_) < 2: # 2 interactions minimum
                continue
            self.user_ids.append(user_id) # user_ids indexes

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        user_id = self.user_ids[index] # Pick a user
        content_id_, correct_, part_ = self.group[user_id]
        seq_len = len(content_id_)

        content_id = np.zeros(self.max_seq, dtype=int)
        part = np.zeros(self.max_seq, dtype=int)
        correct = np.zeros(self.max_seq, dtype=int)

        if seq_len >= self.max_seq:
            if self.subset == "train":
                if random.random() > 0.1:
                    random_start_index = random.randint(0, seq_len - self.max_seq)
                    '''
                    Pick 100 questions, answers, prior question time, 
                    priori question explain from a random index
                    '''
                    end_index = random_start_index + self.max_seq
                    content_id[:] = content_id_[random_start_index:end_index] 
                    part[:] = part_[random_start_index:end_index] 
                    correct[:] = correct_[random_start_index:end_index] 
                else:
                    content_id[:] = content_id_[-self.max_seq:]
                    part[:] = part_[-self.max_seq:]
                    correct[:] = correct_[-self.max_seq:]
            else:
                content_id[:] = content_id_[-self.max_seq:]
                part[:] = part_[-self.max_seq:]
                correct[:] = correct_[-self.max_seq:]
        else:
            if random.random()>0.1:
                seq_len = random.randint(2,seq_len)
                content_id[-seq_len:] = content_id_[:seq_len]
                part[-seq_len:] = part_[:seq_len]
                correct[-seq_len:] = correct_[:seq_len]
            else:
                content_id[-seq_len:] = content_id_
                part[-seq_len:] = part_
                correct[-seq_len:] = correct_
                
        response = correct[:-1]

        content_id = content_id[1:]
        part = part[1:]
        correct = correct[1:]

        return response, content_id, part, correct

In [None]:
train, val = train_test_split(group, test_size=0.1)

train_dataset = RIIIDDataset(train, n_skill, max_seq=max_seq, subset="train")
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=8)
del train

val_dataset = RIIIDDataset(val, n_skill, max_seq=max_seq,subset="val")
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=8)
del val

print("train dataset ", len(train_dataset), " validation dataset ", len(val_dataset))

## Model and training

In [None]:
class SAINTModel(nn.Module):
    def __init__(self, n_skill, max_seq=100,
                 embed_dim=512, num_head=8, num_layer=4):
        super(SAINTModel, self).__init__()
        self.n_skill = n_skill
        
        self.res_embedding = nn.Embedding(2, embed_dim)
        self.res_pos = nn.Embedding(max_seq-1, embed_dim)

        self.ex_embedding = nn.Embedding(n_skill+1, embed_dim)
        self.ex_pos = nn.Embedding(max_seq-1, embed_dim)
        
        self.part_embedding = nn.Embedding(8, embed_dim)

        self.transformer = nn.Transformer(d_model=embed_dim, nhead=num_head,
                                        num_decoder_layers=num_layer, num_encoder_layers=num_layer)
        
        self.pred = nn.Linear(embed_dim, 1)

        self._reset_parameters()
    
    def _reset_parameters(self):
        r"""Initiate parameters in the model."""
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self, r, e, part):
        device = r.device
        src = self.ex_embedding(e)
        src_mask = self.transformer.generate_square_subsequent_mask(e.size(1)).to(device)

        ex_pos_id = torch.arange(e.size(1)).unsqueeze(0).to(device)
        ex_pos = self.ex_pos(ex_pos_id)
        src += ex_pos
        
        part = self.part_embedding(part)
        src += part

        tgt = self.res_embedding(r)
        tgt_mask = self.transformer.generate_square_subsequent_mask(r.size(1)).to(device)
        
        res_pos_id = torch.arange(r.size(1)).unsqueeze(0).to(device)
        res_pos = self.res_pos(res_pos_id)
        tgt += res_pos

        src = src.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        tgt = tgt.permute(1, 0, 2)
        x = self.transformer(src, tgt, src_mask=src_mask, tgt_mask=tgt_mask)
        x = x.permute(1, 0, 2)

        output = self.pred(x)

        return output.squeeze(-1)

In [None]:
model = SAINTModel(n_skill, max_seq=max_seq, embed_dim=embed_dim, num_head=num_head, num_layer=num_layer)

## Train Model

In [None]:
def get_cosine_schedule_with_warmup(
    optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1
):
    """
    Create a schedule with a learning rate that decreases following the values of the cosine function between the
    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
    initial lr set in the optimizer.

    Args:
        optimizer (:class:`~torch.optim.Optimizer`):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (:obj:`int`):
            The number of steps for the warmup phase.
        num_training_steps (:obj:`int`):
            The total number of training steps.
        num_cycles (:obj:`float`, `optional`, defaults to 0.5):
            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
            following a half-cosine).
        last_epoch (:obj:`int`, `optional`, defaults to -1):
            The index of the last epoch when resuming training.

    Return:
        :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """

    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))

    return LambdaLR(optimizer, lr_lambda, last_epoch)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

num_warmup_steps = (len(train_dataset)/batch_size) * 4
num_training_steps = (len(train_dataset)/batch_size) * 50
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
criterion = nn.BCEWithLogitsLoss()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)
criterion.to(device)
print('model to gpu')

In [None]:
def train_epoch(model, train_iterator, optim, criterion, scheduler, device="cpu"):
    model.train()

    train_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    tbar = tqdm(train_iterator)
    for item in tbar:  
        r = item[0].to(device).long()
        e = item[1].to(device).long()
        p = item[2].to(device).long()
        label = item[3].to(device).float()        

        tgt_mask = (p != 0)

        optim.zero_grad()
        output = model(r, e, p)
        
        output_ = torch.masked_select(output, tgt_mask)
        label_ = torch.masked_select(label, tgt_mask)
        
        output = output[:, -1]
        label = label[:, -1]

        loss = criterion(output_, label_)
        loss.backward()
        optim.step()
        scheduler.step()
        train_loss.append(loss.item())

        pred = (torch.sigmoid(output) >= 0.5).long()
        
        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.view(-1).data.cpu().numpy())
        outs.extend(output.view(-1).data.cpu().numpy())

        tbar.set_description('loss - {:.4f}'.format(loss))

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.average(train_loss)

    return loss, acc, auc

In [None]:
def validation(model, val_iterator, criterion, device="cpu"):
    model.eval()

    val_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []
    
    tbar = tqdm(val_iterator)
    for item in tbar:
        r = item[0].to(device).long()
        e = item[1].to(device).long()
        p = item[2].to(device).long()
        label = item[3].to(device).float()
        tgt_mask = (p != 0)

        with torch.no_grad():
            output = model(r, e, p)
            
        output = torch.masked_select(output, tgt_mask)
        label = torch.masked_select(label, tgt_mask)

        loss = criterion(output, label)
        val_loss.append(loss.item())

        pred = (torch.sigmoid(output) >= 0.5).long()

        num_corrects += (pred == label).sum().item()
        num_total += len(label)

        labels.extend(label.squeeze(-1).data.cpu().numpy())
        outs.extend(output.squeeze(-1).data.cpu().numpy())

        tbar.set_description('loss - {:.4f}'.format(loss))

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.average(val_loss)

    return loss, acc, auc

In [None]:
epochs = 50

last_auc = 0

for epoch in range(epochs):
    loss, train_acc, train_auc = train_epoch(model, train_dataloader, optimizer, criterion, scheduler, device)
    print("\nepoch - {} train_loss - {:.3f} acc - {:.3f} auc - {:.4f}".format(epoch, loss, train_acc, train_auc))

    val_loss, val_acc, val_auc = validation(model, val_dataloader, criterion, device)
    print("\nepoch - {} vall_loss - {:.3f} acc - {:.3f} auc - {:.4f}".format(epoch, val_loss, val_acc, val_auc))

    # test_loss, test_acc, test_auc = validation(model, test_dataloader, criterion, device)
    # print("\nepoch - {} test_loss - {:.3f} acc - {:.3f} auc - {:.4f}".format(epoch, test_loss, test_acc, test_auc))
    if last_auc > val_auc:
        print("early stop epoch ", epoch)
        break
    else:
        last_auc = val_auc

## Testing

In [None]:
class TestDataset(Dataset):
    def __init__(self, group, test_df, n_skill=13523, max_seq=130):
        super(TestDataset, self).__init__()
        self.group = group
        self.user_ids = [x for x in test_df["user_id"].unique()]
        self.test_df = test_df
        self.n_skill = n_skill
        self.max_seq = max_seq

    def __len__(self):
        return self.test_df.shape[0]

    def __getitem__(self, index):
        test_info = self.test_df.iloc[index]
        user_id = test_info["user_id"]
        target_id = test_info["content_id"]

        content_id = np.zeros(self.max_seq, dtype=int)
        task_id = np.zeros(self.max_seq, dtype=int)
        part = np.zeros(self.max_seq, dtype=int)
        et = np.zeros(self.max_seq, dtype=int)
        lt = np.zeros(self.max_seq, dtype=int)
        correct = np.zeros(self.max_seq, dtype=int)
        
        if user_id in self.group.index:
            content_id_, correct_, part_ = self.group[user_id]
            seq_len = len(content_id_)
        
            if seq_len >= self.max_seq:
                content_id = content_id_[-self.max_seq:]
                part = part_[-self.max_seq:]
                correct = correct_[-self.max_seq:]

            else:
                content_id[-seq_len:] = content_id_
                part[-seq_len:] = part_ 
                correct[-seq_len:] = correct_
        
        response = correct[1:]
        
        question = np.append(content_id[2:], [target_id])
        part = np.append(part[2:], [test_info["part"]])

        return response, question, part

In [None]:
import riiideducation

env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
import psutil

prev_test_df = None

model.eval()
for (test_df, sample_prediction_df) in tqdm(iter_test):
    if (prev_test_df is not None) & (psutil.virtual_memory().percent<90):
        print(psutil.virtual_memory().percent)
        prev_test_df['answered_correctly'] = eval(test_df['prior_group_answers_correct'].iloc[0])
        
        prev_test_df = prev_test_df[prev_test_df.content_type_id == False]
        prev_test_df = pd.merge(prev_test_df, question_df[["question_id", "part"]], 
                                left_on="content_id", right_on="question_id", how="left")
                
        
        #content_id, task_id, correct, lag_time, elapsed_time, had_e, part
        prev_group = prev_test_df[['user_id', 'content_id', 'answered_correctly', 'part']].groupby('user_id').apply(lambda r: (
            r['content_id'].values,
            r['answered_correctly'].values,
            r['part'].values))
        
        for prev_user_id in prev_group.index:
            prev_group_content = prev_group[prev_user_id][0]
            prev_group_ac = prev_group[prev_user_id][1]
            prev_group_part = prev_group[prev_user_id][2]
            if prev_user_id in group.index:
                group[prev_user_id] = (np.append(group[prev_user_id][0], prev_group_content), 
                                       np.append(group[prev_user_id][1], prev_group_ac),
                                       np.append(group[prev_user_id][2], prev_group_part))
 
            else:
                group[prev_user_id] = (prev_group_content, 
                                       prev_group_ac, 
                                       prev_group_part)
            
            if len(group[prev_user_id][0]) > max_seq:
                new_group_content = group[prev_user_id][0][-max_seq:]
                new_group_ac = group[prev_user_id][1][-max_seq:]
                new_group_part = group[prev_user_id][2][-max_seq:]

                group[prev_user_id] = (new_group_content, 
                                       new_group_ac, 
                                       new_group_part)
                
    prev_test_df = test_df.copy()
    
    test_df = test_df[test_df.content_type_id == False]
    
    test_df = pd.merge(test_df, question_df[["question_id", "part"]], 
                       left_on="content_id", right_on="question_id", how="left")
    
    test_dataset = TestDataset(group, test_df, max_seq=max_seq)
    test_dataloader = DataLoader(test_dataset, batch_size=512, shuffle=False)
    
    outs = []

    for item in tqdm(test_dataloader):
        r = item[0].to(device).long()
        c = item[1].to(device).long()
        p = item[2].to(device).long()

        with torch.no_grad():
            output = model(r, c, p)
        
        output = torch.sigmoid(output)
        output = output[:, -1]

        outs.extend(output.view(-1).data.cpu().numpy())
        
    test_df['answered_correctly'] =  outs
    
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

In [None]:
import matplotlib.pyplot as plt

plt.hist(outs)
plt.show()