# Debugging the baseline SAKT model
- There is seemingly a bug in the `iter_env`.

### TO-DO:
features encoding:
- how to address the problem with previous answers correctly not uniformly predicted
- question tags 

In [1]:
import os
import gc
import sys

import pickle
from time import time

import datatable as dt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
from torch import optim
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from torch.autograd import Variable
import torch.optim as optim
from torch.optim import Optimizer
from torch.optim.lr_scheduler import (CosineAnnealingWarmRestarts, CyclicLR, OneCycleLR,
                                      ReduceLROnPlateau)
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

sns.set()
DEFAULT_FIG_WIDTH = 20
sns.set_context("paper", font_scale=1.2) 
# WORKSPACE_FOLDER=/home/scao/Documents/kaggle-riiid-test
# PYTHONPATH=${WORKSPACE_FOLDER}:${WORKSPACE_FOLDER}/sakt:${WORKSPACE_FOLDER}/transformer
HOME = os.path.abspath(os.path.join('.', os.pardir))
print(HOME, '\n\n')
# HOME = "/home/scao/Documents/kaggle-riiid-test/"
MODEL_DIR = os.path.join(HOME,  'model')
DATA_DIR = os.path.join(HOME,  'data')
sys.path.append(HOME) 
from utils import *
get_system()
from sakt import *
from iter_env import *

/home/scao/Documents/kaggle-riiid-test 


Physical cores    : 10
Total cores       : 20
Max Frequency    : 5200.00 Mhz
Min Frequency    : 800.00 Mhz
Current Frequency: 4800.14 Mhz
Total     : 62.62 GiB
Available : 23.84 GiB
Used      : 37.48 GiB
Python     : 3.8.3 (default, Jul  2 2020, 16:21:59) 
Numpy      : 1.18.5
Pandas     : 1.0.5
PyTorch    : 1.7.0
Device     : cuda:0
GeForce RTX 3090
Mem total      : 23.7 GB
Mem allocated  : 0.0 GB
Mem cached     : 0.0 GB


In [2]:
# set-up

DEBUG = True
TRAIN = False
PREPROCESS = False

TEST_SIZE = 0.05

NUM_SKILLS = 13523 # number of problems
MAX_SEQ = 180
ACCEPTED_USER_CONTENT_SIZE = 4
EMBED_SIZE = 128
NUM_HEADS = 8
BATCH_SIZE = 64
VAL_BATCH_SIZE = 2048
DEBUG_TEST_SIZE = 2500
DROPOUT = 0.1
SEED = 1127

get_seed(SEED)

'''
Columns placeholder and preprocessing params
'''
CONTENT_TYPE_ID = "content_type_id"
CONTENT_ID = "content_id"
TARGET = "answered_correctly"
USER_ID = "user_id"
PRIOR_QUESTION_TIME = 'prior_question_elapsed_time'

PRIOR_QUESTION_EXPLAIN = 'prior_question_had_explanation'
TASK_CONTAINER_ID = "task_container_id"
TIMESTAMP = "timestamp" 
ROW_ID = 'row_id'
FILLNA_VAL = 14_000 # for prior question elapsed time, rounded average in train
TIME_SCALING = 1000 # scaling down the prior question elapsed time

TRAIN_COLS = [TIMESTAMP, USER_ID, CONTENT_ID, CONTENT_TYPE_ID, TARGET]

TRAIN_DTYPES = {TIMESTAMP: 'int64', 
         USER_ID: 'int32', 
         CONTENT_ID: 'int16',
         CONTENT_TYPE_ID: 'bool',
         TARGET:'int8',
         PRIOR_QUESTION_TIME: np.float32,
         PRIOR_QUESTION_EXPLAIN: 'boolean'}


if DEBUG:
    NROWS_TEST = 25_000
    NROWS_TRAIN = 5_000_000
    NROWS_VAL = 500_000
else:
    NROWS_TEST = 250_000
    NROWS_TRAIN = 50_000_000
    NROWS_VAL = 2_000_000

In [3]:
if PREPROCESS:
    with timer("Loading train from parquet"):
        train_df = pd.read_parquet(os.path.join(DATA_DIR, 'cv2_train.parquet'),
                                columns=list(TRAIN_DTYPES.keys())).astype(TRAIN_DTYPES)
        valid_df = pd.read_parquet(os.path.join(DATA_DIR, 'cv2_valid.parquet'),
                                columns=list(TRAIN_DTYPES.keys())).astype(TRAIN_DTYPES)

    if DEBUG:
        train_df = train_df[:NROWS_TRAIN]
        valid_df = valid_df[:NROWS_VAL]

    with timer("Processing train"):
        train_group = preprocess(train_df)
        valid_group = preprocess(valid_df, train_flag=2)
else:
    with open(os.path.join(DATA_DIR, 'sakt_group_cv2.pickle'), 'rb') as f:
        group = pickle.load(f)
    train_group, valid_group = train_test_split(group, test_size = TEST_SIZE, random_state=SEED)


print(f"valid users: {len(valid_group.keys())}")
print(f"train users: {len(train_group.keys())}")

valid users: 18274
train users: 347191


In [4]:
class SAKTDataset(Dataset):
    def __init__(self, group, n_skill, max_seq=MAX_SEQ):
        super(SAKTDataset, self).__init__()
        self.samples, self.n_skill, self.max_seq = {}, n_skill, max_seq
        
        self.user_ids = []
        for i, user_id in enumerate(group.index):
            content_id, answered_correctly = group[user_id]
            if len(content_id) >= ACCEPTED_USER_CONTENT_SIZE:
                if len(content_id) > self.max_seq:
                    total_questions = len(content_id)
                    last_pos = total_questions // self.max_seq
                    for seq in range(last_pos):
                        index = f"{user_id}_{seq}"
                        self.user_ids.append(index)
                        start = seq * self.max_seq
                        end = (seq + 1) * self.max_seq
                        self.samples[index] = (content_id[start:end], 
                                               answered_correctly[start:end])
                    if len(content_id[end:]) >= ACCEPTED_USER_CONTENT_SIZE:
                        index = f"{user_id}_{last_pos + 1}"
                        self.user_ids.append(index)
                        self.samples[index] = (content_id[end:], 
                                               answered_correctly[end:])
                else:
                    index = f'{user_id}'
                    self.user_ids.append(index)
                    self.samples[index] = (content_id, answered_correctly)
                
                
    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        user_id = self.user_ids[index]
        content_id, answered_correctly = self.samples[user_id]
        seq_len = len(content_id)
        
        content_id_seq = np.zeros(self.max_seq, dtype=int)
        answered_correctly_seq = np.zeros(self.max_seq, dtype=int)
        if seq_len >= self.max_seq:
            content_id_seq[:] = content_id[-self.max_seq:]
            answered_correctly_seq[:] = answered_correctly[-self.max_seq:]
        else:
            content_id_seq[-seq_len:] = content_id
            answered_correctly_seq[-seq_len:] = answered_correctly
            
        target_id = content_id_seq[1:] # question till the current one
        label = answered_correctly_seq[1:]
        
        x = content_id_seq[:-1].copy() # question till the previous one
        # encoded answers till the previous one
        x += (answered_correctly_seq[:-1] == 1) * self.n_skill
        
        return x, target_id, label

train_dataset = SAKTDataset(train_group, n_skill=NUM_SKILLS, max_seq=MAX_SEQ)
train_dataloader = DataLoader(train_dataset, 
                                batch_size=BATCH_SIZE, 
                                shuffle=True, 
                                drop_last=True)


val_dataset = SAKTDataset(valid_group, n_skill=NUM_SKILLS, max_seq=MAX_SEQ)
val_dataloader = DataLoader(val_dataset, 
                            batch_size=VAL_BATCH_SIZE, 
                            shuffle=False)

sample_batch = next(iter(train_dataloader))
sample_batch[0].shape, sample_batch[1].shape, sample_batch[2].shape

(torch.Size([64, 179]), torch.Size([64, 179]), torch.Size([64, 179]))

In [5]:
class FFN(nn.Module):
    def __init__(self, state_size = MAX_SEQ, 
                    forward_expansion = 1, 
                    bn_size=MAX_SEQ - 1, 
                    dropout=0.2):
        super(FFN, self).__init__()
        self.state_size = state_size
        
        self.lr1 = nn.Linear(state_size, forward_expansion * state_size)
        self.relu = nn.ReLU()
        self.bn = nn.BatchNorm1d(bn_size)
        self.lr2 = nn.Linear(forward_expansion * state_size, state_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = self.relu(self.lr1(x))
        x = self.bn(x)
        x = self.lr2(x)
        return self.dropout(x)

def future_mask(seq_length):
    future_mask = (np.triu(np.ones([seq_length, seq_length]), k = 1)).astype('bool')
    return torch.from_numpy(future_mask)

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, 
                    heads = 8, 
                    dropout = DROPOUT, 
                    forward_expansion = 1):
        super(TransformerBlock, self).__init__()
        self.multi_att = nn.MultiheadAttention(embed_dim=embed_dim, 
                        num_heads=heads, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.layer_normal = nn.LayerNorm(embed_dim)
        self.ffn = FFN(embed_dim, 
                    forward_expansion = forward_expansion, 
                    dropout=dropout)
        self.layer_normal_2 = nn.LayerNorm(embed_dim)
        

    def forward(self, value, key, query, att_mask):
        att_output, att_weight = self.multi_att(value, key, query, attn_mask=att_mask)
        att_output = self.dropout(self.layer_normal(att_output + value))
        att_output = att_output.permute(1, 0, 2) 
        # att_output: [s_len, bs, embed] => [bs, s_len, embed]
        x = self.ffn(att_output)
        x = self.dropout(self.layer_normal_2(x + att_output))
        return x.squeeze(-1), att_weight
    
class Encoder(nn.Module):
    def __init__(self, n_skill, max_seq=100, 
                 embed_dim=128, 
                 dropout = DROPOUT, 
                 forward_expansion = 1, 
                 num_layers=1, 
                 heads = 8):
        super(Encoder, self).__init__()
        self.n_skill, self.embed_dim = n_skill, embed_dim
        self.embedding = nn.Embedding(2 * n_skill + 1, embed_dim)
        self.pos_embedding = nn.Embedding(max_seq - 1, embed_dim)
        self.e_embedding = nn.Embedding(n_skill+1, embed_dim)
        self.layers = nn.ModuleList([TransformerBlock(embed_dim, heads=heads,
                forward_expansion = forward_expansion) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, question_ids):
        device = x.device
        x = self.embedding(x)
        pos_id = torch.arange(x.size(1)).unsqueeze(0).to(device)
        pos_x = self.pos_embedding(pos_id)
        x = self.dropout(x + pos_x)
        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        e = self.e_embedding(question_ids)
        e = e.permute(1, 0, 2)
        for layer in self.layers:
            att_mask = future_mask(e.size(0)).to(device)
            x, att_weight = layer(e, x, x, att_mask=att_mask)
            x = x.permute(1, 0, 2)
        x = x.permute(1, 0, 2)
        return x, att_weight

class SAKTModel(nn.Module):
    def __init__(self, 
                n_skill, 
                max_seq=MAX_SEQ, 
                embed_dim=EMBED_SIZE, 
                dropout = DROPOUT, 
                forward_expansion = 1, 
                enc_layers=1, 
                heads = NUM_HEADS):
        super(SAKTModel, self).__init__()
        self.encoder = Encoder(n_skill, 
                               max_seq, 
                               embed_dim, 
                               dropout, 
                               forward_expansion, 
                               num_layers=enc_layers,
                               heads=heads)
        self.pred = nn.Linear(embed_dim, 1)
        
    def forward(self, x, question_ids):
        x, att_weight = self.encoder(x, question_ids)
        x = self.pred(x)
        return x.squeeze(-1), att_weight


class TestDataset(Dataset):
    def __init__(self, samples, test_df, n_skill, max_seq=100):
        super(TestDataset, self).__init__()
        self.samples = samples
        self.user_ids = [x for x in test_df["user_id"].unique()]
        self.test_df = test_df
        self.n_skill, self.max_seq = n_skill, max_seq

    def __len__(self):
        return self.test_df.shape[0]
    
    def __getitem__(self, index):
        test_info = self.test_df.iloc[index]
        
        user_id = test_info['user_id']
        target_id = test_info['content_id']
        
        content_id_seq = np.zeros(self.max_seq, dtype=int)
        answered_correctly_seq = np.zeros(self.max_seq, dtype=int)
        
        if user_id in self.samples.index:
            content_id, answered_correctly = self.samples[user_id]
            
            seq_len = len(content_id)
            
            if seq_len >= self.max_seq:
                content_id_seq = content_id[-self.max_seq:]
                answered_correctly_seq = answered_correctly[-self.max_seq:]
            else:
                content_id_seq[-seq_len:] = content_id
                answered_correctly_seq[-seq_len:] = answered_correctly
                
        x = content_id_seq[1:].copy()
        x += (answered_correctly_seq[1:] == 1) * self.n_skill
        
        questions = np.append(content_id_seq[2:], [target_id])
        
        return x, questions

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'\nUsing device: {device}')
model_file = MODEL_DIR+'sakt_seq_180_auc_0.7689.pth'
model = SAKTModel(n_skill=NUM_SKILLS, 
                  max_seq=MAX_SEQ, 
                  embed_dim=EMBED_SIZE, 
                  forward_expansion=1, 
                  enc_layers=1, 
                  heads=NUM_HEADS, 
                  dropout=DROPOUT)

n_params = get_num_params(model)
print(f"Current model has {n_params} parameters.")
        
model = model.to(device)
model.load_state_dict(torch.load(model_file, map_location=device))


Using device: cuda
Current model has 5316071 parameters.


<All keys matched successfully>

In [7]:
# mock test
with timer("Loading private simulated test set"):
    all_test_df = pd.read_parquet(DATA_DIR+'cv2_valid.parquet')
    all_test_df = all_test_df[:DEBUG_TEST_SIZE]

all_test_df['answer_correctly_true'] = all_test_df[TARGET]

predicted = []
def set_predict(df):
    predicted.append(df)

# reload all user group for cv2
with timer('loading cv2'):
    with open(os.path.join(DATA_DIR, 'sakt_group_cv2.pickle'), 'rb') as f:
        group = pickle.load(f)

[94mLoading private simulated test set: start at 1609943641.584812;[0m
[92mLOCAL RAM USAGE AT START: 2.92 GB[0m
[94mLoading private simulated test set:  done at 1609943641.6892245 (0.10 secs elapsed);[0m
[92mLOCAL RAM USAGE AT END:   3.20GB (+0.28GB)[0m


[94mloading cv2: start at 1609943641.6899252;[0m
[92mLOCAL RAM USAGE AT START: 3.20 GB[0m
[94mloading cv2:  done at 1609943642.4915078 (0.80 secs elapsed);[0m
[92mLOCAL RAM USAGE AT END:   3.72GB (+0.52GB)[0m




In [8]:
def iter_env_run(all_test_df, n_iter=1):
    '''
    Running mock test for n_iter iterations using tito's iter_env simulator and cv2_train user group.
    '''
    iter_test = Iter_Valid(all_test_df, max_user=1000)
    prev_test_df = None
    prev_group = None
    batch_user_ids = []

    # reload all user group for cv2
    with open(os.path.join(DATA_DIR, 'sakt_group_cv2.pickle'), 'rb') as f:
        group = pickle.load(f)

    for _ in range(n_iter):
    
        test_df, sample_prediction_df = next(iter_test)

        if prev_test_df is not None:
            prev_test_df['answered_correctly'] = eval(test_df['prior_group_answers_correct'].iloc[0])
            prev_test_df = prev_test_df[prev_test_df.content_type_id == False]
            prev_group = prev_test_df[['user_id', 'content_id', 'answered_correctly']]\
                                    .groupby('user_id').apply(lambda r: (
                                        r['content_id'].values,
                                        r['answered_correctly'].values))
            for prev_user_id in prev_group.index:
                prev_group_content = prev_group[prev_user_id][0]
                prev_group_answered_correctly = prev_group[prev_user_id][1]
                if prev_user_id in group.index:
                    group[prev_user_id] = (np.append(group[prev_user_id][0], prev_group_content), 
                                        np.append(group[prev_user_id][1], prev_group_answered_correctly))
                else:
                    group[prev_user_id] = (prev_group_content, prev_group_answered_correctly)

                if len(group[prev_user_id][0]) > MAX_SEQ:
                    new_group_content = group[prev_user_id][0][-MAX_SEQ:]
                    new_group_answered_correctly = group[prev_user_id][1][-MAX_SEQ:]
                    group[prev_user_id] = (new_group_content, new_group_answered_correctly)

        prev_test_df = test_df.copy()
        test_df = test_df[test_df.content_type_id == False]

        batch_user_ids.append(test_df.user_id.unique())

        test_dataset = TestDataset(group, test_df, NUM_SKILLS, max_seq=MAX_SEQ)
        test_dataloader = DataLoader(test_dataset, batch_size=len(test_df), shuffle=False)

        item = next(iter(test_dataloader))
        x = item[0].to(device).long()
        target_id = item[1].to(device).long()

        with torch.no_grad():
            output, _ = model(x, target_id)

        output = torch.sigmoid(output)
        preds = output[:, -1]
        test_df['answered_correctly'] = preds.cpu().numpy()
        set_predict(test_df.loc[test_df['content_type_id'] == 0, 
                                ['row_id', 'answered_correctly']])

    return test_df, prev_test_df, output, item, group, prev_group, batch_user_ids

## Debugging notes

Current set up, cv2_valid first 25k rows
first 4 batches common `user_id`: 143316232, 1089397940, 1140583044 (placeholder user?)


In [9]:
group[1089397940]

(array([5551, 5074,  296, 4554, 5192, 6445, 3565,  853,  675, 1315],
       dtype=int16),
 array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0], dtype=int8))

Iteration number 1 in the `iter_env`, the model gives the correct preds.

In [10]:
test_df, prev_test_df, output, item, group_updated, _, _ = iter_env_run(all_test_df, n_iter=1)
u_idx_loc = test_df.index.get_loc(test_df[test_df.user_id==1089397940].index[0])
print(f"local index of user 1089397940: {u_idx_loc}", '\n')
test_df.iloc[u_idx_loc]

local index of user 1089397940: 15 



virtual_timestamp                 68564308947
row_id                               51398657
timestamp                              404961
user_id                            1089397940
content_id                              10687
content_type_id                             0
task_container_id                          10
user_answer                                 1
answered_correctly                    0.90646
prior_question_elapsed_time             17000
prior_question_had_explanation           True
answer_correctly_true                       1
prior_group_responses                      []
prior_group_answers_correct                []
Name: 15, dtype: object

In [11]:
test_df

Unnamed: 0,virtual_timestamp,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,answer_correctly_true,prior_group_responses,prior_group_answers_correct
0,68564290161,88274058,1101050579,1878177804,5589,0,149,2,0.662049,,,0,[],[]
1,68564290523,53804591,908442017,1140583044,773,0,65,3,0.413031,17000.0,True,0,[],[]
2,68564295570,90481687,1055420585,1921309771,1875,0,1034,1,0.80593,24333.0,True,1,[],[]
3,68564295570,90481688,1055420585,1921309771,1876,0,1034,0,0.560809,24333.0,True,0,[],[]
4,68564295570,90481689,1055420585,1921309771,1874,0,1034,2,0.747822,24333.0,True,1,[],[]
5,68564295694,30869366,4666240497,661776423,703,0,263,0,0.807548,19000.0,True,1,[],[]
6,68564297749,67419008,34211239598,1434239202,9611,0,573,2,0.790535,14000.0,True,1,[],[]
7,68564302590,99211180,409311647,2104957435,3992,0,70,0,0.901264,13000.0,True,1,[],[]
8,68564305121,79224568,61247394278,1686819041,9836,0,3304,2,0.225786,64000.0,True,0,[],[]
9,68564305686,89389150,1348681,1899558787,7216,0,15,0,0.249552,27000.0,False,0,[],[]


In [12]:
print(item[1][u_idx_loc, -12:]) # user 1089397940 first batch in example_test (question sequence)
print(item[0][u_idx_loc, -12:]) # user 1089397940 first batch in example_test: skill sequence = prev_content_id * (correct or not) + 13523
print(output[u_idx_loc, -12:].cpu().numpy(),'\n') # user 1089397940 probability prediction

print(group_updated[1089397940][0][:12]) # in the first iteration the length is only 11
print(group_updated[1089397940][1][:12])

tensor([    0,  5551,  5074,   296,  4554,  5192,  6445,  3565,   853,   675,
         1315, 10687])
tensor([    0,     0, 19074, 18597, 13819,  4554,  5192, 19968, 17088, 14376,
        14198,  1315])
[0.5443989  0.31052786 0.6453548  0.7576913  0.7659588  0.39360327
 0.5405494  0.6249994  0.3041437  0.6415931  0.28815264 0.90646034] 

[5551 5074  296 4554 5192 6445 3565  853  675 1315]
[1 1 1 0 0 1 1 1 1 0]


Iteration number 2

In [13]:
test_df, prev_test_df, output, item, group_updated, _, _ = iter_env_run(all_test_df, n_iter=2)
u_idx_loc = test_df.index.get_loc(test_df[test_df.user_id==1089397940].index[0])
print(f"local index of user 1089397940: {u_idx_loc}", '\n')
test_df.iloc[u_idx_loc]

local index of user 1089397940: 3 



virtual_timestamp                 68564332056
row_id                               51398658
timestamp                              428070
user_id                            1089397940
content_id                                412
content_type_id                             0
task_container_id                          11
user_answer                                 3
answered_correctly                   0.768928
prior_question_elapsed_time             13000
prior_question_had_explanation           True
answer_correctly_true                       1
prior_group_responses                      []
prior_group_answers_correct                []
Name: 43, dtype: object

In [15]:
test_df

Unnamed: 0,virtual_timestamp,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,answer_correctly_true,prior_group_responses,prior_group_answers_correct
40,68564329112,53804592,908480606,1140583044,506,0,66,3,0.729357,19000.0,True,1,"[2,3,1,0,2,0,2,0,2,0,2,2,3,0,0,1,1,1,2,2,1,0,1...","[0,0,1,0,1,1,1,1,0,0,1,1,0,0,1,1,1,0,1,1,1,1,1..."
41,68564329267,95275628,29264804479,2021767860,410,0,991,3,0.8174,19000.0,True,1,[],[]
42,68564331305,6572819,26121829,143316232,9286,0,69,0,0.485461,37000.0,True,0,[],[]
43,68564332056,51398658,428070,1089397940,412,0,11,3,0.768928,13000.0,True,1,[],[]
44,68564332455,66597784,2677732279,1414737602,4749,0,1504,2,0.21525,30000.0,True,1,[],[]
45,68564333633,67419009,34211275482,1434239202,5609,0,574,3,0.901013,27000.0,True,1,[],[]
46,68564334407,3468134,20730160405,74557183,967,0,1102,1,0.433433,28666.0,True,1,[],[]
47,68564338410,28170520,1862294,601496437,11315,0,24,2,0.32771,41000.0,True,0,[],[]
48,68564341539,76708022,18482569556,1633230783,839,0,5041,0,0.69519,15000.0,True,1,[],[]
49,68564342822,50632497,17425525,1072148405,3944,0,39,0,0.384739,27000.0,True,1,[],[]


In [14]:
print(item[1][u_idx_loc, -12:]) # user 1089397940 2nd batch in example_test (question sequence)
print(item[0][u_idx_loc, -12:]) # user 1089397940 2nd batch in example_test: skill sequence = prev_content_id * (correct or not) + 13523
print(output[u_idx_loc, -12:].cpu().numpy(),'\n') # user 1089397940 probability prediction

print(group_updated[1089397940][0][:12]) # in the 2nd iteration the length is only 11
print(group_updated[1089397940][1][:12])

tensor([ 5551,  5074,   296,  4554,  5192,  6445,  3565,   853,   675,  1315,
        10687,   412])
tensor([    0, 19074, 18597, 13819,  4554,  5192, 19968, 17088, 14376, 14198,
         1315, 24210])
[0.3604449  0.60844517 0.7734691  0.771187   0.4806249  0.6075902
 0.5351341  0.28282395 0.6779909  0.33384073 0.8998479  0.7689281 ] 

[ 5551  5074   296  4554  5192  6445  3565   853   675  1315 10687]
[1 1 1 0 0 1 1 1 1 0 1]


Iteration number 3

In [39]:
test_df, prev_test_df, output, item, group_updated, _, _ = iter_env_run(all_test_df, n_iter=3)
u_idx_loc = test_df.index.get_loc(test_df[test_df.user_id==1089397940].index[0])
print(f"local index of user 1089397940: {u_idx_loc}", '\n')
test_df.iloc[u_idx_loc]

local index of user 1089397940: 2 



virtual_timestamp                 68564356750
row_id                               51398659
timestamp                              452764
user_id                            1089397940
content_id                                651
content_type_id                             0
task_container_id                          12
user_answer                                 3
answered_correctly                   0.374165
prior_question_elapsed_time             16000
prior_question_had_explanation           True
answer_correctly_true                       1
prior_group_responses                      []
prior_group_answers_correct                []
Name: 54, dtype: object

In [23]:
prev_test_df

Unnamed: 0,virtual_timestamp,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,answer_correctly_true,prior_group_responses,prior_group_answers_correct
52,68564353140,53804593,908504634,1140583044,1238,0,67,1,1,15000.0,True,1,"[3,3,0,3,2,3,1,2,0,0,1,0]","[1,1,0,1,1,1,1,0,1,1,0,1]"
53,68564354871,6789101,208892916,147771339,9285,0,146,0,0,17000.0,True,0,[],[]
54,68564356750,51398659,452764,1089397940,651,0,12,3,1,16000.0,True,1,[],[]
55,68564357545,6572820,26148069,143316232,9874,0,70,3,0,13000.0,True,0,[],[]
56,68564357944,79224569,61247447101,1686819041,6325,0,3305,2,0,47000.0,True,0,[],[]
57,68564362534,52786660,95197287,1118911890,660,0,76,3,1,16000.0,True,1,[],[]
58,68564364043,6559032,31998443066,142958766,1265,0,423,3,0,9000.0,True,0,[],[]
59,68564367850,73310718,58181,1559547677,4216,0,1,2,0,9000.0,False,0,[],[]
60,68564369095,79509641,804851645,1693026219,4476,0,210,3,0,18000.0,True,0,[],[]
61,68564369521,30215621,11412935891,648079731,401,0,875,3,1,18000.0,True,1,[],[]


In [40]:
prev_test_df['prior_group_answers_correct'].iloc[0]

'[1,1,0,1,1,1,1,0,1,1,0,1]'

In [41]:
print(item[1][u_idx_loc, -12:]) # user 1089397940 first batch in example_test (question sequence)
print(item[0][u_idx_loc, -12:]) # user 1089397940 first batch in example_test: skill sequence = prev_content_id * (correct or not) + 13523
print(output[u_idx_loc, -12:].cpu().numpy(),'\n') # user 1089397940 probability prediction

print(group_updated[1089397940][0][:12]) # in the first iteration the length is only 11
print(group_updated[1089397940][1][:12])

tensor([ 5074,   296,  4554,  5192,  6445,  3565,   853,   675,  1315, 10687,
          412,   651])
tensor([19074, 18597, 13819,  4554,  5192, 19968, 17088, 14376, 14198,  1315,
        24210, 13935])
[0.6779202  0.75148046 0.7712332  0.4629782  0.6360581  0.52873796
 0.2487526  0.69588625 0.40744492 0.9152832  0.6593629  0.3741647 ] 

[ 5551  5074   296  4554  5192  6445  3565   853   675  1315 10687   412]
[1 1 1 0 0 1 1 1 1 0 1 1]


Iteration number 4

In [18]:
test_df, prev_test_df, output, item, group_updated, _, _ = iter_env_run(all_test_df, n_iter=4)
u_idx_loc = test_df.index.get_loc(test_df[test_df.user_id==1089397940].index[0])
print(f"local index of user 1089397940: {u_idx_loc}", '\n')
test_df.iloc[u_idx_loc]

local index of user 1089397940: 8 



virtual_timestamp                 68564391579
row_id                               51398660
timestamp                              487593
user_id                            1089397940
content_id                                665
content_type_id                             0
task_container_id                          13
user_answer                                 0
answered_correctly                   0.276526
prior_question_elapsed_time             17000
prior_question_had_explanation           True
answer_correctly_true                       0
prior_group_responses                      []
prior_group_answers_correct                []
Name: 71, dtype: object

In [19]:
test_df

Unnamed: 0,virtual_timestamp,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,answer_correctly_true,prior_group_responses,prior_group_answers_correct
62,68564376280,53804594,908527774,1140583044,1296,0,68,0,0.650476,19000.0,True,1,"[1,0,3,3,2,3,3,2,3,3]","[1,0,1,0,0,1,0,0,0,1]"
63,68564376507,6572821,26167031,143316232,13407,0,71,3,0.301902,19000.0,True,1,[],[]
64,68564376553,100237380,34834345988,2127578322,551,0,806,0,0.81286,17000.0,True,1,[],[]
66,68564378665,79862225,32408560618,1700268996,2974,0,1494,1,0.969968,20000.0,True,1,[],[]
67,68564378665,79862226,32408560618,1700268996,2975,0,1494,2,0.635115,20000.0,True,1,[],[]
68,68564378665,79862227,32408560618,1700268996,2973,0,1494,3,0.920297,20000.0,True,1,[],[]
69,68564381106,89677238,1353720395,1905467642,9134,0,37,2,0.263514,31000.0,True,1,[],[]
70,68564389829,95275629,29264865041,2021767860,1226,0,992,3,0.709984,19000.0,True,1,[],[]
71,68564391579,51398660,487593,1089397940,665,0,13,0,0.276526,17000.0,True,0,[],[]
72,68564392581,50632498,17475284,1072148405,5213,0,40,3,0.446198,15000.0,True,1,[],[]


In [38]:
print(item[1][u_idx_loc, -12:]) # user 1089397940 first batch in example_test (question sequence)
print(item[0][u_idx_loc, -12:]) # user 1089397940 first batch in example_test: skill sequence = prev_content_id * (correct or not) + 13523
print(output[u_idx_loc, -12:].cpu().numpy(),'\n') # user 1089397940 probability prediction

print(group_updated[1089397940][0][:12]) # in the first iteration the length is only 11
print(group_updated[1089397940][1][:12])

tensor([  296,  4554,  5192,  6445,  3565,   853,   675,  1315, 10687,   412,
          651,   665])
tensor([18597, 13819,  4554,  5192, 19968, 17088, 14376, 14198,  1315, 24210,
        13935, 14174])
[0.7157869  0.74189687 0.4968931  0.6795694  0.53402317 0.31044167
 0.6519086  0.31791052 0.93494207 0.78832686 0.37740535 0.2967214 ] 

[ 5551  5074   296  4554  5192  6445  3565   853   675  1315 10687   412]
[1 1 1 0 0 1 1 1 1 0 1 1]


In [42]:
prev_test_df

Unnamed: 0,virtual_timestamp,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,answer_correctly_true,prior_group_responses,prior_group_answers_correct
52,68564353140,53804593,908504634,1140583044,1238,0,67,1,1,15000.0,True,1,"[3,3,0,3,2,3,1,2,0,0,1,0]","[1,1,0,1,1,1,1,0,1,1,0,1]"
53,68564354871,6789101,208892916,147771339,9285,0,146,0,0,17000.0,True,0,[],[]
54,68564356750,51398659,452764,1089397940,651,0,12,3,1,16000.0,True,1,[],[]
55,68564357545,6572820,26148069,143316232,9874,0,70,3,0,13000.0,True,0,[],[]
56,68564357944,79224569,61247447101,1686819041,6325,0,3305,2,0,47000.0,True,0,[],[]
57,68564362534,52786660,95197287,1118911890,660,0,76,3,1,16000.0,True,1,[],[]
58,68564364043,6559032,31998443066,142958766,1265,0,423,3,0,9000.0,True,0,[],[]
59,68564367850,73310718,58181,1559547677,4216,0,1,2,0,9000.0,False,0,[],[]
60,68564369095,79509641,804851645,1693026219,4476,0,210,3,0,18000.0,True,0,[],[]
61,68564369521,30215621,11412935891,648079731,401,0,875,3,1,18000.0,True,1,[],[]
