# Transformer in PyTorch

This is a debugged version of https://www.kaggle.com/adityaecdrid/pytorch-demystifying-transformers that added inference. In the original one the author mistakenly sorted the dataframe during feature engineering which result the CV being okay but upon submission, you only get 0.5 AUC like a random guess. 

For inference, I added a function to add new user, however, it is somewhat slow....

A scheduler is also added to the template. There are changes here and there to fix the original kernel's hardcoded hyperparameters.

CV is fixed as well, however, apparently there is a leakage as the CV is much higher than LB even in the debugged version.

Reference:

- CV: https://www.kaggle.com/marisakamozz/cv-strategy-in-the-kaggle-environment
- @adityaecdrid 's kernel: https://www.kaggle.com/adityaecdrid/pytorch-demystifying-transformers
- Host's arXiv preprints https://arxiv.org/abs/2002.07033, https://arxiv.org/abs/2010.12042


In [None]:
import numpy as np
import pandas as pd
import pickle
import gc

from time import time, sleep
import itertools

from collections import deque, Counter

from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from typing import List
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
gc.enable()

import torch
import torch.nn as nn
import torch.nn.functional as F
import math

from torch.autograd import Variable
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.nn.modules.dropout import Dropout
from torch.nn.modules.activation import MultiheadAttention
from torch.nn.modules.normalization import LayerNorm
from torch.utils.data import DataLoader, Dataset

from torch.optim.lr_scheduler import ReduceLROnPlateau

TRAIN_DTYPES = {
    # 'row_id': np.uint32,
    'timestamp': np.uint64,
    'user_id': np.uint32,
    'content_id': np.uint16,
    'content_type_id': np.uint8,
    'task_container_id': np.uint16,
    'user_answer': np.int8,
    'answered_correctly': np.int8,
    'prior_question_elapsed_time': np.float32,
    'prior_question_had_explanation': 'boolean'
}
TRAIN_COLS = list(TRAIN_DTYPES.keys())

DATA_DIR = Path('../input/riiid-test-answer-prediction')
TRAIN_PATH = DATA_DIR / 'train.csv'
QUESTIONS_PATH = DATA_DIR / 'questions.csv'
LECTURES_PATH = DATA_DIR / 'lectures.csv'

# Some global variables
# this parameter denotes how many last seen content_ids I am going to consider <aka the max_seq_len or the window size>.
MAX_SEQ = 100 
TQDM_INT = 4 # interval for tqdm to update the pbar
PAD = 0
FILLNA_VAL = 14_000
n_skill = 13523
EPOCHS = 20
BATCH_SIZE = 512
VAL_BATCH_SIZE = 2048
TEST_BATCH_SIZE = 4096

DEBUG = False
NROWS_TRAIN, NROWS_VAL = (20_000_000, 200_000) if not DEBUG else (2_000_000, 20_000)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the Dataset

We load the train, cv split directly from @marisakamozz 's awesome kernel: https://www.kaggle.com/marisakamozz/cv-strategy-in-the-kaggle-environment

In [None]:
df_questions = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')

In [None]:
%%time
df_train = pd.read_parquet('../input/cv-strategy-in-the-kaggle-environment/cv3_train.parquet')
df_train = df_train[TRAIN_COLS]
df_train = df_train[:NROWS_TRAIN]
gc.collect();

In [None]:
%%time
df_valid = pd.read_parquet('../input/cv-strategy-in-the-kaggle-environment/cv3_valid.parquet')
df_valid = df_valid[TRAIN_COLS]
df_valid = df_valid[:NROWS_VAL]
gc.collect();

In [None]:
def get_feats(data_df, question_df, max_seq=MAX_SEQ):
    '''
    Using a deque as it automatically limits the max_size as per the Data Strucutre's defination itself
    so we don't need to manage that...
    '''
    data_df = data_df.copy()

    data_df['prior_question_had_explanation'] = \
            data_df['prior_question_had_explanation'].astype(np.float16).fillna(0).astype(np.int8)
    data_df = data_df.loc[data_df['content_type_id'] == 0]
    
    part_ids_map = dict(zip(question_df['question_id'], question_df['part']))
    data_df['part_id'] = data_df['content_id'].map(part_ids_map)
    
    data_df["prior_question_elapsed_time"].fillna(FILLNA_VAL, inplace=True) 
    # FILLNA_VAL different than all current values
    data_df["prior_question_elapsed_time"] = data_df["prior_question_elapsed_time"] // 1000

    
    df = {}
    user_id_to_idx = {}
    # the sort needs to be False here for the test
    grp = data_df.groupby("user_id", sort=False).tail(max_seq) # Select MAX_SEQ rows of each user.
    grp_user = grp.groupby("user_id", sort=False)
    num_user_id_grp = len(grp_user)
    # with tqdm(total=num_user_id_grp) as pbar:
    for idx, row in grp_user.agg({
        "content_id":list, 
        "answered_correctly":list, 
        "task_container_id":list, 
        "part_id":list, 
        "prior_question_elapsed_time":list
        }).reset_index().iterrows():
        # here we make a split whether a user has more than equal to 100 entries or less than that
        # if it's less than max_seq, then we need to PAD it using the PAD token defined as 0 by me in this cell block
        # also, padded will be True where we have done padding obviously, rest places it's False.
        if len(row["content_id"]) >= max_seq:
            df[idx] = {
                "user_id": row["user_id"],
                "content_id" : deque(row["content_id"], maxlen=max_seq),
                "answered_correctly" : deque(row["answered_correctly"], maxlen=max_seq),
                "task_container_id" : deque(row["task_container_id"], maxlen=max_seq),
                "prior_question_elapsed_time" : deque(row["prior_question_elapsed_time"], maxlen=max_seq),
                "part_id": deque(row["part_id"], maxlen=max_seq),
                "padded" : deque([False]*max_seq, maxlen=max_seq)
            }
        else:
            # we have to pad...
            num_padding = max_seq-len(row["content_id"])
            padding = [PAD]*num_padding
            df[idx] = {
            "user_id": row["user_id"],
            "content_id" : deque(row["content_id"] + padding, maxlen=max_seq),
            "answered_correctly" : deque(row["answered_correctly"] + padding, maxlen=max_seq),
            "task_container_id" : deque(row["task_container_id"] + padding, maxlen=max_seq),
            "prior_question_elapsed_time" : deque(row["prior_question_elapsed_time"] + padding, maxlen=max_seq),
            "part_id": deque(row["part_id"] + padding, maxlen=max_seq),
            "padded" : deque([False]*len(row["content_id"]) + [True]*num_padding, maxlen=max_seq)
            }
        user_id_to_idx[row["user_id"]] = idx
        # if in future a new user comes, we will just increase the counts as of now... <WIP>
    return df, user_id_to_idx

In [None]:
%%time
print(f"train rows: {len(df_train)}    valid rows: {len(df_valid)} ")
d, user_id_to_idx_train = get_feats(df_train, df_questions)
d_val, _ = get_feats(df_valid, df_questions)
if not DEBUG:
    del df_train, df_valid
    gc.collect()

# A Minimal Transformer Model

In [None]:
class TransformerModel(nn.Module):

    def __init__(self, ninp:int=32, nhead:int=2, nhid:int=64, nlayers:int=2, dropout:float=0.3):
        '''
        nhead -> number of heads in the transformer multi attention thing.
        nhid -> the number of hidden dimension neurons in the model.
        nlayers -> how many layers we want to stack.
        '''
        super(TransformerModel, self).__init__()
        self.src_mask = None
        encoder_layers = TransformerEncoderLayer(d_model=ninp, 
                                                 nhead=nhead, 
                                                 dim_feedforward=nhid, 
                                                 dropout=dropout, 
                                                 activation='relu')
        self.transformer_encoder = TransformerEncoder(encoder_layer=encoder_layers, num_layers=nlayers)
        self.exercise_embeddings = nn.Embedding(num_embeddings=13523, embedding_dim=ninp) # exercise_id
        self.pos_embedding = nn.Embedding(ninp, ninp) # positional embeddings
        self.part_embeddings = nn.Embedding(num_embeddings=7+1, embedding_dim=ninp) # part_id_embeddings
        self.prior_question_elapsed_time = nn.Embedding(num_embeddings=301, embedding_dim=ninp) # prior_question_elapsed_time
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, 2)
        self.init_weights()
        self.name = 'transformer'

    def init_weights(self):
        initrange = 0.1
        # init embeddings
        self.exercise_embeddings.weight.data.uniform_(-initrange, initrange)
        self.part_embeddings.weight.data.uniform_(-initrange, initrange)
        self.prior_question_elapsed_time.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, content_id, part_id, prior_question_elapsed_time=None, mask_src=None):
        '''
        S is the sequence length, N the batch size and E the Embedding Dimension (number of features).
        src: (S, N, E)
        src_mask: (S, S)
        src_key_padding_mask: (N, S)
        padding mask is (N, S) with boolean True/False.
        SRC_MASK is (S, S) with float(’-inf’) and float(0.0).
        '''

        embedded_src = self.exercise_embeddings(content_id) + \
        self.pos_embedding(torch.arange(0, content_id.shape[1]).to(self.device).unsqueeze(0).repeat(content_id.shape[0], 1)) + \
        self.part_embeddings(part_id) + self.prior_question_elapsed_time(prior_question_elapsed_time) # (N, S, E)
        embedded_src = embedded_src.transpose(0, 1) # (S, N, E)
        
        _src = embedded_src * np.sqrt(self.ninp)
        
        output = self.transformer_encoder(src=_src, src_key_padding_mask=mask_src)
        output = self.decoder(output)
        output = output.transpose(1, 0)
        return output

In [None]:
class Riiid(Dataset):
    def __init__(self, d):
        super(Riiid, self).__init__()
        self.d = d
    
    def __len__(self):
        return len(self.d)
    
    def __getitem__(self, idx):
        # you can return a dict of these as well etc etc...
        # remember the order
        return idx, self.d[idx]["content_id"], self.d[idx]["task_container_id"], \
    self.d[idx]["part_id"], self.d[idx]["prior_question_elapsed_time"], self.d[idx]["padded"], \
    self.d[idx]["answered_correctly"]

    
def collate_fn(batch):
    _, content_id, task_id, part_id, prior_question_elapsed_time, padded, labels = zip(*batch)
    content_id = torch.Tensor(content_id).long()
    task_id = torch.Tensor(task_id).long()
    part_id = torch.Tensor(part_id).long()
    prior_question_elapsed_time = torch.Tensor(prior_question_elapsed_time).long()
    padded = torch.Tensor(padded).bool()
    labels = torch.Tensor(labels)
    # remember the order
    return content_id, task_id, part_id, prior_question_elapsed_time, padded, labels


In [None]:
dataset_train = Riiid(d=d)
dataset_val = Riiid(d=d_val)

train_loader = DataLoader(dataset=dataset_train, 
                          shuffle=True,
                          batch_size=BATCH_SIZE, 
                          collate_fn=collate_fn)

val_loader = DataLoader(dataset=dataset_val, shuffle=False,
                        batch_size=VAL_BATCH_SIZE, 
                        collate_fn=collate_fn, 
                        drop_last=False)
print(f'Train iters: {len(train_loader)};  val iters: {len(val_loader)}')

In [None]:
sample = next(iter(train_loader)) # next returns the next element in an iterator
# dummy check
print(sample)

In [None]:
# createing the mdoel

def get_num_params(model):
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    return params

model = TransformerModel(ninp=128, nhead=8, nhid=128, nlayers=3, dropout=0.3)

print(f"Number of parameters: {get_num_params(model)} \n")
print(model) # look into it!

# Train models

In [None]:
def train_epoch(model, train_iterator, optimizer, criterion):
    model.train()

    train_loss = []
    num_corrects = 0
    num_total = 0
    label_all = []
    pred_all = []
    len_dataset = len(train_iterator)

    with tqdm(total=len_dataset) as pbar:
        for idx,batch in enumerate(train_iterator):
            content_id, _, part_id, prior_question_elapsed_time, mask, labels = batch
            content_id = Variable(content_id.to(device))
            part_id = Variable(part_id.to(device))
            prior_question_elapsed_time = Variable(prior_question_elapsed_time.to(device))
            mask = Variable(mask.to(device))
            labels = Variable(labels.to(device).long())
            optimizer.zero_grad()
            
            with torch.set_grad_enabled(mode=True):
                output = model(content_id, part_id, prior_question_elapsed_time, mask)
                # output is (N,S,2) # i am working on it
                
                # loss = criterion(output[:,:,1], labels) # BCEWithLogitsLoss
                loss = criterion(output.reshape(-1, 2), labels.reshape(-1)) # Flatten and use crossEntropy
                loss.backward()
                optimizer.step()

                train_loss.append(loss.cpu().detach().data.numpy())

            pred_probs = torch.softmax(output[~mask], dim=1)
            pred = torch.argmax(pred_probs, dim=1)
            labels = labels[~mask]
            num_corrects += (pred == labels).sum().item()
            num_total += len(labels)

            label_all.extend(labels.reshape(-1).data.cpu().numpy())
            # pred_all.extend(pred.reshape(-1).data.cpu().numpy())
            pred_all.extend(pred_probs[:,1].reshape(-1).data.cpu().numpy()) # use probability to do auc

            if idx % TQDM_INT == 0:
                pbar.set_description(f'loss - {train_loss[-1]:.4f}')
                pbar.update(TQDM_INT)

    acc = num_corrects / num_total
    auc = roc_auc_score(label_all, pred_all)
    loss = np.mean(train_loss)

    return loss, acc, auc


def valid_epoch(model, valid_iterator, criterion):
    model.eval()
    valid_loss = []
    num_corrects = 0
    num_total = 0
    label_all = []
    pred_all = []
    len_dataset = len(valid_iterator)

    for idx, batch in enumerate(valid_iterator):
        content_id, _, part_id, prior_question_elapsed_time, mask, labels = batch
        content_id = Variable(content_id.to(device))
        part_id = Variable(part_id.to(device))
        prior_question_elapsed_time = Variable(prior_question_elapsed_time.to(device))
        mask = Variable(mask.to(device))
        labels = Variable(labels.to(device).long())
        with torch.set_grad_enabled(mode=False):
            output = model(content_id, part_id, prior_question_elapsed_time, mask)
            loss = criterion(output.reshape(-1, 2), labels.reshape(-1)) # Flatten and use crossEntropy

        # New: crossEntropy loss
        valid_loss.append(loss.cpu().detach().data.numpy())
        pred_probs = torch.softmax(output[~mask], dim=1)
        pred = torch.argmax(pred_probs, dim=1)

        # Old: BCE loss
        # output_prob = output[:,:,1]
        # pred = (output_prob >= 0.50)
        # print(output.shape, labels.shape) # torch.Size([N, S, 2]) torch.Size([N, S])
        # _, predicted_classes = torch.max(output[:,:,].data, 1)

        labels = labels[~mask]
        num_corrects += (pred == labels).sum().item()
        num_total += len(labels)
        label_all.extend(labels.reshape(-1).data.cpu().numpy())
        pred_all.extend(pred_probs[:,1].reshape(-1).data.cpu().numpy()) # use probability to do auc

    acc = num_corrects / num_total
    auc = roc_auc_score(label_all, pred_all)
    loss = np.mean(valid_loss)

    return loss, acc, auc

In [None]:
losses = []
history = []
auc_max = 0
model.to(device)
# criterion = nn.BCEWithLogitsLoss().to(device)
criterion = nn.CrossEntropyLoss().to(device)
lr = 1e-3 
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=5, threshold=1e-5)

for epoch in range(EPOCHS):
    train_loss, train_acc, train_auc = train_epoch(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc, valid_auc = valid_epoch(model, val_loader, criterion)
    
    if epoch >= 1:
        scheduler.step(valid_auc)
        
    print(f"\n\n[Epoch {epoch}/{EPOCHS}]")
    print(f"\nTrain: loss - {train_loss:.4f} acc - {train_acc:.4f} auc - {train_auc:.4f}")
    print(f"\nValid: loss - {valid_loss:.4f} acc - {valid_acc:.4f} auc - {valid_auc:.4f}")
    lr = optimizer.param_groups[0]['lr']
    history.append({"epoch":epoch, "lr": lr, 
                    **{"train_auc": train_auc, "train_acc": train_acc}, 
                    **{"valid_auc": valid_auc, "valid_acc": valid_acc}})

    if valid_auc > auc_max:
        print(f"\n[Epoch {epoch}/{EPOCHS}] auc improved from {auc_max:.4f} to {valid_auc:.4f}") 
        print("saving model ...")
        auc_max = valid_auc
        torch.save(model.state_dict(), f"{model.name}_auc_{valid_auc:.4f}.pt")

with open(f'history_auc_{valid_auc:.4f}.pickle', 'wb') as handle:
    pickle.dump(history, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Test

In [None]:
def get_feats_test(data_df, question_df, max_seq=MAX_SEQ):

    data_df = data_df.copy()
    
    data_df['prior_question_had_explanation'] = \
            data_df['prior_question_had_explanation'].astype(np.float16).fillna(0).astype(np.int8)
    data_df = data_df.loc[data_df['content_type_id'] == 0]
    
    part_ids_map = dict(zip(question_df['question_id'], question_df['part']))
    data_df['part_id'] = data_df['content_id'].map(part_ids_map)
    
    data_df["prior_question_elapsed_time"].fillna(FILLNA_VAL, inplace=True) 
    # FILLNA_VAL different than all current values
    data_df["prior_question_elapsed_time"] = data_df["prior_question_elapsed_time"] // 1000

    
    df = {}
    user_id_to_idx = {}
     # the sort needs to be False here for the test
    grp = data_df.groupby("user_id", sort=False).tail(max_seq) # Select MAX_SEQ rows of each user.
    grp_user = grp.groupby("user_id", sort=False)
    num_user_id_grp = len(grp_user)

    for idx, row in grp_user.agg({
        "content_id":list, 
#         "answered_correctly":list, 
        "task_container_id":list, 
        "part_id":list, 
        "prior_question_elapsed_time":list
        }).reset_index().iterrows():
        # here we make a split whether a user has more than equal to 100 entries or less than that
        # if it's less than max_seq, then we need to PAD it using the PAD token defined as 0 by me in this cell block
        # also, padded will be True where we have done padding obviously, rest places it's False.
        if len(row["content_id"]) >= max_seq:
            df[idx] = {
                "user_id": row["user_id"],
                "content_id" : deque(row["content_id"], maxlen=max_seq),
#                 "answered_correctly" : deque(row["answered_correctly"], maxlen=max_seq),
                "task_container_id" : deque(row["task_container_id"], maxlen=max_seq),
                "prior_question_elapsed_time" : deque(row["prior_question_elapsed_time"], maxlen=max_seq),
                "part_id": deque(row["part_id"], maxlen=max_seq),
                "padded" : deque([False]*max_seq, maxlen=max_seq),
                "pred_mask": deque([True]*max_seq, maxlen=max_seq),
            }
        else:
            # we have to pad...
            num_padding = max_seq-len(row["content_id"])
            padding = [PAD]*num_padding
            df[idx] = {
            "user_id": row["user_id"],
            "content_id" : deque(row["content_id"] + padding, maxlen=max_seq),
#             "answered_correctly" : deque(row["answered_correctly"] + padding, maxlen=max_seq),
            "task_container_id" : deque(row["task_container_id"] + padding, maxlen=max_seq),
            "prior_question_elapsed_time" : deque(row["prior_question_elapsed_time"] + padding, maxlen=max_seq),
            "part_id": deque(row["part_id"] + padding, maxlen=max_seq),
            "padded" : deque([False]*len(row["content_id"]) + [True]*num_padding, maxlen=max_seq),
            "pred_mask" : deque([True]*len(row["content_id"]) + [False]*num_padding, maxlen=max_seq),
            }
        user_id_to_idx[row["user_id"]] = idx
        # if in future a new user comes, we will just increase the counts as of now... <WIP>
    return df, user_id_to_idx

class RiiidTest(Dataset):
    
    def __init__(self, d):
        super(RiiidTest, self).__init__()
        self.d = d
    
    def __len__(self):
        return len(self.d)
    
    def __getitem__(self, idx):
        # you can return a dict of these as well etc etc...
        # remember the order
        return idx, self.d[idx]["content_id"], self.d[idx]["task_container_id"], \
    self.d[idx]["part_id"], self.d[idx]["prior_question_elapsed_time"], \
    self.d[idx]["padded"], self.d[idx]["pred_mask"]

def collate_fn_test(batch):
    _, content_id, task_id, part_id, prior_question_elapsed_time, padded, pred_mask = zip(*batch)
    content_id = torch.Tensor(content_id).long()
    task_id = torch.Tensor(task_id).long()
    part_id = torch.Tensor(part_id).long()
    prior_question_elapsed_time = torch.Tensor(prior_question_elapsed_time).long()
    padded = torch.Tensor(padded).bool()
    pred_mask = torch.Tensor(pred_mask).bool()
    # remember the order
    return content_id, task_id, part_id, prior_question_elapsed_time, padded, pred_mask

## Function for update user
This is the function for updating the users, however, it is slow....

In [None]:
def update_users(d, d_new, uid_to_idx, uid_to_idx_new, test_flag=False):
    '''
    Add the user's features from d to d_new
    During inference:
    1. add user's feature from previous df to train df (old=prev test, new=train)
    2. after reading current test df, add user's from train df to test df (old=train, new=current test)

    '''
    feature_cols =  ['content_id', 
#             'answered_correctly', 
            'task_container_id', 
            'prior_question_elapsed_time',
            'part_id',
            ]
    mask_cols = ['padded',
            'pred_mask']
    for uid_test, idx_test in uid_to_idx_new.items():
        if uid_test in uid_to_idx.keys():
            idx_train = uid_to_idx[uid_test]
            old_user_mask = [not s for s in d[idx_train]['padded']]

            old_user = []
            for col in feature_cols:
                old_user.append(np.array(d[idx_train][col])[old_user_mask])
            
            new_user_mask = [not s for s in d_new[idx_test]['padded']]
            len_user_pred = sum(new_user_mask)
            # print(len_user_pred)
            for idx_feat, feat in enumerate(feature_cols):
                new_user_update = np.append(old_user[idx_feat],
                                            np.array(d_new[idx_test][feat])[new_user_mask])
                len_user = len(new_user_update) # the length of the current user after update
                # print(len_user)
                if len_user >= MAX_SEQ:
                    d_new[idx_test][feat] = deque(new_user_update[-MAX_SEQ:], maxlen=MAX_SEQ)
                    
                else:
                    num_padding = MAX_SEQ - len_user
                    d_new[idx_test][feat] = deque(np.append(new_user_update, 
                    np.zeros(num_padding, dtype=int)), maxlen=MAX_SEQ)
                    
            if test_flag:
                assert MAX_SEQ >= len_user_pred
                if len_user >= MAX_SEQ:
                    d_new[idx_test]['padded'] = deque([False]*MAX_SEQ, maxlen=MAX_SEQ)
                    d_new[idx_test]['pred_mask'] = \
                        deque([False]*(MAX_SEQ-len_user_pred) + [True]*len_user_pred, maxlen=MAX_SEQ)
                else:       
                    num_padding = MAX_SEQ - len_user
                    
                    d_new[idx_test]['padded'] = deque([False]*len_user + [True]*num_padding, maxlen=MAX_SEQ)

                    d_new[idx_test]['pred_mask'] = \
                                deque([False]*(len_user-len_user_pred) + [True]*len_user_pred + [False]*num_padding, 
                                maxlen=MAX_SEQ)
    return d_new


In [None]:
import riiideducation

env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
%%time
prev_test_df = None

for idx, (current_test, current_prediction_df) in enumerate(iter_test):

    '''
    concised iter_env
    '''
    if prev_test_df is not None:
        '''Making use of answers to previous questions'''
        answers = eval(current_test["prior_group_answers_correct"].iloc[0])
        responses = eval(current_test["prior_group_responses"].iloc[0])
        prev_test_df['answered_correctly'] = answers
        prev_test_df['user_answer'] = responses

        prev_test_df = prev_test_df[prev_test_df['content_type_id'] == False]
        d_prev, user_id_to_idx_prev = get_feats(prev_test_df, df_questions)
        d = update_users(d_prev, d, user_id_to_idx_prev, user_id_to_idx_train)

    prev_test_df = current_test.copy()

    d_test, user_id_to_idx_test = get_feats_test(current_test, df_questions, max_seq=MAX_SEQ)
    d_test = update_users(d, d_test, user_id_to_idx_train, user_id_to_idx_test, test_flag=True)
    dataset_test = RiiidTest(d=d_test)
    test_loader = DataLoader(dataset=dataset_test, 
                                batch_size=TEST_BATCH_SIZE, 
                                collate_fn=collate_fn_test, shuffle=False, drop_last=False)

    # the problem with current feature gen is that 
    # using groupby user_id sorts the user_id and makes it different from the 
    # test_df's order

    output_all = []
    for _, batch in enumerate(test_loader):
        content_id, _, part_id, prior_question_elapsed_time, mask, pred_mask = batch

        content_id = Variable(content_id.cuda())
        part_id = Variable(part_id.cuda())
        prior_question_elapsed_time = Variable(prior_question_elapsed_time.cuda())
        mask = Variable(mask.cuda())

        with torch.no_grad():
            output = model(content_id, part_id, prior_question_elapsed_time, mask)

        pred_probs = torch.softmax(output[pred_mask], dim=1)
        output_all.extend(pred_probs[:,1].reshape(-1).data.cpu().numpy())
    '''prediction code ends'''

    current_test['answered_correctly'] = output_all
    env.predict(current_test[['row_id', 'answered_correctly']])

In [None]:
sns.set()
sub = pd.read_csv('../working/submission.csv')
sub['answered_correctly'].hist()

In [None]:
if DEBUG:
    (current_test, current_prediction_df) = next(iter_test)

    if prev_test_df is not None:
        '''Making use of answers to previous questions'''
        answers = eval(current_test["prior_group_answers_correct"].iloc[0])
        responses = eval(current_test["prior_group_responses"].iloc[0])
        prev_test_df['answered_correctly'] = answers
        prev_test_df['user_answer'] = responses

        prev_test_df = prev_test_df[prev_test_df['content_type_id'] == False]
        d_prev, user_id_to_idx_prev = get_feats(prev_test_df, df_questions)
        d = update_users(d_prev, d, user_id_to_idx_prev, user_id_to_idx_train)

    prev_test_df = current_test.copy()

    d_test, user_id_to_idx_test = get_feats_test(current_test, df_questions, max_seq=MAX_SEQ)
    d_test = update_users(d, d_test, user_id_to_idx_train, user_id_to_idx_test, test_flag=True)
    dataset_test = RiiidTest(d=d_test)
    test_loader = DataLoader(dataset=dataset_test, 
                                batch_size=TEST_BATCH_SIZE, 
                                collate_fn=collate_fn_test, shuffle=False, drop_last=False)

    # the problem with current feature gen is that 
    # using groupby user_id sorts the user_id and makes it different from the 
    # test_df's order

    output_all = []
    for _, batch in enumerate(test_loader):
        content_id, _, part_id, prior_question_elapsed_time, mask, pred_mask = batch

        content_id = Variable(content_id.cuda())
        part_id = Variable(part_id.cuda())
        prior_question_elapsed_time = Variable(prior_question_elapsed_time.cuda())
        mask = Variable(mask.cuda())

        with torch.no_grad():
            output = model(content_id, part_id, prior_question_elapsed_time, mask)

        pred_probs = torch.softmax(output[pred_mask], dim=1)
        output_all.extend(pred_probs[:,1].reshape(-1).data.cpu().numpy())
    '''prediction code ends'''

    current_test['answered_correctly'] = output_all
    env.predict(current_test[['row_id', 'answered_correctly']])