In [None]:
import gc
import os
import time
import json
import psutil
import numpy as np
import pandas as pd
import riiideducation

import torch
import torch.nn as nn
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

In [None]:
device='cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
MAX_SEQ=100
MAX_LAG_TIME=2160
MAX_PREV_ELAPSE_TIME=600

n_questions=13523
n_parts=7
n_responses=3
n_lagtimes=2161
n_prev_elapsed=601


d_model=160
nhead=8
dim_feedforward=250
max_lr=0.0025

# Dataset

In [None]:
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, test_df, max_seq=100):
        self.test_df=test_df
        self.max_seq=max_seq
    def __len__(self):
        return len(self.test_df)
    def __getitem__(self, idx):
        row=self.test_df.iloc[idx]
        
        content_id=row.content_id
        part=row.part
        timestamp=row.timestamp
        prior_question_elapsed_time=int(row.prior_question_elapsed_time)
        prev_seq=row.prev_seq
        
        (q_, p_, r_, lag_, prev_elapsed_)=prev_seq
        seq_len=q_.size
        
        q_=torch.tensor(q_, dtype=int)
        p_=torch.tensor(p_, dtype=int)
        r_=torch.tensor(r_, dtype=int)
        lag_=torch.tensor(lag_, dtype=int)
        prev_elapsed_=torch.tensor(prev_elapsed_, dtype=int)        
        
        q=torch.zeros(self.max_seq, dtype=int)
        p=torch.zeros(self.max_seq, dtype=int)
        r=torch.zeros(self.max_seq, dtype=int)
        lag=torch.zeros(self.max_seq, dtype=int)
        prev_elapsed=torch.zeros(self.max_seq, dtype=int)
        
        label_mask=0
        if seq_len == 0:
            q[0]=content_id
            p[0]=part
            r[0]=2
            prev_elapsed[0]=prior_question_elapsed_time
            lag[0]=0
            label_mask=0
        elif seq_len == self.max_seq:
            q[:-1]=q_[1:]
            p[:-1]=p_[1:]
            prev_elapsed[:-1]=prev_elapsed_[1:]
            lag[:-1]=lag_[1:]
            
            r[:]=r_[:]
            q[-1]=content_id
            p[-1]=part
            prev_elapsed[-1]=prior_question_elapsed_time
            lag[-1]=timestamp
            label_mask=seq_len-1
        else:
            q[:seq_len]=q_[:]
            p[:seq_len]=p_[:]
            prev_elapsed[:seq_len]=prev_elapsed_[:]
            lag[:seq_len]=lag_[:]
            r[1:seq_len+1]=r_[:]
            
            q[seq_len]=content_id
            p[seq_len]=part
            prev_elapsed[seq_len]=prior_question_elapsed_time
            lag[seq_len]=timestamp
            r[0]=2
            label_mask=seq_len
            
        lag[1:]=lag[1:]-lag[:-1]
        lag=lag/60000
        lag[lag>1440] =1441+(lag[lag>1440] - 1440)/60
        lag[0]=0
        lag[lag<0]=0
        lag[lag>MAX_LAG_TIME]=MAX_LAG_TIME
        lag=lag.type(torch.long)
        
        return (q, p, r, lag, prev_elapsed, label_mask)

# Model

In [None]:
%%time
class FFN(nn.Module):
    def __init__(self, d_model=80, dim_feedforward=512, dropout=0.1):
        super(FFN, self).__init__()
        self.fc1=nn.Linear(d_model, dim_feedforward)
        self.relu=nn.ReLU()
        self.fc2=nn.Linear(dim_feedforward, d_model)
        self.dropout=nn.Dropout(dropout)
    def forward(self, x):
        x=self.fc1(x)
        x=self.relu(x)
        x=self.fc2(x)
        x=self.dropout(x)
        return x
    
class TransformerLayer(nn.Module):
    def __init__(self,
                 d_model=80,
                 nhead=5,
                 dim_feedforward=512,
                 dropout=0.1
                ):
        super(TransformerLayer, self).__init__()
        self.multihead_attn=nn.MultiheadAttention(d_model, 
                                                  num_heads=nhead, 
                                                  dropout=dropout)
        self.ffn=FFN(d_model, dim_feedforward, dropout)
        self.layernorm1=nn.LayerNorm(d_model)
        self.layernorm2=nn.LayerNorm(d_model)
        
    def forward(self, Q, K, V, attn_mask=None):
        Q=self.layernorm1(Q)
        attn_output, _=self.multihead_attn(Q, K, V, attn_mask=attn_mask)
        attn_output=Q+attn_output
        
        attn_output=self.layernorm2(attn_output)
        ffn_out=self.ffn(attn_output)
        layer_out=attn_output + ffn_out
        return layer_out


class Transformer(nn.Module):
    def __init__(self, 
                 d_model=80, 
                 nhead=5,
                 dim_feedforward=512,
                 num_layers=1,
                 dropout=0.1
                ):
        super(Transformer, self).__init__()
        self.transformer_layer=TransformerLayer(d_model,nhead,dim_feedforward, dropout)
        
    def forward(self, Q, K, V, attn_mask):
        y=self.transformer_layer(Q, K, V, attn_mask)
        return y
    
    
class KTModel(nn.Module):
    def __init__(self,
                 n_questions,
                 n_parts,
                 n_responses,
                 n_lagtimes=14400,
                 n_prev_elapsed=121,
                 MAX_SEQ=100,
                 
                 d_model=80, 
                 nhead=5,
                 dim_feedforward=512,
                 num_layers=1,
                 dropout=0.1,
                 device='cpu'
                ):
        super(KTModel, self).__init__()
        self.device=device
        self.pos_ids=torch.arange(MAX_SEQ, device=device)
        
        self.pos_embedding=nn.Embedding(MAX_SEQ, d_model)
        self.q_embedding=nn.Embedding(n_questions, d_model)
        self.p_embedding=nn.Embedding(n_parts+1, d_model)
        self.r_embedding=nn.Embedding(n_responses, d_model)
        self.lag_embedding=nn.Embedding(n_lagtimes, d_model)
        self.prev_elapsed_embedding=nn.Embedding(n_prev_elapsed, d_model)
        
        self.encoder_transformer_layer=TransformerLayer(d_model, nhead,dim_feedforward, dropout)
        self.decoder_transformer_layer=TransformerLayer(d_model, nhead,dim_feedforward, dropout)
        self.transformer_layer=TransformerLayer(d_model, nhead,dim_feedforward, dropout)
        
        self.dropout=nn.Dropout(dropout)
        self.out=nn.Linear(d_model, 1)
    def get_attention_mask(self, sz):
        attn_mask=torch.tensor(np.triu(np.ones((sz, sz)), k=1).astype('bool'))
        attn_mask=attn_mask.to(self.device)
        return attn_mask
    
    def get_encoder_inputs(self, q, p):
        pos_embedd=self.dropout(self.pos_embedding(self.pos_ids))
        q_embedd=self.dropout(self.q_embedding(q))
        p_embedd=self.dropout(self.p_embedding(p))
        encoder_in=pos_embedd+q_embedd+p_embedd
        return encoder_in
        
    def get_decoder_inputs(self, lag_times, prev_elapsed, r):
        pos_embedd=self.dropout(self.pos_embedding(self.pos_ids))
        r_embedd=self.dropout(self.r_embedding(r))
        lag_embedd=self.dropout(self.lag_embedding(lag_times))
        prev_elapsed_embedd=self.dropout(self.prev_elapsed_embedding(prev_elapsed))
        decoder_in=pos_embedd+r_embedd+lag_embedd+prev_elapsed_embedd
        return decoder_in
    
    def forward(self, q, p, lag_times, prev_elapsed, r):
        attn_mask=self.get_attention_mask(q.size(1))
        encoder_in=self.get_encoder_inputs(q, p)
        decoder_in=self.get_decoder_inputs(lag_times, prev_elapsed, r)
        

        encoder_in=encoder_in.permute(1, 0, 2)
        decoder_in=decoder_in.permute(1, 0, 2)
        
        encoder_out=self.encoder_transformer_layer(encoder_in, encoder_in, encoder_in, attn_mask=attn_mask)
        decoder_out=self.decoder_transformer_layer(decoder_in, decoder_in, decoder_in, attn_mask=attn_mask)
        
        y=self.transformer_layer(decoder_out, encoder_out, encoder_out, attn_mask=attn_mask)
        y=y.permute(1, 0, 2)
        yout=self.out(y).squeeze(-1)
        return yout

In [None]:
model=KTModel(n_questions,
              n_parts,
              n_responses,
              n_lagtimes=n_lagtimes,
              n_prev_elapsed=n_prev_elapsed,
              MAX_SEQ=MAX_SEQ,
              d_model=d_model, 
              nhead=nhead,
              dim_feedforward=dim_feedforward,
              device=device).to(device)

model.load_state_dict(torch.load('../input/saint-v2/sakt_saint (2).pth'))

In [None]:
def update_group(test_df, prev_test_df):
    if prev_test_df is None or (psutil.virtual_memory().percent>=90):
        return
    prev_answered_correctly=eval(test_df.prior_group_answers_correct.values[0])
    prev_test_df['answered_correctly']=prev_answered_correctly
    prev_test_df=prev_test_df[prev_test_df.content_type_id==0]
    
    
    prev_group=prev_test_df.groupby('user_id').apply(lambda row: (row.content_id.values[-MAX_SEQ:],
                                                                  row.part.values[-MAX_SEQ:],
                                                                  row.answered_correctly.values[-MAX_SEQ:],
                                                                  row.timestamp.values[-MAX_SEQ:],
                                                                  row.prior_question_elapsed_time.values[-MAX_SEQ:]
                                                                 ))
    
    for user_id in prev_group.index.values:
        if user_id not in group.index:
            group[user_id]=prev_group[user_id]
        else:
            (prev_q, prev_p, prev_r, prev_lag, prev_q_elapsed)=prev_group[user_id]
            group[user_id]=(
                np.append(group[user_id][0], prev_q),
                np.append(group[user_id][1], prev_p),
                np.append(group[user_id][2], prev_r),
                np.append(group[user_id][3], prev_lag),
                np.append(group[user_id][4], prev_q_elapsed)
            )
            
        if len(group[user_id][0]) > MAX_SEQ:
            new_q=group[user_id][0][-MAX_SEQ:]
            new_p=group[user_id][1][-MAX_SEQ:]
            new_r=group[user_id][2][-MAX_SEQ:]
            new_lag=group[user_id][3][-MAX_SEQ:]
            new_q_elapsed=group[user_id][4][-MAX_SEQ:]
            
            group[user_id]=(new_q, new_p, new_r, new_lag, new_q_elapsed)

# Load Group Data

In [None]:
%%time
print('Load Group Data')
group=pd.read_pickle('../input/saint-group-submission/saint_group.pkl')
questions_df=pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')
questions_df.rename(columns={'question_id': 'content_id'}, inplace=True)

# Inference

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
%%time
prev_test_df=None
for (test_df, sample_prediction_df) in iter_test:
    test_df=test_df[['row_id', 'user_id', 'content_id',  'timestamp',
                     'content_type_id', 'prior_question_elapsed_time',
                     'prior_group_answers_correct']].merge(
        questions_df[['content_id', 'part']],how='left',on='content_id')
    
    update_group(test_df, prev_test_df)
    
    test_df.part=test_df.part.fillna(5)
    test_df.prior_question_elapsed_time.fillna(MAX_PREV_ELAPSE_TIME, inplace=True)
    test_df.prior_question_elapsed_time=test_df.prior_question_elapsed_time/1000
    test_df.loc[(test_df.prior_question_elapsed_time > MAX_PREV_ELAPSE_TIME), 'prior_question_elapsed_time']=MAX_PREV_ELAPSE_TIME
    prev_test_df=test_df.copy()
    
    
    test_df=test_df[test_df.content_type_id==0]
    test_df['prev_seq']=test_df.user_id.apply(lambda user_id: group[user_id] if user_id in group else (np.array([]), np.array([]), np.array([]), np.array([]), np.array([]) ))
    test_dataset=TestDataset(test_df, max_seq=100)
    test_dataloader=torch.utils.data.DataLoader(test_dataset,
                                                batch_size=1024, 
                                                shuffle=False,
                                                pin_memory=True, 
                                                num_workers=4)

    model.eval()
    y_answered=[]
    with torch.no_grad():
        for (q, p, r, lag, prev_elapsed, label_mask) in test_dataloader:
            q=q.to(device)
            p=p.to(device)
            r=r.to(device)
            lag=lag.to(device)
            prev_elapsed=prev_elapsed.to(device)
            
            y=model(q, p, lag, prev_elapsed, r)
            y_answered.extend([torch.sigmoid(y[idx][label_id]).cpu().item() for idx, label_id in enumerate(label_mask)])
            
    test_df['answered_correctly']=y_answered
    env.predict(test_df[['row_id', 'answered_correctly']])
    del test_df
    gc.collect()