In [None]:
import gc
import os
import time
import random
import numpy as np 
import pandas as pd

import torch
import torch.nn as nn
import matplotlib.pyplot as plt

from torch.optim.lr_scheduler import OneCycleLR, StepLR
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

This notebook is an attempt to include Saint Embeddings into a single transformer block.

thanks to 
1.  https://www.kaggle.com/wangsg/a-self-attentive-model-for-knowledge-tracing/execution
2. https://www.kaggle.com/leadbest/sakt-with-randomization-state-updates
3.https://www.kaggle.com/gilfernandes/riiid-self-attention-transformer/data


In [None]:
device= 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device:', device)

# Constants

In [None]:
MAX_SEQ=100
MAX_LAG_TIME=2160
MAX_PREV_ELAPSE_TIME=600

n_questions=13523
n_parts=7
n_responses=3
n_lagtimes=2161
n_prev_elapsed=601


d_model=160
nhead=8
dim_feedforward=250
max_lr=0.0025
num_epochs=1

TRAIN_BATCH_SIZE=128
VAL_BATCH_SIZE=512

In [None]:
%%time
group=pd.read_pickle('../input/riiiidgroupdataset/group.pkl')
train, val=train_test_split(group,test_size=0.2, random_state=42)

# Dataset

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self,
                 group,
                 max_seq=100
                ):
        
        super(Dataset, self).__init__()
        self.max_seq=max_seq
        self.group=group
        self.user_ids=[]
        self.split_group={}
        
        for user_id in self.group.index:
            (q, p, r, lag, prev_elapsed, _)=self.group[user_id]
            seq_len=len(q)
            if seq_len > self.max_seq:
                last_pos=seq_len//self.max_seq
                for pos_id in range(last_pos):
                    start=pos_id * self.max_seq
                    end=(pos_id+1)*self.max_seq
                    self.split_group['{}-{}'.format(user_id, pos_id)]=(q[start: end], p[start:end], r[start:end], lag[start: end], prev_elapsed[start:end])
                    self.user_ids.append('{}-{}'.format(user_id, pos_id))
                if len(q[end:]) > 5:
                    self.user_ids.append('{}-{}'.format(user_id, last_pos+1))
                    self.split_group['{}-{}'.format(user_id, last_pos+1)]=(q[end:], p[end:], r[end:], lag[end:], prev_elapsed[end:])
            else:
                self.split_group['{}-0'.format(user_id)]=(q, p, r, lag, prev_elapsed)
                self.user_ids.append('{}-0'.format(user_id))
        del self.group
        gc.collect()

    def __len__(self):
        return len(self.split_group)
    
    def __getitem__(self, idx):
        user_id=self.user_ids[idx]
        (q_, p_, r_, lag_, prev_elapsed_)=self.split_group[user_id]
        
        seq_len=len(q_)
        q_=torch.as_tensor(q_, dtype=int)
        p_=torch.as_tensor(p_, dtype=int)
        r_=torch.as_tensor(r_, dtype=int)
        lag_=torch.as_tensor(lag_, dtype=int)
        prev_elapsed_=torch.as_tensor(prev_elapsed_, dtype=int)
        
        q=torch.zeros(self.max_seq, dtype=int)
        p=torch.zeros(self.max_seq, dtype=int)
        r=torch.zeros(self.max_seq, dtype=int)
        y=torch.zeros(self.max_seq, dtype=int)
        lag=torch.zeros(self.max_seq, dtype=int)
        prev_elapsed=torch.zeros(self.max_seq, dtype=int)
        
        label_mask=torch.zeros(self.max_seq, dtype=bool)
        label_mask[:seq_len]=True
        
        
        if seq_len < self.max_seq:
            q[:seq_len]=q_
            p[:seq_len]=p_
            y[:seq_len]=r_
            lag[:seq_len]=lag_
            prev_elapsed[:seq_len]=prev_elapsed_
            
            r[0]=2
            r[1:seq_len]=r_[:seq_len-1]
        else:
            q[:]=q_[:self.max_seq]
            p[:]=p_[:self.max_seq]
            y[:]=r_[:self.max_seq]
            lag[:]=lag_[:self.max_seq]
            prev_elapsed[:]=prev_elapsed_[:self.max_seq]
            r[0]=2
            r[1:]=r_[: self.max_seq-1]
        
        if seq_len>1:
            lag[1:]=lag[1:]-lag[:-1]
            lag=lag/60000
            lag[lag>1440] =1441+(lag[lag>1440] - 1440)/60
            
        lag[0]=0
        lag[lag<0]=0
        lag[lag>MAX_LAG_TIME]=MAX_LAG_TIME
        lag=lag.type(torch.long)
        return (q, p, r, lag, prev_elapsed, y, label_mask)

In [None]:
%%time
train_dataset=Dataset(train, max_seq=MAX_SEQ)
val_dataset=Dataset(val, max_seq=MAX_SEQ)



train_dataloader=torch.utils.data.DataLoader(train_dataset, 
                                             batch_size=TRAIN_BATCH_SIZE,
                                             shuffle=True, 
                                             pin_memory=True,
                                             num_workers=8)

val_dataloader=torch.utils.data.DataLoader(val_dataset, 
                                           batch_size=VAL_BATCH_SIZE, 
                                           shuffle=False,
                                           pin_memory=True,
                                           num_workers=8)

# Model

In [None]:
%%time
class FFN(nn.Module):
    def __init__(self, d_model=80, dim_feedforward=512, dropout=0.1):
        super(FFN, self).__init__()
        self.fc1=nn.Linear(d_model, dim_feedforward)
        self.relu=nn.ReLU()
        self.fc2=nn.Linear(dim_feedforward, d_model)
        self.dropout=nn.Dropout(dropout)
    def forward(self, x):
        x=self.fc1(x)
        x=self.relu(x)
        x=self.fc2(x)
        x=self.dropout(x)
        return x
    
class TransformerLayer(nn.Module):
    def __init__(self,
                 d_model=80,
                 nhead=5,
                 dim_feedforward=512,
                 dropout=0.1
                ):
        super(TransformerLayer, self).__init__()
        self.multihead_attn=nn.MultiheadAttention(d_model, 
                                                  num_heads=nhead, 
                                                  dropout=dropout)
        self.ffn=FFN(d_model, dim_feedforward, dropout)
        self.layernorm1=nn.LayerNorm(d_model)
        self.layernorm2=nn.LayerNorm(d_model)
        
    def forward(self, Q, K, V, attn_mask=None):
        Q=self.layernorm1(Q)
        attn_output, _=self.multihead_attn(Q, K, V, attn_mask=attn_mask)
        attn_output=Q+attn_output
        
        attn_output=self.layernorm2(attn_output)
        ffn_out=self.ffn(attn_output)
        layer_out=attn_output + ffn_out
        return layer_out


class Transformer(nn.Module):
    def __init__(self, 
                 d_model=80, 
                 nhead=5,
                 dim_feedforward=512,
                 num_layers=1,
                 dropout=0.1
                ):
        super(Transformer, self).__init__()
        self.transformer_layer=TransformerLayer(d_model,nhead,dim_feedforward, dropout)
        
    def forward(self, Q, K, V, attn_mask):
        y=self.transformer_layer(Q, K, V, attn_mask)
        return y
    
    
class KTModel(nn.Module):
    def __init__(self,
                 n_questions,
                 n_parts,
                 n_responses,
                 n_lagtimes=14400,
                 n_prev_elapsed=121,
                 MAX_SEQ=100,
                 
                 d_model=80, 
                 nhead=5,
                 dim_feedforward=512,
                 num_layers=1,
                 dropout=0.1,
                 device='cpu'
                ):
        super(KTModel, self).__init__()
        self.device=device
        self.pos_ids=torch.arange(MAX_SEQ, device=device)
        
        self.pos_embedding=nn.Embedding(MAX_SEQ, d_model)
        self.q_embedding=nn.Embedding(n_questions, d_model)
        self.p_embedding=nn.Embedding(n_parts+1, d_model)
        self.r_embedding=nn.Embedding(n_responses, d_model)
        self.lag_embedding=nn.Embedding(n_lagtimes, d_model)
        self.prev_elapsed_embedding=nn.Embedding(n_prev_elapsed, d_model)
        
        self.encoder_transformer_layer=TransformerLayer(d_model, nhead,dim_feedforward, dropout)
        self.decoder_transformer_layer=TransformerLayer(d_model, nhead,dim_feedforward, dropout)
        self.transformer_layer=TransformerLayer(d_model, nhead,dim_feedforward, dropout)
        
        self.dropout=nn.Dropout(dropout)
        self.out=nn.Linear(d_model, 1)
    def get_attention_mask(self, sz):
        attn_mask=torch.tensor(np.triu(np.ones((sz, sz)), k=1).astype('bool'))
        attn_mask=attn_mask.to(self.device)
        return attn_mask
    
    def get_encoder_inputs(self, q, p):
        pos_embedd=self.dropout(self.pos_embedding(self.pos_ids))
        q_embedd=self.dropout(self.q_embedding(q))
        p_embedd=self.dropout(self.p_embedding(p))
        encoder_in=pos_embedd+q_embedd+p_embedd
        return encoder_in
        
    def get_decoder_inputs(self, lag_times, prev_elapsed, r):
        pos_embedd=self.dropout(self.pos_embedding(self.pos_ids))
        r_embedd=self.dropout(self.r_embedding(r))
        lag_embedd=self.dropout(self.lag_embedding(lag_times))
        prev_elapsed_embedd=self.dropout(self.prev_elapsed_embedding(prev_elapsed))
        decoder_in=pos_embedd+r_embedd+lag_embedd+prev_elapsed_embedd
        return decoder_in
    
    def forward(self, q, p, lag_times, prev_elapsed, r):
        attn_mask=self.get_attention_mask(q.size(1))
        encoder_in=self.get_encoder_inputs(q, p)
        decoder_in=self.get_decoder_inputs(lag_times, prev_elapsed, r)
        

        encoder_in=encoder_in.permute(1, 0, 2)
        decoder_in=decoder_in.permute(1, 0, 2)
        
        encoder_out=self.encoder_transformer_layer(encoder_in, encoder_in, encoder_in, attn_mask=attn_mask)
        decoder_out=self.decoder_transformer_layer(decoder_in, decoder_in, decoder_in, attn_mask=attn_mask)
        
        y=self.transformer_layer(decoder_out, encoder_out, encoder_out, attn_mask=attn_mask)
        y=y.permute(1, 0, 2)
        yout=self.out(y).squeeze(-1)
        return yout

In [None]:
%%time
def train_epoch(model, optim, criterion, schedular=None):
    train_loss=[]
    model.train()
    for i, (q, p, r,lag, prev_elapsed, y, label_mask) in enumerate(train_dataloader):
        q=q.to(device)
        p=p.to(device)
        lag=lag.to(device)
        prev_elapsed=prev_elapsed.to(device)
        r=r.to(device)
        y=y.to(device)
        
        label_mask=label_mask.to(device)
        
        optim.zero_grad()
        yout=model(q, p, lag,prev_elapsed, r)
        
        y=torch.masked_select(y, label_mask).type(torch.cuda.FloatTensor if device == 'cuda' else torch.FloatTensor)
        yout=torch.masked_select(yout, label_mask).to(device)
        
        loss_=criterion(yout, y)
        loss_.backward()
        optim.step()
        if schedular is not None:
            schedular.step()
        train_loss.append(loss_.item())
    return np.mean(train_loss)

def val_epoch(model, criterion):
    val_loss=[]
    ytrue=[]
    ypred=[]
    
    model.eval()
    for (q, p, r, lag, prev_elapsed, y, label_mask) in val_dataloader:
        q=q.to(device)
        p=p.to(device)
        r=r.to(device)
        lag=lag.to(device)
        prev_elapsed=prev_elapsed.to(device)
        y=y.to(device)
        
        with torch.no_grad():
            yout=model(q, p, lag,prev_elapsed, r)
            y=torch.masked_select(y, label_mask).type(torch.cuda.FloatTensor if device == 'cuda' else torch.FloatTensor)
            yout=torch.masked_select(yout, label_mask).to(device)
            loss_=criterion(yout, y)
            val_loss.append(loss_.item())
            
            ytrue.extend(y.cpu().numpy())
            ypred.extend(torch.sigmoid(yout).cpu().numpy())
    roc_score=roc_auc_score(ytrue, ypred)
    return (np.mean(val_loss), roc_score)

def train_model(num_epochs, max_lr, model):
    best_loss=None
    epoch_train_loss=[]
    epoch_val_loss=[]
    
    optim=torch.optim.Adam(model.parameters(), lr=max_lr)
    criterion=torch.nn.BCEWithLogitsLoss().to(device)

    schedular=OneCycleLR(optim, 
                     max_lr=max_lr,
                     steps_per_epoch=len(train_dataloader),
                     epochs=num_epochs)

    for i in range(num_epochs):
        start_time=time.time()
        train_loss_=train_epoch(model, optim, criterion, schedular)
        (val_loss_, roc_score)=val_epoch(model, criterion)

        epoch_train_loss.append(train_loss_)
        epoch_val_loss.append(val_loss_)

        if best_loss==None or best_loss>val_loss_:
            best_loss=val_loss_
            torch.save(model.state_dict(), 'sakt_saint.pth')

        end_time=time.time()
        print("--------------------------------")
        print("Epoch:{} | Train Loss:{:.4f} | Val Loss:{:4f} | Val ROC-Score:{:.4f}".format(i, train_loss_, val_loss_, roc_score))
        print('Epoch Time: {:.4f}'.format(end_time-start_time))
        print()
        gc.collect()
    return (epoch_train_loss, epoch_val_loss)

# Training Model

In [None]:
%%time

model=KTModel(n_questions,
              n_parts,
              n_responses,
              n_lagtimes=n_lagtimes,
              n_prev_elapsed=n_prev_elapsed,
              MAX_SEQ=MAX_SEQ,
              d_model=d_model, 
              nhead=nhead,
              dim_feedforward=dim_feedforward,
              device=device).to(device)

(epoch_train_loss1, epoch_val_loss1) = train_model(15, 0.0025,model)
(epoch_train_loss2, epoch_val_loss2) = train_model(5, 0.002,model)

In [None]:
plt.title('Train Loss')
plt.plot(epoch_val_loss1)
plt.show()

plt.title('Val Loss')
plt.plot(epoch_val_loss1)
plt.show()
print('Exit...')

In [None]:
plt.title('Train Loss')
plt.plot(epoch_val_loss2)
plt.show()

plt.title('Val Loss')
plt.plot(epoch_val_loss2)
plt.show()
print('Exit...')