In [None]:
import gc
import os
import time
import random
import numpy as np 
import pandas as pd

import torch
import torch.nn as nn
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

This notebook is an attempt to include Saint Embeddings into a single transformer block.

thanks to 
1.  https://www.kaggle.com/wangsg/a-self-attentive-model-for-knowledge-tracing/execution
2. https://www.kaggle.com/leadbest/sakt-with-randomization-state-updates

In [None]:
%%time
train=pd.read_pickle('../input/riiid-trainpkl/riiid_train.pkl.gzip')
train=train[train.content_type_id==False][['user_id', 'content_id', 'answered_correctly']].copy()
questions_df=pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')


questions_df.rename(columns={'question_id': 'content_id'}, inplace=True)
train=train.merge(questions_df[['content_id', 'part']])

del questions_df
gc.collect()

In [None]:
questions_df=pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')
len(questions_df)

In [None]:
n_questions=train.content_id.nunique()
n_parts=train.part.nunique()
n_responses=3

In [None]:
group=train.groupby('user_id').apply(lambda row: (row.content_id.values, 
                                                  row.part.values,
                                                  row.answered_correctly.values))

group=group[group.apply(lambda x:len(x[0]) > 10) ]
del train
gc.collect()

In [None]:
class SAKTDataset(torch.utils.data.Dataset):
    def __init__(self,
                 group,
                 max_seq=100):
        
        super(SAKTDataset, self).__init__()
        self.max_seq=max_seq
        self.group=group
        self.user_ids=[]
        for user_id in self.group.index:
            self.user_ids.append(user_id)
        
    def __len__(self):
        return len(self.group)
    
    def __getitem__(self, idx):
        user_id=self.user_ids[idx]
        (q_, p_, r_ )=self.group[user_id]
        
        seq_len=len(q_)
        q_=torch.as_tensor(q_, dtype=int)
        p_=torch.as_tensor(p_, dtype=int)
        r_=torch.as_tensor(r_, dtype=int)
        
        q=torch.zeros(self.max_seq, dtype=int)
        p=torch.zeros(self.max_seq, dtype=int)
        r=torch.zeros(self.max_seq, dtype=int)
        y=torch.zeros(self.max_seq, dtype=int)
        
        
        label_mask=torch.zeros(self.max_seq, dtype=bool)
        label_mask[:seq_len]=True
        
        
        if seq_len <= self.max_seq:
            q[:seq_len]=q_
            p[:seq_len]=p_
            y[:seq_len]=r_
            r[0]=2
            r[1:seq_len]=r_[:seq_len-1]
        else:
            
            if random.random() >= 0.1:
                start=random.randint(0, (seq_len-self.max_seq))
                end=start+self.max_seq
                
                q[:]=q_[start:end]
                p[:]=p_[start:end]
                y[:]=r_[start:end]
                
                if start == 0:
                    r[0]=2
                    r[1:]=r_[:self.max_seq-1]
                else:
                    r[:]=r_[start-1: end-1]
                
            else:
                q[:]=q_[:self.max_seq]
                p[:]=p_[:self.max_seq]
                y[:]=r_[:self.max_seq]
                r[0]=2
                r[1:]=r_[: self.max_seq-1]
            
        
        return (q, p, r, y, label_mask)

In [None]:
train, val=train_test_split(group, test_size=0.2)
print(train.shape, val.shape)

In [None]:
train_dataset=SAKTDataset(train)
train_dataloader=torch.utils.data.DataLoader(train_dataset, 
                                             batch_size=2048,
                                             shuffle=True, 
                                             pin_memory=True,
                                             num_workers=8)


val_dataset=SAKTDataset(val)
val_dataloader=torch.utils.data.DataLoader(val_dataset, 
                                           batch_size=2048,
                                           shuffle=False,
                                           pin_memory=True,
                                           num_workers=8
                                          )

In [None]:
device= 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device:', device)

# Model

In [None]:
class FFN(nn.Module):
    def __init__(self, emb_dim):
        super(FFN, self).__init__()
        
        self.fc1=nn.Linear(emb_dim, emb_dim)
        self.relu1=nn.PReLU()
        self.dropout1=nn.Dropout(0.2)
        self.fc2=nn.Linear(emb_dim, emb_dim)
        self.dropout2=nn.Dropout(0.2)
        
    def forward(self, x):
        x=self.fc1(x)
        x=self.relu1(x)
        x=self.dropout1(x)
        
        x=self.fc2(x)
        x=self.dropout2(x)
        return x


class SAKT(nn.Module):
    def __init__(self, 
                 n_questions,
                 n_parts,
                 n_responses,
                 device='cpu',
                 emb_dim=128,
                 model_dim=128,
                 num_heads=8,
                 max_seq=100):
        
        super(SAKT, self).__init__()
        
        self.pos_idx=torch.arange(max_seq).to(device)
        self.n_questions=n_questions
        self.n_parts=n_parts
        self.n_responses=n_responses
        self.max_seq=max_seq
        self.device=device
        
        self.emb_dim=emb_dim
        self.model_dim=model_dim
        
        self.pos_embedding=nn.Embedding(max_seq, emb_dim)
        self.q_embedding=nn.Embedding(n_questions, emb_dim)
        self.p_embedding=nn.Embedding(n_parts+1, emb_dim)
        self.r_embedding=nn.Embedding(n_responses, emb_dim)
        
        
        self.multihead_attn=nn.MultiheadAttention(model_dim, num_heads=num_heads, dropout=0.2)
        self.layernorm1=nn.LayerNorm(model_dim)
        
        self.dropout1=nn.Dropout(0.2)

        self.ffn=FFN(model_dim)
        self.layernorm2=nn.LayerNorm(model_dim)
        
        self.dropout2=nn.Dropout(0.2)
        self.out = nn.Linear(model_dim, 1)
    
    def get_attention_mask(self, s):
        attn_mask=torch.tensor(np.triu(np.ones((s, s)), k=1).astype('bool'))
        attn_mask=attn_mask.to(self.device)
        return attn_mask
    
    def forward(self, q, p, r):
        pos_embedd=self.pos_embedding(self.pos_idx)
        q_embedd=self.q_embedding(q)
        p_embedd=self.p_embedding(p)
        r_embedd=self.r_embedding(r)
        
        query=q_embedd+p_embedd
        x=pos_embedd+q_embedd+p_embedd+r_embedd
        attn_mask=self.get_attention_mask(q.size(1))
        
        query=query.permute(1, 0, 2)
        x=x.permute(1, 0, 2)
        
        attn_output, attn_weights=self.multihead_attn(query, x, x, attn_mask=attn_mask)
        attn_output=self.layernorm1(query+attn_output)
        
        ffn_out=self.ffn(attn_output)
        y=self.layernorm2(attn_output+ffn_out)
        
        y=y.permute(1, 0, 2)
        yout=self.out(y).squeeze(-1)
        return yout

# Train the model

In [None]:
num_epochs=35
early_stop=4


model=SAKT(n_questions,n_parts,n_responses,device=device).to(device)
optim=torch.optim.Adam(model.parameters())
criterion=torch.nn.BCEWithLogitsLoss().to(device)

In [None]:
def train_epoch(model, optim, criterion):
    train_loss=[]
    model.train()
    for (q, p, r, y, label_mask) in train_dataloader:
        q=q.to(device)
        p=p.to(device)
        r=r.to(device)
        y=y.to(device)
        label_mask=label_mask.to(device)
        
        optim.zero_grad()
        yout=model(q, p, r)
        
        y=torch.masked_select(y, label_mask).type(torch.cuda.FloatTensor)
        yout=torch.masked_select(yout, label_mask).to(device)
        
        loss_=criterion(yout, y)
        loss_.backward()
        optim.step()
        train_loss.append(loss_.item())
    return np.mean(train_loss)

In [None]:
def val_epoch(model, criterion):
    val_loss=[]
    model.eval()
    for (q, p, r, y, label_mask) in val_dataloader:
        q=q.to(device)
        p=p.to(device)
        r=r.to(device)
        y=y.to(device)
        
        with torch.no_grad():
            yout=model(q, p, r)
            y=torch.masked_select(y, label_mask).type(torch.cuda.FloatTensor)
            yout=torch.masked_select(yout, label_mask).to(device)
            loss_=criterion(yout, y)
            val_loss.append(loss_.item())
    return np.mean(val_loss)

In [None]:
%%time
train_loss=[]
val_loss=[]

best_loss=None

for i in range(num_epochs):
    start_time=time.time()
    
    if early_stop==0:
        print('Early Stopping on the epoch:{}'.format(i))
        break
        
    train_loss_=train_epoch(model, optim, criterion)
    val_loss_=val_epoch(model, criterion)
    
    train_loss.append(train_loss_)
    val_loss.append(val_loss)
    
    if best_loss==None or best_loss>val_loss_:
        early_stop=4
        best_loss=val_loss_
        torch.save(model.state_dict(), 'sakt.pth')
    elif best_loss!=None and best_loss<=val_loss_:
        early_stop-=1
        
    end_time=time.time()
    gc.collect()
    
    print('Epoch Time:', (end_time-start_time))
    print("Epoch:{} | Train Loss:{:.4f}".format(i, train_loss_))
    print("Epoch:{} | Val Loss:{:4f}".format(i, val_loss_))
    print('-------')
    print()

In [None]:
plt.title('Train Loss')
plt.plot(train_loss)
plt.show()

plt.title('Val Loss')
plt.plot(val_loss)
plt.show()