In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.modules import Linear
from torch.nn.parameter import Parameter
from torch.nn.init import xavier_uniform_
from torch.nn.init import constant_
from torch.nn.init import xavier_normal_
from typing import Optional, Tuple
import numpy as np
import sys
import regex as re
import io
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import pandas as pd
import math
import tensorflow as tf
import h5py
import copy
from torch.utils.data import DataLoader, Dataset
from numpy.linalg import norm
from scipy.spatial.distance import cosine
from scipy.special import softmax
from copy import deepcopy
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score

LOCAL = False
BASE_DIR = '../'

if not LOCAL:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    BASE_DIR = '/content/drive/My Drive/Colab Notebooks/thesis/'

sys.path.append(BASE_DIR + 'lib')
sys.path.append(BASE_DIR + 'config')
from preprocessing import process_sakt, save_h5
import dataset_parameters as params

# DATASET = 'akribian'
DATASET = 'assistments_2009'
# DATASET = 'junyi_academy'
# DATASET = 'ednet'

INPUT_DIR = BASE_DIR + 'data/' + DATASET + '/raw/'
OUTPUT_DIR = BASE_DIR + 'data/' + DATASET + '/processed/sakt/'

TIME_STEPS = params.time_steps_dict[DATASET]

IN_DATA_PATH = BASE_DIR + 'data/' + DATASET + '/processed/sakt/' 
MODEL_PATH = BASE_DIR + 'models/sakt/' 

# Training parameters
BATCH_SIZE = 64
LATENT_DIM = 256 # latent dimension
PADDING = 0
SHUFFLE = True
TIME_STEPS = params.time_steps_dict[DATASET]
#EPOCHS = params.epochs_dict[DATASET]
EPOCHS = 1
NUMBER_OF_EXERCISES = params.exercise_dict[DATASET]
ROWS_PER_READ = 1000000

EARLY_STOPPING_TOLERANCE = 3

# File paths
INPUT_PATH = IN_DATA_PATH + "processed.h5"

Mounted at /content/drive


## Self attention module

In [2]:
def future_mask(seq_length):
    future_mask = np.triu(np.ones((1, seq_length, seq_length)), k=1).astype('bool')
    return torch.from_numpy(future_mask)


def clone(module, num):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(num)])

# def relative_attention(query, key, value, rel, l1, l2, pos_key_embeds, pos_value_embeds, mask=None, dropout=None):
def relative_attention(query, key, value, mask=None, dropout=None):
    """Compute scaled dot product attention with relative position embeddings.
    (https://arxiv.org/pdf/1803.02155.pdf)
    """
#    assert pos_key_embeds.num_embeddings == pos_value_embeds.num_embeddings

    print("query")
    print(query.shape)
    print("key")
    print(key.shape)    
    print("key transposed")
    print(key.transpose(-2, -1).shape)    
    scores = torch.matmul(query, key.transpose(-2, -1))
    print("scores")
    print(scores.shape)    

    # idxs = torch.arange(scores.size(-1))
    # if query.is_cuda:
    #     idxs = idxs.cuda()
    # idxs = idxs.view(-1, 1) - idxs.view(1, -1)
    # idxs = torch.clamp(idxs, 0, pos_key_embeds.num_embeddings - 1)

    # pos_key = pos_key_embeds(idxs).transpose(-2, -1)
    # pos_scores = torch.matmul(query.unsqueeze(-2), pos_key)
    scores = scores.unsqueeze(-2)
    scores = scores / math.sqrt(query.size(-1))

    # pos_value = pos_value_embeds(idxs)
    value = value.unsqueeze(-3)

    print("scores")
    print(scores.shape)
    print("mask")
    print(mask.shape)
    if mask is not None:
        scores = scores.masked_fill(mask.unsqueeze(-2), -1e9)
    prob_attn = F.softmax(scores, dim=-1)
    if dropout is not None:
        prob_attn = dropout(prob_attn)

    output = torch.matmul(prob_attn, value).unsqueeze(-2)
    prob_attn = prob_attn.unsqueeze(-2)
    return output, prob_attn

In [3]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, head_size, num_heads, drop_prob):
        super(MultiHeadedAttention, self).__init__()
        self.head_size = head_size
        self.num_heads = num_heads
        # W^Q, W^K, W^V
        self.linear_layers = clone(nn.Linear(head_size, head_size), 3)
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, query, key, value, mask=None):
        batch_size = query.shape[1]
        seq_length = query.shape[0]
        print("batch size %i time steps %i" % (batch_size, seq_length))
        print("query looks like this")
        print(query.shape)
        print(query)

        # Apply mask to all heads
        if mask is not None:
            mask = mask.unsqueeze(1)

        # Project inputs
        #rel = rel.unsqueeze(1).repeat(1,self.num_heads,1,1)
        #timestamp = timestamp.unsqueeze(1).repeat(1,self.num_heads,1,1)
        query, key, value = [l(x).view(batch_size, seq_length, self.num_heads, self.head_size).transpose(1, 2)
                             for l, x in zip(self.linear_layers, (query, key, value))]

        # Apply attention
        out, self.prob_attn = relative_attention(query, key, value, mask, self.dropout)

        out = out.transpose(1, 2).contiguous().view(batch_size, seq_length, self.head_size)
        return out, self.prob_attn

# Model class

In [4]:
class sakt(nn.Module):  
    def __init__(self , ex_total , seq_len, dim, heads, dout ):
        super(sakt, self).__init__()
        self.seq_len = seq_len
        self.dim = dim
        embedding_dim = dim

        self.embd_in = nn.Embedding( 2*ex_total+1, embedding_dim  = embedding_dim ).cuda()         # Interaction embedding
        self.embd_ex = nn.Embedding( ex_total+1 , embedding_dim = embedding_dim ).cuda()       # Excercise embedding
        self.embd_pos = nn.Embedding( seq_len , embedding_dim = embedding_dim ).cuda()

        self.linear = nn.ModuleList( [nn.Linear(in_features= embedding_dim , out_features= dim ).cuda() for x in range(3)] ).cuda()   # Linear projection for each embedding
        self.attn = MultiHeadedAttention(head_size= dim , num_heads= heads, drop_prob= dout ).cuda()
        self.ffn = nn.ModuleList([nn.Linear(in_features= dim , out_features=dim, bias= True).cuda() for x in range(2)]).cuda()  # feed forward layers post attention

        self.linear_out = nn.Linear(in_features= dim , out_features= 1 , bias=True).cuda()
        self.layer_norm1 = nn.LayerNorm( dim ).cuda()
        self.layer_norm2 = nn.LayerNorm( dim ).cuda()                           # output with correctnness prediction 
        self.drop = nn.Dropout(dout).cuda()

    def forward( self , input_in , input_ex):

        ## positional embedding
        pos_in = self.embd_pos( torch.arange(self.seq_len).unsqueeze(0) )         #making a tensor of 12 numbers, .unsqueeze(0) for converting to 2d, so as to get a 3d output #print('pos embd' , pos_in.shape)

        ## get the interaction embedding output
        out_in = self.embd_in( input_in )                         # (b, n) --> (b,n,d)
        out_in = out_in + pos_in

        ## split the interaction embeding into v and k ( needs to verify if it is slpited or not)
        value_in = out_in
        key_in   = out_in                                         #print('v,k ', value_in.shape)
        
        ## get the excercise embedding output
        query_ex = self.embd_ex( input_ex )                       # (b,n) --> (b,n,d) #print(query_ex.shape)
        
        ## Linearly project all the embedings
        value_in = self.linear[0](value_in).permute(1,0,2)        # (b,n,d) --> (n,b,d)
        key_in = self.linear[1](key_in).permute(1,0,2)
        query_ex =  self.linear[2](query_ex).permute(1,0,2)

        ## pass through multihead attention
                       #forward(query, key, value, rel, l1, l2, timestamp, encode_pos, pos_key_embeds, pos_value_embeds, mask=None):
        atn_out , _ = self.attn(query_ex , key_in, value_in, mask= torch.from_numpy( np.triu(np.ones((self.seq_len ,self.seq_len)), k=1).astype('bool')).cuda() )      # lower triangular mask, bool, torch    (n,b,d)
        atn_out = query_ex + atn_out                                  # Residual connection ; added excercise embd as residual because previous ex may have imp info, suggested in paper.
        atn_out = self.layer_norm1( atn_out )                          # Layer norm                        #print('atn',atn_out.shape) #n,b,d = atn_out.shape

        #take batch on first axis 
        atn_out = atn_out.permute(1,0,2)                              #  (n,b,d) --> (b,n,d)
        
        ## FFN 2 layers
        ffn_out = self.drop(self.ffn[1]( nn.ReLU()( self.ffn[0]( atn_out ) )))   # (n,b,d) -->    .view([n*b ,d]) is not needed according to the kaggle implementation
        ffn_out = self.layer_norm2( ffn_out + atn_out )                # Layer norm and Residual connection

        ## sigmoid
        ffn_out = torch.sigmoid(self.linear_out( ffn_out )  )

        return ffn_out
          
def randomdata():
    input_in = torch.randint( 0 , 49 ,(64 , 25) )
    return input_in, input_in



In [5]:
print("CUDA available: %i" % torch.cuda.is_available())
cuda = torch.device("cuda:0")
torch.set_default_tensor_type('torch.cuda.FloatTensor')

CUDA available: 1


# Model training

In [6]:
## Training the model
print_freq = 10
graph_freq = 10

model = sakt( ex_total= NUMBER_OF_EXERCISES, seq_len= TIME_STEPS, dim= LATENT_DIM, heads= 8, dout= 0.2 )
model = model.cuda()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

losses_history = []

val_aucs = []

previous_auc = 0.0

no_improvement_streak = 0
best_auc = 0.0

h5_file = h5py.File(INPUT_PATH,'r')

# labels_val = torch.from_numpy(h5_file['labels_val'][:]).flatten()

number_of_train_windows = h5_file['exercises_val'].shape[0]
number_of_val_windows = h5_file['exercises_val'].shape[0]

for epoch in range(EPOCHS):
  print("Epoch %i" % (epoch+1))
  total_loss_print = 0.0
  total_loss_graph = 0.0

  for read_index in range(0, number_of_train_windows, ROWS_PER_READ):

    exercises_train = h5_file['exercises_train'][read_index:read_index+ROWS_PER_READ]
    interactions_train = h5_file['interactions_train'][read_index:read_index+ROWS_PER_READ]
    labels_train = h5_file['labels_train'][read_index:read_index+ROWS_PER_READ]

    for idx in range(0, len(exercises_train), BATCH_SIZE):
      # Retrieve data
      ex_samples = torch.from_numpy(exercises_train[idx:idx + BATCH_SIZE]).long().cuda()
      in_samples = torch.from_numpy(interactions_train[idx:idx + BATCH_SIZE]).long().cuda()
      label = torch.from_numpy(labels_train[idx:idx + BATCH_SIZE]).long().cuda()
      label = torch.unsqueeze(label, 2)

      # Reset optimizer
      optimizer.zero_grad()

      # Predict
      out = model(in_samples, ex_samples)

      # Calculate loss 
      loss = criterion(out, label.float())
      total_loss_print += loss.item()
      total_loss_graph += loss.item()

      # Optimize
      loss.backward()
      optimizer.step()

      idx_discrete = int(idx / BATCH_SIZE)

      # Print status
      if (idx_discrete % print_freq == print_freq - 1):
        avg_loss = total_loss_print/print_freq
        sys.stdout.write("\rIteration %i, avg loss %f" % (idx+1, avg_loss))
        sys.stdout.flush()
        total_loss_print = 0.0

      # Append to graph
      if (idx_discrete % graph_freq == graph_freq - 1):
        avg_loss = total_loss_graph/graph_freq
        losses_history.append(avg_loss)
        total_loss_graph = 0.0
          
  # Validation step
  # Set evaluation mode
  outs_preds = np.array([])
  model.eval()
  # Turn off grad calculation
  with torch.no_grad():
    # Iterate through val data
    for read_index in range(0, number_of_val_windows, ROWS_PER_READ):
      exercises_val = h5_file['exercises_val'][read_index:read_index+ROWS_PER_READ]
      interactions_val = h5_file['interactions_val'][read_index:read_index+ROWS_PER_READ]
      for idx in range(0, len(exercises_val), BATCH_SIZE):
        in_samples = torch.from_numpy(interactions_val[idx:idx + BATCH_SIZE]).long().cuda()
        ex_samples = torch.from_numpy(exercises_val[idx:idx + BATCH_SIZE]).long().cuda()
        outs_pred = model(in_samples, ex_samples)
        outs_pred_flattened = torch.flatten(outs_pred)
        outs_pred_flattened = outs_pred_flattened.cpu().detach().numpy()
        outs_preds = np.append(outs_preds, outs_pred_flattened)
  # Revert to training mode
  model.train()

  labels_val = h5_file['labels_val'][:].flatten()
  auc_score = roc_auc_score(labels_val, outs_preds)
  val_aucs.append(auc_score)
  print("\nValidation AUC: %f" % auc_score)
  auc_diff = auc_score - previous_auc
  if epoch != 0:
    print("AUC difference: %f" % auc_diff)

  if auc_score < best_auc:
    no_improvement_streak += 1
  else:
    no_improvement_streak = 0
    best_auc = auc_score
    torch.save(model.state_dict(), MODEL_PATH + DATASET + ".torch")
    print("Model saved")

  if no_improvement_streak == EARLY_STOPPING_TOLERANCE:
    break

  previous_auc = auc_score

h5_file.close()

Epoch 1
batch size 64 time steps 120
query looks like this
torch.Size([120, 64, 256])
tensor([[[ 0.6667, -0.0163, -0.4275,  ..., -0.4787,  0.3237,  0.0768],
         [-0.2487,  0.2518,  0.2651,  ..., -0.3347, -0.2110,  0.6179],
         [-0.6471,  0.7694,  0.2249,  ..., -0.0469, -0.0738, -1.1035],
         ...,
         [-0.6471,  0.7694,  0.2249,  ..., -0.0469, -0.0738, -1.1035],
         [ 0.0825, -0.1178,  0.5553,  ...,  0.3893,  0.3784,  0.3777],
         [-0.3111,  0.0278, -1.1743,  ..., -0.0267,  0.0021,  0.8743]],

        [[ 0.6667, -0.0163, -0.4275,  ..., -0.4787,  0.3237,  0.0768],
         [-0.2487,  0.2518,  0.2651,  ..., -0.3347, -0.2110,  0.6179],
         [-0.6471,  0.7694,  0.2249,  ..., -0.0469, -0.0738, -1.1035],
         ...,
         [-0.6261, -0.7074, -0.4572,  ..., -0.0521,  0.6354, -0.2275],
         [ 0.0825, -0.1178,  0.5553,  ...,  0.3893,  0.3784,  0.3777],
         [-0.3111,  0.0278, -1.1743,  ..., -0.0267,  0.0021,  0.8743]],

        [[ 0.6667, -0.0163, -0

RuntimeError: ignored

In [None]:
plt.figure(1)
plt.plot(losses_history)

plt.figure(2)
plt.plot(val_aucs)

In [None]:
outs_preds = np.array([])

# predict iteratively in batches to avoid overloading VRAM
h5_file = h5py.File(INPUT_PATH,'r')
number_of_test_windows = h5_file['exercises_test'].shape[0]
for read_index in range(0, number_of_test_windows, ROWS_PER_READ):
  exercises_test = h5_file['exercises_test'][read_index:read_index+ROWS_PER_READ]
  interactions_test = h5_file['interactions_test'][read_index:read_index+ROWS_PER_READ]
  for idx in range(0, len(exercises_test), BATCH_SIZE):
    ex_samples = torch.from_numpy(exercises_test[idx:idx + BATCH_SIZE]).long().cuda()
    in_samples = torch.from_numpy(interactions_test[idx:idx + BATCH_SIZE]).long().cuda()
    outs_pred = model(in_samples, ex_samples)
    outs_pred = outs_pred.flatten().cpu().detach().numpy()
    outs_preds = np.append(outs_preds, outs_pred)
outs_test_flattened = h5_file['labels_test'][:].flatten()
h5_file.close()
print(outs_preds.shape)
print(outs_test_flattened.shape)

In [None]:
fpr_keras, tpr_keras, thresholds_keras = roc_curve(outs_test_flattened, outs_preds)
auc_keras = roc_auc_score(outs_test_flattened, outs_preds)
#auc_keras = auc(fpr_keras, tpr_keras)

plt.figure(3, figsize=(12,8), dpi=80)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='ROC (area = {:.3f})'.format(auc_keras))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()


In [None]:
h5_file = h5py.File(INPUT_PATH,'r')


index = 80
img_width = TIME_STEPS
max_width = 50

ex_samples = h5_file['exercises_test'][index]
ground_truth = outs_test_flattened[index*TIME_STEPS:(index+1)*TIME_STEPS]
pred = outs_preds[index*TIME_STEPS:(index+1)*TIME_STEPS]


# Define image
img = np.ndarray((3, img_width))
img = img[:, :max_width]

# Set category color
ex_color = ex_samples
color_dict = dict(enumerate(list(set(ex_color))))
color_dict = {v:k for k,v in color_dict.items()}
colors = list(map(lambda x: (color_dict[x] + 1)/len(color_dict.keys()), ex_color))
colors = colors[:max_width]
img[0,:] = colors

# Set ground truth color
ground_truth = ground_truth[:max_width]
img[1,:] = ground_truth.flatten()

# Set prediction color
pred = pred[:max_width]
img[2,:] = pred

# Show the figure
plt.figure(4, figsize=(12,8), dpi=100)
plt.imshow(img)
plt.text(max_width, 0.15, "Category")
plt.text(max_width, 1.15, "Ground truth")
plt.text(max_width, 2.15, "Prediction")

# Index
print("index index: %i" % index)

h5_file.close()