In [1]:
# Install latest version sklearn (0.23)
!pip install -U scikit-learn

Collecting scikit-learn
[?25l  Downloading https://files.pythonhosted.org/packages/30/aa/db462d385c56905b731403885454188683f63c86ea68900f6f7e7558b5fa/scikit_learn-0.24.0-cp36-cp36m-manylinux2010_x86_64.whl (22.2MB)
[K     |████████████████████████████████| 22.2MB 1.4MB/s 
[?25hCollecting threadpoolctl>=2.0.0
  Downloading https://files.pythonhosted.org/packages/f7/12/ec3f2e203afa394a149911729357aa48affc59c20e2c1c8297a60f33f133/threadpoolctl-2.1.0-py3-none-any.whl
Installing collected packages: threadpoolctl, scikit-learn
  Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.24.0 threadpoolctl-2.1.0


In [2]:
# Imports
import pandas as pd
import numpy as np
import os
import shutil
from collections import Counter
import matplotlib.pyplot as plt
import time
import gc
import sys
import random
import pickle

from scipy.stats import iqr
from sklearn import preprocessing

import datetime
import logging

# Deep learning
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow.keras.models as M

from IPython.display import display
# Set pandas options
pd.options.display.max_rows = 2000

In [3]:
# Global variables 
THR_E = 100 # interaction threshold of exercises (E) for user
BATCH_SIZE = 128
TRAIN_FRACTION = 0.95

AUTO = tf.data.experimental.AUTOTUNE
EPOCHS = 180
N_SELECT_PER_EPOCH = 100000 # Random select N samples for each epoch from train/val set
VAL_EVERY_N_EPOCHS = 5
PRINT_EVERY_N_BATCHES = 50 
zero_task_etc = True

SAVE_DICTS = True

In [4]:
# SEED THE EXPERIMENTS
np.random.seed(18)

In [5]:
# Experiment date and number!
date = datetime.datetime.today().strftime("%d-%b")
experiment = "-riiid-1"
OUTPUT_FOLDER = date + experiment 
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

In [6]:
# Selected features
selections = ["E", "r", "etc", "ltg", "at", "ra"]
assert((selections[0] == "E") & (selections[1]=="r"))

In [7]:
# Choose model setting
ENC_EMB, ENC_DENSE = [0, 4], [] 
DEC_EMB, DEC_DENSE = [1, 2, 3, 5], []

In [8]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

FOLDER_FEATHER = "/content/drive/My Drive/kaggle-riiid/feather-files"
PREPROCESS_FILE = "/content/drive/My\ Drive/Colab\ Notebooks/demo-riiid-preprocessing.ipynb"
MODEL_FILE = "/content/drive/My\ Drive/Colab\ Notebooks/demo-riiid-transformer.ipynb" # https://stackoverflow.com/questions/57464810/how-to-run-a-jupyter-notebook-with-space-in-relative-path-from-another-notebook
# MODEL_FILE = "/content/drive/My\ Drive/Colab\ Notebooks/riiid-functional-transformer.ipynb" # https://stackoverflow.com/questions/57464810/how-to-run-a-jupyter-notebook-with-space-in-relative-path-from-another-notebook


Mounted at /content/drive


In [9]:
# Add functions to preprocess data
%run $PREPROCESS_FILE

In [10]:
# Create logger
logging = create_logging(OUTPUT_FOLDER)

In [11]:
%%time
# Read all dataframes from feather files and print out
train = read_df_print(os.path.join(FOLDER_FEATHER, "train.feather")) 
questions = read_df_print(os.path.join(FOLDER_FEATHER, "questions.feather"))
lectures = read_df_print(os.path.join(FOLDER_FEATHER, "lectures.feather"))

(101230332, 10)


Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,False,1,3,1,,
1,1,56943,115,5716,False,2,2,1,37000.0,False
2,2,118363,115,128,False,0,0,1,55000.0,False


(13523, 5)


Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92


(418, 4)


Unnamed: 0,lecture_id,tag,part,type_of
0,89,24584,5,concept
1,100,22243,1,concept
2,185,7035,6,concept


CPU times: user 1.62 s, sys: 1.92 s, total: 3.54 s
Wall time: 29.3 s


In [12]:
train["content_type_id"].sum()

1959032

In [13]:
# Preprocess for lectures
if ("v" in selections) or ("l" in selections) or ("vc" in selections):
  
  lectures["lecture_id"] = pd.Categorical(lectures["lecture_id"])
  lectures["lec_id"] = lectures["lecture_id"].cat.codes
  lec_map = dict(zip(lectures["lecture_id"], lectures["lec_id"]))

  train.loc[train["content_type_id"]==1, "content_id"] = train.loc[train["content_type_id"]==1, "content_id"].map(lec_map)
  train["v"] = train["content_type_id"].shift(1).fillna(0).astype(np.int8)
  train["l"] = train["content_id"].shift(1).fillna(418).astype(np.int32)
  train.loc[train["v"]==0, "l"] = 418
  train["vc"] = train.groupby("user_id")["v"].cumsum()

In [14]:
# We don't need the lectures now!
train = train.loc[train["content_type_id"]==0]

29-12 18:24 numexpr.utils INFO     NumExpr defaulting to 4 threads.


In [15]:
# Save space by deleting some columns
del train["user_answer"]

In [16]:
N_questions, N_parts = questions["question_id"].nunique(), questions["part"].nunique()
N_response, N_task, N_lag, N_et, N_groups, N_attempt, N_avg, N_ltg, N_l = 2, 2, 1440, 301, 10, 8, 101, 400, 418 # N_avg = 100 for regular?

In [17]:
# Seq len of users [sorted same as df]. USED FOR TRAINING
seq_len_dict = dict(train["user_id"].value_counts())
users = train["user_id"].unique()
seq_lens = [seq_len_dict[user] for user in users]

In [18]:
# Train/valid split
train_len = int(len(users)*TRAIN_FRACTION)
val_len = len(users) - train_len
train_users, val_users = users[:train_len], users[train_len:]

# PROBS of sequence
seq_len_train, seq_len_val = seq_lens[:train_len], seq_lens[train_len:]
PROBS_TRAIN = seq_len_train/np.sum(seq_len_train)
PROBS_VAL = seq_len_val/np.sum(seq_len_val)

In [19]:
len(train_users), len(val_users)

(373973, 19683)

In [20]:
train["prior_question_had_explanation"].isnull().sum() # Less than users - why?

392506

## Add all inputs/outputs

In [21]:
p_lists, lt_lists, et_lists, tag_lists, task_lists, quantile_transformer_et, lt_cat, quantile_transformer_lt, et_cat, ltc, etc, ex, etg, ra, ca, at, r_dup, ltg, ltg_bins, v_lists, l_lists  = [], [], [], [], [], [], [], [], [], [],[], [], [], [], [], [], [], [], [], [], []

In [22]:
if "ra" in selections:
    %time ra = return_r_avg(add_start_token=True, N_avg = N_avg)

CPU times: user 33.2 s, sys: 606 ms, total: 33.8 s
Wall time: 33 s


In [24]:
# Add exercises
%time E_lists = return_E()

CPU times: user 22.1 s, sys: 1.26 s, total: 23.3 s
Wall time: 23.3 s


In [25]:
# Add results, with or without start token
%time r_lists = return_r(add_start_token=True)

CPU times: user 26.8 s, sys: 253 ms, total: 27 s
Wall time: 27 s


In [26]:
if "l" in selections:
  %time l_lists = return_l()
  del train["l"], train["v"], train["vc"]
if "v" in selections:
    %time v_lists = return_v()
    del train["l"], train["v"], train["vc"]

In [27]:
if "at" in selections:
    %time at = return_attempt()

CPU times: user 56.8 s, sys: 5.66 s, total: 1min 2s
Wall time: 1min 2s


In [28]:
# Lag time grouped
if "ltg" in selections:
    %time ltg, ltg_bins, N_ltg = return_ltg(N_ltg)    
print(N_ltg)

CPU times: user 39.7 s, sys: 255 ms, total: 39.9 s
Wall time: 39.3 s
173


In [29]:
if "r_dup" in selections:
   %time r_dup = r_lists.apply(lambda x: x[:-1])

In [30]:
if "ca" in selections:
    %time ca = return_ca()

In [31]:
if "ex" in selections:
    %time ex = return_ex()

In [32]:
bins = list(range(30)) + [32, 35, 40, 48, 60, 80, 120, 299, 300]
if "etg" in selections:
    %time etg = return_etg(bins)
    # %time etg = return_etg(N_groups)


In [33]:
if "ltc" in selections:
    %time ltc = return_ltc()

In [34]:
if "lt" in selections:
    %time lt_lists, quantile_transformer_lt = return_lt()

In [35]:
if "etc" in selections:
    %time etc = return_etc(zero_task=zero_task_etc)

CPU times: user 34 s, sys: 604 ms, total: 34.6 s
Wall time: 34.4 s


In [36]:
if "p" in selections:
    %time p_lists = return_p()

In [37]:
if "et" in selections:
    %time et_lists, quantile_transformer_et = return_et()

In [38]:
if "tag" in selections:
    %time tag_lists = return_N_highest_tags() # TODO: specify N tags

In [39]:
if "task" in selections:
    %time task_lists = return_task_binary()

In [40]:
feature_mapping = {"E": E_lists, "r": r_lists, "p": p_lists, "et": et_lists, "lt": lt_lists, "tag": tag_lists, "task": task_lists, "et_std": et_lists, "ltc": ltc, "etc": etc, "ex": ex, "etg": etg, "ra": ra, "at": at, "ca": ca, "r_dup": r_dup, "v": v_lists, "ltg": ltg, "l": l_lists}
type_mapping = {"E": tf.int32, "r": tf.int32, "p": tf.int32, "et": tf.float32, "et_std": tf.float32, "lt": tf.float32, "tag": tf.float32, "task": tf.float32, "ltc":tf.int32, "etc": tf.int32, "ex": tf.int32, "etg": tf.int32, "ra": tf.int32, "at": tf.int32, "ca": tf.int32, "r_dup": tf.int32, "v": tf.int32, "ltg": tf.int32, "l": tf.int32}
pad_mapping = {"E": N_questions, "r": N_response, "p": 0, "et": 0.5, "et_std": 0.0, "lt": 0.5, "tag": 2.0, "task": float(N_task), "ltc": N_lag+1, "etc": N_et+1, "ex": 2, "etg": len(bins)+1, "ra": N_avg+1, "at": N_attempt, "ca": N_avg+1, "r_dup": N_response, "v": N_response, "ltg": N_ltg+1, "l": N_l+1}
vocab_mapping = {"E": N_questions+1, "r": N_response+2, "et":None, "p": N_parts+1, "lt": None, "tag": 3.0, "task": float(N_task+1), "et_std": float(300), "ltc": N_lag+2, "etc": N_et+2, "ex": 4, "etg": len(bins)+2, "ra": N_avg+2, "at": N_attempt+1, "ca": N_avg+2, "r_dup": N_response+2, "v": N_response+1, "ltg": N_ltg+2, "l": N_l+2}
pad_shapes = {"E": [THR_E], "r": [THR_E+1], "p": [THR_E], "et": [THR_E], "et_std": [THR_E],  "lt": [THR_E], "tag": [THR_E], "task": [THR_E], "ltc": [THR_E], "etc": [THR_E], "ex": [THR_E], "etg": [THR_E], "ra": [THR_E], "at": [THR_E], "ca": [THR_E], "r_dup": [THR_E], "v": [THR_E], "ltg": [THR_E], "l": [THR_E]}

In [41]:
def save_as_dict(df, filename):
    seq_dict = df.to_dict()
    with open(os.path.join(OUTPUT_FOLDER_DICT, filename), 'wb') as handle:
      pickle.dump(seq_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [42]:
if SAVE_DICTS:
  OUTPUT_FOLDER_DICT = f"seq-dict-{THR_E}"
  os.makedirs(OUTPUT_FOLDER_DICT, exist_ok=True)

  for feature in selections:
      feat_df = feature_mapping[feature]
      feat_df = feat_df.apply(lambda x: x[-THR_E:]) # Take last THR_E in history 
      filename = f"{feature}.pickle"
      save_as_dict(feat_df, filename)
  shutil.move(OUTPUT_FOLDER_DICT, OUTPUT_FOLDER)

In [43]:
# # Add start token!
# # Answers
# r_lists = r_lists.apply(lambda x: [3] + x) # Start token = 3
# feature_mapping["r"] = r_lists

In [44]:
# Running answers
if "ra" in selections:
    ra = ra.apply(lambda x: x[:-1]) # For training => we don't have last value of average
    feature_mapping["ra"] = ra

In [45]:
vocab_sizes = [vocab_mapping[select] for select in selections]
feature_lists = [feature_mapping[select] for select in selections]

In [46]:
PADDING_VALUES = tuple((pad_mapping[select] for select in selections))
OUTPUT_TYPES = tuple((type_mapping[select] for select in selections))
PADDED_SHAPES = tuple((pad_shapes[select] for select in selections))

In [47]:
feature_lists

[user_id
 115           [5692, 5716, 128, 7860, 7922, 156, 51, 50, 789...
 124           [7900, 7876, 175, 1278, 2064, 2063, 2065, 3364...
 2746          [5273, 758, 5976, 236, 404, 382, 405, 873, 531...
 5382          [5000, 3944, 217, 5844, 5965, 4990, 5235, 6050...
 8623          [3915, 4750, 6456, 3968, 6104, 5738, 6435, 549...
                                     ...                        
 2147470770    [7900, 7876, 175, 1278, 2064, 2065, 2063, 3363...
 2147470777    [7900, 7876, 175, 1278, 2065, 2064, 2063, 3365...
 2147481750    [4137, 1270, 9261, 8201, 367, 378, 214, 6071, ...
 2147482216    [3748, 4765, 5474, 9261, 4665, 5987, 6666, 561...
 2147482888    [6147, 4792, 5738, 6102, 4748, 7956, 6435, 928...
 Name: content_id, Length: 393656, dtype: object, user_id
 115           [3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, ...
 124           [3, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, ...
 2746          [3, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, ...
 5382          [3, 1, 0

In [48]:
try:
  del train
except Exception as e:
  print("train already deleted")
gc.collect(), gc.collect()

(60, 0)

# Create train/val datasets

In [49]:
# Lists of numpy arrays
train_list = [features.values[:train_len] for features in feature_lists]
val_list = [features.values[train_len:] for features in feature_lists]

In [50]:
def select_window_size(*x):
    series_len = len(x[0]) # Length of series
    if series_len <= THR_E: # Just return the sequence!
        return x
    else: # Random select from sequence
        max_select = series_len - THR_E
        random_select = tf.random.uniform(shape=(), minval=0, maxval=max_select, dtype=tf.int32)
        x = [i[random_select:random_select+THR_E] for i in x]
        x = tuple(x)
        return x

In [51]:
train_list

[array([list([5692, 5716, 128, 7860, 7922, 156, 51, 50, 7896, 7863, 152, 104, 108, 7900, 7901, 7971, 25, 183, 7926, 7927, 4, 7984, 45, 185, 55, 7876, 6, 172, 7898, 175, 100, 7859, 57, 7948, 151, 167, 7897, 7882, 7962, 1278, 2065, 2064, 2063, 3363, 3365, 3364]),
        list([7900, 7876, 175, 1278, 2064, 2063, 2065, 3364, 3365, 3363, 2948, 2947, 2946, 2595, 2593, 2594, 4492, 4120, 4696, 6116, 6173, 6370, 6909, 6910, 6908, 6911, 7218, 7216, 7217, 7219]),
        list([5273, 758, 5976, 236, 404, 382, 405, 873, 531, 775, 294, 714, 297, 297, 775, 1295, 10684, 1014, 484]),
        ...,
        list([7900, 7876, 175, 1278, 2064, 2063, 2065, 3364, 3365, 3363, 2947, 2948, 2946, 2593, 2595, 2594, 4492, 4120, 4696, 6116, 6173, 6370, 6878, 6880, 6879, 6877, 7219, 7218, 7217, 7216, 6106, 4755, 9313, 3586, 4476, 6432, 5845, 9094, 6191, 5437]),
        list([7900, 7876, 175, 1278, 2064, 2063, 2065, 3364, 3363, 3365, 2946, 2948, 2947, 2594, 2595, 2593, 4492, 4120, 4696, 6116, 6173, 6370, 6879, 6880, 6

In [52]:
def create_train_dataset(N_training_per_epoch, probs_train):
  index = np.random.choice(train_len, N_training_per_epoch, replace=True, p = probs_train) # random indexing (similar to shuffling)
  tr_list = [features[index] for features in train_list] # Selecting by index 
  train_ds = (tf.data.Dataset
                  .from_generator(lambda: iter(zip(*tr_list)), output_types=OUTPUT_TYPES)
                  .map(select_window_size)
                  .padded_batch(batch_size = BATCH_SIZE, padded_shapes = PADDED_SHAPES, padding_values = PADDING_VALUES)
                  .prefetch(AUTO)
  )
  return train_ds

In [53]:
def create_val_dataset(N_valid_per_epoch, probs_val):
  index = np.random.choice(val_len, N_valid_per_epoch, replace=True, p = probs_val) # random indexing (similar to shuffling)
  vl_list = [features[index] for features in val_list] # Selecting by index 
  val_ds = (tf.data.Dataset
                  .from_generator(lambda: iter(zip(*vl_list)), output_types=OUTPUT_TYPES)
                  .map(select_window_size)
                  .padded_batch(batch_size = BATCH_SIZE, padded_shapes = PADDED_SHAPES, padding_values = PADDING_VALUES)
                  .cache()
                  .prefetch(AUTO)
  )
  return val_ds

In [54]:
val_dataset = create_val_dataset(N_SELECT_PER_EPOCH, PROBS_VAL)
a = iter(val_dataset) # Small check

In [55]:
next(a)

(<tf.Tensor: shape=(128, 100), dtype=int32, numpy=
 array([[ 1726,  1725,  3115, ...,  2250,  2249,  2122],
        [ 5257,  4405,  4240, ...,  4414,  2946,  2947],
        [   54,    34,   124, ...,  1525,  1523,  2544],
        ...,
        [  208,  1118,   937, ...,  1386,   254,   587],
        [ 5195,  5621,   454, ...,  4108,   756,  1223],
        [ 8249,   249,  6099, ..., 13523, 13523, 13523]], dtype=int32)>,
 <tf.Tensor: shape=(128, 101), dtype=int32, numpy=
 array([[1, 1, 1, ..., 1, 1, 2],
        [1, 1, 1, ..., 1, 0, 2],
        [1, 1, 1, ..., 1, 0, 2],
        ...,
        [0, 1, 1, ..., 1, 1, 2],
        [0, 1, 1, ..., 1, 1, 2],
        [3, 0, 1, ..., 2, 2, 2]], dtype=int32)>,
 <tf.Tensor: shape=(128, 100), dtype=int32, numpy=
 array([[  0,   0,  22, ...,   0,   0,  26],
        [ 24,  22,  10, ...,  13,   4,   0],
        [ 16,  23,  23, ...,   0,   0,   4],
        ...,
        [ 22,  19,  20, ...,  17,  17,  18],
        [  9,  10,  11, ...,  13,   6,  18],
        [30

## Get model and set hyperparameters

In [56]:
%run $MODEL_FILE

In [57]:
shutil.copy('/content/drive/My Drive/Colab Notebooks/demo-riiid-transformer.ipynb', OUTPUT_FOLDER)

'29-Dec-riiid-1/demo-riiid-transformer.ipynb'

In [58]:
# Hyperparameters main model
config = {"features": selections, "n_features": len(selections), "vocab_sizes": vocab_sizes, 
          "enc_emb": ENC_EMB, "enc_dense": ENC_DENSE, "dec_emb": DEC_EMB, "dec_dense": DEC_DENSE, 
          "window_size": THR_E, "enc_num_layers": 2, "dec_num_layers": 2, "d_model": 256, 
          "dff": 512, "num_heads": 8, "dropout_rate": 0.1, "pos_encoding": True,
          "padding_values": PADDING_VALUES, "output_types": OUTPUT_TYPES, "padded_shapes": PADDED_SHAPES, 
          "quantile_transformer_et": quantile_transformer_et,  "quantile_transformer_lt": quantile_transformer_lt, "ltg_bins": ltg_bins, "zero_task_etc": zero_task_etc}

with open(os.path.join(OUTPUT_FOLDER, "config.pickle"), 'wb') as handle:
  pickle.dump(config, handle, protocol=pickle.HIGHEST_PROTOCOL)

logging.info('config: %s', config)

29-12 18:30 root         INFO     config: {'features': ['E', 'r', 'etc', 'ltg', 'at', 'ra'], 'n_features': 6, 'vocab_sizes': [13524, 4, 303, 175, 9, 103], 'enc_emb': [0, 4], 'enc_dense': [], 'dec_emb': [1, 2, 3, 5], 'dec_dense': [], 'window_size': 100, 'enc_num_layers': 2, 'dec_num_layers': 2, 'd_model': 256, 'dff': 512, 'num_heads': 8, 'dropout_rate': 0.1, 'pos_encoding': True, 'padding_values': (13523, 2, 302, 174, 8, 102), 'output_types': (tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32), 'padded_shapes': ([100], [101], [100], [100], [100], [100]), 'quantile_transformer_et': [], 'quantile_transformer_lt': [], 'ltg_bins': array([0.00000000e+00, 6.00000000e+00, 9.00000000e+00, 1.00000000e+01,
       1.10000000e+01, 1.20000000e+01, 1.30000000e+01, 1.40000000e+01,
       1.50000000e+01, 1.60000000e+01, 1.70000000e+01, 1.80000000e+01,
       1.90000000e+01, 2.00000000e+01, 2.10000000e+01, 2.20000000e+01,
       2.30000000e+01, 2.40000000e+01, 2.50000000e+01, 2.60000000e+01,
  

In [59]:
transformer = create_model_separate_input(config)
# tf.keras.utils.plot_model(transformer, os.path.join(OUTPUT_FOLDER, "model_plot.png"), show_shapes=True)

## Optimizer

In [60]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
    
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [61]:
learning_rate = CustomSchedule(config["d_model"])
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.999, 
                                      epsilon=1e-9)

## Loss and metrics

Since the target sequences are padded, it is important to apply a padding mask when calculating the loss.

In [62]:
# Loss and metric
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

accuracy_object = tf.keras.metrics.sparse_categorical_accuracy

train_auc= tf.keras.metrics.AUC()
val_auc= tf.keras.metrics.AUC()

In [63]:
def loss_function(real, pred): # batch_size x seq_size x 1 vs  batch_size x seq_size x 3
    mask = tf.math.logical_not(tf.math.equal(tf.squeeze(real), N_response)) # batch_size x seq_size
    loss_ = loss_object(real, pred) # batch_size x seq_size
    mask = tf.cast(mask, dtype=loss_.dtype)
    
    loss_ *= mask 
    loss_ = tf.reduce_sum(loss_)/tf.reduce_sum(mask) #loss becomes one value! (from all batches)
    return loss_

In [64]:
def metric_function(real, pred, auc_object): # batch_size x seq_size x 1 vs  batch_size x seq_size x 3
    # Predict accuracy
    mask = tf.math.logical_not(tf.math.equal(tf.squeeze(real), N_response)) # batch_size x seq_size
    accuracy = accuracy_object(real, pred) # batch_size x seq_size
    mask = tf.cast(mask, dtype=accuracy.dtype)
    
    accuracy *= mask
    accuracy = 100*tf.reduce_sum(accuracy)/tf.reduce_sum(mask)
    
    # A work-around to predict AUC => is it stable?
    pred = tf.nn.softmax(pred)
    pred = pred[:,:,1] # pred that answer is correct
    real = tf.keras.backend.flatten(real)
    pred = tf.keras.backend.flatten(pred)
    
    idxs = tf.math.logical_not(tf.math.equal(real, N_response))
    real = real[idxs]
    pred = pred[idxs]
    auc = auc_object(real, pred)
    return accuracy

In [65]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.Mean(name='val_accuracy')

## Training and checkpointing

Create the checkpoint path and the checkpoint manager. This will be used to save checkpoints every `n` epochs.

In [66]:
checkpoint_path = os.path.join(OUTPUT_FOLDER, "checkpoints/")
ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

# https://stackoverflow.com/questions/62919208/how-to-restore-a-specific-checkpoint-in-tensorflow2-to-implement-early-stopping

# # if a checkpoint exists, restore the latest checkpoint.
# if ckpt_manager.latest_checkpoint:
#   ckpt.restore(ckpt_manager.latest_checkpoint)
#   print ('Latest checkpoint restored!!')

In [67]:
# train_step_signature = [
#     tf.TensorSpec(shape=(None, THR_E), dtype=tf.int32),
#     tf.TensorSpec(shape=(None, THR_E), dtype=tf.int32),
#     tf.TensorSpec(shape=(None, THR_E), dtype=tf.int32),
#     tf.TensorSpec(shape=(None, THR_E), dtype=tf.int32),
#     tf.TensorSpec(shape=(None, THR_E), dtype=tf.int32),
#     tf.TensorSpec(shape=(None, THR_E), dtype=tf.int32),
#     tf.TensorSpec(shape=(None, THR_E), dtype=tf.int32),
# ]

In [68]:
@tf.function()
def train_step(tar_real, *inputs):
    
    with tf.GradientTape() as tape: 
        predictions = transformer(inputs, training=True)
        tar_real = tf.expand_dims(tar_real, -1) # IMPORTANT! DOESN'T WORK WITHOUT IT. ALWAYS GIVES ERROR INCOMPATIBLE SHAPE. E.G. (32,169) vs. (32,169,3)
        
        loss = loss_function(tar_real, predictions)
        accuracy = metric_function(tar_real, predictions, auc_object = train_auc)
    
    gradients = tape.gradient(loss, transformer.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(accuracy)

@tf.function()
def val_step(tar_real, *inputs):
    predictions = transformer(inputs, training=False)
    tar_real = tf.expand_dims(tar_real, -1) # IMPORTANT! DOESN'T WORK WITHOUT IT. ALWAYS GIVES ERROR INCOMPATIBLE SHAPE. E.G. (32,169) vs. (32,169,3)
    
    loss = loss_function(tar_real, predictions)
    accuracy = metric_function(tar_real, predictions, auc_object = val_auc)

    val_loss(loss)
    val_accuracy(accuracy)

In [None]:
best_auc = 0
for epoch in range(EPOCHS):
    train_dataset = create_train_dataset(N_SELECT_PER_EPOCH, PROBS_TRAIN) # create train for each epoch
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()
    train_auc.reset_states()
    
    val_loss.reset_states()
    val_accuracy.reset_states()
    val_auc.reset_states()

    # Train
    for (batch, (features)) in enumerate(train_dataset):
        E, tar, rest = features[0], features[1], features[2:] 
        tar_inp = tar[:, :-1]
        tar_real = tar[:, 1:]

        inputs = [E, tar_inp] + list(rest)        
        train_step(tar_real, inputs)
        
        if batch % PRINT_EVERY_N_BATCHES == 0:
            print ('Epoch {} TRAIN Batch {} Loss {:.4f} Accuracy {:.4f} AUC {:.4f}'.format(
              epoch + 1, batch, train_loss.result(), train_accuracy.result(), train_auc.result()))

    logging.info('Epoch TRAIN {} Loss {:.4f} Accuracy {:.4f} AUC {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result(),
                                                train_auc.result()
                                                ))
    
    # Validate and save model
    if (epoch + 1) % VAL_EVERY_N_EPOCHS == 0: 
      for (batch, (features)) in enumerate(val_dataset):
          E, tar, rest = features[0], features[1], features[2:] 
          tar_inp = tar[:, :-1]
          tar_real = tar[:, 1:]

          inputs = [E, tar_inp] + list(rest)          
          val_step(tar_real, inputs)

          if batch % PRINT_EVERY_N_BATCHES == 0:
              print ('Epoch {} VAL Batch {} Loss {:.4f} Accuracy {:.4f} AUC {:.4f}'.format(
                epoch + 1, batch, val_loss.result(), val_accuracy.result(), val_auc.result()))
      
      logging.info('Epoch VAL {} Loss {:.4f} Accuracy {:.4f} AUC {:.4f}'.format(epoch + 1, 
                                          val_loss.result(), 
                                          val_accuracy.result(),
                                          val_auc.result()
                                          ))    
       
      if val_auc.result() > best_auc:
          best_auc = val_auc.result()
          ckpt_save_path = ckpt_manager.save()
          logging.info('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                            ckpt_save_path))
        
    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 1 TRAIN Batch 0 Loss 1.4967 Accuracy 20.5670 AUC 0.5070
Epoch 1 TRAIN Batch 50 Loss 0.9553 Accuracy 51.6246 AUC 0.5023
Epoch 1 TRAIN Batch 100 Loss 0.8247 Accuracy 56.8592 AUC 0.5018
Epoch 1 TRAIN Batch 150 Loss 0.7742 Accuracy 59.1481 AUC 0.5029
Epoch 1 TRAIN Batch 200 Loss 0.7459 Accuracy 60.5054 AUC 0.5044
Epoch 1 TRAIN Batch 250 Loss 0.7270 Accuracy 61.5125 AUC 0.5064
Epoch 1 TRAIN Batch 300 Loss 0.7132 Accuracy 62.2572 AUC 0.5094
Epoch 1 TRAIN Batch 350 Loss 0.7025 Accuracy 62.8574 AUC 0.5137
Epoch 1 TRAIN Batch 400 Loss 0.6935 Accuracy 63.3138 AUC 0.5207
Epoch 1 TRAIN Batch 450 Loss 0.6859 Accuracy 63.6857 AUC 0.5288
Epoch 1 TRAIN Batch 500 Loss 0.6793 Accuracy 64.0464 AUC 0.5361
Epoch 1 TRAIN Batch 550 Loss 0.6740 Accuracy 64.3311 AUC 0.5420
Epoch 1 TRAIN Batch 600 Loss 0.6693 Accuracy 64.5741 AUC 0.5477
Epoch 1 TRAIN Batch 650 Loss 0.6652 Accuracy 64.8014 AUC 0.5526
Epoch 1 TRAIN Batch 700 Loss 0.6615 Accuracy 65.0126 AUC 0.5574
Epoch 1 TRAIN Batch 750 Loss 0.6584 Accurac

29-12 18:32 root         INFO     Epoch TRAIN 1 Loss 0.6565 Accuracy 65.2813 AUC 0.5639


Time taken for 1 epoch: 157.54356837272644 secs

Epoch 2 TRAIN Batch 0 Loss 0.6167 Accuracy 67.1481 AUC 0.6298
Epoch 2 TRAIN Batch 50 Loss 0.6113 Accuracy 67.6672 AUC 0.6302
Epoch 2 TRAIN Batch 100 Loss 0.6103 Accuracy 67.7147 AUC 0.6334
Epoch 2 TRAIN Batch 150 Loss 0.6096 Accuracy 67.7917 AUC 0.6335
Epoch 2 TRAIN Batch 200 Loss 0.6086 Accuracy 67.8490 AUC 0.6351
Epoch 2 TRAIN Batch 250 Loss 0.6080 Accuracy 67.8392 AUC 0.6368
Epoch 2 TRAIN Batch 300 Loss 0.6074 Accuracy 67.8460 AUC 0.6384
Epoch 2 TRAIN Batch 350 Loss 0.6069 Accuracy 67.8558 AUC 0.6393
Epoch 2 TRAIN Batch 400 Loss 0.6064 Accuracy 67.8579 AUC 0.6404
Epoch 2 TRAIN Batch 450 Loss 0.6059 Accuracy 67.8831 AUC 0.6411
Epoch 2 TRAIN Batch 500 Loss 0.6057 Accuracy 67.8782 AUC 0.6418
Epoch 2 TRAIN Batch 550 Loss 0.6052 Accuracy 67.8917 AUC 0.6431
Epoch 2 TRAIN Batch 600 Loss 0.6046 Accuracy 67.9091 AUC 0.6442
Epoch 2 TRAIN Batch 650 Loss 0.6043 Accuracy 67.9317 AUC 0.6449
Epoch 2 TRAIN Batch 700 Loss 0.6039 Accuracy 67.9392 AUC 0

29-12 18:35 root         INFO     Epoch TRAIN 2 Loss 0.6032 Accuracy 67.9691 AUC 0.6474


Time taken for 1 epoch: 151.29352068901062 secs

Epoch 3 TRAIN Batch 0 Loss 0.5990 Accuracy 67.5816 AUC 0.6791
Epoch 3 TRAIN Batch 50 Loss 0.5963 Accuracy 68.2894 AUC 0.6667
Epoch 3 TRAIN Batch 100 Loss 0.5845 Accuracy 69.2022 AUC 0.6891
Epoch 3 TRAIN Batch 150 Loss 0.5710 Accuracy 70.2766 AUC 0.7112
Epoch 3 TRAIN Batch 200 Loss 0.5634 Accuracy 70.8532 AUC 0.7229
Epoch 3 TRAIN Batch 250 Loss 0.5583 Accuracy 71.2279 AUC 0.7306
Epoch 3 TRAIN Batch 300 Loss 0.5540 Accuracy 71.5423 AUC 0.7365
Epoch 3 TRAIN Batch 350 Loss 0.5515 Accuracy 71.7176 AUC 0.7401
Epoch 3 TRAIN Batch 400 Loss 0.5491 Accuracy 71.8816 AUC 0.7431
Epoch 3 TRAIN Batch 450 Loss 0.5470 Accuracy 72.0199 AUC 0.7458
Epoch 3 TRAIN Batch 500 Loss 0.5453 Accuracy 72.1382 AUC 0.7480
Epoch 3 TRAIN Batch 550 Loss 0.5442 Accuracy 72.2057 AUC 0.7496
Epoch 3 TRAIN Batch 600 Loss 0.5430 Accuracy 72.2959 AUC 0.7511
Epoch 3 TRAIN Batch 650 Loss 0.5421 Accuracy 72.3502 AUC 0.7522
Epoch 3 TRAIN Batch 700 Loss 0.5412 Accuracy 72.4163 AUC 0

29-12 18:37 root         INFO     Epoch TRAIN 3 Loss 0.5399 Accuracy 72.5104 AUC 0.7548


Time taken for 1 epoch: 150.39379000663757 secs

Epoch 4 TRAIN Batch 0 Loss 0.5178 Accuracy 74.1474 AUC 0.7655
Epoch 4 TRAIN Batch 50 Loss 0.5301 Accuracy 73.1268 AUC 0.7660
Epoch 4 TRAIN Batch 100 Loss 0.5292 Accuracy 73.2018 AUC 0.7672
Epoch 4 TRAIN Batch 150 Loss 0.5294 Accuracy 73.2234 AUC 0.7672
Epoch 4 TRAIN Batch 200 Loss 0.5292 Accuracy 73.2426 AUC 0.7676
Epoch 4 TRAIN Batch 250 Loss 0.5291 Accuracy 73.2470 AUC 0.7677
Epoch 4 TRAIN Batch 300 Loss 0.5288 Accuracy 73.2624 AUC 0.7681
Epoch 4 TRAIN Batch 350 Loss 0.5288 Accuracy 73.2556 AUC 0.7682
Epoch 4 TRAIN Batch 400 Loss 0.5286 Accuracy 73.2717 AUC 0.7685
Epoch 4 TRAIN Batch 450 Loss 0.5284 Accuracy 73.2814 AUC 0.7686
Epoch 4 TRAIN Batch 500 Loss 0.5285 Accuracy 73.2689 AUC 0.7687
Epoch 4 TRAIN Batch 550 Loss 0.5285 Accuracy 73.2756 AUC 0.7688
Epoch 4 TRAIN Batch 600 Loss 0.5283 Accuracy 73.2865 AUC 0.7689
Epoch 4 TRAIN Batch 650 Loss 0.5282 Accuracy 73.2942 AUC 0.7690
Epoch 4 TRAIN Batch 700 Loss 0.5280 Accuracy 73.3099 AUC 0

29-12 18:40 root         INFO     Epoch TRAIN 4 Loss 0.5280 Accuracy 73.3145 AUC 0.7692


Time taken for 1 epoch: 154.8648931980133 secs

Epoch 5 TRAIN Batch 0 Loss 0.5281 Accuracy 73.5089 AUC 0.7727
Epoch 5 TRAIN Batch 50 Loss 0.5282 Accuracy 73.2543 AUC 0.7690
Epoch 5 TRAIN Batch 100 Loss 0.5270 Accuracy 73.3625 AUC 0.7696
Epoch 5 TRAIN Batch 150 Loss 0.5268 Accuracy 73.3952 AUC 0.7701
Epoch 5 TRAIN Batch 200 Loss 0.5269 Accuracy 73.3946 AUC 0.7700
Epoch 5 TRAIN Batch 250 Loss 0.5271 Accuracy 73.3625 AUC 0.7699
Epoch 5 TRAIN Batch 300 Loss 0.5267 Accuracy 73.3943 AUC 0.7702
Epoch 5 TRAIN Batch 350 Loss 0.5267 Accuracy 73.3991 AUC 0.7703
Epoch 5 TRAIN Batch 400 Loss 0.5267 Accuracy 73.3968 AUC 0.7703
Epoch 5 TRAIN Batch 450 Loss 0.5266 Accuracy 73.4068 AUC 0.7706
Epoch 5 TRAIN Batch 500 Loss 0.5264 Accuracy 73.4218 AUC 0.7706
Epoch 5 TRAIN Batch 550 Loss 0.5264 Accuracy 73.4210 AUC 0.7707
Epoch 5 TRAIN Batch 600 Loss 0.5264 Accuracy 73.4155 AUC 0.7707
Epoch 5 TRAIN Batch 650 Loss 0.5263 Accuracy 73.4255 AUC 0.7709
Epoch 5 TRAIN Batch 700 Loss 0.5261 Accuracy 73.4438 AUC 0.

29-12 18:42 root         INFO     Epoch TRAIN 5 Loss 0.5259 Accuracy 73.4570 AUC 0.7713


Epoch 5 VAL Batch 0 Loss 0.5201 Accuracy 73.6955 AUC 0.7725
Epoch 5 VAL Batch 50 Loss 0.5195 Accuracy 73.8914 AUC 0.7760
Epoch 5 VAL Batch 100 Loss 0.5205 Accuracy 73.7977 AUC 0.7754
Epoch 5 VAL Batch 150 Loss 0.5202 Accuracy 73.8286 AUC 0.7755
Epoch 5 VAL Batch 200 Loss 0.5205 Accuracy 73.7813 AUC 0.7754
Epoch 5 VAL Batch 250 Loss 0.5202 Accuracy 73.8173 AUC 0.7757
Epoch 5 VAL Batch 300 Loss 0.5204 Accuracy 73.7989 AUC 0.7756
Epoch 5 VAL Batch 350 Loss 0.5204 Accuracy 73.8037 AUC 0.7755
Epoch 5 VAL Batch 400 Loss 0.5205 Accuracy 73.7952 AUC 0.7755
Epoch 5 VAL Batch 450 Loss 0.5205 Accuracy 73.8008 AUC 0.7757


In [None]:
from IPython.display import FileLink, FileLinks
FileLinks(OUTPUT_FOLDER) #lists all downloadable files on server

## Add output to Drive

In [None]:
OUTPUT_DRIVE = "/content/drive/My Drive/kaggle-riiid/subs"

In [None]:
# https://stackoverflow.com/questions/15034151/copy-directory-contents-into-a-directory-with-python
from distutils.dir_util import copy_tree
copy_tree(OUTPUT_FOLDER, os.path.join(OUTPUT_DRIVE, OUTPUT_FOLDER))

## Add output to Kaggle

In [None]:
KAGGLE_JSON = "/content/drive/My\ Drive/kaggle-riiid/kaggle.json"
!mkdir -p ~/.kaggle
!cp $KAGGLE_JSON ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets init -p {OUTPUT_FOLDER}

In [None]:
import json

with open(f'{OUTPUT_FOLDER}/dataset-metadata.json', 'r+') as f:
    data = json.load(f)
    data['title'] = OUTPUT_FOLDER
    data['id'] = f'rafiko1/{OUTPUT_FOLDER}'
    f.seek(0)
    json.dump(data, f, indent=4)
    f.truncate()

!cat {OUTPUT_FOLDER}/dataset-metadata.json

In [None]:
!kaggle datasets create -p {OUTPUT_FOLDER} -q -r zip 

In [None]:
os.listdir(OUTPUT_FOLDER)

In [None]:
ltg_bins

## Summary

In this tutorial, you learned about positional encoding, multi-head attention, the importance of masking and how to create a transformer.

Try using a different dataset to train the transformer. You can also create the base transformer or transformer XL by changing the hyperparameters above. You can also use the layers defined here to create [BERT](https://arxiv.org/abs/1810.04805) and train state of the art models. Futhermore, you can implement beam search to get better predictions.

# Older code

In [None]:
# # From https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
# class DataGenerator(keras.utils.Sequence):
#     'Generates data for Keras'
#     def __init__(self, df, shuffle=True):
#         'Initialization'
        
#         self.df = df
#         self.users = self.df["user_id"].nunique()
#         self.batch_size = BATCH_SIZE
#         self.shuffle = shuffle
#         self.on_epoch_end()

#     def __len__(self):
#         'Denotes the number of batches per epoch'
#         return int(np.floor(len(self.list_IDs) / self.batch_size))

#     def __getitem__(self, index):
#         'Generate one batch of data'
#         # Generate indexes of the batch
#         indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

#         # Find list of IDs
#         list_IDs_temp = [self.list_IDs[k] for k in indexes]

#         # Generate data
#         X, y = self.__data_generation(list_IDs_temp)

#         return X, y

#     def on_epoch_end(self):
#         'Updates indexes after each epoch'
#         self.indexes = np.arange(len(self.list_IDs))
#         if self.shuffle == True:
#             np.random.shuffle(self.indexes)

#     def __data_generation(self, list_IDs_temp):
#         'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
#         # Initialization
#         X = np.empty((self.batch_size, *self.dim, self.n_channels))
#         y = np.empty((self.batch_size), dtype=int)

#         # Generate data
#         for i, ID in enumerate(list_IDs_temp):
#             # Store sample
#             X[i,] = np.load('data/' + ID + '.npy')

#             # Store class
#             y[i] = self.labels[ID]

#         return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

In [None]:
# def save_dict(seq_dict, filename):
#     with open(os.path.join(OUTPUT_FOLDER, filename), 'wb') as handle:
#       pickle.dump(seq_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# # Save last N dict to memory
# E_dict = E_lists.apply(lambda x: x[-200:]).to_dict()
# r_dict = r_lists.apply(lambda x: x[-200:]).to_dict()
# et_dict = et_lists.apply(lambda x: x[-200:]).to_dict()

In [None]:
# save_dict(E_dict, "E.pkl")
# save_dict(r_dict, "r.pkl")
# save_dict(et_dict, "et.pkl")