In [1]:
# Imports
import pandas as pd
import numpy as np
import os
import shutil
from collections import Counter
import matplotlib.pyplot as plt
import time
import gc
import sys
import random
import pickle

from scipy.stats import iqr
from sklearn import preprocessing

import datetime
import logging

# Deep learning
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow.keras.models as M

from IPython.display import display
# Set pandas options
pd.options.display.max_rows = 2000

In [2]:
# Global variables 
THR_E = 200 # interaction threshold of exercises (E) for user
COLAB = True

In [3]:
# Selected features
selections = ["E", "r", "p", "et_std"]

In [4]:
if COLAB:
  FOLDER_FEATHER = "/content/drive/My Drive/kaggle-riiid/feather-files"
  FOLDER_PICKLE = "/content/drive/My Drive/kaggle-riiid/sequences-dict-pickle"
  MODEL_FILE = "/content/drive/My\ Drive/Colab\ Notebooks/riiid-functional-transformer.ipynb" # https://stackoverflow.com/questions/57464810/how-to-run-a-jupyter-notebook-with-space-in-relative-path-from-another-notebook
  PREPROCESS_FILE = "/content/drive/My\ Drive/Colab\ Notebooks/riiid-preprocessing.ipynb"
  from google.colab import drive
  drive.mount('/content/drive', force_remount=True)
else:
  FOLDER_FEATHER = "../input/riiid-feather-files/"

Mounted at /content/drive


In [5]:
# Add functions to preprocess data
%run $PREPROCESS_FILE

In [6]:
%%time
# Read all dataframes from feather files and print out
train = read_df_print(os.path.join(FOLDER_FEATHER, "train.feather")) 
questions = read_df_print(os.path.join(FOLDER_FEATHER, "questions.feather"))
lectures = read_df_print(os.path.join(FOLDER_FEATHER, "lectures.feather"))

(101230332, 10)


Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,False,1,3,1,,
1,1,56943,115,5716,False,2,2,1,37000.0,False
2,2,118363,115,128,False,0,0,1,55000.0,False


(13523, 5)


Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92


(418, 4)


Unnamed: 0,lecture_id,tag,part,type_of
0,89,24584,5,concept
1,100,22243,1,concept
2,185,7035,6,concept


CPU times: user 1.55 s, sys: 2.29 s, total: 3.84 s
Wall time: 29.7 s


In [7]:
train = train.loc[train["content_type_id"]==0]

In [8]:
# train = train.groupby("user_id").tail(THR_E+1)
# print(train.shape)

In [9]:
# Save space by deleting some columns
del train["row_id"], train["user_answer"], train["prior_question_had_explanation"]

In [10]:
N_questions, N_parts = questions["question_id"].nunique(), questions["part"].nunique()
N_response, N_task = 2, 2

In [11]:
%%time 
# Split users into multiple parts based on THR_E
train = split_into_more_users(train)
print(f"Number of new_user_ids: {train.new_user_id.nunique()}")

Number of new_user_ids: 775226
CPU times: user 11.7 s, sys: 182 ms, total: 11.9 s
Wall time: 11.5 s


## Add all inputs/outputs

In [12]:
# Add exercises
%time E_lists = return_E(groupby_id="user_id")

CPU times: user 29.2 s, sys: 1.46 s, total: 30.6 s
Wall time: 30.6 s


In [13]:
# Add results, with start token
%time r_lists = return_r(groupby_id="user_id", add_start_token=False)

# delete to make space
del train["answered_correctly"]

CPU times: user 34.1 s, sys: 99.4 ms, total: 34.2 s
Wall time: 34.2 s


In [14]:
if "lt" in selections:
    %time lt_lists = return_lt(groupby_id="user_id")

In [15]:
if "et" in selections:
    %time et_lists = return_et(groupby_id="user_id")

In [16]:
if "et_std" in selections:
    %time et_lists, _ = return_et_std(groupby_id="user_id")

CPU times: user 1min 12s, sys: 2.9 s, total: 1min 15s
Wall time: 1min 14s


In [17]:
if "p" in selections:
    %time p_lists = return_p(groupby_id="user_id")

CPU times: user 29.5 s, sys: 138 ms, total: 29.6 s
Wall time: 29.6 s


In [18]:
if "tag" in selections:
    %time tag_lists = return_N_highest_tags(groupby_id="user_id") # TODO: specify N tags

In [19]:
if "task" in selections:
    %time task_lists = return_task_binary(groupby_id="user_id")

In [33]:
et_lists

user_id
115           [0.0, 0.8513513803482056, 0.9429429173469543, ...
124           [0.0, 0.6721721887588501, 0.7412412166595459, ...
2746          [0.0, 0.7202202081680298, 0.3233233094215393, ...
5382          [0.0, 0.6131131052970886, 0.8303303122520447, ...
8623          [0.0, 0.27077075839042664, 0.8068068027496338,...
                                    ...                        
2147470770    [0.7870376706123352, 0.7870376706123352, 0.787...
2147470777    [0.7057057023048401, 0.5600600838661194, 0.049...
2147481750    [0.0, 0.4234234094619751, 0.3763763904571533, ...
2147482216    [0.5425425171852112, 0.27077075839042664, 0.32...
2147482888    [0.0, 0.22422422468662262, 0.3763763904571533,...
Name: et_std, Length: 393656, dtype: object

# Now save to dictionary

In [20]:
E_lists = E_lists.apply(lambda x: x[-200:])
r_lists = r_lists.apply(lambda x: x[-200:])


In [21]:
p_lists = p_lists.apply(lambda x: x[-200:])

In [23]:
et_lists = et_lists.apply(lambda x: x[-200:])

In [24]:
OUTPUT_FOLDER = "riiid-seq-200"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

In [25]:
def save_as_dict(df, filename):
    seq_dict = df.to_dict()
    with open(os.path.join(OUTPUT_FOLDER, filename), 'wb') as handle:
      pickle.dump(seq_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [26]:
save_as_dict(E_lists, "E.pickle")
save_as_dict(r_lists, "r.pickle")

In [27]:
save_as_dict(p_lists, "p.pickle")

In [28]:
save_as_dict(et_lists, "et.pickle")

## Add output to Kaggle

In [29]:
KAGGLE_JSON = "/content/drive/My\ Drive/kaggle-riiid/kaggle.json"
!mkdir -p ~/.kaggle
!cp $KAGGLE_JSON ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [30]:
!kaggle datasets init -p {OUTPUT_FOLDER}

Data package template written to: riiid-seq-200/dataset-metadata.json


In [31]:
import json

with open(f'{OUTPUT_FOLDER}/dataset-metadata.json', 'r+') as f:
    data = json.load(f)
    data['title'] = OUTPUT_FOLDER
    data['id'] = f'rafiko1/{OUTPUT_FOLDER}'
    f.seek(0)
    json.dump(data, f, indent=4)
    f.truncate()

!cat {OUTPUT_FOLDER}/dataset-metadata.json

{
    "licenses": [
        {
            "name": "CC0-1.0"
        }
    ],
    "id": "rafiko1/riiid-seq-200",
    "title": "riiid-seq-200"
}

In [32]:
!kaggle datasets version -p {OUTPUT_FOLDER} -m "New version"

Starting upload for file et.pickle
100% 280M/280M [00:13<00:00, 21.8MB/s]
Upload successful: et.pickle (280MB)
Starting upload for file r.pickle
100% 64.8M/64.8M [00:06<00:00, 10.1MB/s]
Upload successful: r.pickle (65MB)
Starting upload for file p.pickle
100% 64.8M/64.8M [00:05<00:00, 12.4MB/s]
Upload successful: p.pickle (65MB)
Starting upload for file E.pickle
100% 94.3M/94.3M [00:08<00:00, 11.0MB/s]
Upload successful: E.pickle (94MB)
Dataset version is being created. Please check progress at https://www.kaggle.com/rafiko1/riiid-seq-200


## Summary

In this tutorial, you learned about positional encoding, multi-head attention, the importance of masking and how to create a transformer.

Try using a different dataset to train the transformer. You can also create the base transformer or transformer XL by changing the hyperparameters above. You can also use the layers defined here to create [BERT](https://arxiv.org/abs/1810.04805) and train state of the art models. Futhermore, you can implement beam search to get better predictions.