Reference:

1. https://arxiv.org/abs/2002.07033
2. https://www.kaggle.com/claverru/demystifying-transformers-let-s-make-it-public/execution

In [1]:
# Constants to build data frame.

dev = True

path_questions = "../input/riiid-test-answer-prediction/questions.csv"
path_train = "../input/riiid-test-answer-prediction/train.csv"

dtype_questions = {
    "question_id": "int32",
    # 'bundle_id': 'int32',
    # 'correct_answer': 'int8',
    "part": "int8",
    # 'tags': 'object',
}
dtype_train = {
    "answered_correctly": "int8",
    # 'row_id': 'int64',
    # 'timestamp': 'int64',
    "user_id": "int32",
    "content_id": "int16",
    # 'content_type_id': 'int8',
    "task_container_id": "int16",
    # 'user_answer': 'int8',
    "prior_question_elapsed_time": "float32",
    # 'prior_question_had_explanation': 'boolean'
}

In [2]:
# Functions to build data frame.

import numpy as np
import pandas as pd


def get_question_df(path_questions, dtype_questions):
    questions = pd.read_csv(
        path_questions,
        dtype=dtype_questions,
        usecols=dtype_questions.keys(),
        index_col="question_id",
    )
    return questions


def get_train_df(path_train, dtype_train, dev):
    if dev:
        df = pd.read_csv(path_train, usecols=dtype_train.keys(),
                         dtype=dtype_train, nrows=10**6)
        df = df[df.answered_correctly != -1]
        df = df.groupby("user_id").head(1500)
    else:
        df = pd.read_csv(path_train, usecols=dtype_train.keys(),
                         dtype=dtype_train, nrows=10 * 10**6)
        df = df[df.answered_correctly != -1]
        # TODO(songzy): find a better estimation of head number.
        df = df.groupby("user_id").head(1500)
    return df


def transform_questions(questions):
    part_ids = questions.part.max() + 1
    return questions, part_ids


def transform_df(df, questions):
    df["prior_question_elapsed_time"] = (
        df["prior_question_elapsed_time"].fillna(0).astype(np.float32) / 300000
    )
    content_ids = questions.index.max() + 2
    df = df.join(questions, on="content_id")
    df["content_id"] += 1
    df["task_container_id"] += 1
    task_container_ids = 10001
    return df, content_ids, task_container_ids

In [3]:
# Builds data frame.

questions = get_question_df(path_questions, dtype_questions)
df = get_train_df(path_train, dtype_train, dev)

questions, part_ids = transform_questions(questions)
df, content_ids, task_container_ids = transform_df(df, questions)

df = {uid: u.drop(columns="user_id") for uid, u in df.groupby("user_id")}

In [4]:
import tensorflow as tf


def rolling_window(a, w):
    # [1,2,3,4] --- w = 2 --[[1,2], [2,3], [3,4]] but 2D to 3D
    s0, s1 = a.strides
    m, n = a.shape
    return np.lib.stride_tricks.as_strided(
        a, shape=(m - w + 1, w, n), strides=(s0, s0, s1)
    )


def make_time_series(x, windows_size):
    x = np.pad(x, [[windows_size - 1, 0], [0, 0]], constant_values=0)
    x = rolling_window(x, windows_size)
    return x


def add_features_to_user(user):
    # We add one to the column in order to have zeros as padding values
    # Start Of Sentence (SOS) token will be 3.
    user["answered_correctly"] = user["answered_correctly"].shift(
        fill_value=2) + 1
    return user


class RiiidSequence(tf.keras.utils.Sequence):
    def __init__(self, users, windows_size, batch_size=256, start=0, end=None):
        self.users = users  # {'user_id': user_df, ...}
        self.windows_size = windows_size
        # to convert indices to our keys
        self.mapper = dict(zip(range(len(users)), users.keys()))
        # start and end to easy generate training and validation
        self.start = start
        self.end = end if end else len(users)
        # To know where the answered_correctly_column is
        self.answered_correctly_index = list(self.user_example().columns).index(
            "answered_correctly"
        )

    def __len__(self):
        return self.end - self.start

    def __getitem__(self, idx):
        uid = self.mapper[idx + self.start]
        user = self.users[uid].copy()
        y = user["answered_correctly"].to_numpy().copy()
        x = add_features_to_user(user)
        return make_time_series(x, self.windows_size), y

    def user_example(self):
        """Just to check what we have till' now."""
        uid = self.mapper[self.start]
        return add_features_to_user(self.users[uid].copy())

    # INFERENCE PART
    def get_user_for_inference(self, user_row):
        """Picks a new user row and concats it to previous interactions
        if it was already stored.

        Maybe the biggest trick in the notebook is here. We reuse the user_id column to
        insert the answered_correctly SOS token because we previously placed the column
        there on purpose.

        After it, we roll that column and then crop it if it was bigger than the window
        size, making the SOS token disapear if out of the sequence.

        If the sequence if shorter than the window size, then we pad it.
        """
        uid = user_row[self.answered_correctly_index]
        user_row[self.answered_correctly_index] = 2  # SOS token
        user_row = user_row[np.newaxis, ...]
        if uid in self.users:
            x = np.concatenate([self.users[uid], user_row])
            # same as in training, we need to add one!!!
            x[:, self.answered_correctly_index] = (
                np.roll(x[:, self.answered_correctly_index], 1) + 1
            )
        else:
            x = user_row

        if x.shape[0] < self.windows_size:
            return np.pad(x, [[self.windows_size - x.shape[0], 0], [0, 0]])
        elif x.shape[0] > self.windows_size:
            return x[-self.windows_size:]
        else:
            return x

    def update_user(self, uid, user):
        """Concat the new user's interactions to the old ones if already stored."""
        if uid in self.users:
            self.users[uid] = np.concatenate([self.users[uid], user])[
                -self.windows_size:
            ]
        else:
            self.users[uid] = user

In [5]:
# Builds model input.

train_idx = int(len(df) * 0.8)
windows_size = 64

s_train = RiiidSequence(df, windows_size, start=0, end=train_idx)
s_val = RiiidSequence(df, windows_size, start=train_idx)

In [6]:
sample = s_train[2]
x = sample[0]
elapsed_time = x[..., 3]
x.shape[-1]

5

In [7]:
elapsed_time_embeddings = tf.keras.layers.Dense(
    512, use_bias=False)(
    elapsed_time
)
elapsed_time_embeddings.shape

TensorShape([19, 512])

In [8]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return pos * angle_rates


def positional_encoding(position, d_model):
    angle_rads = get_angles(
        np.arange(position)[:, np.newaxis], np.arange(
            d_model)[np.newaxis, :], d_model
    )
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)


def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    if mask is not None:
        scaled_attention_logits += mask * -1e9
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    output = tf.matmul(attention_weights, v)
    return output, attention_weights


class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask
        )

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

        concat_attention = tf.reshape(
            scaled_attention, (batch_size, -1, self.d_model))

        output = self.dense(concat_attention)

        return output, attention_weights


def point_wise_feed_forward_network(d_model, d_ff):
    return tf.keras.Sequential(
        [tf.keras.layers.Dense(d_ff, activation="relu"),
         tf.keras.layers.Dense(d_model)]
    )


class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, d_ff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, mask):

        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2


def create_padding_mask(seqs):
    # We mask only those vectors of the sequence in which we have all zeroes
    # (this is more scalable for some situations).
    mask = tf.cast(tf.reduce_all(tf.math.equal(seqs, 0), axis=-1), tf.float32)
    # (batch_size, 1, 1, seq_len)
    return mask[:, tf.newaxis, tf.newaxis, :]

In [9]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, d_ff, content_ids, task_container_ids, part_ids,
                 windows_size, rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        # TODO(songzy): maybe make the dimension from (1, windows_size, d_model) -> (batch_size, windows_size, d_model)
        # See: https://www.tensorflow.org/tutorials/customization/custom_layers#implementing_custom_layers.
        self.pos_encoding = positional_encoding(windows_size,
                                                d_model)
        self.content_embeddings = tf.keras.layers.Embedding(
            content_ids, d_model)
        self.task_embeddings = tf.keras.layers.Embedding(
            task_container_ids, d_model)
        self.answered_correctly_embeddings = tf.keras.layers.Embedding(
            4, d_model)
        # TODO(songzy): maybe also use Embedding(1, d_model)
        self.elapsed_time_embeddings = tf.keras.layers.Dense(
            d_model, use_bias=False)
        self.part_embeddings = tf.keras.layers.Embedding(part_ids, d_model)

        self.added = tf.keras.layers.Add()

        self.enc_layers = [EncoderLayer(d_model, num_heads, d_ff, rate)
                           for _ in range(num_layers)]

        self.pooling = tf.keras.layers.GlobalAveragePooling1D()
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, inputs, mask):
        # Divide branches
        content_id = inputs[..., 0]
        task_container_id = inputs[..., 1]
        answered_correctly = inputs[..., 2]
        elapsed_time = inputs[..., 3]
        part = inputs[..., 4]

        # Create embeddings
        content_embeddings = self.content_embeddings(content_id)
        task_embeddings = self.task_embeddings(
            task_container_id
        )
        answered_correctly_embeddings = self.answered_correctly_embeddings(
            answered_correctly
        )
        # Continuous! Only a learnable layer for it.
        elapsed_time_embeddings = self.elapsed_time_embeddings(
            elapsed_time
        )
        part_embeddings = self.part_embeddings(part)

        # Add embeddings
        x = self.added(
            [
                self.pos_encoding,
                content_embeddings,
                task_embeddings,
                answered_correctly_embeddings,
                elapsed_time_embeddings,
                part_embeddings,
            ]
        )

        for enc_layer in self.enc_layers:
            x = enc_layer(x, mask)

        x = self.pooling(x)
        x = self.dropout(x)
        return x  # (batch_size, input_seq_len, d_model)

In [10]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, d_ff, content_ids, task_container_ids, part_ids, windows_size, rate=0.1):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers, d_model, num_heads, d_ff,
                               content_ids, task_container_ids, part_ids, windows_size, rate)
        self.final_layer = tf.keras.layers.Dense(
            1, activation="sigmoid", name="output")

    def call(self, inp):
        enc_padding_mask = create_padding_mask(inp)
        # (batch_size, inp_seq_len, d_model)
        enc_output = self.encoder(inp, enc_padding_mask)
        # (batch_size, tar_seq_len, target_vocab_size)
        final_output = self.final_layer(enc_output)
        return final_output

In [11]:
# Builds model.

tf.keras.backend.clear_session()

model = Transformer(
    num_layers=2,
    d_model=24,
    num_heads=4,
    d_ff=96,
    content_ids=content_ids, task_container_ids=task_container_ids, part_ids=part_ids,
    windows_size=windows_size
)

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=[
        tf.keras.metrics.AUC(name="AUC"),
        tf.keras.metrics.BinaryAccuracy(name="acc"),
    ],
)

# tf.keras.utils.plot_model(model)

In [12]:
# TODO(songzy): find a better way to specify the input shape.
sample_input = tf.ones((1, 64, 5))
model(sample_input)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.09976396]], dtype=float32)>

In [13]:
model.summary()

Model: "transformer"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder (Encoder)            multiple                  580872    
_________________________________________________________________
output (Dense)               multiple                  25        
Total params: 580,897
Trainable params: 580,897
Non-trainable params: 0
_________________________________________________________________


In [14]:
import datetime

print(datetime.datetime.now())

2021-01-10 20:49:00.130932


In [15]:
# Trains model.

if dev:
    epochs = 1
else:
    epochs = 300
patience = 2

model.fit(
    s_train,
    validation_data=s_val,
    epochs=epochs,
    workers=4,
    shuffle=True,
    use_multiprocessing=True,
    callbacks=tf.keras.callbacks.EarlyStopping(
        patience=patience, monitor="val_AUC", mode="max", restore_best_weights=True
    ),
    verbose=1,
)
model.save_weights("model.h5")



In [16]:
import datetime

print(datetime.datetime.now())

2021-01-10 21:00:31.384120


In [17]:
import gc

del s_val
gc.collect()

91

In [18]:
import riiideducation

env = riiideducation.make_env()
iter_test = env.iter_test()

ModuleNotFoundError: No module named 'riiideducation'

In [None]:
columns = list(RiiidSequence(df, windows_size).user_example().columns)
columns[columns.index('answered_correctly')] = 'user_id'
columns = [c for c in columns if c not in questions.columns] + ['row_id']

In [None]:
for test, sample_prediction in iter_test:
    try:
        prior_correct = eval(test['prior_group_answers_correct'].iloc[0])
        prior_correct = [a for a in prior_correct if a != -1]
    except:
        prior_correct = []

    # Add prior correct to test and update stored users
    if prior_correct:
        prior_test.insert(s_train.answered_correctly_index,
                          'answered_correctly', prior_correct)
        for uid, user in prior_test.groupby('user_id'):
            s_train.update_user(
                uid, user.drop(columns='user_id').to_numpy())

    # Filter test
    test = test.loc[
        test['content_type_id'] == 0,
        columns
    ]

    # Add global features
    test, _, _ = transform_df(test, questions)

    # Save test for later
    prior_test = test.drop(columns='row_id').copy()

    # Make x
    x = np.apply_along_axis(
        s_train.get_user_for_inference,
        1,
        test.drop(columns='row_id').to_numpy()
    )

    # Predict
    test['answered_correctly'] = model.predict(x, batch_size=x.shape[0])

    env.predict(test[['row_id', 'answered_correctly']])