In [None]:
import numpy as np
import pandas as pd
import random
import gc

from tqdm.notebook import tqdm


random.seed(1)

In [None]:
!pip install adabelief-tf

# Data preparation

In [None]:
train_df = pd.read_csv("../input/riiid-test-answer-prediction/train.csv",
                       usecols=[2, 3, 4, 6],
                       dtype={
                              'user_id': 'int32',
                              'content_id': 'int16',
                              'user_answer': 'int8',
                              }
                      )
lectures_df = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')
questions_df = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')

In [None]:
question_id_map = {id_: i+1 for i, id_ in enumerate(questions_df["question_id"])}
lecture_id_map = {id_: i+questions_df.shape[0]+1 for i, id_ in enumerate(lectures_df["lecture_id"])}
questions_df["content_id"] = questions_df["question_id"].map(question_id_map)
lectures_df["content_id"] = lectures_df["lecture_id"].map(lecture_id_map)

train_df.loc[train_df["user_answer"] != -1, "content_id"] = train_df.loc[train_df["user_answer"] != -1, "content_id"].map(question_id_map)
train_df.loc[train_df["user_answer"] == -1, "content_id"] = train_df.loc[train_df["user_answer"] == -1, "content_id"].map(lecture_id_map)

train_df["choice_id"] = train_df["content_id"].astype(np.int32)*4 + train_df["user_answer"].astype(np.int32) * (train_df["user_answer"] >= 0)


In [None]:
import tensorflow as tf
choice_parts = [0] * ((lectures_df["content_id"].max()+1) * 4)
choice_tags = [[] for _ in range((lectures_df["content_id"].max()+1) * 4)]
correcr_answers = [-1] * ((lectures_df["content_id"].max()+1) * 4)

for i, row in questions_df.iterrows():
    tags = [] if pd.isna(row["tags"]) else list(map(int, row["tags"].split()))
    for i in range(4):
        choice_tags[i + row["content_id"]*4] = [t for t in tags]
        choice_parts[i + row["content_id"]*4] = row["part"]
        correcr_answers[i + row["content_id"]*4] = row["correct_answer"]
        
for i, row in lectures_df.iterrows():
    tags = [row["tag"]]
    for i in range(4):
        choice_tags[i + row["content_id"]*4] = [t for t in tags]
        choice_parts[i + row["content_id"]*4] = row["part"]
        
choice_parts = tf.constant(choice_parts)
correcr_answers = tf.constant(correcr_answers)
choice_tags = tf.keras.preprocessing.sequence.pad_sequences(choice_tags, dtype="int16", value=-1, padding="post") + 1

In [None]:
start_of_records = (lectures_df["content_id"].max() + 1) * 4
records = []
records_ixs = {}
for i, (user_id, df) in tqdm(enumerate(train_df.groupby("user_id")), total=train_df["user_id"].nunique()):
    records.append(np.int32(np.concatenate([[start_of_records], df["choice_id"].values])))
    records_ixs[user_id] = i

In [None]:
from gc import collect
del train_df
collect()

In [None]:
from sklearn.model_selection import train_test_split
train_user, test_user = train_test_split(list(records_ixs.keys()), test_size=.1, random_state=2021, shuffle=True)

# Model definition

In [None]:
from random import randint
class Sampler():
    def __init__(self, records, records_ixs):
        self.records = records
        self.records_ixs = records_ixs
    
    def stream(self, raw_users, window=32, batch_size=64):
        users = np.copy(raw_users)
        np.random.shuffle(users)
        X = []
        y = []
        for user in users:
            vec = self.records[self.records_ixs[user]]
            ix = randint(1, len(vec)-1)
            
            X.append(vec[max(ix-window, 0):ix])
            y.append(vec[ix])
            if len(X) == batch_size:
                yield X, y
                X = []
                y = []
        return
                

In [None]:
class Tokenizer(tf.keras.Model):
    def __init__(self, emb_dim):
        super(Tokenizer, self).__init__()
        self.content_embedding = tf.keras.layers.Embedding(13943, emb_dim, mask_zero=True)
        
    def call(self, choice_ids):
        return self.content_embedding(choice_ids//4)

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, num_layer, emb_dim):
        super(Encoder, self).__init__()
        self.num_layer = num_layer
        
        self.dense_layers = []
        self.lstm_layers = []
        
        for i in range(num_layer):
            self.dense_layers.append(tf.keras.layers.Dense(emb_dim))
            self.lstm_layers.append(tf.keras.layers.LSTM(emb_dim, return_sequences=True))


    def create_mask(self, window, batch_size):
        """
        mask: 自身と自身より先のレコードを参照しないためのマスク
        start_mask: 一番最初のレコードはどこも参照できないので、全てのweightを0にするためのmask
        """
        
        row = tf.reshape(tf.repeat(tf.range(window), window), (window, window))
        col = tf.transpose(row)
        raw_mask = tf.expand_dims(tf.where(row <= col, tf.float32.min / 100, 0.) + tf.eye(window, dtype="float64") * tf.float32.max / 1000, axis=0)
        mask = tf.concat([raw_mask for _ in range(batch_size)], axis=0)
        mask = tf.cast(mask, "float32")
        
        start_mask = tf.where(tf.reshape(tf.range(window**2), (window, window)) > 0, 1., 0.)
        start_mask = tf.concat([tf.expand_dims(start_mask, axis=0) for _ in range(batch_size)], axis=0)
        
        return mask, start_mask

    @tf.function
    def call(self, X, batch_size, window):
        mask, start_mask = self.create_mask(window, batch_size)
        for i in range(self.num_layer):
            X = X + self.lstm_layers[i](X)
            
            query = X / tf.stop_gradient(tf.norm(X, axis=2, keepdims=True))
            key = X
            value = X
            
            logit = tf.matmul(query, key, transpose_b=True) + mask
            weight = tf.math.softmax(logit) * start_mask
            attention = tf.matmul(weight, value)
            X = X + self.dense_layers[i](attention)
        return X

In [None]:
class TopModel(tf.keras.Model):
    def __init__(self, emb_dim):
        super(TopModel, self).__init__()
        self.lstm_layer = tf.keras.layers.LSTM(emb_dim)
        self.cross_layer = tf.keras.layers.Dense(emb_dim)
        self.conv_layer = tf.keras.layers.Conv1D(emb_dim, 1, activation="relu")
        self.gmp_layer = tf.keras.layers.GlobalMaxPooling1D()
        
        self.gap = tf.keras.layers.GlobalAveragePooling1D()
    
    @tf.function
    def call(self, X, exist_mask):
        X_mean = self.gap(X, exist_mask)
        X_lstm = self.lstm_layer(X)
        X_conv = self.gmp_layer(self.conv_layer(X))
        
        return X_mean * self.cross_layer(tf.concat([X_lstm, X_conv], axis=-1)) + X_mean

In [None]:
class Model(tf.keras.Model):
    def __init__(self, tokenizer, encoder, top_model):
        super(Model, self).__init__()
        self.tokenizer = tokenizer
        self.encoder = encoder
        self.top_model = top_model
    
    @tf.function
    def call(self, X, y, exist_mask, target_mask, window, batch_size, num_split):
        X = self.tokenizer(X)
        X = self.encoder(X, window=window, batch_size=batch_size)
        X = self.top_model(X, exist_mask)

        y = self.tokenizer(y)

        logits = []
        for X_split, y_split in zip(tf.split(X, num_or_size_splits=num_split, axis=0), tf.split(y, num_or_size_splits=num_split, axis=0)):
            logit = tf.matmul(X_split, y_split, transpose_b=True)
            logits.append(logit)
        
        return tf.keras.activations.linear(tf.concat(logits, axis=0) + target_mask)
        #pred = tf.math.softmax(tf.concat(logits, axis=0) + target_mask)

        #return pred        
        

# training

In [None]:
emb_dim = 64

sampler = Sampler(records, records_ixs)
tokenizer = Tokenizer(emb_dim=emb_dim)
encoder = Encoder(num_layer=8, emb_dim=emb_dim)
top_model = TopModel(emb_dim=emb_dim)
model = Model(tokenizer, encoder, top_model)

In [None]:
from adabelief_tf import AdaBeliefOptimizer
loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
optimizer = AdaBeliefOptimizer(learning_rate=1e-3, weight_decay=1e-4) 

train_loss = tf.keras.metrics.Mean(name='train_loss')
test_loss = tf.keras.metrics.Mean(name='test_loss')
test_rank = tf.keras.metrics.Mean(name="test_rank")

In [None]:
len(train_user)

In [None]:
len(test_user)

In [None]:
from time import time

window = 32
batch_size = 2**13
num_split = 2**3
start_time = time()
train_losses = []
test_losses = []
test_ranks = []

def padding_sequence(X, y):
    X = tf.keras.preprocessing.sequence.pad_sequences(X, dtype="int32", value=0, padding="pre")
    y = tf.constant(y, dtype="int32")
    return X, y

@tf.function
def preprocess(X, y, num_split):   
    exist_mask = X > 0

    target_masks = []
    targets = []
    for y_split in tf.split(y, num_or_size_splits=num_split, axis=0):
        target = tf.eye(len(y_split))
        row = tf.reshape(tf.repeat(y_split, len(y_split)), (len(y_split), len(y_split)))
        col = tf.transpose(row)
        target_mask = tf.cast(tf.where(tf.logical_and(target != 1., row == col), tf.float32.min / 100, 0), "float32")
        
        targets.append(target)
        target_masks.append(target_mask)
    
    target = tf.concat(targets, axis=0)
    target_mask = tf.concat(target_masks, axis=0)
    
    return X, y, target, exist_mask, target_mask

@tf.function
def train_step(X, y, exist_mask, target_mask, window, batch_size, num_split):
    with tf.GradientTape() as tape:
        pred = model(X, y, exist_mask, target_mask, window, batch_size, num_split)
        loss = loss_object(target, pred)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

last_train_loss = np.inf
last_test_loss = np.inf
last_test_rank = np.inf

with tqdm(total=128) as pbar:
    for epoch in range(128):
        for i, (X, y) in enumerate(sampler.stream(train_user, window=window, batch_size=batch_size)):
            X, y = padding_sequence(X, y)
            X, y, target, exist_mask, target_mask = preprocess(X, y, num_split)
            
            loss = train_step(X, y, exist_mask, target_mask, window, batch_size, num_split)

            # metric表示部分
            train_loss(loss)
            learning_text = "[{}/{}] ".format(str(i).zfill(3), len(train_user)//batch_size)
            progress_text = "train | Loss: {:.5f} test| Loss{: .5f} rank{: .5f}".format(train_loss.result(), last_test_loss, last_test_rank)
            pbar.set_postfix_str(learning_text + progress_text)
            
        last_train_loss = train_loss.result()
        
        content_embeddings = model.tokenizer(np.arange(13943, dtype=np.uint16)*4)
        for X, y in sampler.stream(test_user, window=window, batch_size=batch_size//num_split):
            X, y = padding_sequence(X, y)
            X, y, target, exist_mask, target_mask = preprocess(X, y, 1)
            
            pred = model(X, y, exist_mask, target_mask, window, batch_size//num_split, 1)
            loss = loss_object(target, pred)    

            X_emb = tokenizer(X)
            X_emb = encoder(X_emb, window=window, batch_size=batch_size//num_split)
            X_emb = top_model(X_emb, exist_mask)
            logit = tf.matmul(X_emb, content_embeddings, transpose_b=True) 
            arg = tf.argsort(logit, axis=1, direction="DESCENDING")
            mean_rank = tf.reduce_mean(tf.cast(tf.where(arg == tf.reshape(tf.cast(y, "int32")//4, (-1, 1)))[:, 1], "float32"))
            test_rank(mean_rank)
            
            # metric表示部分
            test_loss(loss)
            progress_text = "train | Loss: {:.5f} test| Loss{: .5f} rank{: .5f}".format(last_train_loss, test_loss.result(), test_rank.result())
            pbar.set_postfix_str(progress_text)

        last_test_loss = test_loss.result()
        last_test_rank = test_rank.result()
        pbar.update(1)

        train_losses.append(train_loss.result())
        test_losses.append(test_loss.result())
        test_ranks.append(test_rank.result())

        train_loss.reset_states()
        test_loss.reset_states()
        test_rank.reset_states()

model.save_weights("./embedding_model.model")

## plot loss

In [None]:
import matplotlib.pyplot as plt
plt.plot(train_losses)
plt.plot(test_losses)

In [None]:
plt.plot(test_ranks)

# Visualization of embedded representation

## item embedding

In [None]:
import umap
embs = model.tokenizer(np.arange(13943, dtype=np.uint16)*4).numpy()

ixs = np.arange(13943)
np.random.shuffle(ixs)

mapper = umap.UMAP(n_neighbors=15, n_components=2, metric="cosine", verbose=True).fit(embs[ixs])

import matplotlib.pyplot as plt
part = choice_parts.numpy()[np.arange(13942, dtype=np.uint16)*4]
umap_emb = mapper.transform(embs)
for part in range(1, 8):
    ix = np.where((choice_parts.numpy() == part)[::4])
    plt.scatter(umap_emb[ix, 0], umap_emb[ix, 1], s=5, label=part)


plt.legend()
plt.tight_layout()

# Visualization of user embedding by problem

In [None]:
def gather_question(question_id, window):
    X = []
    y = []
    for user in test_user:
        vec = records[records_ixs[user]]
        ixs = np.where(vec//4 == question_id)[0]
        if len(ixs):
            X.append(vec[max(0, ixs[0]-window):ixs[0]])
            y.append(vec[ixs[0]])
    return X, y

In [None]:
question_id = 3365 # count: 172574 mean_accuracy: 0.547162
print(question_id)
X, y = gather_question(question_id, window)

batch_size = 1024
user_emb = []
ixs = np.arange(len(y))
for batch in range(len(y)//batch_size + 1):
    ix = ixs[batch*batch_size:(batch+1)*batch_size]
    X, y = padding_sequence(X, y)
    X_emb, y_emb, target, exist_mask, target_mask = preprocess(np.array(X)[ix], np.array(y)[ix], 1)
    X_emb = tokenizer(X_emb)
    X_emb = encoder(X_emb, batch_size=len(y_emb), window=window)
    X_emb = tf.keras.layers.GlobalAveragePooling1D()(X_emb, exist_mask)
    user_emb.append(X_emb.numpy())

user_emb = np.vstack(user_emb)
is_correct = (np.array(y)%4 == correcr_answers.numpy()[np.array(y)])

ixs = np.arange(len(y))
np.random.shuffle(ixs)

mapper = umap.UMAP(n_neighbors=30, n_components=2, metric="cosine", verbose=True).fit(user_emb[ixs])
umap_emb = mapper.transform(user_emb)

plt.scatter(umap_emb[:, 0], umap_emb[:, 1], c=np.array(y)%4 == correcr_answers.numpy()[y], s=10, alpha=0.3)
plt.legend()
plt.tight_layout()

In [None]:
question_id = 7218 # count: 160300, mean_accuracy: 0.501142
print(question_id)
X, y = gather_question(question_id, window)

batch_size = 1024
user_emb = []
ixs = np.arange(len(y))
for batch in range(len(y)//batch_size + 1):
    ix = ixs[batch*batch_size:(batch+1)*batch_size]
    X, y = padding_sequence(X, y)
    X_emb, y_emb, target, exist_mask, target_mask = preprocess(np.array(X)[ix], np.array(y)[ix], 1)
    X_emb = tokenizer(X_emb)
    X_emb = encoder(X_emb, batch_size=len(y_emb), window=window)
    X_emb = tf.keras.layers.GlobalAveragePooling1D()(X_emb, exist_mask)
    user_emb.append(X_emb.numpy())

user_emb = np.vstack(user_emb)
is_correct = (np.array(y)%4 == correcr_answers.numpy()[np.array(y)])

ixs = np.arange(len(y))
np.random.shuffle(ixs)

mapper = umap.UMAP(n_neighbors=30, n_components=2, metric="cosine", verbose=True).fit(user_emb[ixs])
umap_emb = mapper.transform(user_emb)

plt.scatter(umap_emb[:, 0], umap_emb[:, 1], c=np.array(y)%4 == correcr_answers.numpy()[y], s=10, alpha=0.3)
plt.legend()
plt.tight_layout()

In [None]:
question_id = randint(1, questions_df.shape[0]-1)
print(question_id)
X, y = gather_question(question_id, window)

batch_size = 1024
user_emb = []
ixs = np.arange(len(y))
for batch in range(len(y)//batch_size + 1):
    ix = ixs[batch*batch_size:(batch+1)*batch_size]
    X, y = padding_sequence(X, y)
    X_emb, y_emb, target, exist_mask, target_mask = preprocess(np.array(X)[ix], np.array(y)[ix], 1)
    X_emb = tokenizer(X_emb)
    X_emb = encoder(X_emb, batch_size=len(y_emb), window=window)
    X_emb = tf.keras.layers.GlobalAveragePooling1D()(X_emb, exist_mask)
    user_emb.append(X_emb.numpy())

user_emb = np.vstack(user_emb)
is_correct = (np.array(y)%4 == correcr_answers.numpy()[np.array(y)])

ixs = np.arange(len(y))
np.random.shuffle(ixs)

mapper = umap.UMAP(n_neighbors=30, n_components=2, metric="cosine", verbose=True).fit(user_emb[ixs])
umap_emb = mapper.transform(user_emb)

plt.scatter(umap_emb[:, 0], umap_emb[:, 1], c=np.array(y)%4 == correcr_answers.numpy()[y], s=10, alpha=0.3)
plt.legend()
plt.tight_layout()

# fine tuning

In [None]:
class TransformerModel(tf.keras.Model):
    def __init__(self, tokenizer, encoder, window):
        super(TransformerModel, self).__init__()
        self.tokenizer = tokenizer
        self.encoder = encoder
        self.gap = tf.keras.layers.GlobalAvgPool1D()
        self.window = window
    
    @tf.function
    def __call__(self, X, y, exist_mask):
        X_emb = self.tokenizer(X)
        X_emb = self.encoder(X_emb, batch_size=len(y), window=self.window)
        X_emb = self.gap(X_emb, exist_mask)
        y_emb = self.tokenizer(y)
        return tf.concat([X_emb, y_emb], axis=1)

In [None]:
class RiiidTop(tf.keras.Model):
    def __init__(self, layer_dims=[128, 128]):
        super(RiiidTop, self).__init__()
        self.denses = [tf.keras.layers.Dense(dim, activation="relu") for dim in layer_dims]
        self.out = tf.keras.layers.Dense(1, activation="sigmoid")
    
    def call(self, emb):
        for i in range(len(self.denses)):
            emb = self.denses[i](emb)
        return self.out(emb)     

In [None]:
class RiiidModel(tf.keras.Model):
    def __init__(self, input_layer, top_model):
        super(RiiidModel, self).__init__()
        self.input_layer = input_layer
        self.top_model = top_model
    
    def call(self, X, y, exist_mask):
        emb = self.input_layer(X, y, exist_mask)
        return self.top_model(emb)

In [None]:
window = 32
transformer_layer = TransformerModel(tokenizer, encoder, window=window)
transformer_layer.trainable = False
transformer_riiid_model = RiiidModel(transformer_layer, RiiidTop([128, 128]))

In [None]:
from adabelief_tf import AdaBeliefOptimizer
loss_object = tf.keras.losses.BinaryCrossentropy()
optimizer = AdaBeliefOptimizer(learning_rate=1e-3, weight_decay=1e-4) 

train_loss = tf.keras.metrics.Mean(name='train_loss')
test_loss = tf.keras.metrics.Mean(name='test_loss')
test_auc = tf.keras.metrics.AUC(name="test_auc")

In [None]:
from time import time

window = 32
batch_size = 2**13
start_time = time()
train_losses = []
test_losses = []
test_aucs = []

def preprocess(X, y):
    X = tf.keras.preprocessing.sequence.pad_sequences(X, dtype="int32", value=0, padding="pre")
    y = tf.constant(y, dtype="int32")    
    target = tf.eye(batch_size)
    exist_mask = X > 0

    row = tf.reshape(tf.repeat(tf.cast(y, "int32"), len(y)), (len(y), len(y)))
    col = tf.transpose(row)
    target_mask = tf.cast(tf.where(tf.logical_and(target != 1., row == col), tf.float32.min / 100, 0), "float32")
    
    return X, y, target, exist_mask, target_mask

def padding_sequence(X, y):
    X = tf.keras.preprocessing.sequence.pad_sequences(X, dtype="int32", value=0, padding="pre")
    y = tf.constant(y, dtype="int32")
    return X, y

@tf.function
def preprocess(X, y, num_split):   
    exist_mask = X > 0

    target_masks = []
    targets = []
    for y_split in tf.split(y, num_or_size_splits=num_split, axis=0):
        target = tf.eye(len(y_split))
        row = tf.reshape(tf.repeat(y_split, len(y_split)), (len(y_split), len(y_split)))
        col = tf.transpose(row)
        target_mask = tf.cast(tf.where(tf.logical_and(target != 1., row == col), tf.float32.min / 100, 0), "float32")
        
        targets.append(target)
        target_masks.append(target_mask)
    
    target = tf.concat(targets, axis=0)
    target_mask = tf.concat(target_masks, axis=0)
    
    return X, y, target, exist_mask, target_mask

@tf.function
def train_step(X, y, exist_mask, target_mask, window, batch_size, num_split):
    label = tf.reshape(tf.cast(y%4 == tf.gather(correcr_answers, y), "float32"), (-1, 1))
    with tf.GradientTape() as tape:
        pred = transformer_riiid_model(X, y, exist_mask)
        loss = loss_object(label, pred)
    grad = tape.gradient(loss, transformer_riiid_model.trainable_variables)
    optimizer.apply_gradients(zip(grad, transformer_riiid_model.trainable_variables))
    return loss

last_train_loss = np.inf
last_test_loss = np.inf
last_test_auc = np.inf

with tqdm(total=64) as pbar:
    for epoch in range(64):
        for i, (X, y) in enumerate(sampler.stream(train_user, window=window, batch_size=batch_size)):
            X, y = padding_sequence(X, y)
            X, y, target, exist_mask, target_mask = preprocess(X, y, num_split)
            loss = train_step(X, y, exist_mask, target_mask, window, batch_size, num_split)
            
            train_loss(loss)
            learning_text = "[{}/{}] ".format(str(i).zfill(3), len(train_user)//batch_size)
            progress_text = "train | Loss: {:.5f} test| Loss{: .5f} auc{: .5f}".format(train_loss.result(), last_test_loss, last_test_auc)
            pbar.set_postfix_str(learning_text + progress_text)
            
        last_train_loss = train_loss.result()
        
        content_embeddings = model.tokenizer(np.arange(13943, dtype=np.uint16)*4)
        for i, (X, y) in enumerate(sampler.stream(test_user, window=window, batch_size=batch_size)):
            X, y = padding_sequence(X, y)
            X, y, target, exist_mask, target_mask = preprocess(X, y, num_split)
            label = tf.reshape(tf.cast(y%4 == tf.gather(correcr_answers, y), "float32"), (-1, 1))
    
            pred = transformer_riiid_model(X, y, exist_mask)
            loss = loss_object(label, pred)
            
            # metric表示部分
            test_loss(loss)
            test_auc(label, pred)
            progress_text = "train | Loss: {:.5f} test| Loss{: .5f} auc{: .5f}".format(last_train_loss, test_loss.result(), test_auc.result())
            pbar.set_postfix_str(progress_text)

        last_test_loss = test_loss.result()
        last_test_auc = test_auc.result()
        pbar.update(1)

        train_losses.append(train_loss.result())
        test_losses.append(test_loss.result())
        test_aucs.append(test_auc.result())

        train_loss.reset_states()
        test_loss.reset_states()
        test_auc.reset_states()

In [None]:
import matplotlib.pyplot as plt
plt.plot(train_losses)
plt.plot(test_losses)

In [None]:
plt.plot(test_aucs)

# appendix: fine tuning with word2vec

In [None]:
w2v_vec = pd.read_pickle("../input/gensim-word2vec/word2vec_weight.npy")
w2v_tokenizer = Tokenizer(emb_dim=64)
w2v_tokenizer.compile()
w2v_tokenizer(0)
w2v_tokenizer.content_embedding.set_weights([w2v_vec])
w2v_tokenizer.trainable = False


In [None]:
class W2VModel(tf.keras.Model):
    def __init__(self, w2v_tokenizer):
        super(W2VModel, self).__init__()
        self.w2v_tokenizer = w2v_tokenizer
        self.gap = tf.keras.layers.GlobalAvgPool1D()
    
    @tf.function
    def __call__(self, X, y, exist_mask):
        X_emb = self.w2v_tokenizer(X)
        X_emb = self.gap(X_emb, exist_mask)
        y_emb = self.w2v_tokenizer(y)
        return tf.concat([X_emb, y_emb], axis=1)

In [None]:
w2v_layer = W2VModel(w2v_tokenizer)
w2v_layer.trainable = False
w2v_riiid_model = RiiidModel(w2v_layer, RiiidTop([128, 128]))

In [None]:
from adabelief_tf import AdaBeliefOptimizer
loss_object = tf.keras.losses.BinaryCrossentropy()
optimizer = AdaBeliefOptimizer(learning_rate=1e-3, weight_decay=1e-4) 

train_loss = tf.keras.metrics.Mean(name='train_loss')
test_loss = tf.keras.metrics.Mean(name='test_loss')
test_auc = tf.keras.metrics.AUC(name="test_auc")

In [None]:
from time import time

window = 32
batch_size = 2**13
start_time = time()
train_losses = []
test_losses = []
test_aucs = []

def preprocess(X, y):
    X = tf.keras.preprocessing.sequence.pad_sequences(X, dtype="int32", value=0, padding="pre")
    y = tf.constant(y, dtype="int32")    
    target = tf.eye(batch_size)
    exist_mask = X > 0

    row = tf.reshape(tf.repeat(tf.cast(y, "int32"), len(y)), (len(y), len(y)))
    col = tf.transpose(row)
    target_mask = tf.cast(tf.where(tf.logical_and(target != 1., row == col), tf.float32.min / 100, 0), "float32")
    
    return X, y, target, exist_mask, target_mask

def padding_sequence(X, y):
    X = tf.keras.preprocessing.sequence.pad_sequences(X, dtype="int32", value=0, padding="pre")
    y = tf.constant(y, dtype="int32")
    return X, y

@tf.function
def preprocess(X, y, num_split):   
    exist_mask = X > 0

    target_masks = []
    targets = []
    for y_split in tf.split(y, num_or_size_splits=num_split, axis=0):
        target = tf.eye(len(y_split))
        row = tf.reshape(tf.repeat(y_split, len(y_split)), (len(y_split), len(y_split)))
        col = tf.transpose(row)
        target_mask = tf.cast(tf.where(tf.logical_and(target != 1., row == col), tf.float32.min / 100, 0), "float32")
        
        targets.append(target)
        target_masks.append(target_mask)
    
    target = tf.concat(targets, axis=0)
    target_mask = tf.concat(target_masks, axis=0)
    
    return X, y, target, exist_mask, target_mask

@tf.function
def train_step(X, y, exist_mask, target_mask, window, batch_size, num_split):
    label = tf.reshape(tf.cast(y%4 == tf.gather(correcr_answers, y), "float32"), (-1, 1))
    with tf.GradientTape() as tape:
        pred = w2v_riiid_model(X, y, exist_mask)
        loss = loss_object(label, pred)
    grad = tape.gradient(loss, w2v_riiid_model.trainable_variables)
    optimizer.apply_gradients(zip(grad, w2v_riiid_model.trainable_variables))
    return loss

last_train_loss = np.inf
last_test_loss = np.inf
last_test_auc = np.inf

with tqdm(total=64) as pbar:
    for epoch in range(64):
        for i, (X, y) in enumerate(sampler.stream(train_user, window=window, batch_size=batch_size)):
            X, y = padding_sequence(X, y)
            X, y, target, exist_mask, target_mask = preprocess(X, y, num_split)
            loss = train_step(X, y, exist_mask, target_mask, window, batch_size, num_split)
            
            train_loss(loss)
            learning_text = "[{}/{}] ".format(str(i).zfill(3), len(train_user)//batch_size)
            progress_text = "train | Loss: {:.5f} test| Loss{: .5f} auc{: .5f}".format(train_loss.result(), last_test_loss, last_test_auc)
            pbar.set_postfix_str(learning_text + progress_text)
            
        last_train_loss = train_loss.result()
        
        content_embeddings = model.tokenizer(np.arange(13943, dtype=np.uint16)*4)
        for i, (X, y) in enumerate(sampler.stream(test_user, window=window, batch_size=batch_size)):
            X, y = padding_sequence(X, y)
            X, y, target, exist_mask, target_mask = preprocess(X, y, num_split)
            label = tf.reshape(tf.cast(y%4 == tf.gather(correcr_answers, y), "float32"), (-1, 1))
    
            pred = w2v_riiid_model(X, y, exist_mask)
            loss = loss_object(label, pred)
            
            # metric表示部分
            test_loss(loss)
            test_auc(label, pred)
            progress_text = "train | Loss: {:.5f} test| Loss{: .5f} auc{: .5f}".format(last_train_loss, test_loss.result(), test_auc.result())
            pbar.set_postfix_str(progress_text)

        last_test_loss = test_loss.result()
        last_test_auc = test_auc.result()
        pbar.update(1)

        train_losses.append(train_loss.result())
        test_losses.append(test_loss.result())
        test_aucs.append(test_auc.result())

        train_loss.reset_states()
        test_loss.reset_states()
        test_auc.reset_states()

In [None]:
import matplotlib.pyplot as plt
plt.plot(train_losses)
plt.plot(test_losses)

In [None]:
plt.plot(test_aucs)