In [None]:
import riiideducation
from tqdm import tqdm
import numpy as np
import pandas as pd
import os
import sklearn

print(sklearn.__version__)

user_cache = dict()

env = riiideducation.make_env()

In [None]:
questions_df = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/questions.csv").set_index("question_id")

qid_max = questions_df.index.max() + 1

questions_df["part"].value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder

lectures_df = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/lectures.csv")
le = LabelEncoder()
le.fit(lectures_df["lecture_id"].values)

def encode_cid(x):
    return qid_max # + le.transform(x)
lectures_df["part"] = 8

lectures_df["lecture_id"] = encode_cid(lectures_df["lecture_id"])
lectures_df.set_index("lecture_id", inplace=True)
lid_max = lectures_df.index.max() + 1

lectures_df["part"].value_counts()

In [None]:
lpart_dict = lectures_df["part"].to_dict()
qpart_dict = questions_df["part"].to_dict()

len(lpart_dict), len(qpart_dict)

In [None]:
CA = questions_df["correct_answer"].values
CA

In [None]:
import torch.nn as nn
import torch
from collections import OrderedDict


EMB_DIM = 8


def l2norm(q):
    qn = torch.norm(q, p=2, dim=2).detach()
    qn = qn.unsqueeze(-1).repeat(1, 1, q.shape[2])
    q = q.div(qn)
    return q


class RidModel(nn.Module):
    def __init__(self, gru_dim=128, emb_dim=EMB_DIM):
        super(RidModel, self).__init__()
        self.content_difficulty_emb = nn.Embedding(lid_max, embedding_dim=2)
        self.content_answer_emb = nn.Embedding(lid_max, embedding_dim=4)
        self.answer_emb = nn.Embedding(5, embedding_dim=4)
        
        self.num_heads = 8
        self.content_emb_size = 184
        total_emb_size = self.content_emb_size + emb_dim
        self.content_emb = nn.Embedding(lid_max, embedding_dim=self.content_emb_size)
        self.sim_convs = nn.ModuleList([nn.Linear(total_emb_size, 32) for i in range(self.num_heads)])
        
        self.part_emb = nn.Embedding(lectures_df.part.max() + 1, embedding_dim=emb_dim)
        self.hidden = nn.Sequential(nn.Linear(10 + self.num_heads + total_emb_size, gru_dim), nn.Tanh())
        self.gate = nn.Sequential(nn.Linear(10 + self.num_heads + total_emb_size, gru_dim), nn.Sigmoid())
        
        self.inv_gru = nn.GRU(gru_dim, gru_dim, batch_first=True)
        self.gru = nn.GRU(2*gru_dim, gru_dim, batch_first=True)
        
        self.inv_gru2 = nn.GRU(gru_dim, gru_dim, batch_first=True)
        self.gru2 = nn.GRU(2*gru_dim, gru_dim, batch_first=True)
        self.final = nn.Sequential(nn.Linear(2 + gru_dim + total_emb_size + 3, 128),
                                   nn.BatchNorm1d(128),
                                   nn.ReLU(),
                                   nn.Linear(128, 32),
                                   nn.BatchNorm1d(32),
                                   nn.ReLU(),
                                   nn.Linear(32, 1))
        
    def forward(self, inputs):
        target_hist, ct_hist, content_hist, part_hist, time_hist, tcid_hist, answer_hist, ethe_hist, content, part, numeric = inputs
        
        tcid_hist = torch.log(1 + torch.clamp(tcid_hist, 0, None))
        
        content_difficulty_hist = self.content_difficulty_emb(content_hist)
        content_difficulty_hist = torch.log(torch.clamp(content_difficulty_hist, 3, None))
        part_hist = self.part_emb(part_hist)
        
        content_difficulty = self.content_difficulty_emb(content).squeeze(1)
        content_difficulty = torch.log(torch.clamp(content_difficulty, 3, None))
        part = self.part_emb(part)
        
        
        content_similarity_hist = torch.cat([self.content_emb(content_hist), part_hist], -1)
        content_similarity = torch.cat([self.content_emb(content), part], -1)
        
        
        sim_features = []
        for i in range(self.num_heads):
            a = l2norm(self.sim_convs[i](content_similarity_hist))
            b = l2norm(self.sim_convs[i](content_similarity)).repeat(1, content_similarity_hist.shape[1], 1)
            sim_features.append((a*b).sum(axis=2))
        
        ca_hist = l2norm(self.content_answer_emb(content_hist))
        a_hist = l2norm(self.answer_emb(answer_hist))
        ca_hist = (ca_hist*a_hist).sum(axis=2)
        
        x = torch.cat([content_difficulty_hist, ethe_hist, content_similarity_hist] + 
                      [x.unsqueeze(-1) for x in [target_hist, ct_hist, time_hist, ca_hist, tcid_hist] + sim_features], 
                      axis=2)

        x = self.hidden(x)*self.gate(x)
        x_inv, _ = self.inv_gru(torch.flip(x, (1,)))
        x = torch.cat([x, torch.flip(x_inv, (1,))], -1)
        x, _ = self.gru(x)
        x_inv, _ = self.inv_gru2(torch.flip(x, (1,)))
        x = torch.cat([x, torch.flip(x_inv, (1,))], -1)
        x, _ = self.gru2(x)
        x = torch.cat([x[:, -1], content_difficulty, content_similarity.squeeze(1), numeric], axis=1)
        x = self.final(x)
        return x
    
    
class RidEnsembleModel(nn.Module):
    def __init__(self, versions):
        super(RidEnsembleModel, self).__init__()
        self.models = []
        for v in versions:
            model = RidModel()
            state_dict = torch.load(f"/kaggle/input/riiid-models/{v}.pth")
            new_state_dict = OrderedDict()
            for k, v in state_dict.items():
                name = k[7:] # remove `module.`
                new_state_dict[name] = v
            model.load_state_dict(new_state_dict)
            self.models.append(model)
        self.models = nn.ModuleList(self.models)
        
    def update_difficulty_emb(self, emb):
        emb = torch.Tensor(emb).cuda()
        for i in range(len(self.models)):
            self.models[i].content_difficulty_emb.weight += emb
        
    def forward(self, x):
        return sum([model(x).sigmoid() for model in self.models])/len(self.models)
    
model = RidEnsembleModel(["v56_0", "v56_1", "v56_2", "v56_3", "v56_4"])
model = model.cuda()

model.eval()

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader


class RidInferenceDataset(Dataset):

    def __init__(self, test_df, max_seq_len=256):
        self.test_df = test_df
        #self.test_df["prior_question_elapsed_time"].fillna(-1000, inplace=True)
        #self.test_df["prior_question_had_explanation"].fillna(False, inplace=True)
        self.max_seq_len = max_seq_len
        
    def _pad(self, array, pad_val):
        if len(array) >= self.max_seq_len:
            return array[-self.max_seq_len:]
        shape = list(array.shape)
        shape[0] = self.max_seq_len
        x = np.ones(shape, dtype=np.float32)*pad_val
        if len(array) > 0:
            x[-len(array):] = array
        return x

    def __getitem__(self, idx):
        row = self.test_df.iloc[idx].copy()
        
        user_df = dict(user_cache[row["user_id"]]) # copy
            
        row["part"] = qpart_dict[row["content_id"]]

        #df.sort_values("timestamp", inplace=True)
        
        if user_df["ethe_hist"] is not None:
            ethe_hist = np.log1p(user_df["ethe_hist"])
        else:
            ethe_hist = np.log1p(np.zeros((1, 3), dtype=np.float32))
        
        times = np.log1p((row["timestamp"] - user_df["timestamp"])*1e-6)
        
        tcid_hist = user_df["task_container_id"] - row["task_container_id"]
        
        content_type_hist = user_df["content_type_id"]
        user_answer = user_df["user_answer"] + 1
        correct_answer = CA[row["content_id"]] + 1
        
        numeric = np.zeros(3)
        valid_answers = user_answer[content_type_hist == 0][::-1]
        last_time_same_answer = np.where(valid_answers == correct_answer)[0]
        if len(last_time_same_answer) > 0:
            last_time_same_answer = last_time_same_answer[0]
        else:
            last_time_same_answer = len(valid_answers)
        numeric[0] = last_time_same_answer
        
        if len(valid_answers) > 0:
            last_answer = valid_answers[0]
            seq_len = np.where(valid_answers != last_answer)[0]
            if len(seq_len) > 0:
                seq_len = seq_len[0]
            else:
                seq_len = 1
            if last_answer == correct_answer:
                numeric[1] = seq_len
            else:
                numeric[2] = seq_len
                
        numeric = np.log1p(numeric)
        
        outputs = {"target_hist": (1 - content_type_hist)*(2*user_df["answered_correctly"] - 1),
                   "content_type_hist": content_type_hist,
                   "content_hist": user_df["content_id"],
                   "part_hist": user_df["part"],
                   "time_hist": times,
                   "tcid_hist": tcid_hist,
                   "answer_hist": user_answer,
                   "ethe_hist": ethe_hist,
                   "content": [row["content_id"]],
                   "part": [row["part"]],
                   "numeric": numeric
                  }
        
        for key in outputs.keys():
            if "hist" in key:
                outputs[key] = self._pad(outputs[key], 0)
            if key in {"content_hist", "content", "part_hist", "part", "answer_hist"}:
                outputs[key] = torch.LongTensor(outputs[key])
            else:
                outputs[key] = torch.FloatTensor(outputs[key])
        
        return tuple(o for o in outputs.values())


    def __len__(self):
        return len(self.test_df)
    
#example_dataset = RidInferenceDataset(test_df)
#example_dataset[0]

In [None]:
columns = {
        'timestamp': 'int64',
        'user_id': 'int32',
        'content_id': 'int16',
        'content_type_id': 'int8',
        'task_container_id': 'int16',
        'user_answer': 'int8',
        'answered_correctly':'int8',
        'prior_question_elapsed_time': 'float32',
        'prior_question_had_explanation': 'boolean'
    }

def concat(arr1, arr2):
    if arr1 is None:
        return arr2
    if len(arr1) == 0:
        return arr2
    return np.concatenate([arr1, arr2])


def read_ethe(user_dict):
    ethe_hist = np.zeros((len(user_dict["content_id"]), 3), dtype=np.float32)*np.nan
    pqet = user_dict['prior_question_elapsed_time'][-1]
    pqhe = user_dict['prior_question_had_explanation'][-1]
    pqet_hist = user_dict["prior_question_elapsed_time"]
    pqhe_hist = user_dict['prior_question_had_explanation']
    container = user_dict["task_container_id"][-1]
    
    pqts_hist = user_dict['timestamp']*1e-6
    pqts = pqts_hist[-1]

    for i in range(1, ethe_hist.shape[0]):
        ix = -i - 1
        
        if user_dict["task_container_id"][ix] == container:
            ethe_hist[ix] = ethe_hist[ix + 1]
        else:
            container = user_dict["task_container_id"][ix]
            if pd.isna(pqet):
                pqet = 0.0
            if pd.isna(pqhe):
                pqhe = False
            ethe_hist[ix, 0] = pqet*1e-6
            ethe_hist[ix, 1] = 1.0*pqhe
            ethe_hist[ix, 2] = pqts - pqts_hist[ix]
            pqet = pqet_hist[ix]
            pqhe = pqhe_hist[ix]
            pqts = pqts_hist[ix]

    return ethe_hist       
    

def update_ethe(df, users):
    for user in users:
        user_ethe = user_cache[user]["ethe_hist"]
        new_user_data = df[df["user_id"] == user].copy()
        
        if (new_user_data.shape[0] == 0) or (user_ethe is None):
            continue
        
        pqet = (new_user_data["prior_question_elapsed_time"].fillna(0.0)*1e-6).values[0]
        pqhe = (new_user_data["prior_question_had_explanation"].fillna(False)*1.0).values[0]
        
        ts = user_cache[user]["timestamp"]
        pqts = new_user_data["timestamp"].values[0]
        
        for i in range(user_ethe.shape[0]):
            ix = -1 - i
            if np.isnan(user_ethe[ix, 2]):
                user_ethe[ix, 0] = pqet
                user_ethe[ix, 1] = pqhe
                user_ethe[ix, 2] = 1e-6*(pqts - ts[ix])
            else:
                break
        
        user_cache[user]["ethe_hist"] = user_ethe


def update_user_cache_q(df, users):
    new_target = np.zeros((lid_max, 2))
    for user in users:
        new_user_data = df[df["user_id"] == user].copy()
        
        if new_user_data.shape[0] > 0:
            user_data = user_cache[user]
        
            content_id = new_user_data["content_id"].values
            target = new_user_data["answered_correctly"].values

            new_user_data["part"] = np.array([qpart_dict[x] for x in content_id])
            new_user_data["content_id"] = content_id

            for key in list(columns.keys()) + ["part"]:
                user_data[key] = concat(user_data[key], new_user_data[key].values)

            user_data["ethe_hist"] = concat(user_data["ethe_hist"], np.zeros((len(content_id), 3), dtype=np.float32)*np.nan)

            user_cache[user] = user_data

            for q in range(new_user_data.shape[0]):
                new_target[content_id[q], target[q]] += 1

    with torch.no_grad():
        model.update_difficulty_emb(new_target)

        
def hash_folder(x, k=16):
    name = ""
    for i in range(3):
        name = name + f"{x%k}/"
        x = x // k
    return name

def read_user_cache(users):
    for user in users:
        if user not in user_cache:
            folder = f"/kaggle/input/riiid-partitioned/npdata/{hash_folder(user)}"
            filename = f"{folder}{user}.npz"
            
            user_dict = dict()
            
            try:
                user_data = np.load(filename, allow_pickle=True)
                for key in columns.keys():
                    user_dict[key] = user_data[key]
                #print("Found:", filename)

            except:
                #print(f"Not found {filename}, new user.")
                for key in columns.keys():
                    user_dict[key] = np.array([])
                
            questions = np.where(user_dict["content_type_id"] == 0)[0]
            
            for key in columns.keys():
                user_dict[key] = user_dict[key][questions]

            user_dict["part"] = np.array([qpart_dict[x] for x in user_dict["content_id"]])
            
            if len(user_dict["part"]) > 0:
                user_dict["ethe_hist"] = read_ethe(user_dict)
            else:
                user_dict["ethe_hist"] = None
            user_cache[user] = user_dict

In [None]:
def read_data(data):
    return tuple(d.cuda() for d in data)

def predict(eval_loader):
    
    preds = []

    for idx, data in enumerate(eval_loader):
        inputs = read_data(data)

        pred = model(inputs)

        preds.append(pred.detach().cpu().numpy().ravel())
    
    return np.concatenate(preds)

In [None]:
TARGET = "answered_correctly"
BS = 64
NW = 1


previous_test_df = None

iter_test = tqdm(env.iter_test())
for (test_df, sample_prediction_df) in iter_test:
    
    if previous_test_df is not None:
        previous_test_df[TARGET] = eval(test_df["prior_group_answers_correct"].iloc[0])
        previous_test_df["user_answer"] = eval(test_df["prior_group_responses"].iloc[0])
        update_user_cache_q(previous_test_df[previous_test_df['content_type_id'] == 0].reset_index(drop=True), relevant_users)
    
    relevant_users = np.unique(test_df["user_id"].values)
    read_user_cache(relevant_users)

    eval_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    update_ethe(eval_df, relevant_users)
    
    eval_dataset = RidInferenceDataset(eval_df)
    eval_loader = DataLoader(eval_dataset, batch_size=BS, shuffle=False, num_workers=NW, pin_memory=False, drop_last=False)

    eval_df[TARGET] = predict(eval_loader)
    
    env.predict(eval_df[['row_id', TARGET]])
    
    previous_test_df = test_df.copy()