In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%time
import gc
import random
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import os
def seed_everything(seed = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.set_deterministic(False)
seed_everything()

dtype = {'timestamp': 'int64', 'user_id': 'int32' ,'content_id': 'int16','content_type_id': 'int8','answered_correctly':'int8'}

train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', usecols=[1, 2, 3, 4, 5, 7], dtype=dtype)

train_df = train_df[train_df.content_type_id == False]
del train_df['content_type_id']
gc.collect()
train_df = train_df.sort_values(['timestamp'], ascending=True).reset_index(drop = True)

questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')
dict_tags = {}
for i, tags in enumerate(questions['tags'].unique()):
    if tags in dict_tags.keys():
        continue
    dict_tags[tags] = i
questions.tags.replace(dict_tags, inplace=True)
questions.part = questions.part.astype(np.int8)
questions.tags = questions.tags.astype(np.int16)
skills = questions["question_id"].unique()
n_skill = len(skills)
print("number skills", len(skills))
questions.rename(columns = {'question_id': 'content_id'}, inplace=True)
train_df = train_df.merge(questions[['content_id', 'part', 'tags']], on='content_id', how='left')
skills = questions["content_id"].unique()
n_skill = len(skills)
print("number skills", len(skills))

In [None]:
%%time
group = train_df.groupby('user_id').apply(lambda r: (
        r['content_id'].values,
        r['answered_correctly'].values,
        r['task_container_id'].values,
        r['timestamp'].values,
        r['part'].values,
        r['tags'].values
        ))
del train_df
gc.collect()

In [None]:
MAX_SEQ = 180
ACCEPTED_USER_CONTENT_SIZE = 4
EMBED_SIZE = 128
BATCH_SIZE = 64
DROPOUT = 0.1

In [None]:
class FFN(nn.Module):
    def __init__(self, state_size=200):
        super(FFN, self).__init__()
        self.state_size = state_size

        self.lr1 = nn.Linear(state_size, state_size)
        self.relu = nn.ReLU()
        self.lr2 = nn.Linear(state_size, state_size)
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        x = self.lr1(x)
        x = self.relu(x)
        x = self.lr2(x)
        return self.dropout(x)

def future_mask(shape):
    future_mask = np.triu(np.ones(shape), k=1).astype('bool')
    return torch.from_numpy(future_mask)

class SubLayer(nn.Module):
    def __init__(self,embed_dim):
        super().__init__()
        self.multi_att = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=8, dropout=0.2)

        self.dropout = nn.Dropout(0.2)
        self.layer_normal = nn.LayerNorm(embed_dim) 

        self.ffn = FFN(embed_dim)
    def forward(self, e, x):
        att_mask = future_mask(shape=(e.size(0), x.size(0))).to(device)
        att_output, att_weight = self.multi_att(e, x, x, attn_mask=att_mask)
        att_output = self.layer_normal(att_output + e)
        att_output = att_output.permute(1, 0, 2) # att_output: [s_len, bs, embed] => [bs, s_len, embed]

        x = self.ffn(att_output)
        x = self.layer_normal(x + att_output)
        return x, att_weight
class BERTModel(nn.Module):
    def __init__(self, n_skill, max_seq=MAX_SEQ, embed_dim=128):
        super().__init__()
        self.pos_embedding = nn.Embedding(max_seq, embed_dim)
        self.embedding = nn.Embedding(n_skill+1, embed_dim)
        self.ans_embedding = nn.Embedding(3, embed_dim)
        self.time_embedding = nn.Embedding(10000, embed_dim)
        self.lag_time_embedding = nn.Embedding(3600, embed_dim)
        self.elapsed_time_embedding = nn.Embedding(1520, embed_dim)
        self.part_embedding = nn.Embedding(8, embed_dim)
        
        self.sub1 = SubLayer(embed_dim)
        
        self.fc = nn.Linear(embed_dim, embed_dim*2)
        self.fc1 = nn.Linear(embed_dim*5, embed_dim)
        
        self.bacth_norm = nn.BatchNorm1d(max_seq)
        self.bacth_norm1 = nn.BatchNorm1d(max_seq)
        
        self.pred = nn.Linear(embed_dim*2, 1)
        
    def forward(self, history_question, history_answer, time, lag_time, part, elapsed_time):
        device = history_question.device
        history_answer = history_answer
        history_answer = self.ans_embedding(history_answer)
        
        x = self.embedding(history_question)
        pos_id = torch.arange(x.size(1)).unsqueeze(0).to(device)
        pos_x = self.pos_embedding(pos_id)
        time_x = self.time_embedding(time)
        lag_time = self.lag_time_embedding(lag_time)
        part = self.part_embedding(part)
        elapsed_time = self.elapsed_time_embedding(elapsed_time)
        
        history_answer += pos_x
        x += history_answer
        time_x += history_answer
        lag_time += history_answer
        part += history_answer
        elapsed_time += history_answer
        x = torch.cat([x, time_x, lag_time, part, elapsed_time], axis=-1)
        x = self.fc1(x)
        
        
        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        x, att_weight= self.sub1(x, x)
        
        x = self.fc(x)
        x = self.bacth_norm(x)

        x = self.pred(x)
        
        return x.squeeze(-1), att_weight

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torch.load('/kaggle/input/v9-test/7901_bert_v4.pth',  map_location=lambda storage, loc: storage.cuda(0))

In [None]:
class TestDataset(Dataset):
    def __init__(self, samples, test_df, skills, max_seq=MAX_SEQ):
        super(TestDataset, self).__init__()
        self.samples = samples
        self.user_ids = [x for x in test_df["user_id"].unique()]
        self.test_df = test_df
        self.skills = skills
        self.n_skill = len(skills)
        self.max_seq = max_seq

    def __len__(self):
        return self.test_df.shape[0]

    def __getitem__(self, index):
        test_info = self.test_df.iloc[index]

        user_id = test_info["user_id"]
        target_id = test_info["content_id"]
        target_task = test_info["task_container_id"]
        target_timestamp = test_info["timestamp"]
        target_part = test_info['part']
        target_tags = test_info['tags']
        
    
        q = np.zeros(self.max_seq, dtype=int)
        qa = np.zeros(self.max_seq, dtype=int)
        task = np.zeros(self.max_seq, dtype=int)
        lag_time = np.zeros(self.max_seq, dtype=int)
        part = np.zeros(self.max_seq, dtype=int)
        tags = np.zeros(self.max_seq, dtype=int)
        
        target_lag_time = 0
        if user_id in self.samples.index:
            q_, qa_, task_, timestamp_, part_, tags_ = self.samples[user_id]
            
            ##获取lag_time
            lag_time_ = (np.diff(timestamp_)/1000)
            lag_time_ = lag_time_.astype(int)
            lag_time_ = np.append([0], lag_time_)
            lag_time_[lag_time_>3599] = 3599
            seq_len = len(q_)

            if seq_len >= self.max_seq:
                q = q_[-self.max_seq:]
                qa = qa_[-self.max_seq:]
                task = task_[-self.max_seq:]
                lag_time = lag_time_[-self.max_seq:]
                part = part_[-self.max_seq:]
                tags = tags_[-self.max_seq:]
                
            else:
                q[-seq_len:] = q_
                qa[-seq_len:] = qa_          
                task[-seq_len:] = task_
                lag_time[-seq_len:] = lag_time_
                part[-seq_len:] = part_
                tags[-seq_len:] = tags_
            #print('compute lag time ', target_timestamp, timestamp_[-1])
            target_lag_time = int((target_timestamp - timestamp_[-1])/1000)
            #print('target_lag_time ', target_lag_time)
            if target_lag_time > 3599:
                target_lag_time = 3599
        q = np.append(q[1:], [target_id])
        qa = np.append(qa[1:], [2])
        task = np.append(task[1:], [target_task])
        lag_time = np.append(lag_time[1:], [target_lag_time])
        part = np.append(part[1:], target_part)
        tags = np.append(tags[1:], target_tags)
        return q, qa, task, lag_time, part, tags

In [None]:
import riiideducation

env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
import psutil
model.eval()

#HDKIM
prev_test_df = None
#HDKIMHDKIM
MAX_SEQ = 180
for (test_df, sample_prediction_df) in tqdm(iter_test):
    #HDKIM
    if (prev_test_df is not None) & (psutil.virtual_memory().percent<90):
        prev_test_df['answered_correctly'] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prev_test_df = prev_test_df[prev_test_df.content_type_id == False]
        prev_group = prev_test_df[['timestamp', 'user_id', 'content_id', 'answered_correctly','task_container_id', 'part', 'tags']].groupby('user_id').apply(
            lambda r:
            (
            r['content_id'].values,
            r['answered_correctly'].values, 
            r['task_container_id'].values,
            r['timestamp'].values,
            r['part'].values,
            r['tags'].values
            ))
        for prev_user_id in prev_group.index:
            prev_group_content, prev_group_ac, prev_group_task,  prev_group_timestamp, prev_part, prev_tags = prev_group[prev_user_id]
            if prev_user_id in group.index:
                
                group[prev_user_id] = (
                                       np.append(group[prev_user_id][0],prev_group_content), 
                                       np.append(group[prev_user_id][1],prev_group_ac),
                                       np.append(group[prev_user_id][2],prev_group_task),
                                       np.append(group[prev_user_id][3],prev_group_timestamp),
                                       np.append(group[prev_user_id][4],prev_part),
                                       np.append(group[prev_user_id][5],prev_tags)
                                      )
 
            else:
                group[prev_user_id] = (prev_group_content, prev_group_ac, prev_group_task, prev_group_timestamp, prev_part, prev_tags)
            if len(group[prev_user_id][0])>MAX_SEQ:
                new_group_content = group[prev_user_id][0][-MAX_SEQ:]
                new_group_ac = group[prev_user_id][1][-MAX_SEQ:]
                new_group_task = group[prev_user_id][2][-MAX_SEQ:]
                new_group_timestamp = group[prev_user_id][3][-MAX_SEQ:]
                new_group_part = group[prev_user_id][4][-MAX_SEQ:]
                new_group_tags = group[prev_user_id][5][-MAX_SEQ:]
                group[prev_user_id] = (new_group_content, new_group_ac, new_group_task, new_group_timestamp, new_group_part, new_group_tags)
    
    test_df = test_df.merge(questions[['content_id', 'part', 'tags']], on='content_id', how='left')
    prev_test_df = test_df.copy()
    test_df = test_df[test_df.content_type_id == False]
    #HDKIMHDKIM
    

    test_dataset = TestDataset(group, test_df, skills)
    test_dataloader = DataLoader(test_dataset, batch_size=51200, shuffle=False)
    
    outs = []

    for item in tqdm(test_dataloader):
        q = item[0].to(device).long()
        qa = item[1].to(device).long()
        task = item[2].to(device).long()
        lag_time = item[3].to(device).long()
        part = item[4].to(device).long()
        tags = item[5].to(device).long()
        qa[:, -1] = 2
        with torch.no_grad():
            output, att_weight = model(q, qa, part, lag_time, part, tags)
        
        
        output = torch.sigmoid(output)
        output = output[:, -1]

        outs.extend(output.view(-1).data.cpu().numpy())
        
    test_df['answered_correctly'] =  outs
    
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])