In [1]:
# !pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1

In [2]:
# import sys
# sys.path.append('../input/riiid-test-answer-prediction')

In [3]:
import numpy as np
import pandas as pd
from collections import defaultdict
# import datatable as dt
import lightgbm as lgb
from matplotlib import pyplot as plt
import riiideducation
import random
from sklearn.metrics import roc_auc_score
import gc

import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

_ = np.seterr(divide='ignore', invalid='ignore')

In [4]:
features = [
#     'user_id',
#     'timestamp',
    'lagtime',
    'lagtime_mean',
    'content_id',
    'task_container_id',
    'user_lecture_cumsum',
    'user_lecture_mean',
    'prior_question_elapsed_time',
    'delta_prior_question_elapsed_time',
    'user_correctness',
    'user_correct_cumcount',
    'user_correct_cumsum',
    'content_correctness',
    'content_correctness_std',
    'content_count',
    'content_sum',
    'task_container_correctness',
    'task_container_std',
    'task_container_sum',
    'bundle_correctness',
    'attempt_no',
    
    'part',
    'part_correctness_mean',
    'part_correctness_std',
    'tags1',
    'tags1_correctness_mean',
    'tags1_correctness_std',
    'tags2',
    'tags2_correctness_mean',
    'tags2_correctness_std',
    'tags3',
    'tags3_correctness_mean',
    'tags3_correctness_std',
    'tags4',
    'tags5',
    'tags6',
    'bundle_id',
    'explanation_mean', 
    'explanation_cumsum',
    'prior_question_had_explanation',
]
categorical_columns= [
    'content_id',
    'task_container_id',
    'bundle_id',
    
    'part',        
    'tags1',
    'tags2',
    'tags3',
    'tags4',
    'tags5',
    'tags6',
    'prior_question_had_explanation',
]
target = 'answered_correctly'

# getting feature dicts for state updating

In [5]:
import pickle
with open('../input/riiid-train-features/feature_dict.pickle', 'rb') as handle:
    feature_dict = pickle.load(handle)

In [6]:
prior_question_elapsed_time_mean = 13005.0810546875

In [7]:
questions_df = pd.read_parquet('../input/riiid-train-features/question_features_full.parquet')
questions_df.loc[:, questions_df.dtypes == 'float32'] = questions_df.loc[:, questions_df.dtypes == 'float32'].astype('float16')

In [17]:
def get_max_attempt(user_id,content_id):
    k = (user_id,content_id)

    if k in feature_dict['attempt_no_sum_dict'].keys():
        feature_dict['attempt_no_sum_dict'][k]+=1
        return feature_dict['attempt_no_sum_dict'][k]

    feature_dict['attempt_no_sum_dict'][k] = 1
    return feature_dict['attempt_no_sum_dict'][k]

## sakt model definition

In [19]:
skills = questions_df["content_id"].unique()
n_skill = len(skills)
with open('../input/riiid-train-features/question_features_full.parquet', 'r') as f:
    group = pickle.load(f)

In [9]:
MAX_SEQ = 100

class FFN(nn.Module):
    def __init__(self, state_size=200):
        super(FFN, self).__init__()
        self.state_size = state_size

        self.lr1 = nn.Linear(state_size, state_size)
        self.relu = nn.ReLU()
        self.lr2 = nn.Linear(state_size, state_size)
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        x = self.lr1(x)
        x = self.relu(x)
        x = self.lr2(x)
        return self.dropout(x)

def future_mask(seq_length):
    future_mask = np.triu(np.ones((seq_length, seq_length)), k=1).astype('bool')
    return torch.from_numpy(future_mask)


class SAKTModel(nn.Module):
    def __init__(self, n_skill, max_seq=MAX_SEQ, embed_dim=128): #HDKIM 100
        super(SAKTModel, self).__init__()
        self.n_skill = n_skill
        self.embed_dim = embed_dim

        self.embedding = nn.Embedding(2*n_skill+1, embed_dim)
        self.pos_embedding = nn.Embedding(max_seq-1, embed_dim)
        self.e_embedding = nn.Embedding(n_skill+1, embed_dim)

        self.multi_att = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=8, dropout=0.2)

        self.dropout = nn.Dropout(0.2)
        self.layer_normal = nn.LayerNorm(embed_dim) 

        self.ffn = FFN(embed_dim)
        self.pred = nn.Linear(embed_dim, 1)
    
    def forward(self, x, question_ids):
        device = x.device        
        x = self.embedding(x)
        pos_id = torch.arange(x.size(1)).unsqueeze(0).to(device)

        pos_x = self.pos_embedding(pos_id)
        x = x + pos_x

        e = self.e_embedding(question_ids)

        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        e = e.permute(1, 0, 2)
        att_mask = future_mask(x.size(0)).to(device)
        att_output, att_weight = self.multi_att(e, x, x, attn_mask=att_mask)
        att_output = self.layer_normal(att_output + e)
        att_output = att_output.permute(1, 0, 2) # att_output: [s_len, bs, embed] => [bs, s_len, embed]

        x = self.ffn(att_output)
        x = self.layer_normal(x + att_output)
        x = self.pred(x)

        return x.squeeze(-1), att_weight
    
class TestDataset(Dataset):
    def __init__(self, samples, test_df, skills, max_seq=MAX_SEQ): #HDKIM 100
        super(TestDataset, self).__init__()
        self.samples = samples
        self.user_ids = [x for x in test_df["user_id"].unique()]
        self.test_df = test_df
        self.skills = skills
        self.n_skill = len(skills)
        self.max_seq = max_seq

    def __len__(self):
        return self.test_df.shape[0]

    def __getitem__(self, index):
        test_info = self.test_df.iloc[index]

        user_id = test_info["user_id"]
        target_id = test_info["content_id"]

        q = np.zeros(self.max_seq, dtype=int)
        qa = np.zeros(self.max_seq, dtype=int)

        if user_id in self.samples.index:
            q_, qa_ = self.samples[user_id]
            
            seq_len = len(q_)

            if seq_len >= self.max_seq:
                q = q_[-self.max_seq:]
                qa = qa_[-self.max_seq:]
            else:
                q[-seq_len:] = q_
                qa[-seq_len:] = qa_          
        
        x = np.zeros(self.max_seq-1, dtype=int)
        x = q[1:].copy()
        x += (qa[1:] == 1) * self.n_skill
        
        questions = np.append(q[2:], [target_id])
        
        return x, questions

# loading models and doing inference

In [10]:
env = riiideducation.make_env()
iter_test = env.iter_test()
prior_test_df = None

In [11]:
# loading lgbm
lgbm_model = lgb.Booster(model_file='../input/riiid-train-features/model_better.txt')

# loading sakt
sakt_model = SAKTModel(n_skill, embed_dim=128)
sakt_model.load_state_dict(torch.load("../input/riiid-train-features/sakt_model.pkl"))
device = torch.device("cuda")
sakt_model.to(device)
sakt_model.eval()

In [12]:
# have to run this cell once so we don't have to run and save the whole notebook before submission
# basically just generate a blank submission file so the competition can detect it and allow us to submit
with open('./submission.csv', 'w+') as f:
    pass

In [18]:
%%time
import psutil

for (test_df, sample_prediction_df) in iter_test:    
    # doing the state update stuff
    if prior_test_df is not None and (psutil.virtual_memory().percent<90):
        prior_test_df[target] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prior_test_df = prior_test_df[prior_test_df[target] != -1].reset_index(drop=True)
        
        # for sakt
        prev_group = prev_test_df[['user_id', 'content_id', 'answered_correctly']].groupby('user_id').apply(lambda r: (
            r['content_id'].values,
            r['answered_correctly'].values))
        for prev_user_id in prev_group.index:
            if prev_user_id in group.index:
                group[prev_user_id] = (
                    np.append(group[prev_user_id][0], prev_group[prev_user_id][0])[-MAX_SEQ:], 
                    np.append(group[prev_user_id][1], prev_group[prev_user_id][1])[-MAX_SEQ:]
                )
 
            else:
                group[prev_user_id] = (
                    prev_group[prev_user_id][0], 
                    prev_group[prev_user_id][1]
                )

        # for lgbm
        prior_test_df['prior_question_had_explanation'].fillna(False, inplace=True)       
        prior_test_df.prior_question_had_explanation=prior_test_df.prior_question_had_explanation.astype('int8')
    
        user_ids = prior_test_df['user_id'].values
        content_ids = prior_test_df['content_id'].values
        task_container_ids = prior_test_df['task_container_id'].values
        prior_question_had_explanations = prior_test_df['prior_question_had_explanation'].values
        targets = prior_test_df[target].values
       
        
        
        for user_id, content_id,prior_question_had_explanation,task_container_id,answered_correctly in zip(user_ids, content_ids, prior_question_had_explanations,task_container_ids,targets):
            feature_dict['user_sum_dict'][user_id] += answered_correctly
            feature_dict['user_count_dict'][user_id] += 1         
            feature_dict['explanation_sum_dict'][user_id] += prior_question_had_explanation
            feature_dict['explanation_count_dict'][user_id] += 1
            

    prior_test_df = test_df.copy()
    lecture_test_df = test_df[test_df['content_type_id'] == 1].reset_index(drop=True)
    
    # for sakt
    test_dataset = TestDataset(group, test_df, skills)
    test_dataloader = DataLoader(test_dataset, batch_size=51200, shuffle=False)
    
    sakt_predictions = []

    for item in test_dataloader:
        x = item[0].to(device).long()
        target_id = item[1].to(device).long()

        with torch.no_grad():
            output, att_weight = sakt_model(x, target_id)
        sakt_predictions.extend(torch.sigmoid(output)[:, -1].view(-1).data.cpu().numpy())

    # for lgbm
    for i, (user_id,content_type_id, content_id) in enumerate(zip(lecture_test_df['user_id'].values,lecture_test_df['content_type_id'].values,lecture_test_df['content_id'].values)):
      
        feature_dict['user_lecture_sum_dict'][user_id] += content_type_id
        feature_dict['user_lecture_count_dict'][user_id] += 1

        
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
   
    test_df['prior_question_had_explanation'].fillna(False, inplace=True)
    test_df.prior_question_had_explanation=test_df.prior_question_had_explanation.astype('int8')
    test_df['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace=True)
    

    user_lecture_sum = np.zeros(len(test_df), dtype=np.int16)
    user_lecture_count = np.zeros(len(test_df), dtype=np.int16) 
    
    user_sum = np.zeros(len(test_df), dtype=np.int16)
    user_count = np.zeros(len(test_df), dtype=np.int16)
    content_sum = np.zeros(len(test_df), dtype=np.int32)
    content_count = np.zeros(len(test_df), dtype=np.int32)
    task_container_sum = np.zeros(len(test_df), dtype=np.int32)
    task_container_count = np.zeros(len(test_df), dtype=np.int32)
    task_container_std = np.zeros(len(test_df), dtype=np.float16)
    content_task_mean = np.zeros(len(test_df), dtype=np.float16)
    explanation_sum = np.zeros(len(test_df), dtype=np.int32)
    explanation_count = np.zeros(len(test_df), dtype=np.int32)
    delta_prior_question_elapsed_time = np.zeros(len(test_df), dtype=np.int32)

    attempt_no_count = np.zeros(len(test_df), dtype=np.int16)
    lagtime = np.zeros(len(test_df), dtype=np.int32)
    lagtime_mean = np.zeros(len(test_df), dtype=np.int32)
   
    
    for i, (user_id, prior_question_had_explanation, content_type_id, prior_question_elapsed_time, timestamp, content_id, task_container_id) in enumerate(zip(test_df['user_id'].values,test_df['prior_question_had_explanation'].values,test_df['content_type_id'].values,test_df['prior_question_elapsed_time'].values,test_df['timestamp'].values, test_df['content_id'].values, test_df['task_container_id'].values)):
         
        feature_dict['user_lecture_sum_dict'][user_id] += content_type_id
        feature_dict['user_lecture_count_dict'][user_id] += 1
        
        user_lecture_sum[i] = feature_dict['user_lecture_sum_dict'][user_id]
        user_lecture_count[i] = feature_dict['user_lecture_count_dict'][user_id]
        
        user_sum[i] = feature_dict['user_sum_dict'][user_id]
        user_count[i] = feature_dict['user_count_dict'][user_id]
        content_sum[i] = feature_dict['content_sum_dict'][content_id]
        content_count[i] = feature_dict['content_count_dict'][content_id]
        task_container_sum[i] = feature_dict['task_container_sum_dict'][task_container_id]
        task_container_count[i] = feature_dict['task_container_count_dict'][task_container_id]
        task_container_std[i]=feature_dict['task_container_std_dict'][task_container_id]
      
        explanation_sum[i] = feature_dict['explanation_sum_dict'][user_id]
        explanation_count[i] = feature_dict['explanation_count_dict'][user_id]
  
        if user_id in feature_dict['max_timestamp_u_dict']['max_time_stamp'].keys():
            lagtime[i]=timestamp-feature_dict['max_timestamp_u_dict']['max_time_stamp'][user_id]
            feature_dict['max_timestamp_u_dict']['max_time_stamp'][user_id]=timestamp
            lagtime_mean[i]=(feature_dict['lagtime_mean_dict'][user_id]+lagtime[i])/2           
        else:
            lagtime[i]=0
            feature_dict['max_timestamp_u_dict']['max_time_stamp'].update({user_id:timestamp})
            feature_dict['lagtime_mean_dict'].update({user_id:timestamp})
            lagtime_mean[i]=(feature_dict['lagtime_mean_dict'][user_id]+lagtime[i])/2
            
        if user_id in feature_dict['user_prior_question_elapsed_time_dict']['prior_question_elapsed_time'].keys():            
            delta_prior_question_elapsed_time[i]=prior_question_elapsed_time - feature_dict['user_prior_question_elapsed_time_dict']['prior_question_elapsed_time'][user_id]
            feature_dict['user_prior_question_elapsed_time_dict']['prior_question_elapsed_time'][user_id]=prior_question_elapsed_time
        else:           
            delta_prior_question_elapsed_time[i]=0    
            feature_dict['user_prior_question_elapsed_time_dict']['prior_question_elapsed_time'].update({user_id:prior_question_elapsed_time})
           
        
        

    test_df=test_df.merge(questions_df.loc[questions_df.index.isin(test_df['content_id'])],
                  how='left', on='content_id', right_index=True)
    
 
    test_df['user_lecture_mean'] = user_lecture_sum / user_lecture_count
    test_df['user_lecture_cumsum'] = user_lecture_sum
    test_df['user_correctness'] = user_sum / user_count
    test_df['user_correct_cumcount'] =user_count
    test_df['user_correct_cumsum'] =user_sum
    #
    test_df['content_correctness'] = content_sum / content_count
    test_df['content_count'] = content_count
    test_df['content_sum'] = content_sum
    
    test_df['task_container_correctness'] = task_container_sum / task_container_count
    test_df['task_container_sum'] = task_container_sum 
    test_df['task_container_std'] = task_container_std 
    
    test_df['explanation_mean'] = explanation_sum / explanation_count
    test_df['explanation_cumsum'] = explanation_sum 
    
    #
    test_df['delta_prior_question_elapsed_time'] = delta_prior_question_elapsed_time 
    
  
 
    test_df["attempt_no"] = test_df[["user_id", "content_id"]].apply(lambda row: get_max_attempt(row["user_id"], row["content_id"]), axis=1)
    test_df["lagtime"]=lagtime
    test_df["lagtime_mean"]=lagtime_mean

    test_df['user_correctness'].fillna( 1, inplace=True)
    test_df['attempt_no'].fillna(1, inplace=True)
    test_df.fillna(0, inplace=True)
    
    # ensemble predictions
    lgbm_preds = lgbm_model.predict(test_df[features])
    test_df[target] =  0.6 * np.array(sakt_predictions) + 0.4 * lgbm_predictions
    env.predict(test_df[['row_id', target]])

CPU times: user 1.9 s, sys: 62.2 ms, total: 1.96 s
Wall time: 1.23 s
