In [None]:
import gc

import numpy as np
import pandas as pd

import json
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
import time
import warnings

import riiideducation
import pickle


# for SAKT
import random
import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

import joblib

XGBoost Predictor

In [None]:
"""This file is to serve XGBoost model trained in BQML."""

import glob
import json
import os
import re
import numpy as np
import xgboost as xgb


class Predictor(object):
  """Class to feed input data into XGBoost model.

  It performs both preprocessing and postprocessing on the input and output.
  """

  def __init__(self, model, model_metadata, ohe_categorical_index_vocab,
               mhe_categorical_index_vocab):
    """Initializes a Predictor for XGBoost model serving.

    Args:
      model: XGBoost model.
      model_metadata: The metadata of the model.
      ohe_categorical_index_vocab: Feature index to the vocabulary dictionary
        for one hot encoded features.
      mhe_categorical_index_vocab: Feature index to the vocabulary dictionary
        for multi hot encoded features.

    Returns:
      A 'Predictor' instance.
    """
    self._model = model
    self._model_metadata = model_metadata
    self._ohe_categorical_index_vocab = ohe_categorical_index_vocab
    self._mhe_categorical_index_vocab = mhe_categorical_index_vocab
    self._model_type = None
    self._label_col = None
    self._feature_name_to_index_map = {}
    # This is to keep the order of features used in training.
    self._feature_names = []
    self._class_names = []

  def _extract_model_metadata(self):
    """Extracts info from model metadata and fills member variables.

    Raises:
      ValueError: An error occurred when:
        1. Invalid model type.
        2. Label not found.
        3. Features not found.
        4. Class names not found for boosted_tree_classifier.
        6. Feature index mismatch.
        7. Invalid encode type for categorical features.
    """
    if 'model_type' not in self._model_metadata or self._model_metadata[
        'model_type'] not in [
            'boosted_tree_regressor', 'boosted_tree_classifier'
        ]:
      raise ValueError('Invalid model_type in model_metadata')
    self._model_type = self._model_metadata['model_type']
    if 'label_col' not in self._model_metadata:
      raise ValueError('label_col not found in model_metadata')
    self._label_col = self._model_metadata['label_col']
    if not self._model_metadata['features']:
      raise ValueError('No feature found in model_metadata')
    self._feature_names = self._model_metadata['feature_names']
    if self._model_type == 'boosted_tree_classifier':
      if 'class_names' not in self._model_metadata or not self._model_metadata[
          'class_names']:
        raise ValueError('No class_names found in model_metadata')
      self._class_names = self._model_metadata['class_names']
    for feature_index in range(len(self._feature_names)):
      feature_name = self._feature_names[feature_index]
      self._feature_name_to_index_map[feature_name] = feature_index
      feature_metadata = self._model_metadata['features'][feature_name]
      if 'encode_type' not in feature_metadata or not feature_metadata[
          'encode_type']:
        continue
      elif feature_metadata['encode_type'] == 'ohe':
        if feature_index not in self._ohe_categorical_index_vocab:
          raise ValueError(
              'feature_index %d missing in _ohe_categorical_index_vocab' %
              feature_index)
      elif feature_metadata['encode_type'] == 'mhe':
        if feature_index not in self._mhe_categorical_index_vocab:
          raise ValueError(
              'feature_index %d missing in _mhe_categorical_index_vocab' %
              feature_index)
      else:
        raise ValueError('Invalid encode_type %s for feature %s' %
                         (feature_metadata['encode_type'], feature_name))

  def _preprocess(self, data):
    """Preprocesses raw input data for prediction.

    Args:
      data: Raw input in 2d array.

    Returns:
      Preprocessed data in 2d array.

    Raises:
      ValueError: An error occurred when features in a data row are different
      from the features in the model.
    """
    self._extract_model_metadata()
    preprocessed_data = []
    for row_index in range(len(data)):
      row = data[row_index]
      sorted_data_feature_names = sorted(row.keys())
      sorted_model_feature_names = sorted(self._feature_names)
      if sorted_data_feature_names != sorted_model_feature_names:
        raise ValueError(
            'Row %d has different features %s than the model features %s' %
            (row_index, ','.join(sorted_data_feature_names),
             ','.join(sorted_model_feature_names)))
      encoded_row = []
      for feature_name in self._feature_names:
        col = row[feature_name]
        feature_index = self._feature_name_to_index_map[feature_name]
        if feature_index in self._ohe_categorical_index_vocab:
          # Label encoding.
          vocab = self._ohe_categorical_index_vocab[feature_index]
          col_value = str(col)
          if col_value in vocab:
            encoded_row.append(float(vocab.index(col_value)))
          else:
            # unseen category.
            encoded_row.append(None)
        elif feature_index in self._mhe_categorical_index_vocab:
          # Multihot encoding.
          vocab = self._mhe_categorical_index_vocab[feature_index]
          mhe_list = [0.0] * len(vocab)
          try:
            for item in col:
              item_value = str(item)
              if item_value in vocab:
                mhe_list[vocab.index(item_value)] = 1.0
            encoded_row.extend(mhe_list)
          except ValueError:
            raise ValueError('The feature %s in row %d is not an array' %
                             (feature_name, row_index))
        else:
          # Numerical feature.
          try:
            encoded_row.append(float(col))
          except ValueError:
            raise ValueError(
                'The feature %s in row %d cannot be converted to float' %
                (feature_name, row_index))
      preprocessed_data.append(encoded_row)
    return preprocessed_data

  def predict(self, instances, **kwargs):
    """Performs prediction.

    Args:
      instances: A list of prediction input instances.
      **kwargs: A dictionary of keyword args provided as additional fields on
        the predict request body.

    Returns:
      A list of outputs containing the prediction results.
    """
    del kwargs
    encoded = self._preprocess(instances)
    # We have to convert encoded from list to numpy array, otherwise xgb will
    # take 0s as missing values.
    prediction_input = xgb.DMatrix(
        np.array(encoded).reshape((len(instances), -1)), missing=None)
    if self._model_type == 'boosted_tree_classifier':
      outputs = self._model.predict(prediction_input) # ntree_limit: MAX=600
      final_outputs = []
      for np_output in outputs:
        output = np_output.tolist()
        final_output = {}
        final_output['predicted_{}'.format(
            self._label_col)] = self._class_names[output.index(max(output))]
        final_output['{}_values'.format(self._label_col)] = self._class_names
        final_output['{}_probs'.format(self._label_col)] = output
        final_outputs.append(final_output)
      return final_outputs
    else:
      # Boosted tree regressor.
      return {
          'predicted_' + self._label_col:
              self._model.predict(prediction_input).tolist()
      }

  @classmethod
  def from_path(cls, model_dir, model_name="model.bst", meta_name="model_metadata.json"):
    """Creates an instance of Predictor using the given path.

    Args:
      model_dir: The local directory that contains the trained XGBoost model and
        the assets including vocabularies and model metadata.

    Returns:
      An instance of 'Predictor'.
    """
    # Keep model name the same as ml::kXgboostFinalModelFilename.
    model_path = os.path.join(model_dir, model_name)
    model = xgb.Booster(model_file=model_path)
    #assets_path = os.path.join(model_dir, 'assets')
    assets_path = model_dir
    model_metadata_path = os.path.join(assets_path, meta_name)
    with open(model_metadata_path) as f:
      model_metadata = json.load(f)
    txt_list = glob.glob(assets_path + '/*.txt')
    ohe_categorical_index_vocab = {}
    mhe_categorical_index_vocab = {}
    for txt_file in txt_list:
      ohe_feature_found = re.search(r'(\d+).txt', txt_file)
      mhe_feature_found = re.search(r'(\d+)_array.txt', txt_file)
      if ohe_feature_found:
        feature_index = int(ohe_feature_found.group(1))
        with open(txt_file) as f:
          ohe_categorical_index_vocab[feature_index] = f.read().splitlines()
      elif mhe_feature_found:
        feature_index = int(mhe_feature_found.group(1))
        with open(txt_file) as f:
          mhe_categorical_index_vocab[feature_index] = f.read().splitlines()
    return cls(model, model_metadata, ohe_categorical_index_vocab,
               mhe_categorical_index_vocab)

simulator

In [None]:
class Iter_Valid(object):
    def __init__(self, df, max_user=1000):
        df = df.reset_index(drop=True)
        self.df = df
        self.user_answer = df['user_answer'].astype(str).values
        self.answered_correctly = df['answered_correctly'].astype(str).values
        df['prior_group_responses'] = "[]"
        df['prior_group_answers_correct'] = "[]"
        self.sample_df = df[df['content_type_id'] == 0][['row_id']]
        self.sample_df['answered_correctly'] = 0
        self.len = len(df)
        self.user_id = df.user_id.values
        self.task_container_id = df.task_container_id.values
        self.content_type_id = df.content_type_id.values
        self.max_user = max_user
        self.current = 0
        self.pre_user_answer_list = []
        self.pre_answered_correctly_list = []

    def __iter__(self):
        return self
    
    def fix_df(self, user_answer_list, answered_correctly_list, pre_start):
        df= self.df[pre_start:self.current].copy()
        sample_df = self.sample_df[pre_start:self.current].copy()
        df.loc[pre_start,'prior_group_responses'] = '[' + ",".join(self.pre_user_answer_list) + ']'
        df.loc[pre_start,'prior_group_answers_correct'] = '[' + ",".join(self.pre_answered_correctly_list) + ']'
        self.pre_user_answer_list = user_answer_list
        self.pre_answered_correctly_list = answered_correctly_list
        return df, sample_df

    def __next__(self):
        added_user = set()
        pre_start = self.current
        pre_added_user = -1
        pre_task_container_id = -1
        pre_content_type_id = -1
        user_answer_list = []
        answered_correctly_list = []
        while self.current < self.len:
            crr_user_id = self.user_id[self.current]
            crr_task_container_id = self.task_container_id[self.current]
            crr_content_type_id = self.content_type_id[self.current]
            if crr_user_id in added_user and (crr_user_id != pre_added_user or (crr_task_container_id != pre_task_container_id and crr_content_type_id == 0 and pre_content_type_id == 0)):
                # known user(not prev user or (differnt task container and both question))
                return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
            if len(added_user) == self.max_user:
                if  crr_user_id == pre_added_user and (crr_task_container_id == pre_task_container_id or crr_content_type_id == 1):
                    user_answer_list.append(self.user_answer[self.current])
                    answered_correctly_list.append(self.answered_correctly[self.current])
                    self.current += 1
                    continue
                else:
                    return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
            added_user.add(crr_user_id)
            pre_added_user = crr_user_id
            pre_task_container_id = crr_task_container_id
            pre_content_type_id = crr_content_type_id
            user_answer_list.append(self.user_answer[self.current])
            answered_correctly_list.append(self.answered_correctly[self.current])
            self.current += 1
        if pre_start < self.current:
            return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
        else:
            raise StopIteration()

SAKT

In [None]:
MAX_SEQ = 240 # 210
ACCEPTED_USER_CONTENT_SIZE = 2 # 2
EMBED_SIZE = 256 # 256
BATCH_SIZE = 64+32 # 96
DROPOUT = 0.1 # 0.1

class FFN(nn.Module):
    def __init__(self, state_size = 200, forward_expansion = 1, bn_size = MAX_SEQ - 1, dropout=0.2):
        super(FFN, self).__init__()
        self.state_size = state_size
        
        self.lr1 = nn.Linear(state_size, forward_expansion * state_size)
        self.relu = nn.ReLU()
        self.bn = nn.BatchNorm1d(bn_size)
        self.lr2 = nn.Linear(forward_expansion * state_size, state_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = self.relu(self.lr1(x))
        x = self.bn(x)
        x = self.lr2(x)
        return self.dropout(x)
    
class FFN0(nn.Module):
    def __init__(self, state_size = 200, forward_expansion = 1, bn_size = MAX_SEQ - 1, dropout=0.2):
        super(FFN0, self).__init__()
        self.state_size = state_size

        self.lr1 = nn.Linear(state_size, forward_expansion * state_size)
        self.relu = nn.ReLU()
        self.lr2 = nn.Linear(forward_expansion * state_size, state_size)
        self.layer_normal = nn.LayerNorm(state_size) 
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        x = self.lr1(x)
        x = self.relu(x)
        x = self.lr2(x)
        x=self.layer_normal(x)
        return self.dropout(x)
    
def future_mask(seq_length):
    future_mask = (np.triu(np.ones([seq_length, seq_length]), k = 1)).astype('bool')
    return torch.from_numpy(future_mask)

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, heads = 8, dropout = DROPOUT, forward_expansion = 1):
        super(TransformerBlock, self).__init__()
        self.multi_att = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=heads, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.layer_normal = nn.LayerNorm(embed_dim)
        self.ffn = FFN(embed_dim, forward_expansion = forward_expansion, dropout=dropout)
        self.ffn0  = FFN0(embed_dim, forward_expansion = forward_expansion, dropout=dropout)
        self.layer_normal_2 = nn.LayerNorm(embed_dim)

    def forward(self, value, key, query, att_mask):
        att_output, att_weight = self.multi_att(value, key, query, attn_mask=att_mask)
        att_output = self.dropout(self.layer_normal(att_output + value))
        att_output = att_output.permute(1, 0, 2) # att_output: [s_len, bs, embed] => [bs, s_len, embed]
        x = self.ffn(att_output)
        x1 = self.ffn0(att_output)
        x = self.dropout(self.layer_normal_2(x + x1 + att_output))
        return x.squeeze(-1), att_weight
    
class Encoder(nn.Module):
    def __init__(self, n_skill, max_seq=100, embed_dim=128, dropout = DROPOUT, forward_expansion = 1, num_layers=1, heads = 8):
        super(Encoder, self).__init__()
        self.n_skill, self.embed_dim = n_skill, embed_dim
        self.embedding = nn.Embedding(2 * n_skill + 1, embed_dim)
        self.pos_embedding = nn.Embedding(max_seq - 1, embed_dim)
        self.e_embedding = nn.Embedding(n_skill+1, embed_dim)
        self.layers = nn.ModuleList([TransformerBlock(embed_dim, forward_expansion = forward_expansion) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, question_ids):
        device = x.device
        x = self.embedding(x)
        pos_id = torch.arange(x.size(1)).unsqueeze(0).to(device)
        pos_x = self.pos_embedding(pos_id)
        x = self.dropout(x + pos_x)
        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        e = self.e_embedding(question_ids)
        e = e.permute(1, 0, 2)
        for layer in self.layers:
            att_mask = future_mask(e.size(0)).to(device)
            x, att_weight = layer(e, x, x, att_mask=att_mask)
            x = x.permute(1, 0, 2)
        x = x.permute(1, 0, 2)
        return x, att_weight

class SAKTModel(nn.Module):
    def __init__(self, n_skill, max_seq=100, embed_dim=128, dropout = DROPOUT, forward_expansion = 1, enc_layers=1, heads = 8):
        super(SAKTModel, self).__init__()
        self.encoder = Encoder(n_skill, max_seq, embed_dim, dropout, forward_expansion, num_layers=enc_layers)
        self.pred = nn.Linear(embed_dim, 1)
        
    def forward(self, x, question_ids):
        x, att_weight = self.encoder(x, question_ids)
        x = self.pred(x)
        return x.squeeze(-1), att_weight
    
class TestDataset(Dataset):
    def __init__(self, samples, test_df, n_skill, max_seq=100):
        super(TestDataset, self).__init__()
        self.samples, self.user_ids, self.test_df = samples, [x for x in test_df["user_id"].unique()], test_df
        self.n_skill, self.max_seq = n_skill, max_seq

    def __len__(self):
        return self.test_df.shape[0]
    
    def __getitem__(self, index):
        test_info = self.test_df.iloc[index]
        
        user_id = test_info['user_id']
        target_id = test_info['content_id']
        
        content_id_seq = np.zeros(self.max_seq, dtype=int)
        answered_correctly_seq = np.zeros(self.max_seq, dtype=int)
        
        if user_id in self.samples.index:
            content_id, answered_correctly = self.samples[user_id]
            
            seq_len = len(content_id)
            
            if seq_len >= self.max_seq:
                content_id_seq = content_id[-self.max_seq:]
                answered_correctly_seq = answered_correctly[-self.max_seq:]
            else:
                content_id_seq[-seq_len:] = content_id
                answered_correctly_seq[-seq_len:] = answered_correctly
                
        x = content_id_seq[1:].copy()
        x += (answered_correctly_seq[1:] == 1) * self.n_skill
        
        questions = np.append(content_id_seq[2:], [target_id])
        
        return x, questions


モデル読み込み

In [None]:
!ls /kaggle/input/riiid-xgboost-model-and-features | grep xgb

In [None]:
class config:
    FOLD = 0
    ROOT_PATH = "/kaggle/input/riiid-xgboost-model-and-features"
    MODEL_NAME = "xgb_v17_06_f0"
    validaten_flg = False
    DDOF = 1

In [None]:
model_path = f"{config.ROOT_PATH}/{config.MODEL_NAME}/{config.MODEL_NAME}"
model_name = f"{config.MODEL_NAME}_model.bst"
model_meta = f"{config.MODEL_NAME}_assets_model_metadata.json"
model = Predictor.from_path(model_path, model_name=model_name, meta_name=model_meta)

model._extract_model_metadata()
feature_names = model._feature_names

categorical_features = [feature_names[i] for i in model._ohe_categorical_index_vocab.keys()]

print("features:", len(feature_names))
categorical_features

# Check multi hot encoding features
print(model._mhe_categorical_index_vocab)
assert len(model._mhe_categorical_index_vocab) == 0

In [None]:
group = joblib.load("/kaggle/input/riiid-sakt-model/group.pkl.zip")
n_skill = joblib.load("/kaggle/input/riiid-sakt-model/skills.pkl.zip")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def create_model():
    return SAKTModel(n_skill, max_seq=MAX_SEQ, embed_dim=EMBED_SIZE, forward_expansion=1, enc_layers=1, heads=4, dropout=0.1)
sakt_model = create_model()
sakt_model.load_state_dict(torch.load("/kaggle/input/riiid-sakt-model/sakt_model.pt"))
sakt_model.to(device)
sakt_model.eval()

sakt_model_b = create_model()
sakt_model_b.load_state_dict(torch.load("/kaggle/input/riiid-sakt-model/best_sakt_model_1.pt"))
sakt_model_b.to(device)
sakt_model_b.eval()

print("all model loaded")

コンテンツ特徴量読み込み

In [None]:
content_agg_feats = pd.read_csv(f"{config.ROOT_PATH}/content_agg_feats.csv")
question_tags_ohe = pd.read_csv(f"{config.ROOT_PATH}/question_tags_ohe.csv")
lecture_tags_ohe = pd.read_csv(f"{config.ROOT_PATH}/lecture_tags_ohe.csv")
questions = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/questions.csv")

question_tags_ohe = question_tags_ohe.rename(columns={'question_id':'content_id'})
lecture_tags_ohe = lecture_tags_ohe.rename(columns={'lecture_id':'content_id'})
questions = questions.rename(columns={'question_id':'content_id'}).drop(["bundle_id", "correct_answer", "tags"], axis=1)
content_agg_feats = content_agg_feats.merge(question_tags_ohe, how="left", on="content_id")
content_agg_feats = content_agg_feats.merge(questions, how="left", on="content_id")
content_agg_feats = content_agg_feats.merge(lecture_tags_ohe, how="outer", on="content_id")

content_agg_feats = content_agg_feats.fillna(0)

content_agg_feats_v = content_agg_feats.values
content_agg_feats_c = content_agg_feats.columns.values[1:]

q_ohe_dic = {i: v for i, v in enumerate(question_tags_ohe.set_index("content_id").values)}
l_ohe_dic = {i: row.values for i, row in lecture_tags_ohe.drop("type_of", axis=1).set_index("content_id").iterrows()}

del lecture_tags_ohe, question_tags_ohe, questions, content_agg_feats
gc.collect()

ユーザー特徴量読み込み

In [None]:
user_agg_feats_even = pd.read_csv(f"{config.ROOT_PATH}/user_agg_feat_even.csv")
user_agg_feats_odd = pd.read_csv(f"{config.ROOT_PATH}/user_agg_feat_odd.csv")
user_agg_feats_df = pd.concat([user_agg_feats_even, user_agg_feats_odd])

user_agg_feats_v = user_agg_feats_df.values

del user_agg_feats_df, user_agg_feats_even, user_agg_feats_odd
gc.collect()

In [None]:
user_last_timestamp = pd.read_csv(f"{config.ROOT_PATH}/user_last_timestamp.csv")
last_timestamp_dic = {k: v for k, v in user_last_timestamp.values}
del user_last_timestamp
gc.collect()

Window特徴量

In [None]:
#WINDOW = "/kaggle/input/riiid-create-data-for-transformer-train-on-gpu"
WINDOW = config.ROOT_PATH

with open(f"{WINDOW}/user_all_count.pkl", "rb") as f:
    user_all_count = pickle.load(f)
    
with open(f"{WINDOW}/user_correct_window_200.pkl", "rb") as f:
    user_correct_window_200 = pickle.load(f)
    
with open(f"{WINDOW}/prior_question_elapsed_time_window_dict.pkl", "rb") as f:
     prior_question_elapsed_time_window_dict = pickle.load(f)

with open(f"{WINDOW}/prior_question_had_explanation_count.pkl", "rb") as f:
     prior_question_had_explanation_count = pickle.load(f)
        
with open(f"{WINDOW}/prior_question_had_explanation_window_dict.pkl", "rb") as f:
     prior_question_had_explanation_window_dict = pickle.load(f)

with open(f"{WINDOW}/timediff_window_dict.pkl", "rb") as f:
     timediff_window_dict = pickle.load(f)

カラムの設定

In [None]:
col1 = [f"work_q_tag_{i}_v3" for i in range(188)]
col2 = [f"cumsum_q_tag_{i}_v3" for i in range(188)]
col3 = [f"work_l_tag_{i}_v2" for i in range(188)]
user_agg_feats_c = col1 + col2 + col3

rate_col = [f"correct_rate_q_tag_{i}" for i in range(188)]

学習中の集計関数

In [None]:
def get_content_feature(_content_id):
    idx = np.where(content_agg_feats_v[:,0] == _content_id)[0][0]
    v = content_agg_feats_v[idx, 1:]
    return v.tolist()

def get_user_feature(_user_id):
    idx = np.where(user_agg_feats_v[:,0] == _user_id)[0]
    if len(idx) == 0:
        return np.zeros(user_agg_feats_v.shape[1] - 1)
    else:
        idx = idx[0]
        v = user_agg_feats_v[idx, 1:]
        return v.tolist()
    
def get_timediff(row):
    _timestamp = row["timestamp"]
    _user_id = row["user_id"]
    try:
        return _timestamp - last_timestamp_dic[_user_id]
    except KeyError:
        return 0
    
    
def get_lgbm_window_feat(_user_id):
    try:
        v = prior_question_elapsed_time_window_dict[_user_id]
        v = np.array(v)[~np.isnan(v)]
        prior_question_elapsed_time_std_w200 = v.std(ddof=config.DDOF)
        prior_question_elapsed_time_avg_w200 = v.mean()
        
        prior_question_had_explanation_std_w200 = np.std(prior_question_had_explanation_window_dict[_user_id], ddof=config.DDOF)
        prior_question_had_explanation_avg_w200 = np.mean(prior_question_had_explanation_window_dict[_user_id])
        timediff_std_w200 = np.std(timediff_window_dict[_user_id], ddof=config.DDOF)
        timediff_avg_w200 = np.mean(timediff_window_dict[_user_id])
        _prior_question_had_explanation_count = prior_question_had_explanation_count[_user_id]
    except KeyError:
        prior_question_elapsed_time_std_w200 = 0
        prior_question_elapsed_time_avg_w200 = 0
        prior_question_had_explanation_std_w200 = 0
        prior_question_had_explanation_avg_w200 = 0
        timediff_std_w200 = 0
        timediff_avg_w200 = 0
        _prior_question_had_explanation_count = 0
    return [
        prior_question_elapsed_time_std_w200,
        prior_question_elapsed_time_avg_w200,
        prior_question_had_explanation_std_w200,
        prior_question_had_explanation_avg_w200,
        timediff_std_w200,
        timediff_avg_w200,
        _prior_question_had_explanation_count, 
    ]

学習中の状態更新関数

In [None]:
def update_infomation(row):
    global user_agg_feats_v
    
    _user_id = row["user_id"]
    _timestamp = row["timestamp"]
    _content_id = row["content_id"]
    _answered_correctly = row["answered_correctly"]
    _content_type_id = row["content_type_id"]
    
    try:
        _prior_question_had_explanation = int(row["prior_question_had_explanation"])
    except TypeError:
        _prior_question_had_explanation = 0
        
    try:
        _prior_question_elapsed_time = float(row["prior_question_elapsed_time"])
    except TypeError:
        _prior_question_elapsed_time = 0
    
    try:
        _timediff = _timestamp - last_timestamp_dic[_user_id]
    except KeyError:
        _timediff = 0
    
    # timestamp update
    last_timestamp_dic[_user_id] = _timestamp
    
    # get content tag values
    if _content_type_id == 0:
        n_work = q_ohe_dic[_content_id]
        n_correct = n_work * _answered_correctly
        n_lecture = np.zeros(188)
    else:
        n_work = np.zeros(188)
        n_correct = np.zeros(188)
        n_lecture = l_ohe_dic[_content_id]
    tag_feats = np.hstack([n_work, n_correct, n_lecture])
    
    # user features
    idx = np.where(user_agg_feats_v[:,0] == _user_id)[0]
    if len(idx) == 0:
        # append
        append_v = np.hstack([_user_id, tag_feats]).astype(int)
        user_agg_feats_v = np.vstack([user_agg_feats_v, append_v])
    else:
        # update
        idx = idx[0]
        user_agg_feats_v[idx, 1:] += tag_feats.astype(int)
    
    # count feature
    try:
        prior_question_had_explanation_count[_user_id] += _prior_question_had_explanation
        user_all_count[_user_id][0] += 1
        user_all_count[_user_id][1] += int(_answered_correctly == 1)
    except KeyError:
        prior_question_had_explanation_count[_user_id] = _prior_question_had_explanation
        user_all_count[_user_id] = [1, int(_answered_correctly == 1)]
    
    # Window features
    try:
        if len(user_correct_window_200[_user_id]) == 201:
            user_correct_window_200[_user_id].pop(0)
            prior_question_elapsed_time_window_dict[_user_id].pop(0)
            prior_question_had_explanation_window_dict[_user_id].pop(0)
            timediff_window_dict[_user_id].pop(0)
        user_correct_window_200[_user_id].append(int(_answered_correctly == 1))
        prior_question_elapsed_time_window_dict[_user_id].append(_prior_question_elapsed_time)
        prior_question_had_explanation_window_dict[_user_id].append(_prior_question_had_explanation)
        timediff_window_dict[_user_id].append(_timediff)
    except KeyError:
        user_correct_window_200[_user_id] = [int(_answered_correctly == 1)]
        prior_question_elapsed_time_window_dict[_user_id] = [_prior_question_elapsed_time]
        prior_question_had_explanation_window_dict[_user_id] = [_prior_question_had_explanation]
        timediff_window_dict[_user_id] = [_timediff]

窓関数

In [None]:
def get_window_n_correctry(_user_id):
    try:
        v = np.array(user_correct_window_200[_user_id])
    except KeyError:
        return [0, 0, 0]
    _v = (v == 1).sum()
    v_std = np.std((v == 1), ddof=config.DDOF)
    if len(v) == 201:
        return [_v, _v/200, v_std]
    else:
        return [_v, 0, v_std]
    
def get_all_count(_user_id):
    try:
        v = user_all_count[_user_id]
    except KeyError:
        return [0, 0, 0]
    
    return v + [v[1]/v[0]]

# Infer

In [None]:
if config.validaten_flg:
    target_df = pd.read_pickle('../input/riiid-cross-validation-files/cv1_valid.pickle')
    iter_test = Iter_Valid(target_df, max_user=1000)
    predicted = []
    def set_predict(df):
        predicted.append(df)
    user_agg_feats_v = user_agg_feats_v[:10000]
    last_timestamp_dic = {k: last_timestamp_dic[k] for k in user_agg_feats_v[:, 0]}
    user_correct_window_200 = {k: user_correct_window_200[k] for k in user_agg_feats_v[:, 0]}
    user_all_count = {k: user_all_count[k] for k in user_agg_feats_v[:, 0]}
    prior_question_had_explanation_count = {k: prior_question_had_explanation_count[k] for k in user_agg_feats_v[:, 0]}
    user_correct_window_200 = {k: user_correct_window_200[k] for k in user_agg_feats_v[:, 0]}
    prior_question_elapsed_time_window_dict = {k: prior_question_elapsed_time_window_dict[k] for k in user_agg_feats_v[:, 0]}
    prior_question_had_explanation_window_dict = {k: prior_question_had_explanation_window_dict[k] for k in user_agg_feats_v[:, 0]}
    timediff_window_dict = {k: timediff_window_dict[k] for k in user_agg_feats_v[:, 0]}
else:
    env = riiideducation.make_env()
    iter_test = env.iter_test()
    set_predict = env.predict

In [None]:
import psutil
print(psutil.virtual_memory().percent)

In [None]:
prev_df = None
pbar = tqdm(total=2500000)
warnings.simplefilter('ignore')
for (test_df, sample_prediction_df) in iter_test:
    if prev_df is not None:
        prev_df["answered_correctly"] = eval(test_df["prior_group_answers_correct"].iloc[0])
        _ = prev_df.apply(update_infomation, axis=1)
        
        # update for SAKT
        prev_test_df = prev_df[prev_df.content_type_id == False]
        prev_group = prev_test_df[['user_id', 'content_id', 'answered_correctly']].groupby('user_id').apply(lambda r: (
            r['content_id'].values,
            r['answered_correctly'].values))
        for prev_user_id in prev_group.index:
            prev_group_content = prev_group[prev_user_id][0]
            prev_group_answered_correctly = prev_group[prev_user_id][1]
            if prev_user_id in group.index:
                group[prev_user_id] = (np.append(group[prev_user_id][0], prev_group_content), 
                                       np.append(group[prev_user_id][1], prev_group_answered_correctly))
            else:
                group[prev_user_id] = (prev_group_content, prev_group_answered_correctly)
            
            if len(group[prev_user_id][0]) > MAX_SEQ:
                new_group_content = group[prev_user_id][0][-MAX_SEQ:]
                new_group_answered_correctly = group[prev_user_id][1][-MAX_SEQ:]
                group[prev_user_id] = (new_group_content, new_group_answered_correctly)
        
    prev_df = test_df.reset_index(drop=True)
    _test_df = test_df

    # merge features contents
    c_agg_feat_df = pd.DataFrame(_test_df["content_id"].map(get_content_feature).tolist(), columns=content_agg_feats_c)
    _test_df = pd.concat([_test_df.reset_index(drop=True), c_agg_feat_df], axis=1)
    
    # merge features users
    u_agg_feat_df = pd.DataFrame(_test_df["user_id"].map(get_user_feature).tolist(), columns=user_agg_feats_c)
    _test_df = pd.concat([_test_df.reset_index(drop=True), u_agg_feat_df], axis=1)
    
    # calcurate rate 
    rate_df = pd.DataFrame(np.nan_to_num(_test_df[col2].values/_test_df[col1].values), columns=rate_col)
    _test_df = pd.concat([_test_df.reset_index(drop=True), rate_df], axis=1)
    
    # calc timediff
    _test_df["timediff"] = _test_df.apply(lambda x: get_timediff(x), axis=1)
    
    # fill Nan
    _test_df["prior_question_had_explanation"] = _test_df["prior_question_had_explanation"].fillna(0).astype(int)
    
    # Window features
    w_df = pd.DataFrame(_test_df["user_id"].map(get_window_n_correctry).tolist(),
                        columns=["correct_sum_w200", "rate_sum_w200", "correct_std_w200"])
    all_df = pd.DataFrame(_test_df["user_id"].map(get_all_count).tolist(),
                          columns=['work_sum_all', 'correct_sum_all', 'rate_sum_all'])
    _test_df = pd.concat([_test_df, w_df, all_df], axis=1)
    
    # LGBM features
    new_feat = pd.DataFrame(_test_df["user_id"].map(get_lgbm_window_feat).tolist(),
            columns=[
                "prior_question_elapsed_time_std_w200",
                "prior_question_elapsed_time_avg_w200",
                "prior_question_had_explanation_std_w200",
                "prior_question_had_explanation_avg_w200",
                "timediff_std_w200",
                "timediff_avg_w200",
                "prior_question_had_explanation_count"
    ])
    _test_df = pd.concat([_test_df, new_feat], axis=1)
    
    _test_df["work_per_time"] = (_test_df["work_sum_all"]/_test_df["timestamp"]).fillna(0).values
    _test_df["correct_per_time"] = (_test_df["correct_sum_all"]/_test_df["timestamp"]).fillna(0).values 
    _test_df["prior_question_per_time"] = (_test_df["prior_question_had_explanation_count"]/_test_df["timestamp"]).fillna(0).values
    
    # transfer feature df row to dict
    feature_df = _test_df[feature_names].reset_index(drop=True)
    feature_dict = [row.to_dict() for _, row in feature_df.iterrows()]

    # SAKT
    sakt_test_df = test_df[test_df.content_type_id == False]
    
    test_dataset = TestDataset(group, sakt_test_df, n_skill, max_seq=MAX_SEQ)
    test_dataloader = DataLoader(test_dataset, batch_size=len(sakt_test_df), shuffle=False)
    
    item = next(iter(test_dataloader))
    x = item[0].to(device).long()
    target_id = item[1].to(device).long()
    
    with torch.no_grad():
        output, _ = sakt_model(x, target_id)
    output = torch.sigmoid(output)
    output = output[:, -1]
    sakt_pred = output.cpu().numpy()
    
    with torch.no_grad():
        output_b, _ = sakt_model_b(x, target_id)
    output_b = torch.sigmoid(output_b)
    output_b = output_b[:, -1]
    sakt_pred_b = output_b.cpu().numpy()
      
    # predict
    # XGB
    pred = model.predict(feature_dict)
    pred = [p["answered_correctly_probs"][0] for p in pred]
    pred = [pred[i] for i, v in enumerate((test_df['content_type_id'] == 0).values) if v]
    
    preds_avg = np.average(np.array([pred, sakt_pred, sakt_pred_b]).T, weights=np.array([4, 1, 1]), axis=1)
    test_df.loc[test_df['content_type_id'] == 0, "answered_correctly"] = preds_avg
    
    # Submit
    set_predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])
    
    pbar.update(len(test_df))