In [None]:
import pandas as pd
import numpy as np
import gc
import pickle
import psutil
import joblib
from sklearn.metrics import roc_auc_score
from collections import defaultdict
from tqdm.notebook import tqdm
import lightgbm as lgb
import riiideducation
import matplotlib.pyplot as plt
import seaborn as sns

import random
import os

In [None]:
TARGET = 'answered_correctly'
# Features to train and predict
FEATURES = ['prior_question_elapsed_time', 
            'prior_question_had_explanation', 
            'content_field', 
            'answered_correctly_u_avg', 
            'elapsed_time_u_avg', 
            'explanation_u_avg',
            'elapsed_time_q_avg', 
            'explanation_q_avg',
            'explanation_qtrue_avg',
            'explanation_qfalse_avg',
            'beta_q', 
            'answered_correctly_uq_count', 
            'timestamp_u_recency_1', 
            'timestamp_u_recency_2', 
            'timestamp_u_recency_3', 
            'timestamp_u_incorrect_recency',
            'theta_u',
            'performance_u',
            'task_container_avg',
            'tags_avg',
            'mean_question_accuracy',
            'std_accuracy',
            'tags_encoded',
            'tag_1',
            'tag_2',
            'answered_correctly_u_num', 
            'answered_correctly_u_num_field', 
            'answered_correctly_u_avg_field', 
            'answered_correctly_difficulty_weighted_avg', 
            'answered_correctly_difficulty_weighted_avg_field',
            'min_u_wrong_difficulty',
            'min_u_wrong_difficulty_field',
            'max_u_solved_difficulty',
            'max_u_solved_difficulty_field', 
            'session_u_time', 
            'time_u_to_last_session',
            'elo_rate',
            'tags_lsi',
            'question_elapsed_time_mean',
            'tag_acc_max',
            'tag_acc_min',
            'tag_acc_count',
            'explanation_qtrue_mean',
            'explanation_qfalse_mean'
            ]

In [None]:
# Random seed
SEED = 123

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(SEED)

In [None]:
# Functions for theta and beta
def get_new_theta(is_good_answer, beta, theta, nb_previous_answers):
    return theta + learning_rate_theta(nb_previous_answers) * (
        is_good_answer - probability_of_good_answer(theta, beta)
    )

def get_new_beta(is_good_answer, beta, theta, nb_previous_answers):
    return beta - learning_rate_beta(nb_previous_answers) * (
        is_good_answer - probability_of_good_answer(theta, beta)
    )

def learning_rate_theta(nb_answers):
    return max(0.3 / (1 + 0.01 * nb_answers), 0.04)

def learning_rate_beta(nb_answers):
    return 1 / (1 + 0.05 * nb_answers)

def probability_of_good_answer(theta, beta, left_asymptote = 1/4):
    return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [None]:
# Funcion for user stats with loops
def add_train_features(df, 
                answered_correctly_u_count, # v1 & v2 
                answered_correctly_u_sum, # v1 & v2
                elapsed_time_u_sum, # v1
                explanation_u_sum, # v1
                timestamp_u, # v1
                timestamp_u_incorrect, # v1
                latest_u_theta, # v1 & v2
                answered_correctly_q_count, # v1 & v2 
                answered_correctly_q_sum,  # v1
                elapsed_time_q_sum, # v1
                explanation_q_sum, # v1
                explanation_qtrue_sum, # v1
                explanation_qtrue_count, # v1
                latest_q_beta,  # v1 & v2
                answered_correctly_uq, # v1 & v2
                question_avg_sum_u, # v1
                task_container_sum, # v1 & v2
                task_container_count, # v1 & v2
                tags_sum, # v1
                tags_count, # v1
                answered_correctly_u_count_field, # v2
                answered_correctly_u_sum_field, # v2
                answered_correctly_difficulty_weighted_sum, # v2
                answered_correctly_difficulty_weighted_sum_field, # v2
                max_solved_difficulty, # v2
                max_solved_difficulty_field, # v2
                min_wrong_difficulty, # v2
                min_wrong_difficulty_field, # v2
                session_time, # v2
                since_last_session_time, # v2
                last_session_start_time, # v2
                first_action_time, # v2
                update = True):
    df['tags'] = df['tags'].\
                    apply(lambda ts: [int(x) for x in str(ts).split() if x != 'nan'])
    # -----------------------------------------------------------------------
    for _, row in enumerate(df[['user_id', # 0
                                  'answered_correctly', # 1 
                                  'content_id', # 2 
                                  'prior_question_elapsed_time', # 3 
                                  'prior_question_had_explanation', # 4
                                  'timestamp', # 5
                                  'task_container_id', # 6
                                  'tags', # 7
                                  'content_field', # 8
                                  'mean_question_accuracy', # 9
                                  ]].values):
        # Client features assignation
        field = int(row[8]) - 1
        ###
        if row[0] not in answered_correctly_u_count_field: 
            answered_correctly_u_count_field[row[0]] = [0] * 7 
            answered_correctly_u_sum_field[row[0]] = [0] * 7 
            answered_correctly_difficulty_weighted_sum_field[row[0]] = [0] * 7 
            max_solved_difficulty_field[row[0]] = [0] * 7 
            min_wrong_difficulty_field[row[0]] = [0] * 7  

        if first_action_time[row[0]] == 0:
            first_action_time[row[0]] = row[5]
            last_session_start_time[row[0]] = row[5]
        else:
            if row[5] - last_session_start_time[row[0]] >= 7200 * 1000:
                since_last_session_time[row[0]] = (row[5] - last_session_start_time[row[0]]) /\
                                                    1000 / 3600
                last_session_start_time[row[0]] = row[5]
        
        # ------------------------------------------------------------------
        # ------------------------------------------------------------------
        # Client features updates
        answered_correctly_u_count[row[0]] += 1
        answered_correctly_u_count_field[row[0]][field] += 1
        elapsed_time_u_sum[row[0]] += row[3]
        explanation_u_sum[row[0]] += int(row[4])
        if len(timestamp_u[row[0]]) == 3:
            timestamp_u[row[0]].pop(0)
            timestamp_u[row[0]].append(row[5])
        else:
            timestamp_u[row[0]].append(row[5])
        # ------------------------------------------------------------------
        # Question features updates
        answered_correctly_q_count[row[2]] += 1
        elapsed_time_q_sum[row[2]] += row[3]
        explanation_q_sum[row[2]] += int(row[4])

        if row[4]:
            explanation_qtrue_count[row[0]] += 1
        # ------------------------------------------------------------------
        # Client Question updates
        if row[0] not in answered_correctly_uq:
            answered_correctly_uq[row[0]] = defaultdict(int)
        answered_correctly_uq[row[0]][row[2]] += 1
        # ------------------------------------------------------------------
        # Other features updates
        task_container_count[row[6]] += 1
        
        tags = row[7]
        for k in range(len(tags)):
            tags_count[tags[k]] += 1
        # ------------------------------------------------------------------
        # Flag for training and inference
        if update:
            theta = latest_u_theta[row[0]]
            beta = latest_q_beta[row[2]]
            # ------------------------------------------------------------------
            # Client features updates
            answered_correctly_u_sum[row[0]] += row[1]
            answered_correctly_u_sum_field[row[0]][field] += row[1] #
            answered_correctly_difficulty_weighted_sum[row[0]] += row[1] * (1 - row[9]) * 3 #
            answered_correctly_difficulty_weighted_sum_field[row[0]][field] += row[1] * (1 - row[9]) * 3 #
            if row[1] == 0:
                if len(timestamp_u_incorrect[row[0]]) == 1:
                    timestamp_u_incorrect[row[0]].pop(0)
                    timestamp_u_incorrect[row[0]].append(row[5])
                else:
                    timestamp_u_incorrect[row[0]].append(row[5])
                
                if row[9] > min_wrong_difficulty[row[0]]:
                    min_wrong_difficulty[row[0]] = row[9] #
                
                if row[9] > min_wrong_difficulty_field[row[0]][field]:
                    min_wrong_difficulty_field[row[0]][field] = row[9] #
            
            else:
                if 1 - row[9] > max_solved_difficulty[row[0]]: 
                    max_solved_difficulty[row[0]] = 1 - row[9] #
                
                if 1 - row[9] > max_solved_difficulty_field[row[0]][field]:
                    max_solved_difficulty_field[row[0]][field] = 1 - row[9] #

            latest_u_theta[row[0]] = get_new_theta(row[1],
                                                    beta, theta,
                                                    answered_correctly_u_count[row[0]])
            # ------------------------------------------------------------------
            # Other features updates
            task_container_sum[row[6]] += row[1]

            for k in range(len(tags)):
                tags_sum[tags[k]] += row[1]
            # ------------------------------------------------------------------
            # Question features updates
            answered_correctly_q_sum[row[2]] += row[1]
            if row[4]:
                explanation_qtrue_sum[row[0]] += row[1]
            latest_q_beta[row[2]] = get_new_beta(row[1],
                                                    beta, theta,
                                                    answered_correctly_q_count[row[2]])
            # ------------------------------------------------------------------
        # Question average sum by user updates
        question_avg_sum_u[row[0]] += answered_correctly_q_sum[row[2]] /\
                                      answered_correctly_q_count[row[2]]

In [None]:
# Funcion for user stats with loops
def add_features(df, 
                answered_correctly_u_count, # v1 & v2 
                answered_correctly_u_sum, # v1 & v2
                elapsed_time_u_sum, # v1
                explanation_u_sum, # v1
                timestamp_u, # v1
                timestamp_u_incorrect, # v1
                latest_u_theta, # v1 & v2
                answered_correctly_q_count, # v1 & v2 
                answered_correctly_q_sum,  # v1
                elapsed_time_q_sum, # v1
                explanation_q_sum, # v1
                explanation_qtrue_sum, # v1
                explanation_qtrue_count, # v1
                latest_q_beta,  # v1 & v2
                answered_correctly_uq, # v1 & v2
                question_avg_sum_u, # v1
                task_container_sum, # v1 & v2
                task_container_count, # v1 & v2
                tags_sum, # v1
                tags_count, # v1
                answered_correctly_u_count_field, # v2
                answered_correctly_u_sum_field, # v2
                answered_correctly_difficulty_weighted_sum, # v2
                answered_correctly_difficulty_weighted_sum_field, # v2
                max_solved_difficulty, # v2
                max_solved_difficulty_field, # v2
                min_wrong_difficulty, # v2
                min_wrong_difficulty_field, # v2
                session_time, # v2
                since_last_session_time, # v2
                last_session_start_time, # v2
                first_action_time, # v2
                update = True):
    # -----------------------------------------------------------------------
    # Client features
    answered_correctly_u_avg = np.zeros(len(df), dtype = np.float32) # v1 & v2
    elapsed_time_u_avg = np.zeros(len(df), dtype = np.float32) # v1
    explanation_u_avg = np.zeros(len(df), dtype = np.float32) # v1
    timestamp_u_recency_1 = np.zeros(len(df), dtype = np.float32) # v1
    timestamp_u_recency_2 = np.zeros(len(df), dtype = np.float32) # v1
    timestamp_u_recency_3 = np.zeros(len(df), dtype = np.float32) # v1
    timestamp_u_incorrect_recency = np.zeros(len(df), dtype = np.float32) # v1
    theta_u = np.zeros(len(df), dtype = np.float32) # v1 & v2
    answered_correctly_u_num = np.zeros(len(df), dtype = np.int32) # v2
    answered_correctly_u_num_field = np.zeros(len(df), dtype = np.int32) # v2
    answered_correctly_u_avg_field = np.zeros(len(df), dtype = np.float32) # v2
    answered_correctly_difficulty_weighted_avg = np.zeros(len(df), dtype = np.float32) # v2
    answered_correctly_difficulty_weighted_avg_field = np.zeros(len(df), dtype = np.float32) # v2
    max_u_solved_difficulty = np.zeros(len(df), dtype = np.float32) # v2
    max_u_solved_difficulty_field = np.zeros(len(df), dtype = np.float32) # v2
    min_u_wrong_difficulty = np.zeros(len(df), dtype = np.float32) # v2
    min_u_wrong_difficulty_field = np.zeros(len(df), dtype = np.float32) # v2
    session_u_time = np.zeros(len(df), dtype = np.float32) # v2
    time_u_to_last_session = np.zeros(len(df), dtype = np.float32) # v2
    # -----------------------------------------------------------------------
    # Question features
#     answered_correctly_q_avg = np.zeros(len(df), dtype = np.float32) # v1
    elapsed_time_q_avg = np.zeros(len(df), dtype = np.float32) # v1
    explanation_q_avg = np.zeros(len(df), dtype = np.float32) # v1
    explanation_qtrue_avg = np.zeros(len(df), dtype = np.float32) # v1
    explanation_qfalse_avg = np.zeros(len(df), dtype = np.float32) # v1
    beta_q = np.zeros(len(df), dtype = np.float32) # v1 & v2
#     answered_correctly_q_num = np.zeros(len(df), dtype = np.int32) # v2
    # -----------------------------------------------------------------------
    # User Question
    answered_correctly_uq_count = np.zeros(len(df), dtype = np.int32) # v1 & v2
    performance_u = np.zeros(len(df), dtype = np.float32) # v1
#     hmean_uq = np.zeros(len(df), dtype = np.float32) # v2
#     hmean_uq_field = np.zeros(len(df), dtype = np.float32) # v2
    elo_rate = np.zeros(len(df), dtype = np.float32) # v2
    # -----------------------------------------------------------------------
    # Other features
    task_container_avg = np.zeros(len(df), dtype = np.float32) # v1 & v2
    tags_avg = np.zeros(len(df), dtype = np.float32) # v1
    # -----------------------------------------------------------------------
    for num, row in enumerate(df[['user_id', # 0
                                  'answered_correctly', # 1 
                                  'content_id', # 2 
                                  'prior_question_elapsed_time', # 3 
                                  'prior_question_had_explanation', # 4
                                  'timestamp', # 5
                                  'task_container_id', # 6
                                  'tags', # 7
                                  'content_field', # 8
                                  'mean_question_accuracy', # 9
                                  ]].values):
        
        # Client features assignation
        field = int(row[8]) - 1
        # ------------------------------------------------------------------
        answered_correctly_u_num[num] = answered_correctly_u_count[row[0]] #

        if answered_correctly_u_count[row[0]] != 0:
            answered_correctly_u_avg[num] = answered_correctly_u_sum[row[0]] /\
                                            answered_correctly_u_count[row[0]]
            elapsed_time_u_avg[num] = elapsed_time_u_sum[row[0]] /\
                                      answered_correctly_u_count[row[0]]
            explanation_u_avg[num] = explanation_u_sum[row[0]] /\
                                     answered_correctly_u_count[row[0]]
            performance_u[num] = answered_correctly_u_avg[num] - \
                                 question_avg_sum_u[row[0]] / answered_correctly_u_count[row[0]]
            theta_u[num] = latest_u_theta[row[0]]
            answered_correctly_difficulty_weighted_avg[num] = answered_correctly_difficulty_weighted_sum[row[0]] /\
                                                answered_correctly_u_count[row[0]] #
            max_u_solved_difficulty[num] = max_solved_difficulty[row[0]] #
            min_u_wrong_difficulty[num] = min_wrong_difficulty[row[0]] #
        else:
            answered_correctly_u_avg[num] = np.nan
            elapsed_time_u_avg[num] = np.nan
            explanation_u_avg[num] = np.nan
            performance_u[num] = np.nan
            theta_u[num] = 0
            answered_correctly_difficulty_weighted_avg[num] = np.nan #                
            max_u_solved_difficulty[num] = np.nan #
            min_u_wrong_difficulty[num] = np.nan #

        ###
        if row[0] not in answered_correctly_u_count_field: 
            answered_correctly_u_count_field[row[0]] = [0] * 7 
            answered_correctly_u_sum_field[row[0]] = [0] * 7 
            answered_correctly_difficulty_weighted_sum_field[row[0]] = [0] * 7 
            max_solved_difficulty_field[row[0]] = [0] * 7 
            min_wrong_difficulty_field[row[0]] = [0] * 7 
        answered_correctly_u_num_field[num] = answered_correctly_u_count_field[row[0]][field] 

        if answered_correctly_u_count_field[row[0]][field] != 0:
            answered_correctly_u_avg_field[num] = answered_correctly_u_sum_field[row[0]][field] /\
                                                answered_correctly_u_count_field[row[0]][field]
            answered_correctly_difficulty_weighted_avg_field[num] = answered_correctly_difficulty_weighted_sum_field[row[0]][field] /\
                                                                    answered_correctly_u_count_field[row[0]][field]

            max_u_solved_difficulty_field[num] = max_solved_difficulty_field[row[0]][field]
            min_u_wrong_difficulty_field[num] = min_wrong_difficulty_field[row[0]][field]
        else:
            answered_correctly_u_avg_field[num] = np.nan
            answered_correctly_difficulty_weighted_avg_field[num] = np.nan
            
            max_u_solved_difficulty_field[num] = np.nan
            min_u_wrong_difficulty_field[num] = np.nan

        if first_action_time[row[0]] == 0:
            first_action_time[row[0]] = row[5]
            last_session_start_time[row[0]] = row[5]
        else:
            if row[5] - last_session_start_time[row[0]] >= 7200 * 1000:
                since_last_session_time[row[0]] = (row[5] - last_session_start_time[row[0]]) /\
                                                    1000 / 3600
                last_session_start_time[row[0]] = row[5]
                session_u_time[num] = 0

            else:
                session_u_time[num] = (row[5] - last_session_start_time[row[0]]) /\
                                        1000 / 60
        time_u_to_last_session[num] = since_last_session_time[row[0]]
        ###
            
        if len(timestamp_u[row[0]]) == 0:
            timestamp_u_recency_1[num] = np.nan
            timestamp_u_recency_2[num] = np.nan
            timestamp_u_recency_3[num] = np.nan
        elif len(timestamp_u[row[0]]) == 1:
            timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][0]
            timestamp_u_recency_2[num] = np.nan
            timestamp_u_recency_3[num] = np.nan
        elif len(timestamp_u[row[0]]) == 2:
            timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][1]
            timestamp_u_recency_2[num] = row[5] - timestamp_u[row[0]][0]
            timestamp_u_recency_3[num] = np.nan
        elif len(timestamp_u[row[0]]) == 3:
            timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][2]
            timestamp_u_recency_2[num] = row[5] - timestamp_u[row[0]][1]
            timestamp_u_recency_3[num] = row[5] - timestamp_u[row[0]][0]
        
        if len(timestamp_u_incorrect[row[0]]) == 0:
            timestamp_u_incorrect_recency[num] = np.nan
        else:
            timestamp_u_incorrect_recency[num] = row[5] - timestamp_u_incorrect[row[0]][0]            
        # ------------------------------------------------------------------
        # Question features assignation
#         answered_correctly_q_num[num] = answered_correctly_q_count[row[2]]

        if answered_correctly_q_count[row[2]] != 0:
#             answered_correctly_q_avg[num] = answered_correctly_q_sum[row[2]] /\
#                                             answered_correctly_q_count[row[2]]
            elapsed_time_q_avg[num] = elapsed_time_q_sum[row[2]] /\
                                      answered_correctly_q_count[row[2]]
            explanation_q_avg[num] = explanation_q_sum[row[2]] /\
                                     answered_correctly_q_count[row[2]]
            beta_q[num] = latest_q_beta[row[2]]
        else:
#             answered_correctly_q_avg[num] = np.nan
            elapsed_time_q_avg[num] = np.nan
            explanation_q_avg[num] = np.nan
            beta_q[num] = 0

        if explanation_qtrue_count[row[0]] != 0:
            explanation_qtrue_avg[num] = explanation_qtrue_sum[row[0]] /\
                                        explanation_qtrue_count[row[0]]
        else:
            explanation_qtrue_avg[num] = np.nan

        if answered_correctly_u_count[row[0]] - explanation_qtrue_count[row[0]] != 0:
            explanation_qfalse_avg[num] =  \
            (answered_correctly_u_sum[row[0]] - explanation_qtrue_sum[row[0]]) /\
            (answered_correctly_u_count[row[0]] - explanation_qtrue_count[row[0]])
        else:
            explanation_qfalse_avg[num] = np.nan
        # ------------------------------------------------------------------
        # Client Question assignation
#         hmean_uq[num] = 2 * answered_correctly_u_avg[num] * row[9] /\
#                         (answered_correctly_u_avg[num] + row[9])
#         hmean_uq_field[num] = 2 * answered_correctly_u_avg_field[num] * row[9] /\
#                             (answered_correctly_u_avg_field[num] + row[9])
        elo_rate[num] = latest_u_theta[row[0]] - latest_q_beta[row[2]]
        # ------------------------------------------------------------------
        # Other features assignation
        if task_container_count[row[6]] != 0:
            task_container_avg[num] = task_container_sum[row[6]] /\
                                      task_container_count[row[6]]
        else:
            task_container_avg[num] = np.nan
        
        tags = row[7]
        tags_means = []
        for k in range(len(tags)):
            if tags_count[tags[k]] == 0:
                continue
            tags_means.append(tags_sum[tags[k]] / tags_count[tags[k]])
        if not tags_means:
            tags_avg[num] = np.nan
        else:
            tags_avg[num] = np.mean(tags_means)
        # ------------------------------------------------------------------
        # ------------------------------------------------------------------
        # Client features updates
        answered_correctly_u_count[row[0]] += 1
        answered_correctly_u_count_field[row[0]][field] += 1
        elapsed_time_u_sum[row[0]] += row[3]
        explanation_u_sum[row[0]] += int(row[4])
        if len(timestamp_u[row[0]]) == 3:
            timestamp_u[row[0]].pop(0)
            timestamp_u[row[0]].append(row[5])
        else:
            timestamp_u[row[0]].append(row[5])
        # ------------------------------------------------------------------
        # Question features updates
        answered_correctly_q_count[row[2]] += 1
        elapsed_time_q_sum[row[2]] += row[3]
        explanation_q_sum[row[2]] += int(row[4])

        if row[4]:
            explanation_qtrue_count[row[0]] += 1
        # ------------------------------------------------------------------
        # Client Question updates
        if row[0] not in answered_correctly_uq:
            answered_correctly_uq[row[0]] = defaultdict(int)
        answered_correctly_uq_count[num] = answered_correctly_uq[row[0]][row[2]]
        answered_correctly_uq[row[0]][row[2]] += 1
        # ------------------------------------------------------------------
        # Other features updates
        task_container_count[row[6]] += 1
        
        for k in range(len(tags)):
            tags_count[tags[k]] += 1
        # ------------------------------------------------------------------
        # Flag for training and inference
        if update:
            theta = latest_u_theta[row[0]]
            beta = latest_q_beta[row[2]]
            # ------------------------------------------------------------------
            # Client features updates
            answered_correctly_u_sum[row[0]] += row[1]
            answered_correctly_u_sum_field[row[0]][field] += row[1] #
            answered_correctly_difficulty_weighted_sum[row[0]] += row[1] * (1 - row[9]) * 3 #
            answered_correctly_difficulty_weighted_sum_field[row[0]][field] += row[1] * (1 - row[9]) * 3 #
            if row[1] == 0:
                if len(timestamp_u_incorrect[row[0]]) == 1:
                    timestamp_u_incorrect[row[0]].pop(0)
                    timestamp_u_incorrect[row[0]].append(row[5])
                else:
                    timestamp_u_incorrect[row[0]].append(row[5])
                
                if row[9] > min_wrong_difficulty[row[0]]:
                    min_wrong_difficulty[row[0]] = row[9] #
                
                if row[9] > min_wrong_difficulty_field[row[0]][field]:
                    min_wrong_difficulty_field[row[0]][field] = row[9] #
            
            else:
                if 1 - row[9] > max_solved_difficulty[row[0]]: 
                    max_solved_difficulty[row[0]] = 1 - row[9] #
                
                if 1 - row[9] > max_solved_difficulty_field[row[0]][field]:
                    max_solved_difficulty_field[row[0]][field] = 1 - row[9] #

            latest_u_theta[row[0]] = get_new_theta(row[1],
                                                    beta, theta,
                                                    answered_correctly_u_count[row[0]])
            # ------------------------------------------------------------------
            # Other features updates
            task_container_sum[row[6]] += row[1]

            for k in range(len(tags)):
                tags_sum[tags[k]] += row[1]
            # ------------------------------------------------------------------
            # Question features updates
            answered_correctly_q_sum[row[2]] += row[1]
            if row[4]:
                explanation_qtrue_sum[row[0]] += row[1]
            latest_q_beta[row[2]] = get_new_beta(row[1],
                                                    beta, theta,
                                                    answered_correctly_q_count[row[2]])
            # ------------------------------------------------------------------
        # Question average sum by user updates
        question_avg_sum_u[row[0]] += answered_correctly_q_sum[row[2]] /\
                                      answered_correctly_q_count[row[2]]
            
    user_df = pd.DataFrame({'answered_correctly_u_avg': answered_correctly_u_avg, 
                            'elapsed_time_u_avg': elapsed_time_u_avg, 
                            'explanation_u_avg': explanation_u_avg, 
                            'elapsed_time_q_avg': elapsed_time_q_avg, 
                            'explanation_q_avg': explanation_q_avg,
                            'explanation_qtrue_avg': explanation_qtrue_avg,
                            'explanation_qfalse_avg': explanation_qfalse_avg,
                            'answered_correctly_uq_count': answered_correctly_uq_count, 
                            'timestamp_u_recency_1': timestamp_u_recency_1, 
                            'timestamp_u_recency_2': timestamp_u_recency_2,
                            'timestamp_u_recency_3': timestamp_u_recency_3, 
                            'timestamp_u_incorrect_recency': timestamp_u_incorrect_recency,
                            'performance_u': performance_u,
                            'tags_avg': tags_avg,
                            'answered_correctly_u_num': answered_correctly_u_num, 
                            'answered_correctly_u_num_field': answered_correctly_u_num_field, 
                            'answered_correctly_u_avg_field': answered_correctly_u_avg_field, 
                            'answered_correctly_difficulty_weighted_avg': answered_correctly_difficulty_weighted_avg, 
                            'answered_correctly_difficulty_weighted_avg_field': answered_correctly_difficulty_weighted_avg_field,
                            'max_u_solved_difficulty': max_u_solved_difficulty,
                            'max_u_solved_difficulty_field': max_u_solved_difficulty_field,
                            'min_u_wrong_difficulty': min_u_wrong_difficulty,
                            'min_u_wrong_difficulty_field': min_u_wrong_difficulty_field, 
                            'session_u_time': session_u_time, 
                            'time_u_to_last_session': time_u_to_last_session,
                            'answered_correctly_uq_count': answered_correctly_uq_count,
                            'theta_u': theta_u,
                            'beta_q': beta_q,
                            'elo_rate': elo_rate,
                            'task_container_avg': task_container_avg,
                            })
    
    df = pd.concat([df, user_df], axis = 1)
    return df

In [None]:
def update_features(df, 
                    answered_correctly_u_sum, 
                    answered_correctly_q_sum, 
                    timestamp_u_incorrect,
                    explanation_qtrue_sum,
                    task_container_sum,
                    latest_u_theta,
                    latest_q_beta,
                    answered_correctly_u_count,
                    answered_correctly_q_count,
                    tags_sum,
                    answered_correctly_u_sum_field,
                    answered_correctly_difficulty_weighted_sum,
                    answered_correctly_difficulty_weighted_sum_field,
                    max_solved_difficulty,
                    max_solved_difficulty_field,
                    min_wrong_difficulty,
                    min_wrong_difficulty_field,
                    ):
    for row in df[['user_id', # 0
                    'answered_correctly', # 1 
                    'content_id', # 2
                    'content_type_id', # 3 
                    'timestamp', # 4
                    'prior_question_had_explanation', # 5
                    'task_container_id', # 6
                    'tags', # 7
                    'content_field', # 8
                    'mean_question_accuracy', # 9
                    ]].values:
        if row[3] == 0:
            field = int(row[8]) - 1

            theta = latest_u_theta[row[0]]
            beta = latest_q_beta[row[2]]
            # ------------------------------------------------------------------
            # Client features updates
            answered_correctly_u_sum[row[0]] += row[1]
            answered_correctly_u_sum_field[row[0]][field] += row[1] #
            answered_correctly_difficulty_weighted_sum[row[0]] += row[1] * (1 - row[9]) * 3 #
            answered_correctly_difficulty_weighted_sum_field[row[0]][field] += row[1] * (1 - row[9]) * 3 #
            if row[1] == 0:
                if len(timestamp_u_incorrect[row[0]]) == 1:
                    timestamp_u_incorrect[row[0]].pop(0)
                    timestamp_u_incorrect[row[0]].append(row[4])
                else:
                    timestamp_u_incorrect[row[0]].append(row[4])

                
                if row[9] > min_wrong_difficulty[row[0]]: 
                    min_wrong_difficulty[row[0]] = row[9] #
                
                if row[9] > min_wrong_difficulty_field[row[0]][field]:
                    min_wrong_difficulty_field[row[0]][field] = row[9] #
            else:
                if 1 - row[9] > max_solved_difficulty[row[0]]:
                    max_solved_difficulty[row[0]] = 1 - row[9] #
                
                if 1 - row[9] > max_solved_difficulty_field[row[0]][field]:
                    max_solved_difficulty_field[row[0]][field] = 1 - row[9] #

            latest_u_theta[row[0]] = get_new_theta(row[1],
                                                    beta, theta,
                                                    answered_correctly_u_count[row[0]])

            # ------------------------------------------------------------------
            # Other features updates
            task_container_sum[row[6]] += row[1]

            tags = row[7]
            for k in range(len(tags)):
                tags_sum[tags[k]] += row[1]
            # ------------------------------------------------------------------
            # Question features updates
            answered_correctly_q_sum[row[2]] += row[1]
            if row[5]:
                explanation_qtrue_sum[row[0]] += row[1]
            latest_q_beta[row[2]] = get_new_beta(row[1],
                                                    beta, theta,
                                                    answered_correctly_q_count[row[2]])
            # ------------------------------------------------------------------
            
    return

In [None]:
def read_and_preprocess(feature_engineering = False, n_split = 3):
    
    train_pickle = '../input/riiid-cross-validation-files/cv1_train.pickle'
    valid_pickle = '../input/riiid-cross-validation-files/cv1_valid.pickle'
    question_metadata_file = '../input/question-metadate-new-new/question_metadata_new_new.csv'
    question_data_file = '../input/riiid-test-answer-prediction/questions.csv'
    
    
    # Read data
    feld_needed = ['timestamp', 
                   'user_id', 
                   'answered_correctly', 
                   'content_id', 
                   'content_type_id',
                   'prior_question_elapsed_time', 
                   'prior_question_had_explanation',
                   'task_container_id']
    train = pd.read_pickle(train_pickle)[feld_needed]
    valid = pd.read_pickle(valid_pickle)[feld_needed]
    
            
    # Delete some trianing data to don't have ram problems
    if feature_engineering:
        train = train.iloc[-1000000:]
        valid = valid.iloc[-1000000:]
        
    
    # Filter by content_type_id to discard lectures
    train = train.loc[train.content_type_id == False].reset_index(drop = True)
    valid = valid.loc[valid.content_type_id == False].reset_index(drop = True)

    print('train size: ')
    print(train.shape)

    print('valid size: ')
    print(valid.shape)
    
    # Changing dtype to avoid lightgbm error
    train['prior_question_had_explanation'] = train.prior_question_had_explanation.fillna(False).astype('int8')
    valid['prior_question_had_explanation'] = valid.prior_question_had_explanation.fillna(False).astype('int8')
    
    # Fill prior question elapsed time with the mean
    prior_question_elapsed_time_mean = train['prior_question_elapsed_time'].dropna().mean()
    train['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace = True)
    valid['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace = True)
    
    # Merge with question dataframe
    questions_df = pd.read_csv(question_metadata_file)
    questions_df_ = pd.read_csv(question_data_file)
    
#     questions_df_['tags'] = questions_df_['tags'].\
#                         apply(lambda ts: [int(x) for x in str(ts).split() if x != 'nan'])
    questions_df_.rename(columns = {'question_id': 'content_id'}, inplace = True)
    questions_df = questions_df.merge(questions_df_[['content_id', 'tags']], on = 'content_id', how = 'left')
    del questions_df_
    gc.collect()
    
    train = train.merge(questions_df[['content_id', 'tags', 'content_field', 'mean_question_accuracy']], on = 'content_id', how = 'left')
    valid = valid.merge(questions_df[['content_id', 'tags', 'content_field', 'mean_question_accuracy']], on = 'content_id', how = 'left')
    
   # Client dictionaries
    answered_correctly_u_count = defaultdict(int)
    answered_correctly_u_sum = defaultdict(int)
    elapsed_time_u_sum = defaultdict(int)
    explanation_u_sum = defaultdict(int)
    timestamp_u = defaultdict(list)
    timestamp_u_incorrect = defaultdict(list)
    latest_u_theta = defaultdict(np.float32)

    answered_correctly_u_count_field = {} #
    answered_correctly_u_sum_field = {} #
    answered_correctly_difficulty_weighted_sum = defaultdict(np.float32) #
    answered_correctly_difficulty_weighted_sum_field = {} #
    max_solved_difficulty = defaultdict(np.float32) #
    max_solved_difficulty_field = {} #
    min_wrong_difficulty = defaultdict(np.float32) #
    min_wrong_difficulty_field = {} #
    session_time = defaultdict(np.float32) #
    since_last_session_time = defaultdict(np.float32) #
    last_session_start_time = defaultdict(np.float32) #
    first_action_time = defaultdict(np.float32) #
  
    # Question dictionaries
    answered_correctly_q_count = defaultdict(int)
    answered_correctly_q_sum = defaultdict(int)
    elapsed_time_q_sum = defaultdict(int)
    explanation_q_sum = defaultdict(int)
    explanation_qtrue_sum = defaultdict(int)
    explanation_qtrue_count = defaultdict(int)
    latest_q_beta = defaultdict(np.float32)
    
    # Client Question dictionary
    answered_correctly_uq = {}
    question_avg_sum_u = defaultdict(np.float32)

    # Other features dictionary
    task_container_sum = defaultdict(int)
    task_container_count = defaultdict(int)
    tags_sum = defaultdict(int)
    tags_count = defaultdict(int)

    # split the train set into n_split subsets
    l = int(len(train) // n_split)
    
    for n in range(n_split - 1):        
        train.iloc[int(l * n) : int(l * (n + 1))].to_csv('train_%s.csv' % str(n))
    
    train.iloc[int(l * (n_split - 1)):].to_csv('train_%s.csv' % str(n+1))
    
    del train
    gc.collect()
    
    print('User feature calculation started...')
    print('\n')
    for n in range(n_split):
        train = pd.read_csv('train_%s.csv' % str(n))
        print(train.shape)
        add_train_features(train, 
                            answered_correctly_u_count, # v1 & v2 
                            answered_correctly_u_sum, # v1 & v2
                            elapsed_time_u_sum, # v1
                            explanation_u_sum, # v1
                            timestamp_u, # v1
                            timestamp_u_incorrect, # v1
                            latest_u_theta, # v1 & v2
                            answered_correctly_q_count, # v1 & v2 
                            answered_correctly_q_sum,  # v1
                            elapsed_time_q_sum, # v1
                            explanation_q_sum, # v1
                            explanation_qtrue_sum, # v1
                            explanation_qtrue_count, # v1
                            latest_q_beta,  # v1 & v2
                            answered_correctly_uq, # v1 & v2
                            question_avg_sum_u, # v1
                            task_container_sum, # v1 & v2
                            task_container_count, # v1 & v2
                            tags_sum, # v1
                            tags_count, # v1
                            answered_correctly_u_count_field, # v2
                            answered_correctly_u_sum_field, # v2
                            answered_correctly_difficulty_weighted_sum, # v2
                            answered_correctly_difficulty_weighted_sum_field, # v2
                            max_solved_difficulty, # v2
                            max_solved_difficulty_field, # v2
                            min_wrong_difficulty, # v2
                            min_wrong_difficulty_field, # v2
                            session_time, # v2
                            since_last_session_time, # v2
                            last_session_start_time, # v2
                            first_action_time, # v2                  
                            )
        gc.collect()
        
        del train
        gc.collect()
        print(n + 1, '/', n_split)
    print('Finish train feature calculation!')

    add_train_features(valid, 
                        answered_correctly_u_count, # v1 & v2 
                        answered_correctly_u_sum, # v1 & v2
                        elapsed_time_u_sum, # v1
                        explanation_u_sum, # v1
                        timestamp_u, # v1
                        timestamp_u_incorrect, # v1
                        latest_u_theta, # v1 & v2
                        answered_correctly_q_count, # v1 & v2 
                        answered_correctly_q_sum,  # v1
                        elapsed_time_q_sum, # v1
                        explanation_q_sum, # v1
                        explanation_qtrue_sum, # v1
                        explanation_qtrue_count, # v1
                        latest_q_beta,  # v1 & v2
                        answered_correctly_uq, # v1 & v2
                        question_avg_sum_u, # v1
                        task_container_sum, # v1 & v2
                        task_container_count, # v1 & v2
                        tags_sum, # v1
                        tags_count, # v1
                        answered_correctly_u_count_field, # v2
                        answered_correctly_u_sum_field, # v2
                        answered_correctly_difficulty_weighted_sum, # v2
                        answered_correctly_difficulty_weighted_sum_field, # v2
                        max_solved_difficulty, # v2
                        max_solved_difficulty_field, # v2
                        min_wrong_difficulty, # v2
                        min_wrong_difficulty_field, # v2
                        session_time, # v2
                        since_last_session_time, # v2
                        last_session_start_time, # v2
                        first_action_time, # v2    
                        )
    print('Finish valid feature calculation!')
    del valid
    gc.collect()
#     print('train: ')
#     print(train.head())
    
#     print('valid: ')
#     print(valid.head())
    
    print('User feature calculation completed...')
    print('\n')
    
    features_dicts = {
        'answered_correctly_u_count': answered_correctly_u_count,
        'answered_correctly_u_sum': answered_correctly_u_sum,
        'elapsed_time_u_sum': elapsed_time_u_sum,
        'explanation_u_sum': explanation_u_sum,
        'timestamp_u': timestamp_u,
        'timestamp_u_incorrect': timestamp_u_incorrect,
        'latest_u_theta': latest_u_theta,
        'answered_correctly_q_count': answered_correctly_q_count,
        'answered_correctly_q_sum': answered_correctly_q_sum,
        'elapsed_time_q_sum': elapsed_time_q_sum,
        'explanation_q_sum': explanation_q_sum,
        'explanation_qtrue_sum': explanation_qtrue_sum,
        'explanation_qtrue_count': explanation_qtrue_count,
        'latest_q_beta': latest_q_beta,
        'answered_correctly_uq': answered_correctly_uq,
        'question_avg_sum_u': question_avg_sum_u,
        'task_container_sum': task_container_sum,
        'task_container_count': task_container_count,
        'tags_sum': tags_sum,
        'tags_count': tags_count,
        'answered_correctly_u_count_field': answered_correctly_u_count_field, #
        'answered_correctly_u_sum_field': answered_correctly_u_sum_field, #
        'answered_correctly_difficulty_weighted_sum': answered_correctly_difficulty_weighted_sum, #
        'answered_correctly_difficulty_weighted_sum_field': answered_correctly_difficulty_weighted_sum_field, #
        'max_solved_difficulty': max_solved_difficulty, #
        'max_solved_difficulty_field': max_solved_difficulty_field, #
        'min_wrong_difficulty': min_wrong_difficulty, #
        'min_wrong_difficulty_field': min_wrong_difficulty_field, #
        'session_time': session_time, #
        'since_last_session_time': since_last_session_time, #
        'last_session_start_time': last_session_start_time, #
        'first_action_time': first_action_time, #
    }
    
    questions_df['tags'] = questions_df['tags'].\
                        apply(lambda ts: [int(x) for x in str(ts).split() if x != 'nan'])
    
    return questions_df, prior_question_elapsed_time_mean, features_dicts

In [None]:
questions_df, prior_question_elapsed_time_mean, features_dicts = read_and_preprocess(feature_engineering = False, n_split = 25)

LGBM_model = lgb.Booster(model_file = '../input/lgbm-v921-01/lgbm_model_V921_0.1.lgb')

In [None]:
# Get feature dict
answered_correctly_u_count = features_dicts['answered_correctly_u_count']
answered_correctly_u_sum = features_dicts['answered_correctly_u_sum']
elapsed_time_u_sum = features_dicts['elapsed_time_u_sum']
explanation_u_sum = features_dicts['explanation_u_sum']
answered_correctly_q_count = features_dicts['answered_correctly_q_count']
answered_correctly_q_sum = features_dicts['answered_correctly_q_sum']
elapsed_time_q_sum = features_dicts['elapsed_time_q_sum']
explanation_q_sum = features_dicts['explanation_q_sum']
explanation_qtrue_sum = features_dicts['explanation_qtrue_sum'] #
explanation_qtrue_count = features_dicts['explanation_qtrue_count'] #
answered_correctly_uq = features_dicts['answered_correctly_uq']
timestamp_u = features_dicts['timestamp_u']
timestamp_u_incorrect = features_dicts['timestamp_u_incorrect']
question_avg_sum_u = features_dicts['question_avg_sum_u']
task_container_sum = features_dicts['task_container_sum']
task_container_count = features_dicts['task_container_count']
latest_u_theta = features_dicts['latest_u_theta']
latest_q_beta = features_dicts['latest_q_beta']
tags_sum = features_dicts['tags_sum']
tags_count = features_dicts['tags_count']

answered_correctly_u_count_field = features_dicts['answered_correctly_u_count_field']
answered_correctly_u_sum_field = features_dicts['answered_correctly_u_sum_field']
answered_correctly_difficulty_weighted_sum = features_dicts['answered_correctly_difficulty_weighted_sum']
answered_correctly_difficulty_weighted_sum_field = features_dicts['answered_correctly_difficulty_weighted_sum_field']
max_solved_difficulty = features_dicts['max_solved_difficulty']
max_solved_difficulty_field = features_dicts['max_solved_difficulty_field']
min_wrong_difficulty = features_dicts['min_wrong_difficulty']
min_wrong_difficulty_field = features_dicts['min_wrong_difficulty_field']
session_time = features_dicts['session_time']
since_last_session_time = features_dicts['since_last_session_time']
last_session_start_time = features_dicts['last_session_start_time']
first_action_time =features_dicts['first_action_time']

del features_dicts
gc.collect()

In [None]:
import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

# parameters
MAX_SEQ = 240 # 210
ACCEPTED_USER_CONTENT_SIZE = 2 # 2
EMBED_SIZE = 256 # 256
BATCH_SIZE = 64+32 # 96
DROPOUT = 0.1 # 0.1

# load data
n_skill = 13523
print(n_skill)
group = joblib.load("../input/new-sakt-dataset/group.pkl.zip")

# model define
class FFN(nn.Module):
    def __init__(self, state_size = 200, forward_expansion = 1, bn_size = MAX_SEQ - 1, dropout=0.2):
        super(FFN, self).__init__()
        self.state_size = state_size
        
        self.lr1 = nn.Linear(state_size, forward_expansion * state_size)
        self.relu = nn.ReLU()
        self.bn = nn.BatchNorm1d(bn_size)
        self.lr2 = nn.Linear(forward_expansion * state_size, state_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = self.relu(self.lr1(x))
        x = self.bn(x)
        x = self.lr2(x)
        return self.dropout(x)
    
class FFN0(nn.Module):
    def __init__(self, state_size = 200, forward_expansion = 1, bn_size = MAX_SEQ - 1, dropout=0.2):
        super(FFN0, self).__init__()
        self.state_size = state_size

        self.lr1 = nn.Linear(state_size, forward_expansion * state_size)
        self.relu = nn.ReLU()
        self.lr2 = nn.Linear(forward_expansion * state_size, state_size)
        self.layer_normal = nn.LayerNorm(state_size) 
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        x = self.lr1(x)
        x = self.relu(x)
        x = self.lr2(x)
        x=self.layer_normal(x)
        return self.dropout(x)
    
def future_mask(seq_length):
    future_mask = (np.triu(np.ones([seq_length, seq_length]), k = 1)).astype('bool')
    return torch.from_numpy(future_mask)

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, heads = 8, dropout = DROPOUT, forward_expansion = 1):
        super(TransformerBlock, self).__init__()
        self.multi_att = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=heads, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.layer_normal = nn.LayerNorm(embed_dim)
        self.ffn = FFN(embed_dim, forward_expansion = forward_expansion, dropout=dropout)
        self.ffn0  = FFN0(embed_dim, forward_expansion = forward_expansion, dropout=dropout)
        self.layer_normal_2 = nn.LayerNorm(embed_dim)

    def forward(self, value, key, query, att_mask):
        att_output, att_weight = self.multi_att(value, key, query, attn_mask=att_mask)
        att_output = self.dropout(self.layer_normal(att_output + value))
        att_output = att_output.permute(1, 0, 2) # att_output: [s_len, bs, embed] => [bs, s_len, embed]
        x = self.ffn(att_output)
        x1 = self.ffn0(att_output)
        x = self.dropout(self.layer_normal_2(x + x1 + att_output))
        return x.squeeze(-1), att_weight
    
class Encoder(nn.Module):
    def __init__(self, n_skill, max_seq=100, embed_dim=128, dropout = DROPOUT, forward_expansion = 1, num_layers=1, heads = 8):
        super(Encoder, self).__init__()
        self.n_skill, self.embed_dim = n_skill, embed_dim
        self.embedding = nn.Embedding(2 * n_skill + 1, embed_dim)
        self.pos_embedding = nn.Embedding(max_seq - 1, embed_dim)
        self.e_embedding = nn.Embedding(n_skill+1, embed_dim)
        self.layers = nn.ModuleList([TransformerBlock(embed_dim, forward_expansion = forward_expansion) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, question_ids):
        device = x.device
        x = self.embedding(x)
        pos_id = torch.arange(x.size(1)).unsqueeze(0).to(device)
        pos_x = self.pos_embedding(pos_id)
        x = self.dropout(x + pos_x)
        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        e = self.e_embedding(question_ids)
        e = e.permute(1, 0, 2)
        for layer in self.layers:
            att_mask = future_mask(e.size(0)).to(device)
            x, att_weight = layer(e, x, x, att_mask=att_mask)
            x = x.permute(1, 0, 2)
        x = x.permute(1, 0, 2)
        return x, att_weight

class SAKTModel(nn.Module):
    def __init__(self, n_skill, max_seq=100, embed_dim=128, dropout = DROPOUT, forward_expansion = 1, enc_layers=1, heads = 8):
        super(SAKTModel, self).__init__()
        self.encoder = Encoder(n_skill, max_seq, embed_dim, dropout, forward_expansion, num_layers=enc_layers)
        self.pred = nn.Linear(embed_dim, 1)
        
    def forward(self, x, question_ids):
        x, att_weight = self.encoder(x, question_ids)
        x = self.pred(x)
        return x.squeeze(-1), att_weight
    
class TestDataset(Dataset):
    def __init__(self, samples, test_df, n_skill, max_seq=100):
        super(TestDataset, self).__init__()
        self.samples, self.user_ids, self.test_df = samples, [x for x in test_df["user_id"].unique()], test_df
        self.n_skill, self.max_seq = n_skill, max_seq

    def __len__(self):
        return self.test_df.shape[0]
    
    def __getitem__(self, index):
        test_info = self.test_df.iloc[index]
        
        user_id = test_info['user_id']
        target_id = test_info['content_id']
        
        content_id_seq = np.zeros(self.max_seq, dtype=int)
        answered_correctly_seq = np.zeros(self.max_seq, dtype=int)
        
        if user_id in self.samples.index:
            content_id, answered_correctly = self.samples[user_id]
            
            seq_len = len(content_id)
            
            if seq_len >= self.max_seq:
                content_id_seq = content_id[-self.max_seq:]
                answered_correctly_seq = answered_correctly[-self.max_seq:]
            else:
                content_id_seq[-seq_len:] = content_id
                answered_correctly_seq[-seq_len:] = answered_correctly
                
        x = content_id_seq[1:].copy()
        x += (answered_correctly_seq[1:] == 1) * self.n_skill
        
        questions = np.append(content_id_seq[2:], [target_id])
        
        return x, questions

def create_model():
    return SAKTModel(n_skill, max_seq=MAX_SEQ, embed_dim=EMBED_SIZE, forward_expansion=1, enc_layers=1, heads=4, dropout=0.1)

# load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_PATH = '../input/new-sakt-dataset/sakt_model.pt'
SAKT_model = create_model()
SAKT_model.load_state_dict(torch.load(MODEL_PATH, map_location='cpu'))
# SAKT_model.load_state_dict(torch.load(MODEL_PATH))
SAKT_model.to(device)

In [None]:
# Get api iterator and predictor
env = riiideducation.make_env()
iter_test = env.iter_test()
set_predict = env.predict

In [None]:
%%time
w = 0.225
previous_test_df = None
for (test_df, sample_prediction_df) in iter_test:    
    if previous_test_df is not None:
        previous_test_df[TARGET] = eval(test_df["prior_group_answers_correct"].iloc[0])
        update_features(previous_test_df, 
                        answered_correctly_u_sum, 
                        answered_correctly_q_sum, 
                        timestamp_u_incorrect,
                        explanation_qtrue_sum,
                        task_container_sum,
                        latest_u_theta,
                        latest_q_beta,
                        answered_correctly_u_count,
                        answered_correctly_q_count,
                        tags_sum,
                        answered_correctly_u_sum_field,
                        answered_correctly_difficulty_weighted_sum,
                        answered_correctly_difficulty_weighted_sum_field,
                        max_solved_difficulty,
                        max_solved_difficulty_field,
                        min_wrong_difficulty,
                        min_wrong_difficulty_field)
        
        # HDKIM SAKT State Update
#         prev_group = previous_test_df[previous_test_df['content_type_id'] == 0][['user_id', 'content_id', 'answered_correctly']].\
#         groupby('user_id').apply(lambda r: (
#             r['content_id'].values,
#             r['answered_correctly'].values))
        
#         for prev_user_id in prev_group.index:
#             if prev_user_id in group.index:
#                 group[prev_user_id] = (
#                     np.append(group[prev_user_id][0], prev_group[prev_user_id][0])[-MAX_SEQ:], 
#                     np.append(group[prev_user_id][1], prev_group[prev_user_id][1])[-MAX_SEQ:]
#                 )
 
#             else:
#                 group[prev_user_id] = (
#                     prev_group[prev_user_id][0], 
#                     prev_group[prev_user_id][1]
#                 )
        # HDKIMHDKIM
        
    test_df['prior_question_had_explanation'] = test_df.prior_question_had_explanation.\
                                                        fillna(False).astype('int8')
    test_df['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace = True)
    test_df = test_df.merge(questions_df, on = 'content_id', how = 'left')
    previous_test_df = test_df.copy()
    
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop = True)
    test_df[TARGET] = 1/4
    test_df = add_features(test_df, 
                            answered_correctly_u_count, # v1 & v2 
                            answered_correctly_u_sum, # v1 & v2
                            elapsed_time_u_sum, # v1
                            explanation_u_sum, # v1
                            timestamp_u, # v1
                            timestamp_u_incorrect, # v1
                            latest_u_theta, # v1 & v2
                            answered_correctly_q_count, # v1 & v2 
                            answered_correctly_q_sum,  # v1
                            elapsed_time_q_sum, # v1
                            explanation_q_sum, # v1
                            explanation_qtrue_sum, # v1
                            explanation_qtrue_count, # v1
                            latest_q_beta,  # v1 & v2
                            answered_correctly_uq, # v1 & v2
                            question_avg_sum_u, # v1
                            task_container_sum, # v1 & v2
                            task_container_count, # v1 & v2
                            tags_sum, # v1
                            tags_count, # v1
                            answered_correctly_u_count_field, # v2
                            answered_correctly_u_sum_field, # v2
                            answered_correctly_difficulty_weighted_sum, # v2
                            answered_correctly_difficulty_weighted_sum_field, # v2
                            max_solved_difficulty, # v2
                            max_solved_difficulty_field, # v2
                            min_wrong_difficulty, # v2
                            min_wrong_difficulty_field, # v2
                            session_time, # v2
                            since_last_session_time, # v2
                            last_session_start_time, # v2
                            first_action_time, # v2
                            update = False)
    test_df[TARGET] = LGBM_model.predict(test_df[FEATURES])
    
    # HDKIM SAKT
#     test_dataset = TestDataset(group, test_df, n_skill, max_seq=MAX_SEQ)
#     test_dataloader = DataLoader(test_dataset, batch_size=len(test_df), shuffle=False)
    
#     SAKT_outs = []

#     for item in test_dataloader:
#         x = item[0].to(device).long()
#         target_id = item[1].to(device).long()

#         with torch.no_grad():
#             output, att_weight = SAKT_model(x, target_id)
 
#         output = torch.sigmoid(output)
#         output = output[:, -1]
#         SAKT_outs.extend(output.view(-1).data.cpu().numpy())
    
#     test_df[TARGET] = test_df[TARGET] * (1 - w) + np.array(SAKT_outs) * w
    # HDKIMHDKIM

    set_predict(test_df[['row_id', TARGET]])

print('Job Done')