In [None]:
!pip install ../input/lgbm-inference-db-full-data/pickle5-0.0.11/

In [None]:
import pandas as pd
import numpy as np
import gc
from sklearn.metrics import roc_auc_score
from collections import defaultdict
from tqdm.notebook import tqdm
import lightgbm as lgb
import pickle5 as pickle
from numba import jit

In [None]:
from contextlib import contextmanager
import time


@contextmanager
def timer(name):
    """
    Time Each Process
    """
    t0 = time.time()
    yield
    print('\n[{}] done in {} Minutes\n'.format(name, round((time.time() - t0) / 60, 2)))

In [None]:
train_pickle = '../input/lgbm-inference-db-full-data/train_df.pickle'
question_file = '../input/lgbm-inference-db-full-data/question_features.csv'
# feature_file = '../input/lgbm-inference-db-full-data/pre_features.csv'
ms_in_a_day = 8.64 * 10 ** 7

prior_question_elapsed_time_mean = 25439.41

## feature engineering

In [None]:
left_asymptote = 0.25


@jit(nopython=True)
def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
    return theta + learning_rate_theta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote))


@jit(nopython=True)
def learning_rate_theta(nb_answers):
    return max(0.3 / (1 + 0.01 * nb_answers), 0.04)


@jit(nopython=True)
def probability_of_good_answer(theta, beta, left_asymptote):
    return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)


@jit(nopython=True)
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [None]:
# funcs for user stats with loop
# def count_attempts(df):
#     cols = ['user_id', 'content_id', 'content_type_id']
#     for cnt, row in enumerate(tqdm(df[cols].values)):
#         if row[2] == 0:
#             attempt_dict[row[0]][row[1]] += 1
#     return df

In [None]:
def calc_user_feats_test(df, bundle_count, temp_values):
    attempt_no_array = np.zeros(len(df), dtype=np.int32)
    last_lecture_time_array = np.zeros(len(df), dtype=np.float64)
    last_incorrect_time_array = np.zeros(len(df), dtype=np.float64)
    acsu = np.zeros(len(df), dtype=np.int32)
    acsu_part = np.zeros(len(df), dtype=np.int32)
    cu = np.zeros(len(df), dtype=np.int32)
    cu_part = np.zeros(len(df), dtype=np.int32)
    tu_part = np.zeros(len(df), dtype=np.int32)
    lag_time_array = np.zeros(len(df), dtype=np.int64)  # diff between timestamps
    wait_time_array = np.zeros(len(df), dtype=np.float64)  # the time a student waits before starts the next question
    theta_array = np.zeros(len(df), dtype=np.float32)
    beta_array = np.zeros(len(df), dtype=np.float32)
    difficulty_correct_array = np.zeros(len(df), dtype=np.float64)
    difficulty_incorrect_array = np.zeros(len(df), dtype=np.float64)

    feature_cols = ['user_id', 'prior_question_elapsed_time', 'timestamp',
                    'content_id', 'content_type_id', 'part', 'bundle_id',
                    'mean_content_accuracy_sm']

    for cnt, row in enumerate(df[feature_cols].values):
        if row[2] == 0:
            lag_time_array[cnt] = 0
            prior_question_lag_time[row[0]] = np.nan
            wait_time_array[cnt] = np.nan
        elif row[2] == user_last_timestamp[row[0]]:  # if question is in the same bundle as the previous one
            wait_time_array[cnt] = temp_values[5]
            lag_time_array[cnt] = row[2] - user_last_timestamp_traceback[row[0]]
        else:
            lag_time_array[cnt] = row[2] - user_last_timestamp[row[0]]
            if (lag_time_array[cnt] == 0) | (lag_time_array[cnt] == row[2]) | (len(prior_bundle_count[row[0]]) != 2):
                wait_time_array[cnt] = np.nan
            else:
                wait_time_array[cnt] = prior_question_lag_time[row[0]] - prior_bundle_count[row[0]][1] * row[1]

            user_last_timestamp_traceback[row[0]] = user_last_timestamp[
                row[0]]  # assign last time stamp to track back dict
        user_last_timestamp[row[0]] = row[2]  # assign the latest timestamp to user

        # count attempts for the same question, lecture attendance (all) and lecture attendance (solving problem)
        if row[4] == 1:
            last_lecture_time[row[0]] = row[3]
        else:
            if row[6] in bundles:
                if len(prior_bundle_count[row[0]]) == 2:
                    if row[6] == prior_bundle_count[row[0]][0]:
                        bundle_count += 1
                        bundle_flg = True
                        save_temp_value_flg = False
                    else:
                        bundle_count = 1
                        bundle_flg = False
                        save_temp_value_flg = True
                else:
                    bundle_count = 1
                    bundle_flg = False
                    save_temp_value_flg = True
            else:
                bundle_count = 1
                bundle_flg = False
                save_temp_value_flg = False

            prior_question_lag_time[row[0]] = lag_time_array[cnt]
            if save_temp_value_flg:
                temp_values[0] = answered_correctly_sum_user_dict['total'][row[0]]
                temp_values[1] = answered_correctly_sum_user_dict[int(row[5])][row[0]]
                temp_values[2] = question_count_dict['total'][row[0]]
                temp_values[3] = question_count_dict[int(row[5])][row[0]]
                # temp_values[4] = sum(answer_list_20[row[0]])
                # temp_values[5] = len(answer_list_20[row[0]])
                temp_values[4] = last_incorrect_time[row[0]]   # 6
                temp_values[5] = wait_time_array[cnt]    # 7
                temp_values[6] = beta_dict[row[3]]     #  8
                temp_values[7] = theta_dict[row[0]]    # 9
                temp_values[8] = difficulty_dict[row[0]]['correct']
                temp_values[9] = difficulty_dict[row[0]]['incorrect']

            if bundle_flg:  # assign fixed temp values
                acsu[cnt] = temp_values[0]
                cu[cnt] = temp_values[2]
                difficulty_correct_array[cnt] = temp_values[8]
                difficulty_incorrect_array[cnt] = temp_values[9]

                # part feature
                acsu_part[cnt] = temp_values[1]
                cu_part[cnt] = temp_values[3]
                
                theta_array[cnt] = temp_values[7]
                beta_array[cnt] = temp_values[6]

                if row[2] == 0:
                    last_incorrect_time_array[cnt] = np.nan
                else:
                    last_incorrect_time_array[cnt] = row[2] - temp_values[4]

            else:
                acsu[cnt] = answered_correctly_sum_user_dict['total'][row[0]]
                cu[cnt] = question_count_dict['total'][row[0]]
                difficulty_correct_array[cnt] = difficulty_dict[row[0]]['correct']
                difficulty_incorrect_array[cnt] = difficulty_dict[row[0]]['incorrect']

                # part feature
                acsu_part[cnt] = answered_correctly_sum_user_dict[int(row[5])][row[0]]
                cu_part[cnt] = question_count_dict[int(row[5])][row[0]]

                if row[2] == 0:
                    last_incorrect_time_array[cnt] = np.nan
                else:
                    last_incorrect_time_array[cnt] = row[2] - last_incorrect_time[row[0]]

                # keep track of the last 20 questions
#                 if len(answer_list_20[row[0]]) == 0:
#                     last_20_accuracy[cnt] = np.nan
#                 else:
#                     last_20_accuracy[cnt] = sum(answer_list_20[row[0]]) / len(answer_list_20[row[0]])
                
                # ELO Rating
                beta_array[cnt] = beta_dict[row[3]]
                theta_array[cnt] = theta_dict[row[0]]
                
            tu_part[cnt] = user_time_dict[int(row[5])][row[0]]

            # count attempts for the same question, lecture attendance, and record last lecture time
            if bundle_count == 1:
                attempt_dict[row[0]][row[6]] += 1
            attempt_no_array[cnt] = attempt_dict[row[0]][row[6]]
            
            if last_lecture_time[row[0]] == 0:
                last_lecture_time_array[cnt] = np.nan
                # learn_from_lecture_array[cnt] = 0
            else:
                last_lecture_time_array[cnt] = row[2] - last_lecture_time[row[0]]
#                 if last_lecture_time_array[cnt] < ms_in_a_day:
#                     learn_from_lecture_array[cnt] = (last_lecture_info[row[0]][1] in row[8])
#                 else:
#                     learn_from_lecture_array[cnt] = 0

            # keep track of the total time a student has spent on each part and in total
            if np.isnan(row[1]):
                user_time_dict[int(row[5])][row[0]] += 0
            else:
                user_time_dict[int(row[5])][row[0]] += row[1]

            prior_bundle_count[row[0]] = (row[6], bundle_count)

    df['attempt_no'] = attempt_no_array
    df['last_lecture_time'] = last_lecture_time_array
    df['last_incorrect_time'] = last_incorrect_time_array
    df['lag_time'] = lag_time_array
    df['prior_question_wait_time'] = wait_time_array
    df['theta'] = theta_array
    df['beta'] = beta_array

    user_feats_df = pd.DataFrame({'answered_correctly_sum_user': acsu, 'answered_count': cu,
                                  'answered_correctly_sum_user_part': acsu_part, 'answered_count_part': cu_part,
                                  'total_time_spent_user_part': tu_part,
                                  'difficulty_correct_count': difficulty_correct_array,
                                  'difficulty_incorrect_count': difficulty_incorrect_array
                                  })

    user_feats_df['mean_user_accuracy'] = user_feats_df['answered_correctly_sum_user'] / user_feats_df['answered_count']
    user_feats_df['mean_user_accuracy_part'] = user_feats_df['answered_correctly_sum_user_part'] / user_feats_df[
        'answered_count_part']
    user_feats_df['mean_user_spent_time_part'] = user_feats_df['total_time_spent_user_part'] / user_feats_df[
        'answered_count_part']
    user_feats_df.loc[user_feats_df['answered_count_part'] == 0, 'mean_user_spent_time_part'] = np.nan
    user_feats_df['difficulty_correct'] = user_feats_df['difficulty_correct_count'] / user_feats_df['answered_correctly_sum_user']
    user_feats_df['difficulty_incorrect'] = user_feats_df['difficulty_incorrect_count'] / \
                                            (user_feats_df['answered_count'] - user_feats_df['answered_correctly_sum_user'])
    user_feats_df['difficulty_diff'] = user_feats_df['difficulty_correct'] - user_feats_df['difficulty_incorrect']
    user_feats_df.drop(columns=['difficulty_correct_count', 'difficulty_incorrect_count', 'difficulty_correct'], inplace=True)
    
    feats_cols = user_feats_df.columns
    for col in feats_cols:
        df[col] = user_feats_df[col].values

    df['hmean_user_content_accuracy'] = 2 * (df['mean_user_accuracy'] * df['mean_content_accuracy_sm']) / \
                                        (df['mean_user_accuracy'] + df['mean_content_accuracy_sm'])
    # floor the anomalies (?) at zero; could try other values
    df.loc[df['prior_question_wait_time'] < 0, 'prior_question_wait_time'] = 0

    return df, bundle_count, temp_values


In [None]:
def update_user_feats(df):
    bundle_count = 1
    for row in df[['user_id', 'answered_correctly', 'content_type_id', 'timestamp',
                   'part', 'content_id', 'answered_count', 'mean_content_accuracy_sm']].values:
        if row[2] == 0:
            # cumulatively add values for total & part
            answered_correctly_sum_user_dict['total'][row[0]] += row[1]
            answered_correctly_sum_user_dict[int(row[4])][row[0]] += row[1]
            question_count_dict['total'][row[0]] += 1
            question_count_dict[int(row[4])][row[0]] += 1
            
            theta = theta_dict[row[0]]
            beta = beta_dict[row[5]]
            theta_dict[row[0]] = get_new_theta(row[1], beta, left_asymptote, theta, row[6])

            if row[1] == 0:
                last_incorrect_time[row[0]] = row[3]
                difficulty_dict[row[0]]['incorrect'] += row[7]
            else:
                difficulty_dict[row[0]]['correct'] += row[7]

In [None]:
with open(train_pickle, 'rb') as file:
    df = pickle.load(file)

In [None]:
def multi_level_dict():
    return defaultdict(int)

attempt_dict = defaultdict(multi_level_dict)

In [None]:
def multi_level_float_dict():
    return defaultdict(float)

In [None]:
# with timer("Counting Attempts"):
#     train = count_attempts(train)
#     del train
#     gc.collect()
#     valid = count_attempts(valid)
#     del valid
#     gc.collect()

In [None]:
with timer("counting"):
    keys = np.sort(df['user_id'].unique())
    total = len(keys)

    # add user content attempts
    user_bundle = df.groupby('user_id')['bundle_id'].apply(np.array).apply(np.sort).apply(np.unique)
    user_attempts = df.groupby(['user_id', 'bundle_id'])['bundle_id'].count().astype(np.uint8).groupby('user_id').apply(np.array)

    for user_id, bundle, attempt in tqdm(zip(keys, user_bundle, user_attempts), total=total):
        attempt_dict[user_id] = defaultdict(int, zip(bundle, attempt))
        
del user_bundle, user_attempts, df, bundle, attempt
gc.collect()

In [None]:
with timer("Loading Data..."):
    # answered correctly average for each content
    # content_df = pd.read_csv('../input/lgbm-inference-db-full-data/content_df.csv', index_col = 0)
    part_df = pd.read_csv('../input/lgbm-inference-db-full-data/part_df.csv', index_col = 0)

    with open('../input/lgbm-inference-db-full-data/answered_correctly_sum_user_dict.pickle', 'rb') as file:
        answered_correctly_sum_user_dict = pickle.load(file)

    with open('../input/lgbm-inference-db-full-data/question_count_dict.pickle', 'rb') as file:
        question_count_dict = pickle.load(file)

    with open('../input/lgbm-inference-db-full-data/user_time_dict.pickle', 'rb') as file:
        user_time_dict = pickle.load(file)

    with open('../input/lgbm-inference-db-full-data/user_last_timestamp.pickle', 'rb') as file:
        user_last_timestamp = pickle.load(file)

    with open('../input/lgbm-inference-db-full-data/user_last_timestamp_traceback.pickle', 'rb') as file:
        user_last_timestamp_traceback = pickle.load(file)

#     with open('../input/lgbm-inference-db-full-data/answer_list_20.pickle', 'rb') as file:
#         answer_list_20 = pickle.load(file)
    
    with open('../input/lgbm-inference-db-full-data/last_lecture_time.pickle', 'rb') as file:
        last_lecture_time = pickle.load(file)
        
    with open('../input/lgbm-inference-db-full-data/last_incorrect_time.pickle', 'rb') as file:
        last_incorrect_time = pickle.load(file)
    
    with open('../input/lgbm-inference-db-full-data/prior_question_lag_time.pickle', 'rb') as file:
        prior_question_lag_time = pickle.load(file)
        
    with open('../input/lgbm-inference-db-full-data/prior_bundle_count.pickle', 'rb') as file:
        prior_bundle_count = pickle.load(file)
        
    with open('../input/lgbm-inference-db-full-data/theta_dict.pickle', 'rb') as file:
        theta_dict = pickle.load(file)
        
    with open('../input/lgbm-inference-db-full-data/beta_dict.pickle', 'rb') as file:
        beta_dict = pickle.load(file)
    
    with open('../input/lgbm-inference-db-full-data/bundles.npy', 'rb') as file:
        bundles = np.load(file)
        
    with open('../input/lgbm-inference-db-full-data/difficulty_dict.pickle', 'rb') as file:
        difficulty_dict = pickle.load(file)
        

In [None]:
# part and tags
questions_df = pd.read_csv(question_file, index_col='question_id')
# questions_df['tags'].fillna('-1', inplace=True)
# questions_df['tags'] = questions_df['tags'].apply(lambda x: np.array(x.split()).astype(int))

In [None]:
# feature_df = pd.read_csv(feature_file, index_col='content_id')
# questions_df = questions_df.merge(feature_df, left_index=True, right_index=True)

## modeling

In [None]:
TARGET = 'answered_correctly'
FEATS_1 = ['mean_user_accuracy',
         # 'answered_correctly_sum_user',
         'answered_count',
         'mean_content_accuracy_sm',
         'prior_question_elapsed_time',
         # 'hmean_user_content_accuracy',
         'last_incorrect_time', 'prior_question_wait_time',
         # 'prior_question_had_explanation',
         'content_freq_encoding',
         'lag_time',
         'attempt_no', 'last_lecture_time',
         'mean_user_spent_time_part',
         'answered_correctly_sum_user_part',
         'mean_user_accuracy_part',
         'part', 'theta', 'beta',
         'question_avg_explanation_sm',
         'question_avg_elapsed_time_sm',
         'tags_lsi',
         'difficulty_incorrect', 'difficulty_diff'
         ]

categorical_features = ['part', 'tags_lsi']

In [None]:
FEATS_2 = ['mean_user_accuracy',
         'answered_correctly_sum_user',
         'answered_count',
         'mean_content_accuracy_sm',
         'prior_question_elapsed_time',
         'hmean_user_content_accuracy',
         'last_incorrect_time', 'prior_question_wait_time',
         # 'prior_question_had_explanation',
         'content_freq_encoding',
         'lag_time',
         'attempt_no', 'last_lecture_time',
         'mean_user_spent_time_part',
         'answered_correctly_sum_user_part',
         'mean_user_accuracy_part',
         'part', 'theta', 'beta',
         'question_avg_explanation_sm',
         'question_avg_elapsed_time_sm',
         'tags_lsi',
         'difficulty_incorrect',
         # 'difficulty_diff'
         ]

In [None]:
model_1 = lgb.Booster(model_file='../input/lgbm-inference-db-full-data/lightgbm_v11.5.txt')

In [None]:
model_2 = lgb.Booster(model_file='../input/lgbm-inference-db-full-data/lightgbm_v11.6.txt')

## inference

In [None]:
import riiideducation
env = riiideducation.make_env()
iter_test = env.iter_test()
set_predict = env.predict

In [None]:
# use fast merging method for faster iterations of test
previous_test_df = None
for (test_df, sample_prediction_df) in iter_test:
    test_df = pd.concat([test_df.reset_index(drop=True), 
                         questions_df.reindex(test_df['content_id'].values).reset_index(drop=True)], axis=1)
    test_df = pd.concat([test_df.reset_index(drop=True), 
                         part_df.reindex(test_df['part'].values).reset_index(drop=True)], axis=1)
    
    
    if previous_test_df is not None:
        previous_test_df[TARGET] = eval(test_df["prior_group_answers_correct"].iloc[0])
        update_user_feats(previous_test_df)
    else:
        bundle_count = 1
        temp_values = np.empty(10) * np.nan
    # previous_test_df = test_df.copy()
    
    test_df, bundle_count, temp_values = calc_user_feats_test(test_df, bundle_count, temp_values)
    previous_test_df = test_df.copy()
    
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    test_df['prior_question_had_explanation'] = test_df.prior_question_had_explanation.fillna(False).astype('int8')
    test_df['part'] = test_df.part.fillna(False).astype('int8')
    test_df['prior_question_elapsed_time'] = test_df.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean)
    
    for col in categorical_features:
        test_df[col] = test_df[col].astype('category')
    
    test_df[TARGET] =  model_1.predict(test_df[FEATS_1]) * 0.5 + model_2.predict(test_df[FEATS_2]) * 0.5
    # test_df[TARGET] =  model_1.predict(test_df[FEATS_1])
    set_predict(test_df[['row_id', TARGET]])