## general

This is our final inference pipeline. Unfortunately we did not have time to blend anything but this LGB can achieve .7936 CV/.794 Private LB on its own.

The full training code can be found here:
https://github.com/nicohrubec/riid_solution

**Thanks to @gmilosev for the cool collaboration. This is a joint effort.**

## features

The final model uses only 35 features. Some of the most important were:
- mean answered_correctly for user
- rolling mean answered_correctly for user
- mean answered_correctly for user on question
- mean answered_correctly for user on part
- last time the user has seen the current question
- last time the user was seen
- last n times the user was seen
- user count
- user count for question
- bin questions to difficulty level and compute mean user answered_correctly per difficulty bin


## memory management
Since we do not see all users we have in train in the test set it does not make sense to precompute the feature dictionaries for all users seen in train especially since we only have 16 GB of ram. Here it is solved by on the fly dictionary computation as follows:
1. init empty feature dictionaries before inference
2. check for each user we encounter if the user can already be found in the feature dictionaries
3. if not: check if the user is in train
4. if we dont have the user in the dicts and it can be found in train: query the train data and update the feature dicts with the train data for this user.

In the beginning I was struggling a lot with kernel memory restrictions when adding new features until I found this strategy. After we switched to this strategy we never had any problems.

In [None]:
import pandas as pd
import numpy as np
import os
import pickle
import lightgbm as lgb
import riiideducation
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
import gc

import warnings
warnings.filterwarnings('ignore')

In [None]:
TARGET = 'answered_correctly'
feats = ['content_id', 'task_container_id', 'prior_question_elapsed_time', 'prior_question_had_explanation', 'part', 'content_id_target_mean', 'user_count', 'user_correct_mean', 'user_question_count', 'user_question_correct_mean', 'last_time_user', 'last_time_question_user', 'last_time_user_inter', 'user_last_n_correct', 'answer1', 'answer2', 'answer3', 'answer4', 'user_last_n_time', 'user_last_n_time2', 'user_last_n_time3', 'user_part_count', 'user_part_correct_mean', 'user_part1_mean', 'user_part2_mean', 'user_part3_mean', 'user_part4_mean', 'user_part5_mean', 'user_part6_mean', 'user_part7_mean', 'task_container_eq1', 'task_container_eq2', 'user_hardness_count', 'user_hardness_mean', 'user_hardness_inter']

In [None]:
model_path = '../input/lgb-baseline-riid/lgb_0.7936991.dat'
questions_file = '../input/riiid-test-answer-prediction/questions.csv'
lectures_file = '../input/riiid-test-answer-prediction/lectures.csv'
train_file = '../input/riiid-test-answer-prediction/train.csv'
test_file = '../input/riiid-test-answer-prediction/example_test.csv'

In [None]:
def get_and_merge_feat(trn, target, feat):
    # compute target mean and merge on train
    feat_name = '{}_target_mean'.format(feat)
    mean = trn[[feat, target]].groupby([feat]).agg(['mean'])
    mean.columns = [feat_name]
    trn = pd.merge(trn, mean, on=feat, how='left')

    # transform df to dict for test merge
    feat_dict = mean.astype('float32').to_dict()[feat_name]

    return trn, feat_dict

In [None]:
def get_answer_feats(trn):
    answer_counts = trn.groupby('content_id')['user_answer'].value_counts(normalize=True)

    answer_counts_unstack = answer_counts.unstack().reset_index(drop=True).astype(np.float32)
    answer_counts_unstack.columns = ['answer1', 'answer2', 'answer3', 'answer4']
    answer_counts_unstack = answer_counts_unstack.rename_axis('content_id').reset_index()
    answers = answer_counts_unstack.values[:, -4:].astype(np.float32)
    answers.sort(axis=1)
    answer_counts_unstack[['answer1', 'answer2', 'answer3', 'answer4']] = answers
    answer_counts_unstack = answer_counts_unstack.astype(np.float32)

    return answer_counts_unstack

In [None]:
def get_q_hardness_bins(df):
    df['question_hardness'] = 0
    df.question_hardness.values[df.content_id_target_mean >= 0.9] = -10  # very easy
    df.question_hardness.values[(df.content_id_target_mean >= 0.7) & (df.content_id_target_mean < 0.9)] = -11  # easier
    df.question_hardness.values[(df.content_id_target_mean >= 0.5) & (df.content_id_target_mean < 0.7)] = -12  # harder
    df.question_hardness.values[df.content_id_target_mean < 0.5] = -13  # hard

    return df

In [None]:
needed_cols = ['user_id', 'content_id', 'timestamp', 'part', 'task_container_id', TARGET, 'user_answer']

print("Get content mean feature ...")
train = pd.read_pickle('../input/local-training-file-pickled/train_all.pkl')[needed_cols]
train = train[train.answered_correctly != -1] # exclude lectures
train, content_dict = get_and_merge_feat(train, TARGET, 'content_id')  # get question target mean
train = get_q_hardness_bins(train)  # bin questions into target mean bins --> question difficulty clusters

print("Get question answer distribution ...")
answer_counts = get_answer_feats(train) # answer distribution for each question

del train['user_answer']
gc.collect()

model = pickle.load(open(model_path, "rb"))
user_idx = pd.concat([ train[['user_id']].drop_duplicates(keep='first'), train[['user_id']].drop_duplicates(keep='last'), ]).reset_index().sort_values(['user_id', 'index']).groupby(['user_id'])['index'].agg([list]).to_dict()['list']

In [None]:
# convert answer distribution to {question: distribution} dictionary
answer_dist_dict = {}

for idx, row in enumerate(answer_counts.values):
    answer_dist_dict[int(row[0])] = [row[1], row[2], row[3], row[4]]

del answer_counts

In [None]:
def update_dicts(row, count_dict, correct_dict, time_dict, last_n_dict):
    # get dictionary keys for each feature
    user = int(row[0])
    question = int(row[1])
    timestamp = int(row[2])
    part = int(-row[3])
    task_id = int(row[4])
    hardness = int(row[5])
    correct = int(row[6])

    if user in count_dict:  # known user
        # overall user features
        count_dict[user]['sum'] += 1
        correct_dict[user]['sum'] += correct
        time_dict[user]['last'] = timestamp
        last_n_dict[user]['last_task2'] = last_n_dict[user]['last_task']
        last_n_dict[user]['last_task'] = task_id

        # update rolling answered correct for user
        last_n_dict[user]['last_n'].append(correct)
        correction = last_n_dict[user]['last_n'].pop(0)
        last_n_dict[user]['sum'] -= correction
        last_n_dict[user]['sum'] += correct

        # update last n timestamps for user
        last_n_dict[user]['last_n_time'].append(timestamp)
        last_n_dict[user]['time_sum'] = last_n_dict[user]['last_n_time'].pop(0)
        last_n_dict[user]['time_sum2'] = last_n_dict[user]['last_n_time'][9]
        last_n_dict[user]['time_sum3'] = last_n_dict[user]['last_n_time'][14]

        # update question difficulty specific features
        if hardness in count_dict[user]:
            count_dict[user][hardness] += 1
            correct_dict[user][hardness] += correct
        else:
            count_dict[user][hardness] = 1
            correct_dict[user][hardness] = correct

        # update part specific features
        if part in count_dict[user]:
            count_dict[user][part] += 1
            correct_dict[user][part] += correct
        else:
            count_dict[user][part] = 1
            correct_dict[user][part] = correct

        # update question specific features
        if question in count_dict[user]:  # known question for this user
            count_dict[user][question] += 1
            correct_dict[user][question] += correct
            time_dict[user][question] = timestamp
        else:  # unknown question for this user
            count_dict[user][question] = 1
            correct_dict[user][question] = correct
            time_dict[user][question] = timestamp

    else:  # unknown user create new entry
        count_dict[user] = {'sum': 1, question: 1, part: 1, hardness: 1}
        correct_dict[user] = {'sum': correct, question: correct, part: correct, hardness: correct}
        time_dict[user] = {'last': timestamp, question: timestamp}
        last_n_dict[user] = {'sum': correct, 'time_sum': 0, 'time_sum2': 0, 'time_sum3': 0,
                             'last_n': [0, 0, 0, 0, correct], 'last_task': task_id, 'last_task2': np.nan,
                             'last_n_time': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, timestamp]}

    return count_dict, correct_dict, time_dict, last_n_dict


def get_row_values(row, count_dict, correct_dict, time_dict, last_n_dict):
    feats = np.full(23, fill_value=np.nan, dtype=np.float32)  # row storage

    # get dictionary keys
    user = int(row[0])
    question = int(row[1])
    timestamp = int(row[2])
    part = int(-row[3])
    task_id = int(row[4])
    hardness = int(row[5])

    if user in count_dict:  # known user
        # get overall user specific features
        feats[0] = count_dict[user]['sum']
        feats[1] = correct_dict[user]['sum']
        feats[4] = timestamp - time_dict[user]['last']
        feats[6] = last_n_dict[user]['sum']
        feats[7] = timestamp - last_n_dict[user]['time_sum']
        feats[8] = timestamp - last_n_dict[user]['time_sum2']
        feats[9] = timestamp - last_n_dict[user]['time_sum3']
        feats[19] = task_id - last_n_dict[user]['last_task']
        feats[20] = task_id - last_n_dict[user]['last_task2']

        # get question difficulty specific features
        if hardness in count_dict[user]:
            feats[21] = count_dict[user][hardness]
            feats[22] = correct_dict[user][hardness]

        # get part specific features
        if part in count_dict[user]:
            feats[10] = count_dict[user][part]
            feats[11] = correct_dict[user][part]

        # add part features for all other parts as well
        for i, p in enumerate([-1, -2, -3, -4, -5, -6, -7]):
            if p in count_dict[user]:
                feats[12 + i] = correct_dict[user][p] / count_dict[user][p]

        # add question specific features
        if question in count_dict[user]:  # known question for this user
            feats[2] = count_dict[user][question]
            feats[3] = correct_dict[user][question]
            feats[5] = timestamp - time_dict[user][question]

    return feats


def get_user_feats(trn, val, save_dicts=False):
    # build up the feature dicts row wise on train and compute train features
    trn, count_dict, correct_dict, time_dict, last_n_dict = calc_dicts_and_add(trn)
    # add to precomputed train feature dicts and compute inference features
    val, count_dict, correct_dict, time_dict, last_n_dict = calc_dicts_and_add(val, count_dict, correct_dict,
                                                                               time_dict, last_n_dict)

    if save_dicts:
        print("Save count dict ...")
        with open(configs.count_dict_path, 'wb') as f:
            pickle.dump(count_dict, f, pickle.HIGHEST_PROTOCOL)
        print("Save correct dict ...")
        with open(configs.correct_dict_path, 'wb') as f:
            pickle.dump(correct_dict, f, pickle.HIGHEST_PROTOCOL)
        print("Save time dict ...")
        with open(configs.time_dict_path, 'wb') as f:
            pickle.dump(time_dict, f, pickle.HIGHEST_PROTOCOL)
        print("Save last n dict ...")
        with open(configs.last_n_dict_path, 'wb') as f:
            pickle.dump(last_n_dict, f, pickle.HIGHEST_PROTOCOL)

    del count_dict, correct_dict, time_dict, last_n_dict
    gc.collect()

    return trn, val


def calc_feats_from_stats(df, user_feats):
    # compute correctness means
    user_feats[:, 1] = user_feats[:, 1] / user_feats[:, 0]
    user_feats[:, 3] = user_feats[:, 3] / user_feats[:, 2]
    user_feats[:, 11] = user_feats[:, 11] / user_feats[:, 10]
    user_feats[:, 22] = user_feats[:, 22] / user_feats[:, 21]

    # assign computed features to new columns in the df
    df['user_count'] = user_feats[:, 0].astype(np.float32)
    df['user_correct_mean'] = user_feats[:, 1].astype(np.float32)
    df['user_question_count'] = user_feats[:, 2].astype(np.float32)
    user_feats[:, 3][user_feats[:, 3] == -np.inf] = 0
    df['user_question_correct_mean'] = user_feats[:, 3].astype(np.float32)
    df['last_time_user'] = user_feats[:, 4].astype(np.float32)
    df['last_time_question_user'] = user_feats[:, 5].astype(np.float32)
    df['last_time_user_inter'] = df['last_time_user'].astype(np.float32) - df['last_time_question_user'].astype(
        np.float32)
    df['user_last_n_correct'] = user_feats[:, 6].astype(np.float32)
    df['user_last_n_time'] = user_feats[:, 7].astype(np.float32)
    df['user_last_n_time2'] = user_feats[:, 8].astype(np.float32)
    df['user_last_n_time3'] = user_feats[:, 9].astype(np.float32)
    df['user_part_count'] = user_feats[:, 10].astype(np.float32)
    df['user_part_correct_mean'] = user_feats[:, 11].astype(np.float32)
    df['user_part1_mean'] = user_feats[:, 12].astype(np.float32)
    df['user_part2_mean'] = user_feats[:, 13].astype(np.float32)
    df['user_part3_mean'] = user_feats[:, 14].astype(np.float32)
    df['user_part4_mean'] = user_feats[:, 15].astype(np.float32)
    df['user_part5_mean'] = user_feats[:, 16].astype(np.float32)
    df['user_part6_mean'] = user_feats[:, 17].astype(np.float32)
    df['user_part7_mean'] = user_feats[:, 18].astype(np.float32)
    df['task_container_eq1'] = user_feats[:, 19].astype(np.float32)
    df['task_container_eq2'] = user_feats[:, 20].astype(np.float32)
    df['user_hardness_count'] = user_feats[:, 21].astype(np.float32)
    df['user_hardness_mean'] = user_feats[:, 22].astype(np.float32)
    df['user_hardness_inter'] = df['user_hardness_mean'] - df['content_id_target_mean']

    return df


def calc_dicts_and_add(df, count_dict=None, correct_dict=None, time_dict=None, last_n_dict=None):
    # init empty dicts if nothing is provided else use precomputed dicts
    # count_dict: user specific counts for questions, parts, question difficulty etc.
    # correct_dict: user specific counts of correct answers for questions, parts, question difficulty etc.
    # time dict: last time we saw the user, last time user answered this question etc.
    # last n dict: last n time user correct, last n time user question correct, last n time seen user etc.
    if not count_dict:
        count_dict = {}
    if not correct_dict:
        correct_dict = {}
    if not time_dict:
        time_dict = {}
    if not last_n_dict:
        last_n_dict = {}

    # init numpy storage for all features and create row iterator
    user_feats = np.full((len(df), 23), fill_value=np.nan, dtype=np.float32)
    prev_row = None
    feat_iterator = df[['user_id', 'content_id', 'timestamp', 'part', 'task_container_id', 'question_hardness',
                        'answered_correctly']].values
    del df['timestamp']
    del df['user_id']
    del df['row_id']

    for row_id, curr_row in enumerate(tqdm(feat_iterator)):
        if prev_row is not None:
            # increment user information incrementally with newly gained information (previous row)
            count_dict, correct_dict, time_dict, last_n_dict = update_dicts(prev_row, count_dict, correct_dict,
                                                                            time_dict, last_n_dict)

        # obtain feature values for current row
        user_row_values = get_row_values(curr_row, count_dict, correct_dict, time_dict, last_n_dict)
        user_feats[row_id] = user_row_values

        prev_row = curr_row

    # calculate and add features from preprocessed state dicts to data
    del feat_iterator
    df = calc_feats_from_stats(df, user_feats)

    return df, count_dict, correct_dict, time_dict, last_n_dict

In [None]:
# methods to prepare and merge the meta file to the main data

def prepare_questions():
    questions = pd.read_csv(questions_file,
                            dtype={
                                'question_id': 'int16',
                                'bundle_id': 'int16',
                                'correct_answer': 'int8',
                                'part': 'int8'
                            })

    questions.drop(['bundle_id', 'correct_answer'], axis=1, inplace=True)
    questions.rename(columns={'question_id': 'content_id'}, inplace=True)
    questions['content_type_id'] = 0

    return questions

def prepare_lectures():
    lectures = pd.read_csv(lectures_file,
                           dtype={
                               'lecture_id': 'int16',
                               'tag': 'int16',
                               'part': 'int8'
                           })

    lectures.drop(['type_of'], axis=1, inplace=True)
    lectures.rename(columns={'lecture_id': 'content_id', 'tag': 'tags'}, inplace=True)
    lectures['content_type_id'] = 1

    return lectures

def prepare_meta_feats():
    questions = prepare_questions()
    lectures = prepare_lectures()
    meta = questions.append(lectures)
    
    return meta

In [None]:
def replace_bools(df):
    df.loc[:, 'prior_question_had_explanation'] = df.loc[:, 'prior_question_had_explanation'].map(
        {False: 0, True: 1}
    )
    df['prior_question_had_explanation'] = df['prior_question_had_explanation'].astype(np.float16)

    return df

In [None]:
def merge_feat_val(key_feat, feat_dict):
    add_feat = np.zeros((len(key_feat)))
    key_feat = key_feat.values

    for row_id, row in enumerate(key_feat):
        key = key_feat[row_id]

        if key in feat_dict:
            add_feat[row_id] = feat_dict[key]
        else:
            add_feat[row_id] = -1

    return add_feat

In [None]:
def get_answer_feats(row, ans_dict):
    feats = np.zeros(4)
    question = int(row[1])
    
    feats[0] = ans_dict[question][0]
    feats[1] = ans_dict[question][1]
    feats[2] = ans_dict[question][2]
    feats[3] = ans_dict[question][3]
    
    return feats

def add_answer_feats(df, ans_feats):
    df['answer1'] = ans_feats[:, 0].astype(np.float32)
    df['answer2'] = ans_feats[:, 1].astype(np.float32)
    df['answer3'] = ans_feats[:, 2].astype(np.float32)
    df['answer4'] = ans_feats[:, 3].astype(np.float32)
    
    return df

In [None]:
meta_features = prepare_meta_feats()
env = riiideducation.make_env()
set_predict = env.predict
iter_test = env.iter_test()

In [None]:
# init empty feature dictionaries which will be build up on the fly during inference
count_dict = {}
correct_dict = {}
time_dict = {}
last_n_dict = {}

In [None]:
# row train iterator for feature engineering
train = train[['user_id', 'content_id', 'timestamp', 'part', 'task_container_id', 'question_hardness', TARGET]]

In [None]:
%%time
prev_test_df = None
for (test_df, sample_prediction_df) in iter_test:
    if prev_test_df is not None:
        prev_test_df[TARGET] = eval(test_df["prior_group_answers_correct"].iloc[0])
        for row_id, prev_row in enumerate(prev_test_df[['user_id', 'content_id', 'timestamp', 'part', 'task_container_id', 'question_hardness', TARGET, 'content_type_id']].values):
            if prev_row[7]==0: # only if not lecture
                # update state dicts
                count_dict, correct_dict, time_dict, last_n_dict = update_dicts(prev_row, count_dict, correct_dict, time_dict, last_n_dict)
    
    test_df = pd.merge(test_df, meta_features, on=['content_type_id', 'content_id'], how='left') # merge meta data
    test_df['content_id_target_mean'] = merge_feat_val(test_df['content_id'], content_dict) # mean encode questions
    test_df = get_q_hardness_bins(test_df) # get content difficulty bins
    prev_test_df = test_df.copy()  # copy for updates in next iteration
    test_df = test_df.sort_values(['user_id','timestamp'], ascending=False)
    test_df = replace_bools(test_df)
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)  # remove lectures
    
    # init empty numpy arrays --> storage for row features
    user_feats = np.full((len(test_df), 23), dtype=np.float32, fill_value=np.nan)
    answer_feats = np.zeros((len(test_df), 4), dtype=np.float32)
    
    # get features for each row
    for row_id, curr_row in enumerate(test_df[['user_id', 'content_id', 'timestamp', 'part', 'task_container_id', 'question_hardness']].values):
        user = curr_row[0]
        
        # update dicts on the fly if we encounter a test user which is seen in the train set
        # check for each row if we have already queried the train for the current user
        if user not in count_dict:
            if user in user_idx: # we have not queried train yet so check if we even have the user in train
                # query train data for seen test user and update dictionaries with the train data
                for train_row_id, train_curr_row in enumerate(train[train.user_id==user].values):
                    count_dict, correct_dict, time_dict, last_n_dict = update_dicts(train_curr_row, count_dict, correct_dict, time_dict, last_n_dict)
        
        # get features values and fill numpy arrays
        user_feats[row_id] = get_row_values(curr_row, count_dict, correct_dict, time_dict, last_n_dict)
        answer_feats[row_id] = get_answer_feats(curr_row, answer_dist_dict)
    
    # calc ratios and assign computed features to new columns in the df
    test_df = calc_feats_from_stats(test_df, user_feats)
    test_df = add_answer_feats(test_df, answer_feats)
    
    test_preds = model.predict(test_df[feats])
    test_df[TARGET] = test_preds
    set_predict(test_df[['row_id', TARGET]])