# Version notes

To download the pickled file you go to Version 8 directly.

- Ver 1: testing if the features can be dumped into a pickle, train is last 50m rows
- Ver 6 (interactive): just dumping train dictionary without add features to the df 
- Ver 7 (quick-save): update the workflow to be `itertuples()` to be the fastest
- Ver 9: getting rid of feature columns to save even more memory

## main contribution
- Un-encapsulating the functions to be more debuggable.
- Pickling and loading nested dictionaries for fast inference.


### TO-DO:

- Add a rolling mean of `time_recency`? (too much for inference?)

Reference:
- https://www.kaggle.com/its7171/lgbm-with-loop-feature-engineering
- https://www.kaggle.com/ragnar123/riiid-model-lgbm

In [None]:
import pandas as pd
import numpy as np
import datatable as dt
import gc
from sklearn.metrics import roc_auc_score
from collections import defaultdict
from contextlib import contextmanager
import psutil
import math
from time import time
from tqdm.notebook import tqdm
import lightgbm as lgb
import riiideducation
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import random
import os
import sys
from utils import *

In [None]:
def get_memory(num_var=10):
    for name, size in sorted(((name, sys.getsizeof(value)) for name, value in globals().items()), 
                             key= lambda x: -x[1])[:num_var]:
        print(color(f"{name:>30}:", color=Colors.green), 
              color(f"{get_size(size):>8}", color=Colors.magenta))

get_system()

In [None]:
DEBUG = True
FOLD = 1
SEED = 802

In [None]:

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(SEED)

# Feature engineering

In [None]:
train_parquet = '../input/cv-strategy-in-the-kaggle-environment/cv1_train.parquet'
question_file = '../input/riiid-test-answer-prediction/questions.csv'

# Read data
features = ['timestamp', 
           'user_id', 
           'answered_correctly',
           'content_id', 
           'content_type_id', 
           'prior_question_elapsed_time', 
           'prior_question_had_explanation']

train_dtypes = {
    'timestamp': 'int64',
    'user_id': 'int32', 
    'answered_correctly': 'int8', 
    'content_id': 'int16', 
    'content_type_id':'int8', 
#     'task_container_id': 'int16',
    #'user_answer': 'int8',
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool'
}

with timer("Loading train"):
    train = pd.read_parquet(train_parquet)[features].astype(train_dtypes)

In [None]:
# Delete some trianing data to don't have ram problems
if DEBUG:
    train = train.iloc[:10_000_000]

# Filter by content_type_id to discard lectures
train = train.loc[train.content_type_id == False].reset_index(drop = True)

# Changing dtype to avoid lightgbm error
train['prior_question_had_explanation'] = \
train.prior_question_had_explanation.fillna(False).astype('int8')

# Fill prior question elapsed time with the mean
# prior_question_elapsed_time_mean = \
# train['prior_question_elapsed_time'].dropna().mean()
prior_question_elapsed_time_mean = 13005.081
train['prior_question_elapsed_time']\
.fillna(prior_question_elapsed_time_mean, inplace = True)

##### Merge with question dataframe, not needed for feature dump for inference
questions_df = pd.read_csv(question_file)
questions_df['part'] = questions_df['part'].astype(np.int32)
questions_df['bundle_id'] = questions_df['bundle_id'].astype(np.int32)

# train = pd.merge(train, questions_df[['question_id', 'part']], 
#                  left_on = 'content_id', right_on = 'question_id', how = 'left')
gc.collect()

If it is just for user dict dump, no need to store the features.

In [None]:
# Funcion for user stats with loops
def add_features(row):
    
    '''
    after re-assignment
    row[0]: 'user_id',
    row[1]: 'answered_correctly', 
    row[2]: 'content_id', 
    row[3]: 'prior_question_elapsed_time', 
    row[4]: 'prior_question_had_explanation',
    row[5]: 'timestamp'
    
    '''
   
    num = row[0] # index
    row = row[1:]
    user_id = row[0]

    # Client features assignation
    # ------------------------------------------------------------------
#     if answered_correctly_u_count[user_id] != 0:
#         answered_correctly_u_avg[num] = \
#         answered_correctly_u_sum[user_id] / answered_correctly_u_count[user_id]

#         elapsed_time_u_avg[num] = \
#         elapsed_time_u_sum[user_id] / answered_correctly_u_count[user_id]

#         explanation_u_avg[num] = \
#         explanation_u_sum[user_id] / answered_correctly_u_count[user_id]

#     else:
#         answered_correctly_u_avg[num] = np.nan

#         elapsed_time_u_avg[num] = np.nan

#         explanation_u_avg[num] = np.nan

#     if len(timestamp_u[user_id]) == 0:
#         timestamp_u_recency_1[num] = np.nan
#         timestamp_u_recency_2[num] = np.nan
#         timestamp_u_recency_3[num] = np.nan

#     elif len(timestamp_u[user_id]) == 1:
#         timestamp_u_recency_1[num] = row[5] - timestamp_u[user_id][0]
#         timestamp_u_recency_2[num] = np.nan
#         timestamp_u_recency_3[num] = np.nan

#     elif len(timestamp_u[user_id]) == 2:
#         timestamp_u_recency_1[num] = row[5] - timestamp_u[user_id][1]
#         timestamp_u_recency_2[num] = row[5] - timestamp_u[user_id][0]
#         timestamp_u_recency_3[num] = np.nan

#     elif len(timestamp_u[user_id]) == 3:
#         timestamp_u_recency_1[num] = row[5] - timestamp_u[user_id][2]
#         timestamp_u_recency_2[num] = row[5] - timestamp_u[user_id][1]
#         timestamp_u_recency_3[num] = row[5] - timestamp_u[user_id][0]

#     if len(timestamp_u_incorrect[user_id]) == 0:
#         timestamp_u_incorrect_recency[num] = np.nan
#     else:
#         timestamp_u_incorrect_recency[num] = \
#         row[5] - timestamp_u_incorrect[user_id][0]

#     # ------------------------------------------------------------------
#     # Question features assignation
#     if answered_correctly_q_count[row[2]] != 0:
#         answered_correctly_q_avg[num] = \
#         answered_correctly_q_sum[row[2]] / answered_correctly_q_count[row[2]]
#         elapsed_time_q_avg[num] = elapsed_time_q_sum[row[2]] / answered_correctly_q_count[row[2]]
#         explanation_q_avg[num] = explanation_q_sum[row[2]] / answered_correctly_q_count[row[2]]
#     else:
#         answered_correctly_q_avg[num] = np.nan
#         elapsed_time_q_avg[num] = np.nan
#         explanation_q_avg[num] = np.nan
#     # ------------------------------------------------------------------
#     # Client Question assignation
#     answered_correctly_uq_count[num] = answered_correctly_uq[user_id][row[2]]
    # ------------------------------------------------------------------

    # ------------------------------------------------------------------
    # Client features updates
    answered_correctly_u_count[user_id] += 1
    elapsed_time_u_sum[user_id] += row[3]
    explanation_u_sum[user_id] += int(row[4])

    if len(timestamp_u[user_id]) == 3:
        timestamp_u[user_id].pop(0)
        timestamp_u[user_id].append(row[5])
    else:
        timestamp_u[user_id].append(row[5])

    # ------------------------------------------------------------------
    # Question features updates
    answered_correctly_q_count[row[2]] += 1
    elapsed_time_q_sum[row[2]] += row[3]
    explanation_q_sum[row[2]] += int(row[4])
    # ------------------------------------------------------------------
    # Client Question updates
    answered_correctly_uq[user_id][row[2]] += 1

    # ------------------------------------------------------------------
    # Flag for training and inference
    # ------------------------------------------------------------------
    # Client features updates
    answered_correctly_u_sum[user_id] += row[1]
    if row[1] == 0:
        if len(timestamp_u_incorrect[user_id]) == 1:
            timestamp_u_incorrect[user_id].pop(0)
            timestamp_u_incorrect[user_id].append(row[5])
        else:
            timestamp_u_incorrect[user_id].append(row[5])

    # ------------------------------------------------------------------
    # Question features updates
    answered_correctly_q_sum[row[2]] += row[1]
    # ------------------------------------------------------------------

In [None]:
# train_len = len(train)

# # -----------------------------------------------------------------------
# # Client features
# answered_correctly_u_avg = np.zeros(train_len, dtype = np.float32)
# elapsed_time_u_avg = np.zeros(train_len, dtype = np.float32)
# explanation_u_avg = np.zeros(train_len, dtype = np.float32)
# timestamp_u_recency_1 = np.zeros(train_len, dtype = np.float32)
# timestamp_u_recency_2 = np.zeros(train_len, dtype = np.float32)
# timestamp_u_recency_3 = np.zeros(train_len, dtype = np.float32)
# timestamp_u_incorrect_recency = np.zeros(train_len, dtype = np.float32)
# # -----------------------------------------------------------------------
# # Question features
# answered_correctly_q_avg = np.zeros(train_len, dtype = np.float32)
# elapsed_time_q_avg = np.zeros(train_len, dtype = np.float32)
# explanation_q_avg = np.zeros(train_len, dtype = np.float32)

# # -----------------------------------------------------------------------
# # User Question
# answered_correctly_uq_count = np.zeros(train_len, dtype = np.int32)

# # -----------------------------------------------------------------------

In [None]:
# Client dictionaries, global var to be updated
answered_correctly_u_count = defaultdict(int)
answered_correctly_u_sum = defaultdict(int)
elapsed_time_u_sum = defaultdict(int)
explanation_u_sum = defaultdict(int)
timestamp_u = defaultdict(list)
timestamp_u_incorrect = defaultdict(list)

# Question dictionaries, global var to be updated
answered_correctly_q_count = defaultdict(int)
answered_correctly_q_sum = defaultdict(int)
elapsed_time_q_sum = defaultdict(int)
explanation_q_sum = defaultdict(int)

# Client Question dictionary, if the user has not answer a questions, then the value is a defaultdict(int)
answered_correctly_uq = defaultdict(lambda: defaultdict(int))

In [None]:
len(train)

In [None]:
iters = train[['user_id',
          'answered_correctly', 
          'content_id', 
          'prior_question_elapsed_time', 
          'prior_question_had_explanation',
          'timestamp']].itertuples()
train_len = len(train)

with timer("User feature calculation"):
    for _row in tqdm(iters, total=train_len):
        add_features(_row)
gc.collect()

# Dumping features

In [None]:
user_ids = train['user_id'].unique()

In [None]:
del train
gc.collect()

In [None]:
for item in answered_correctly_u_sum.items():
    print(item)
    break

In [None]:
len(answered_correctly_u_sum)

# Dumping regular dicts

In [None]:
with open('answered_correctly_u_count.pickle', 'wb') as f:
    pickle.dump(answered_correctly_u_count, f, protocol=pickle.HIGHEST_PROTOCOL)
del answered_correctly_u_count

In [None]:
with open('answered_correctly_u_sum.pickle', 'wb') as f:
    pickle.dump(answered_correctly_u_sum, f, protocol=pickle.HIGHEST_PROTOCOL)
del answered_correctly_u_sum

In [None]:
with open('elapsed_time_u_sum.pickle', 'wb') as f:
    pickle.dump(elapsed_time_u_sum, f, protocol=pickle.HIGHEST_PROTOCOL)
del elapsed_time_u_sum

In [None]:
with open('explanation_u_sum.pickle', 'wb') as f:
    pickle.dump(explanation_u_sum, f, protocol=pickle.HIGHEST_PROTOCOL)
del explanation_u_sum

In [None]:
with open('answered_correctly_q_count.pickle', 'wb') as f:
    pickle.dump(answered_correctly_q_count, f, protocol=pickle.HIGHEST_PROTOCOL)
del answered_correctly_q_count

In [None]:
with open('answered_correctly_q_sum.pickle', 'wb') as f:
    pickle.dump(answered_correctly_q_sum, f, protocol=pickle.HIGHEST_PROTOCOL)
del answered_correctly_q_sum

In [None]:
with open('elapsed_time_q_sum.pickle', 'wb') as f:
    pickle.dump(elapsed_time_q_sum, f, protocol=pickle.HIGHEST_PROTOCOL)
del elapsed_time_q_sum

In [None]:
    
with open('explanation_q_sum.pickle', 'wb') as f:
    pickle.dump(explanation_q_sum, f, protocol=pickle.HIGHEST_PROTOCOL)
del explanation_q_sum

In [None]:
with open('timestamp_u.pickle', 'wb') as f:
    pickle.dump(timestamp_u, f, protocol=pickle.HIGHEST_PROTOCOL)
del timestamp_u

In [None]:
with open('timestamp_u_incorrect.pickle', 'wb') as f:
    pickle.dump(timestamp_u_incorrect, f, protocol=pickle.HIGHEST_PROTOCOL)
del timestamp_u_incorrect

In [None]:
gc.collect()

# Dumping nested dictionary
`answered_correctly_uq` is a nested dict using `user_id` as keys to apply as a lambda function, a straightforward application of pickling is not possible. After applied `user_id`, this user's correctly answered questions are the keys to this dict and the values are just 1.

In [None]:
answered_correctly_uq[926573062] # user_id == 926573062

The way to dump it is first to convert it to a regular dict as follows.

In [None]:
answered_correctly_uq_dict = defaultdict(int)

with tqdm(total=len(user_ids)) as pbar:
    for num, user in enumerate(user_ids):
        answered_correctly_uq_dict[user] = answered_correctly_uq[user]
        if num % 50 == 0:
            pbar.update(50)

In [None]:
get_memory(num_var=5) # lambda function is so small because it has not been applied

In [None]:
with open('answered_correctly_uq_dict.pickle', 'wb') as f:
    pickle.dump(answered_correctly_uq_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

After dumping we will find its file size is relative big.

In [None]:
files = find_files('pickle', '../working/')
print_file_size(files)

In [None]:
# answered_correctly_u_count[898778487] # total q for a user
# answered_correctly_u_sum[898778487] # correct for a user

In [None]:
answered_correctly_uq_dict[898778487] 

# Load models
Just make sure we can do inference, we load the models to do a mock run.

In [None]:
TARGET = 'answered_correctly'
# Features to train and predict
FEATURES = ['prior_question_elapsed_time', 
            'prior_question_had_explanation', 
            'part', 
            'answered_correctly_u_avg', 
            'elapsed_time_u_avg', 
            'explanation_u_avg',
            'answered_correctly_q_avg', 
            'elapsed_time_q_avg', 
            'explanation_q_avg', 
            'answered_correctly_uq_count', 
            'timestamp_u_recency_1',
            'timestamp_u_recency_2', 
            'timestamp_u_recency_3', 
            'timestamp_u_incorrect_recency']

model_file = '../input/riiid-lgb-models/lgb_loop_fold_0_auc_0.7739.txt'
model = lgb.Booster(model_file=model_file)

# Inference

In [None]:
# Funcion for user stats with loops for test
def add_features_test(df):
    # -----------------------------------------------------------------------
    # Client features
    answered_correctly_u_avg = np.zeros(len(df), dtype = np.float32)
    elapsed_time_u_avg = np.zeros(len(df), dtype = np.float32)
    explanation_u_avg = np.zeros(len(df), dtype = np.float32)
    timestamp_u_recency_1 = np.zeros(len(df), dtype = np.float32)
    timestamp_u_recency_2 = np.zeros(len(df), dtype = np.float32)
    timestamp_u_recency_3 = np.zeros(len(df), dtype = np.float32)
    timestamp_u_incorrect_recency = np.zeros(len(df), dtype = np.float32)
    # -----------------------------------------------------------------------
    # Question features
    answered_correctly_q_avg = np.zeros(len(df), dtype = np.float32)
    elapsed_time_q_avg = np.zeros(len(df), dtype = np.float32)
    explanation_q_avg = np.zeros(len(df), dtype = np.float32)
    
    # -----------------------------------------------------------------------
    # User Question
    answered_correctly_uq_count = np.zeros(len(df), dtype = np.int32)
    
    # -----------------------------------------------------------------------
    
    for num, row in enumerate(df[['user_id',
                                  'answered_correctly', 
                                  'content_id', 
                                  'prior_question_elapsed_time', 
                                  'prior_question_had_explanation',
                                  'timestamp']].values):

        # Client features assignation
        # ------------------------------------------------------------------
        if answered_correctly_u_count[row[0]] != 0:
            answered_correctly_u_avg[num] = \
            answered_correctly_u_sum[row[0]] / answered_correctly_u_count[row[0]]

            elapsed_time_u_avg[num] = \
            elapsed_time_u_sum[row[0]] / answered_correctly_u_count[row[0]]

            explanation_u_avg[num] = \
            explanation_u_sum[row[0]] / answered_correctly_u_count[row[0]]

        else:
            answered_correctly_u_avg[num] = np.nan

            elapsed_time_u_avg[num] = np.nan

            explanation_u_avg[num] = np.nan

        if len(timestamp_u[row[0]]) == 0:
            timestamp_u_recency_1[num] = np.nan
            timestamp_u_recency_2[num] = np.nan
            timestamp_u_recency_3[num] = np.nan

        elif len(timestamp_u[row[0]]) == 1:
            timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][0]
            timestamp_u_recency_2[num] = np.nan
            timestamp_u_recency_3[num] = np.nan

        elif len(timestamp_u[row[0]]) == 2:
            timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][1]
            timestamp_u_recency_2[num] = row[5] - timestamp_u[row[0]][0]
            timestamp_u_recency_3[num] = np.nan

        elif len(timestamp_u[row[0]]) == 3:
            timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][2]
            timestamp_u_recency_2[num] = row[5] - timestamp_u[row[0]][1]
            timestamp_u_recency_3[num] = row[5] - timestamp_u[row[0]][0]

        if len(timestamp_u_incorrect[row[0]]) == 0:
            timestamp_u_incorrect_recency[num] = np.nan
        else:
            timestamp_u_incorrect_recency[num] = \
            row[5] - timestamp_u_incorrect[row[0]][0]

        # ------------------------------------------------------------------
        # Question features assignation
        if answered_correctly_q_count[row[2]] != 0:
            answered_correctly_q_avg[num] = \
            answered_correctly_q_sum[row[2]] / answered_correctly_q_count[row[2]]
            elapsed_time_q_avg[num] = elapsed_time_q_sum[row[2]] / answered_correctly_q_count[row[2]]
            explanation_q_avg[num] = explanation_q_sum[row[2]] / answered_correctly_q_count[row[2]]
        else:
            answered_correctly_q_avg[num] = np.nan
            elapsed_time_q_avg[num] = np.nan
            explanation_q_avg[num] = np.nan
        # ------------------------------------------------------------------
        # Client Question assignation
        answered_correctly_uq_count[num] = answered_correctly_uq[row[0]][row[2]]
        # ------------------------------------------------------------------

        # ------------------------------------------------------------------
        # Client features updates
        answered_correctly_u_count[row[0]] += 1
        elapsed_time_u_sum[row[0]] += row[3]
        explanation_u_sum[row[0]] += int(row[4])

        if len(timestamp_u[row[0]]) == 3:
            timestamp_u[row[0]].pop(0)
            timestamp_u[row[0]].append(row[5])
        else:
            timestamp_u[row[0]].append(row[5])

        # ------------------------------------------------------------------
        # Question features updates
        answered_correctly_q_count[row[2]] += 1
        elapsed_time_q_sum[row[2]] += row[3]
        explanation_q_sum[row[2]] += int(row[4])
        # ------------------------------------------------------------------
        # Client Question updates
        answered_correctly_uq[row[0]][row[2]] += 1

            
    user_df = pd.DataFrame({'answered_correctly_u_avg': answered_correctly_u_avg, 
                            'elapsed_time_u_avg': elapsed_time_u_avg, 
                            'explanation_u_avg': explanation_u_avg, 
                            'answered_correctly_q_avg': answered_correctly_q_avg, 
                            'elapsed_time_q_avg': elapsed_time_q_avg, 
                            'explanation_q_avg': explanation_q_avg, 
                            'answered_correctly_uq_count': answered_correctly_uq_count, 
                            'timestamp_u_recency_1': timestamp_u_recency_1, 
                            'timestamp_u_recency_2': timestamp_u_recency_2,
                            'timestamp_u_recency_3': timestamp_u_recency_3, 
                            'timestamp_u_incorrect_recency': timestamp_u_incorrect_recency})
    
    df = pd.concat([df, user_df], axis = 1)
    return df

# Loading a nested dict
Here loading the nested dict is a bit tricky, we simply re-allocate it into a dict with a default value as a lambda function.

In [None]:
with open('../working/answered_correctly_uq_dict.pickle', 'rb') as f:
    answered_correctly_uq_dict = pickle.load(f)
    
answered_correctly_uq = defaultdict(lambda: defaultdict(int))
for key in tqdm(answered_correctly_uq_dict.keys()):
    answered_correctly_uq[key] = answered_correctly_uq_dict[key]

In [None]:
# Get feature dicts
with open('../working/answered_correctly_u_count.pickle', 'rb') as f:
    answered_correctly_u_count = pickle.load(f)
    
with open('../working/answered_correctly_u_sum.pickle', 'rb') as f:
    answered_correctly_u_sum = pickle.load(f)

with open('../working/elapsed_time_u_sum.pickle', 'rb') as f:
    elapsed_time_u_sum = pickle.load(f)

with open('../working/explanation_u_sum.pickle', 'rb') as f:
    explanation_u_sum = pickle.load(f)    

with open('../working/answered_correctly_q_count.pickle', 'rb') as f:
    answered_correctly_q_count = pickle.load(f)  
    
with open('../working/answered_correctly_q_sum.pickle', 'rb') as f:
    answered_correctly_q_sum = pickle.load(f)  
    
with open('../working/elapsed_time_q_sum.pickle', 'rb') as f:
    elapsed_time_q_sum = pickle.load(f)  

with open('../working/explanation_q_sum.pickle', 'rb') as f:
    explanation_q_sum = pickle.load(f)     
    
with open('../working/timestamp_u.pickle', 'rb') as f:
    timestamp_u = pickle.load(f)         
    
with open('../working/timestamp_u_incorrect.pickle', 'rb') as f:
    timestamp_u_incorrect = pickle.load(f)     

In [None]:
def update_features(df):
    for row in df[['user_id', 
                   'answered_correctly', 
                   'content_id', 
                   'content_type_id', 
                   'timestamp']].values:
        if row[3] == 0:
            # ------------------------------------------------------------------
            # Client features updates
            answered_correctly_u_sum[row[0]] += row[1]
            if row[1] == 0:
                if len(timestamp_u_incorrect[row[0]]) == 1:
                    timestamp_u_incorrect[row[0]].pop(0)
                    timestamp_u_incorrect[row[0]].append(row[4])
                else:
                    timestamp_u_incorrect[row[0]].append(row[4])
            # ------------------------------------------------------------------
            # Question features updates
            answered_correctly_q_sum[row[2]] += row[1]
            # ------------------------------------------------------------------

In [None]:
env = riiideducation.make_env()
iter_test = env.iter_test()
set_predict = env.predict

In [None]:
%%time
previous_test_df = None
for (test_df, sample_prediction_df) in iter_test:
    if previous_test_df is not None:
        previous_test_df[TARGET] = eval(test_df["prior_group_answers_correct"].iloc[0])
        update_features(previous_test_df)
    previous_test_df = test_df.copy()
    
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop = True)
    
    test_df['prior_question_had_explanation'] = \
    test_df.prior_question_had_explanation.fillna(False).astype('int8')
    
    test_df['prior_question_elapsed_time'].\
    fillna(prior_question_elapsed_time_mean, inplace = True)
    test_df = pd.merge(test_df, questions_df[['question_id', 'part']], 
                       left_on = 'content_id', 
                       right_on = 'question_id', 
                       how = 'left')
    test_df[TARGET] = 0.67
    
    test_df = add_features_test(test_df)
    
    test_df[TARGET] =  model.predict(test_df[FEATURES])
    set_predict(test_df[['row_id', TARGET]])


In [None]:
sub = pd.read_csv('../working/submission.csv')
sub['answered_correctly'].hist(bins=15);

`answered_correctly_uq` is now a dict with a lambda function as default value again.

In [None]:
answered_correctly_uq[705741139.0][7922.0]

In [None]:
## debug
# previous_test_df = None
# test_df, sample_prediction_df = next(iter_test)
# if previous_test_df is not None:
#     previous_test_df[TARGET] = eval(test_df["prior_group_answers_correct"].iloc[0])
#     update_features(previous_test_df)
# previous_test_df = test_df.copy()

# test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop = True)

# test_df['prior_question_had_explanation'] = \
# test_df.prior_question_had_explanation.fillna(False).astype('int8')

# test_df['prior_question_elapsed_time'].\
# fillna(prior_question_elapsed_time_mean, inplace = True)
# test_df = pd.merge(test_df, questions_df[['question_id', 'part']], 
#                    left_on = 'content_id', 
#                    right_on = 'question_id', 
#                    how = 'left')
# test_df[TARGET] = 0.66

# test_df = add_features_test(test_df)

# test_df[TARGET] =  model.predict(test_df[FEATURES])
# set_predict(test_df[['row_id', TARGET]])
