In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Check if GPU is enabled
import tensorflow as tf
import timeit
gpu = None
device_name = tf.test.gpu_device_name()
if "GPU" in device_name:
    gpu='gpu'
    print('Found GPU at: {}'.format(device_name))

In [None]:
from IPython.core.magic import register_cell_magic
# This magic is used to skip some cells when running
@register_cell_magic
def skip(line, cell=None):
    '''Skips execution of the current line/cell if line evaluates to True.'''
    if eval(line):
        return
        
    get_ipython().run_cell(cell)

In [None]:
#From https://www.kaggle.com/rohanrao/ashrae-half-and-half

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False,verbose=True):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    if verbose :print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    if verbose:print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
import random
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gc
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler,MinMaxScaler,PolynomialFeatures
import json
from sklearn.metrics import roc_auc_score,accuracy_score
import lightgbm as lgb
import dill
import time
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2, f_regression
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
import xgboost as xgb
import tensorflow as tf
from tensorflow import keras

### Track everything:

We start by converting the dataset into **.feather** format. After we show how to pick a subset of users from the full dataset in a way to balance both positive and negative classes. We use Autoencoder to reduce the tags into one column.   We preprocessed and did some feature engineering on the selected subset. We also track all user's state within the metadata file that we migrated to SQLite files. 
https://www.kaggle.com/tchaye59/riiid-dataset-to-feather

https://www.kaggle.com/tchaye59/riiid-preprocess-and-balance-the-dataset

https://www.kaggle.com/tchaye59/riiid-metadata-to-sqlite

# Current goal
In this notebook, we will show how to train and submit a model using the SQL files. We will also gradually update the database at each step.

Since training takes time, we will do it in two steps. We train first and then we submit

In [None]:
np.random.seed(59)
tf.random.set_seed(59)
random.seed(59)
path_train1 = '../input/riiid-preprocess-and-balance-the-dataset/train0.feather'
path_train2 = '../input/riiid-preprocess-and-balance-the-dataset/train1.feather'
path_content = '../input/riiid-preprocess-and-balance-the-dataset/content.feather'
path_metadata = '../input/riiid-preprocess-and-balance-the-dataset/metadata.dill'

In [None]:
# submit or train the models
SUBMIT = True

We get back the models we train from the previous commit

In [None]:
! cp -r /kaggle/input/riiid-work-with-the-full-state-using-sqlalchemy/*.dill ./

# Load data
Load the training data and metadata prepared from: https://www.kaggle.com/tchaye59/riiid-preprocess-and-balance-the-dataset

In [None]:
%%skip SUBMIT
metadata = dill.load(open(path_metadata,'rb'))

target = 'answered_correctly'
train_df = pd.read_feather(path_train1)
train_df = train_df.append(pd.read_feather(path_train2))
train_df.shape

Merge global statistics with the training dataset

In [None]:
%%skip SUBMIT
for key in metadata:
    if key in [('content_id',),('bundle_id',)]:
        print(key)
        train_df = train_df.merge(metadata[key],how='left',on=key)
del metadata

In [None]:
%%skip SUBMIT
train_df.fillna(0,inplace=True)
train_df = reduce_mem_usage(train_df)
train_df.info()

Compute mean from counts and sums

In [None]:
def add_mean(df,remove=False):
    for col in ['user_id','content_id','bundle_id',]:
        c1 = f'{col}_roll_count'
        c2 = f'{col}_roll_sum'
        c3 = f'{col}_roll_mean'
        df[c3] = (df[c2]/df[c1]).astype(np.float32)
        if remove:
            del df[c1],df[c2]
    for col in ['bundle','content']:
        c1 = f'{col}_count'
        c2 = f'{col}_sum'
        c3 = f'{col}_mean'
        df[c3] = (df[c2]/df[c1]).astype(np.float32)
        if remove:
            del df[c1],df[c2]
    df.fillna(-1,inplace=True)
    return df

In [None]:
%%skip SUBMIT
ignore_columns = ['user_answer','user_id','row_id']
train_df = add_mean(train_df,remove=False)

# Build models

In [None]:
%%skip SUBMIT
features = list(train_df.columns)
for col in ignore_columns+['answered_correctly']:
    if col in features:features.remove(col)
print(features)
print('Features size: ',len(features))
dill.dump(features,open('features.dill','wb'))
dill.dump(ignore_columns,open('ignore_columns.dill','wb'))

Split the dataset

In [None]:
%%skip SUBMIT

X_test,y_test = None,None
if True:
    train_df,test_df = train_test_split(train_df,
                                        stratify=train_df[target],
                                        test_size=0.2, 
                                        random_state=1)
    test_df.drop(columns=ignore_columns,inplace=True)
    X_test,y_test = test_df[features].astype(np.float32),test_df[target]
    del test_df
# remove ignore_columns
train_df.drop(columns=ignore_columns,inplace=True)
X,y = train_df[features].astype(np.float32),train_df[target]
del train_df
X.shape

In [None]:
gc.collect()

# LGB Model

In [None]:
class CustomLGBClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.params = {
            'objective': 'binary',
            'learning_rate': 0.1,
            #"num_leaves" : 10,
            #"max_depth": 32,
            'verbose': 1,
            'device': gpu,
            'metrics': ['binary','auc'],
            'nthread':4,
            'seed':1,
        }
        self.model = None
        
        self.pipe = Pipeline([
            ('scaler',MinMaxScaler()),
            #('kbest',SelectKBest(score_func=chi2,k=30)),
        ])
        self.score_value = None
        

    
    def fit(self, X,y, X_test=None,y_test=None):
        # preprocess the data
        X = self.pipe.fit_transform(X,y)
        train_data = lgb.Dataset(X, label=y)
        valid_sets = [train_data,]
        
        # preprocess test data
        val_data = None
        if X_test is not None:
            X_test = self.pipe.transform(X_test)
            val_data = lgb.Dataset(X_test, label=y_test)
            valid_sets.append(val_data)
            
        #Train
        self.model = lgb.train(self.params,
                                  train_data,
                                  num_boost_round=3000,
                                  valid_sets=valid_sets,
                                  early_stopping_rounds=50,
                                  verbose_eval=True
                                 )
        return self
    
    def predict(self, X):
        return self.model.predict(self.pipe.transform(X))
    
    def predict_proba(self,X):
        return self.predict(X)[:,np.newaxis]
    
    def score(self,X,y):
        y_pred = self.predict(X)
        auc = round(roc_auc_score(y, y_pred),2)
        acc = round(accuracy_score(y, y_pred>0.5),2)
        return auc,acc

Train

In [None]:
train_lgbc = False

In [None]:
%%skip SUBMIT or not train_lgbc
model = CustomLGBClassifier()
model.fit(X,y,X_test,y_test)
dill.dump(model,open(f'lgb_model.dill','wb'))

# XGB Model

In [None]:
class CustomXGBClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self,):
        self.params = {
            'objective': 'binary:logistic',
            'learning_rate': 0.1,
            "max_depth": 10,
            'eval_metric':'auc',
            'subsample':0.6,
            'colsample_bytree':0.6,
            'tree_method': 'gpu_hist' if gpu else None,
            #'nthread':4,
            'seed': 1
        }
        self.model = None
        
        self.pipe = Pipeline([
            ('scaler',MinMaxScaler()),
            #('kbest',SelectKBest(score_func=chi2,k=40)),
        ])
        self.score_value = None
        self.best_ntree_limit = 0
        

    
    def fit(self, X,y, X_test=None,y_test=None):
        # preprocess the data
        X = self.pipe.fit_transform(X,y)
        train_data = xgb.DMatrix(X, label=y,feature_names=features)
        evals = [(train_data, 'train'),]
        # preprocess test data
        if X_test is not None:
            X_test = self.pipe.transform(X_test)
            val_data = xgb.DMatrix(X_test, label=y_test,feature_names=features)
            evals.append((val_data, 'eval'))
        
        #Train
        self.model = xgb.train(self.params,train_data,3000,
                               evals = evals,
                               early_stopping_rounds=50,
                               verbose_eval=True
                              )
        self.best_ntree_limit = self.model.best_ntree_limit
        return self
    
    def plot_features(self):
        xgb.plot_importance(self.model,show_values=True,max_num_features=len(features))

    
    def init_thread(self):
        self.best_ntree_limit = self.model.best_ntree_limit
        self.model.save_model('tmp.model')
        bst = xgb.Booster({'nthread': 4}) 
        bst.load_model('tmp.model')
        self.model = bst
    
    def predict(self, X):
        return self.model.predict(
            xgb.DMatrix(self.pipe.transform(X),feature_names=features),
            #ntree_limit=self.best_ntree_limit,
        )
    
    def predict_proba(self,X):
        return self.predict(X)[:,np.newaxis]
    
    def score(self,X,y):
        y_pred = self.predict(X)
        auc = round(roc_auc_score(y, y_pred),2)
        return auc

In [None]:
train_xgbc = False

In [None]:
%%skip SUBMIT or not train_xgbc
model = CustomXGBClassifier()
model.fit(X,y,X_test,y_test)
print('Saving...')
dill.dump(model,open(f'xgb_model.dill','wb'))
print('Done...')

In [None]:
%%skip SUBMIT or not train_xgbc
model.plot_features()

# Neural Network

In [None]:
train_nn = False

In [None]:
%%skip SUBMIT or not train_nn


model_path="best_nn_model.h5"
callbacks_list = [
    keras.callbacks.ModelCheckpoint(model_path, 
                                    verbose=0,
                                    monitor='auc', 
                                    save_best_only=True, 
                                    mode='max'),
]
batch_size = 2**14

val_data = None if X_test is None else (X_test,y_test)

In [None]:
%%skip SUBMIT or not train_nn

layers = [keras.layers.BatchNormalization(input_shape=(len(features),)),]

for u in range(5):
    layers.extend([
        keras.layers.BatchNormalization(input_shape=(len(features),)),
        keras.layers.Dense(256),
        keras.layers.BatchNormalization(),
        keras.layers.Activation('relu'),
        #keras.layers.Dropout(0.1),
    ])

model = keras.Sequential([
    *layers,
    keras.layers.Dense(1,activation='sigmoid'),
])
if os.path.exists("nn_model.h5"):
    print('Loading...')
    model = keras.models.load_model("nn_model.h5")
    
model.compile(loss=keras.losses.binary_crossentropy,
              optimizer=keras.optimizers.Adam(0.01),
              metrics=[
                'acc',
                keras.metrics.AUC(name='auc'),
             ]
             )
model.summary()

In [None]:
%%skip SUBMIT or not train_nn

history = model.fit(X,y,
                    batch_size=batch_size,
                    validation_data=val_data,
                    epochs=100,
                    callbacks=callbacks_list,
                   )
model.save("nn_model.h5")

In [None]:
%%skip SUBMIT or not train_nn
pd.DataFrame(history.history).plot()

# Make Submission

In [None]:
%%skip not SUBMIT
import json
from sklearn.metrics import roc_auc_score,accuracy_score
from sqlalchemy import create_engine
from sqlalchemy.sql import select,delete,and_
from sqlalchemy.sql.expression import table,column
import multiprocessing as mp
import threading
from queue import Queue
from mlxtend.classifier import EnsembleVoteClassifier

lock = mp.Lock()

#### Copy the databases files

In [None]:
%%skip not SUBMIT
! rm -rf ./*db.*
! cp /kaggle/input/riiid-metadata-to-sqlite/*db.* ./

#### Select the model that we want to submit

In [None]:
use_lgb = True
use_xgb = False
use_nn = False

In [None]:
%%skip not (use_lgb and SUBMIT)
model = dill.load(open('lgb_model.dill','rb'))
model

In [None]:
%%skip not (use_xgb and SUBMIT)
model = dill.load(open('xgb_model.dill','rb'))
#model.init_thread()
model

In [None]:
%%skip not (use_nn and SUBMIT)
model = keras.models.load_model("best_nn_model.h5")

### Connect to databases

In [None]:
%%skip not SUBMIT

metadata_info = dill.load(open('/kaggle/input/riiid-metadata-to-sqlite/metadata_info.dill','rb'))
features = dill.load(open('features.dill','rb'))
ignore_columns = dill.load(open('ignore_columns.dill','rb'))
content_df = pd.read_feather('../input/riiid-preprocess-and-balance-the-dataset/content.feather')
print('Features size: ',len(features))

In [None]:
%%skip not SUBMIT
for key in metadata_info:
    name = f"db.{'_'.join(key)}.sqlite"
    engine = create_engine(f'sqlite:///{name}?check_same_thread=False', echo=False)
    sqlite_connection = engine.connect()
    cols = metadata_info[key][-1]
    cols = [column(col) for col in cols]+[column('index'),]
    tab = table('_'.join(key), *cols)
    metadata_info[key][0] = (sqlite_connection,tab)

In [None]:
# Runs a process in a thread
class Worker(threading.Thread):
    
    def __init__(self, process,args,queue=None):
        super().__init__()
        self.queue = queue
        self.process = process
        self.args = args

    def run(self):
        res = self.process(self.args)
        if self.queue:
            self.queue.put(res)

In [None]:
def build_index(x):
    return '_'.join(map(lambda x:str(int(x)),x)) if type(x) not in [int,float] else str(int(x))
    
# Get data from the database
# We also keep the DB result in a cache in order to updates it later
def getValues(args):
    key,df,cache= args
    df = df[list(key)]
    index = df.index
    df = df[list(key)]
    db,tab = metadata_info[key][0]
    cols = metadata_info[key][-1]
    
    ids = df.apply(build_index,axis=1).values
    query = select([tab.c['index'],] + [tab.c[col] for col in cols]).where(tab.c.index.in_(np.unique(ids)))
    result = list(db.execute(query))
    
    res_df = pd.DataFrame(result,columns=['tmp_key']+cols)
    res_df.set_index('tmp_key',inplace=True)
    df['tmp_key'] = ids
    df = df.merge(res_df,how='left',on='tmp_key').fillna(0)
    # keep cache for later updates
    cache[key] = df.groupby([*key,]).tail(1).set_index([*key,])
    df = df.drop(columns=[*key,'tmp_key'])
    df.index = index
    return df

def merge_metadata(df):
    cache = {}
    workers= []
    queue = Queue()
    for key in metadata_info:
        worker =  Worker(getValues,(key,df,cache),queue)
        worker.start()
        workers.append(worker)
    frames = [queue.get() for _ in workers]
    frames.append(df)
    for worker in workers:
        worker.join()
    return  pd.concat(frames,axis=1),cache

def update_metadata(args):
    df,key = args
    if not df.shape[0]:
        return
    key = (key,) if type(key) == str else tuple(key)
    if key not in metadata_info:
        return
    (db,tab),cols = metadata_info[key]
    
    cols = [c for c in cols if c not in ignore_columns]
    
    #Make sur we have the right index in df.index
    if all(k in df.index.names for k in key):
        df = df.loc[:,[c for c in cols if c not in key]]
    else:
        df = df.set_index(list(key)).loc[:,cols]
    db_index = list(map(build_index,df.index))
    df.index = db_index
    # Delete
    query = delete(tab).where(tab.c.index.in_(db_index))
    db.execute(query)
    # Add new values
    df.to_sql('_'.join(key), db, if_exists='append')
    
def add_previous_state(df,key,cache):
    # Make df.index is correct
    key = (key,) if type(key) == str else tuple(key)
    if not all(k in df.index.names for k in key):
        df.set_index(list(key),inplace=True)
    index = df.index
    cols = [c for c in df.columns if c not in ignore_columns]
    if not cols:
        return df
    return (df+cache[key])[cols]

In [None]:
def user_roll_features(df,cols):
    metadata = {}
    for col in cols:
        #roll count
        c1 = f'{col}_roll_count'
        c2 = f'{col}_roll_sum'
        
        df_tmp = df[[col,'user_id','answered_correctly',]]
        grp = df_tmp.groupby([col,'user_id'])
        df_tmp[c1] = grp.answered_correctly.cumcount()+1
        df_tmp[c2] = grp.answered_correctly.cumsum()
        
        # In windows
        wincols = []
        for win in [50,10]:
            sum_col = f'{col}_win{win}_sum'
            count_col = f'{col}_win{win}_count'
            df_tmp[sum_col] = df_tmp[c2]
            df_tmp[count_col] = df_tmp[c1]
            wincols.append(sum_col)
            wincols.append(count_col)
        
        #save metadata
        grp = df_tmp.groupby(['user_id',col]).tail(1)
        metadata[('user_id',col)] = grp.set_index(['user_id',col])[[c1,c2,*wincols]]
            
    return metadata

##### Do some tests before we submit

In [None]:
# Skip tests when submitting the notebook
skip_test = True

In [None]:
%%skip skip_test
val_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',
                     header=0, 
                     skiprows=range(1,1000000*5),
                     nrows=500000)
val_df['row_id'] = range(val_df.shape[0])
val_df = val_df[['row_id', 'timestamp', 'user_id', 'content_id', 'content_type_id',
       'task_container_id', 'prior_question_elapsed_time',
       'prior_question_had_explanation',
        'answered_correctly','user_answer']]

In [None]:
%%skip skip_test

prev_test_df = None
prev_cache = None

auc_sum = 0 
acc_sum = 0
df = val_df
count = 0
while df.shape[0]:
    count+=1
    selector = df.task_container_id == df.task_container_id.min()
    test_df = df.loc[selector]
    df = df.loc[~selector]
    print(f'{test_df.shape[0]}/{df.shape[0]}')
    
    
    # merge content file 
    test_df = test_df.merge(content_df,how='left',on=['content_id',])
    test_df.prior_question_had_explanation = test_df.prior_question_had_explanation.fillna(False).astype(np.int8)
     
    
    test_state = test_df.loc[test_df['content_type_id'] == 0]
    test_state.replace([np.inf, -np.inf], 999999999,inplace=True)
    #Merge metadat
    test_state,cache = merge_metadata(test_state)
    # prior_duration
    test_state['prior_duration'] = test_state.timestamp-test_state.prev_timestamp
    #add means
    test_state = add_mean(test_state)
    
    test_state['answered_correctly'] = model.predict(test_state[features])
    
    y = test_df[test_df['content_type_id'] == 0]['answered_correctly']
    y_pred = test_state['answered_correctly']
    auc = roc_auc_score(y, y_pred)
    acc = accuracy_score(y, y_pred>0.5)
    auc_sum += auc 
    acc_sum += acc
    print(f"AUC:{round(auc,4)} | Accuracy: {round(acc,4)} | AUC MEAN: {round(auc_sum/count,4)} | ACC MEAN: {round(acc_sum/count,4)}")
    # Update metadata
    if prev_test_df is not None:
        workers = []
        
        #prior_group_answers_correct	= json.loads(test_df.iloc[0].prior_group_answers_correct)
        #prior_group_responses = json.loads(test_df.iloc[0].prior_group_responses)
        #prior_group_responses = np.array(prior_group_responses)
        #prior_group_answers_correct = np.array(prior_group_answers_correct)
        #prev_test_df['user_answer'] = prior_group_responses
        #prev_test_df['answered_correctly'] = prior_group_answers_correct
        prev_test_df = prev_test_df[prev_test_df.content_type_id == 0]
        
        # update user performances over time
        user_df = prev_test_df[['user_id','answered_correctly']]
        grp = user_df.groupby(['user_id'])
        user_df['user_id_roll_sum'] = grp['answered_correctly'].cumsum()
        user_df['user_id_roll_count'] = grp['answered_correctly'].cumcount()+1
        # take last state
        user_df = user_df.groupby(['user_id',]).tail(1)
        user_df.set_index('user_id',inplace=True)
        del user_df['answered_correctly']
        # Also, update performance in the small window
        win_cols = []
        for win in [50,10]:
            sum_col = f'user_id_win{win}_sum'
            count_col = f'user_id_win{win}_count'
            user_df[sum_col] = user_df.user_id_roll_sum
            user_df[count_col] = user_df.user_id_roll_count
            win_cols.append(sum_col)
            win_cols.append(count_col)
        win_current_state = user_df[win_cols] 
        user_df = add_previous_state(user_df,('user_id',),prev_cache)
        # Don't let the count exeed the windows size
        for win in [50,10]:
            sum_col = f'user_id_win{win}_sum'
            count_col = f'user_id_win{win}_count'
            selector = (user_df[count_col]>win).values
            user_df.loc[selector,[sum_col,count_col]] = win_current_state.loc[selector,[sum_col,count_col]]
        
        # prev answers
        tmp_df = prev_test_df[['user_answer','answered_correctly','user_id','timestamp']].groupby('user_id').tail(1)
        tmp_df.set_index('user_id',inplace=True)
        tmp_df.columns = [f'prev_{col}' for col in tmp_df.columns]
        user_df = user_df.merge(tmp_df,how='left',on=['user_id',])
        worker =  Worker(update_metadata,(user_df,['user_id'],))
        worker.start()
        workers.append(worker)
        
        #user statistics
        cols = ['content_id','bundle_id',]
        metadata = user_roll_features(prev_test_df,cols)
        for key in metadata:
            col = key[-1]
            tmp_df = metadata[key]
            win_current_state = tmp_df
            tmp_df = add_previous_state(tmp_df,key,prev_cache)
            
            # Don't let the count exeed the windows size
            for win in [50,10]:
                sum_col = f'{col}_win{win}_sum'
                count_col = f'{col}_win{win}_count'
                selector = (tmp_df[count_col]>win).values
                tmp_df.loc[selector,[sum_col,count_col]] = win_current_state.loc[selector,[sum_col,count_col]]
            
            if key == ('user_id','content_id'):
                #The prev user's answer in the content
                tmp = prev_test_df[['content_id','user_id','answered_correctly',]]
                tmp.rename(columns={'answered_correctly':'prev_content_answered_correctly'},inplace=True)
                tmp = tmp.groupby(['content_id','user_id',]).tail(1)
                tmp.set_index(['user_id','content_id'],inplace=True)
                tmp_df = tmp_df.merge(tmp,how='left',on=['user_id','content_id'])
            worker =  Worker(update_metadata,(tmp_df,key))
            workers.append(worker)
            worker.start()
        # We wait for the workers to finish  
        for worker in workers:
            worker.join()
    prev_test_df = test_df
    prev_cache = cache

### Submit

In [None]:
%%skip not SUBMIT
import riiideducation
env = riiideducation.make_env()
iter_test = env.iter_test()

prev_test_df = None
prev_cache = None

for (test_df, sample_prediction_df) in iter_test:
    
    # merge content file 
    test_df = test_df.merge(content_df,how='left',on=['content_id',])
    test_df.prior_question_had_explanation = test_df.prior_question_had_explanation.fillna(False).astype(np.int8)
     
    
    test_state = test_df.loc[test_df['content_type_id'] == 0]
    test_state.replace([np.inf, -np.inf], 999999999,inplace=True)
    #Merge metadat
    test_state,cache = merge_metadata(test_state)
    # prior_duration
    test_state['prior_duration'] = test_state.timestamp-test_state.prev_timestamp
    #add means
    test_state = add_mean(test_state)
    test_state['answered_correctly'] = model.predict(test_state[features])
    
    # Update metadata
    if prev_test_df is not None:
        workers = []
        
        prior_group_answers_correct	= json.loads(test_df.iloc[0].prior_group_answers_correct)
        prior_group_responses = json.loads(test_df.iloc[0].prior_group_responses)
        prior_group_responses = np.array(prior_group_responses)
        prior_group_answers_correct = np.array(prior_group_answers_correct)
        prev_test_df['user_answer'] = prior_group_responses
        prev_test_df['answered_correctly'] = prior_group_answers_correct
        prev_test_df = prev_test_df[prev_test_df.content_type_id == 0]
        
        # update user performances over time
        user_df = prev_test_df[['user_id','answered_correctly']]
        grp = user_df.groupby(['user_id'])
        user_df['user_id_roll_sum'] = grp['answered_correctly'].cumsum()
        user_df['user_id_roll_count'] = grp['answered_correctly'].cumcount()+1
        # take last state
        user_df = user_df.groupby(['user_id',]).tail(1)
        user_df.set_index('user_id',inplace=True)
        del user_df['answered_correctly']
        # Also, update performance in the small window
        win_cols = []
        for win in [50,10]:
            sum_col = f'user_id_win{win}_sum'
            count_col = f'user_id_win{win}_count'
            user_df[sum_col] = user_df.user_id_roll_sum
            user_df[count_col] = user_df.user_id_roll_count
            win_cols.append(sum_col)
            win_cols.append(count_col)
        win_current_state = user_df[win_cols] 
        user_df = add_previous_state(user_df,('user_id',),prev_cache)
        # Don't let the count exeed the windows size
        for win in [50,10]:
            sum_col = f'user_id_win{win}_sum'
            count_col = f'user_id_win{win}_count'
            selector = (user_df[count_col]>win).values
            user_df.loc[selector,[sum_col,count_col]] = win_current_state.loc[selector,[sum_col,count_col]]
        
        # prev answers
        tmp_df = prev_test_df[['user_answer','answered_correctly','user_id','timestamp']].groupby('user_id').tail(1)
        tmp_df.set_index('user_id',inplace=True)
        tmp_df.columns = [f'prev_{col}' for col in tmp_df.columns]
        user_df = user_df.merge(tmp_df,how='left',on=['user_id',])
        worker =  Worker(update_metadata,(user_df,['user_id'],))
        worker.start()
        workers.append(worker)
        
        #user statistics
        cols = ['content_id','bundle_id',]
        metadata = user_roll_features(prev_test_df,cols)
        for key in metadata:
            col = key[-1]
            tmp_df = metadata[key]
            win_current_state = tmp_df
            tmp_df = add_previous_state(tmp_df,key,prev_cache)
            
            # Don't let the count exeed the windows size
            for win in [50,10]:
                sum_col = f'{col}_win{win}_sum'
                count_col = f'{col}_win{win}_count'
                selector = (tmp_df[count_col]>win).values
                tmp_df.loc[selector,[sum_col,count_col]] = win_current_state.loc[selector,[sum_col,count_col]]
            
            if key == ('user_id','content_id'):
                #The prev user's answer in the content
                tmp = prev_test_df[['content_id','user_id','answered_correctly',]]
                tmp.rename(columns={'answered_correctly':'prev_content_answered_correctly'},inplace=True)
                tmp = tmp.groupby(['content_id','user_id',]).tail(1)
                tmp.set_index(['user_id','content_id'],inplace=True)
                tmp_df = tmp_df.merge(tmp,how='left',on=['user_id','content_id'])
            worker =  Worker(update_metadata,(tmp_df,key))
            workers.append(worker)
            worker.start()
        # We wait for the workers to finish  
        for worker in workers:
            worker.join()
    prev_test_df = test_df
    prev_cache = cache
    
    env.predict(test_state[['row_id', 'answered_correctly']])


In [None]:
#pd.read_csv('submission.csv')

In [None]:
%%skip not SUBMIT
! rm -rf ./*db.*