## Time-series API (iter_test) Emulator 

This script emulate all the features that have been officially announced about Time-series API:
<pre>
The hidden test set contains new users but not new questions.
The test data follows chronologically after the train data. The test iterations give interactions of users chronologically.
Each group will contain interactions from many different users, but no more than one task_container_id of questions from any single user. 
Each group has between 1 and 1000 users.
Expect to see roughly 2.5 million questions in the hidden test set.
The API will also consume roughly 15 minutes of runtime for loading and serving the data.
The API loads the data using the types specified in Data Description page.
</pre>

I hope this helps to validation, especially for reducing "Submission Scoring Error".

This emulator may help to check following which can't check with official visible test:
* Memory usage
* Disk size consumed
* The time it took to inference
* Handling of New Users
* Handling of not only questions but also lectures.
* etc.

To deal with "Submission Scoring Error", you'd better to refer to [this discussion](https://www.kaggle.com/c/riiid-test-answer-prediction/discussion/192124).

And, of course, this will help you to check your validation score.


In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import time
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
import os
import gc

## loading validation file.
This file is made by following notebook:
https://www.kaggle.com/its7171/cv-strategy

In [None]:
target_df = pd.read_pickle('../input/riiid-cross-validation-files/cv2_valid.pickle')

In [None]:
target_df

## iter_test emulator
This class emulate iter_test()

In [None]:
class Iter_Valid(object):
    def __init__(self, df, max_user=1000):
        df = df.reset_index(drop=True)
        self.df = df
        self.user_answer = df['user_answer'].astype(str).values
        self.answered_correctly = df['answered_correctly'].astype(str).values
        df['prior_group_responses'] = "[]"
        df['prior_group_answers_correct'] = "[]"
        self.sample_df = df[df['content_type_id'] == 0][['row_id']]
        self.sample_df['answered_correctly'] = 0
        self.len = len(df)
        self.user_id = df.user_id.values
        self.task_container_id = df.task_container_id.values
        self.content_type_id = df.content_type_id.values
        self.max_user = max_user
        self.current = 0
        self.pre_user_answer_list = []
        self.pre_answered_correctly_list = []

    def __iter__(self):
        return self
    
    def fix_df(self, user_answer_list, answered_correctly_list, pre_start):
        df= self.df[pre_start:self.current].copy()
        sample_df = self.sample_df[pre_start:self.current].copy()
        df.loc[pre_start,'prior_group_responses'] = '[' + ",".join(self.pre_user_answer_list) + ']'
        df.loc[pre_start,'prior_group_answers_correct'] = '[' + ",".join(self.pre_answered_correctly_list) + ']'
        self.pre_user_answer_list = user_answer_list
        self.pre_answered_correctly_list = answered_correctly_list
        return df, sample_df

    def __next__(self):
        added_user = set()
        pre_start = self.current
        pre_added_user = -1
        pre_task_container_id = -1
        pre_content_type_id = -1
        user_answer_list = []
        answered_correctly_list = []
        while self.current < self.len:
            crr_user_id = self.user_id[self.current]
            crr_task_container_id = self.task_container_id[self.current]
            crr_content_type_id = self.content_type_id[self.current]
            if crr_user_id in added_user and (crr_user_id != pre_added_user or (crr_task_container_id != pre_task_container_id and crr_content_type_id == 0 and pre_content_type_id == 0)):
                # known user(not prev user or (differnt task container and both question))
                return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
            if len(added_user) == self.max_user:
                if  crr_user_id == pre_added_user and (crr_task_container_id == pre_task_container_id or crr_content_type_id == 1):
                    user_answer_list.append(self.user_answer[self.current])
                    answered_correctly_list.append(self.answered_correctly[self.current])
                    self.current += 1
                    continue
                else:
                    return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
            added_user.add(crr_user_id)
            pre_added_user = crr_user_id
            pre_task_container_id = crr_task_container_id
            pre_content_type_id = crr_content_type_id
            user_answer_list.append(self.user_answer[self.current])
            answered_correctly_list.append(self.answered_correctly[self.current])
            self.current += 1
        if pre_start < self.current:
            return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
        else:
            raise StopIteration()

## emulator setting

In [None]:
validaten_flg = True
if validaten_flg:
    iter_test = Iter_Valid(target_df,max_user=1000)
    predicted = []
    def set_predict(df):
        predicted.append(df)
else:
    import riiideducation
    env = riiideducation.make_env()
    iter_test = env.iter_test()
    set_predict = env.predict

Feature and Exctracting

In [None]:
def Feature_and_extracting():
    dtype = {
        'timestamp':'float32',
        'content_type_id':'bool',
        'content_id':'int16',
        'answered_correctly':'int8',
        'prior_question_elapsed_time':'float32',
        'prior_question_had_explanation':'int8'
    }
    cols = [
        'timestamp',
        'content_type_id',
        'content_id',
        'answered_correctly',
        'prior_question_elapsed_time',
        'prior_question_had_explanation'
    ]
    path="/kaggle/input/riiid-test-answer-prediction/"
    if(os.path.exists('train.pkl')):
        df_train = pd.read_pickle('train.pkl')
    else:
        df_train = pd.read_csv(path+'train.csv',sep=',',usecols=cols,dtype=dtype)
        df_train.to_pickle('train.pkl')
    df_q = pd.read_csv(path+'questions.csv', sep=',')
    df_train['prior_question_had_explanation'] = df_train['prior_question_had_explanation'].astype('bool')
    df_train = df_train[df_train.answered_correctly != -1]
    asd = df_train[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean'])
    df_q['dif']=asd[asd!=-1].dropna()['answered_correctly']['mean']
    del asd
    df_q = df_q[['question_id','dif']]
    df_train = df_train.merge(df_q,how = 'inner',left_on='content_id',right_on='question_id')
    del df_train['content_id']
    answers = df_train['answered_correctly']
    del df_train['answered_correctly']
    del df_train['question_id']
    df_train = df_train.fillna(0)
    df_train.info(memory_usage='deep')
    df_train['dif'] = df_train['dif'].astype('float32')
    df_train['timestamp'] = df_train['timestamp'].astype('float32')
    gc.collect()
    answers = answers.to_numpy(dtype='bool')  
    answers = np.array([answers, ~answers],dtype='int8').transpose()
    
    
    from sklearn import preprocessing
    print(8765)
    gc.collect()
    df_train = df_train.to_numpy()
    print(8765)

    min_max_scaler = preprocessing.MinMaxScaler()
    df_train = min_max_scaler.fit_transform(df_train)
    
    
    
    import tensorflow as tf
    from tensorflow import keras
    model = keras.Sequential()
    act = 'relu'
    model.add(keras.layers.Dense(350, input_dim=4, activation=act))

    model.add(keras.layers.Dense(100, activation=act))

    model.add(keras.layers.Dense(2, activation='softmax'))

    model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['AUC'])
    model.fit(df_train, answers, epochs=5, batch_size=10000,verbose=1,validation_split=0.3)

    model.save('model')
    return model
def Predict(df_test, df_q, model):
    df_test = df_test.merge(df_q,how = 'inner',left_on='content_id',right_on='bundle_id')
    row_id = df_test.row_id
    df_test = df_test[['timestamp','prior_question_elapsed_time','prior_question_had_explanation','dif']]
    df_test['timestamp'] = df_test['timestamp'].astype('float32')
    df_test['prior_question_had_explanation'] = df_test['prior_question_had_explanation'].astype('float32')
    df_test = df_test.values
    min_max_scaler = preprocessing.MinMaxScaler()
    df_test = min_max_scaler.fit_transform(df_test)
    pred = model.predict(df_test)
    return pred.reshape((2,len(pred)))[0]

## iterator
Now we can use iter_test(wrapper for env.iter_test) and set_predict(wrapper for env.predict) as usual.

In [None]:
dtype = {
    'timestamp':'float32',
    'content_type_id':'bool',
    'content_id':'int16',
    'answered_correctly':'int8',
    'prior_question_elapsed_time':'float32',
    'prior_question_had_explanation':'int8'
}
cols = [
    'timestamp',
    'content_type_id',
    'content_id',
    'answered_correctly',
    'prior_question_elapsed_time',
    'prior_question_had_explanation'
]
path="/kaggle/input/riiid-test-answer-prediction/"
if(os.path.exists('train.pkl')):
    df_train = pd.read_pickle('train.pkl')
else:
    df_train = pd.read_csv(path+'train.csv',sep=',',usecols=cols,dtype=dtype)
    df_train.to_pickle('train.pkl')
df_q = pd.read_csv(path+'questions.csv', sep=',')
asd = df_train[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean'])
df_q['dif']=asd[asd!=-1].dropna()['answered_correctly']['mean']
del asd
df_q = df_q[['question_id','dif']]
del df_train


In [None]:
gc.collect()

In [None]:
model = Feature_and_extracting()

In [None]:
pbar = tqdm(total=2500000)
previous_test_df = None
counter = 0
for (current_test, current_prediction_df) in iter_test:          
    if previous_test_df is not None:
        answers = eval(current_test["prior_group_answers_correct"].iloc[0])
        responses = eval(current_test["prior_group_responses"].iloc[0])
        previous_test_df['answered_correctly'] = answers
        previous_test_df['user_answer'] = responses
    previous_test_df = current_test.copy()
    current_test = current_test[current_test.content_type_id == 0]
    # your prediction code here
    current_test['answered_correctly'] = Predict(current_test,df_q,model)
    set_predict(current_test.loc[:,['row_id', 'answered_correctly']])
    pbar.update(len(current_test))

In [None]:
# if validaten_flg:
#     #validation score
#     y_true = target_df[target_df.content_type_id == 0].answered_correctly
#     y_pred = pd.concat(predicted).answered_correctly
#     print('validation auc:',roc_auc_score(y_true, y_pred))