In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import pickle
import json
import typing
import math
from threading import Lock


In [None]:
import sys
sys.path.append('../input/riiidmodel4thplace/')

In [None]:
from modeling import RiiidAnswerModel
from model_config import config

In [None]:
with open('../input/riiidmodel4thplace/data_map.pickle', 'rb') as f:
    data_map = pickle.load(f)

    
with open(
    '../input/riiidmodel4thplace/encoded_content_map_v2.json', 
    'r') as f:
    encoded_content_map = json.load(f)
with open(
    '../input/riiidmodel4thplace/encoded_content_id_map.json', 
    'r') as f: 
    encoded_content_id_map = json.load(f)

encoded_content_table = pd.DataFrame.from_dict(
    encoded_content_id_map
).astype({
        'content_id': 'int16',
        'content_type_id':'int8',
        'encoded_content_id':'int32'
    })

max_question_id = encoded_content_table[encoded_content_table.content_type_id == 0].encoded_content_id.max()

In [None]:
SEQUENCE_LENGTH = 512
BATCH_SIZE = 32
data_keys = ['timestamp',
             'encoded_content_id',
             'user_answer',
             'answered_correctly',
             'question_elapsed_time',
             'question_had_explanation',
             'non_padding_mask',
             'time_lag']
data_types = [
            tf.int64,
            tf.int32,
            tf.int32,
            tf.int32,
            tf.float32,
            tf.int32,
            tf.float32,
            tf.float32
        ]

In [None]:

def build_test_ds_function(test_agg_df, data_map):
    data = data_map['data']
    index_map = data_map['index']
    lock = Lock()
    def process_row_i(i):
        user_id = i.numpy()
        row = test_agg_df.loc[user_id]
        n = len(row['timestamp'])
        index = index_map.get(user_id, None)
        if index is not None:
            is_question = data['encoded_content_id'][index] <= max_question_id
            questions = data['timestamp'][index][is_question]
            if len(questions) > 0:
                last_question_timestamp = questions.max()
                prior_question_cond = data['timestamp'][index] == last_question_timestamp
                data['question_elapsed_time'][index][prior_question_cond] = row['prior_question_elapsed_time'][0]
                data['question_had_explanation'][index][prior_question_cond] = row['prior_question_had_explanation'][0]
            
            for k in data_keys:
                data[k][index,:-n] = data[k][index, n:]

        else:
            lock.acquire()
            index = data_map['next_index']
            index_map[user_id] = index
            data_map['next_index'] += 1
            lock.release()
            
        data['timestamp'][index, -n:] = row['timestamp']
        data['encoded_content_id'][index,-n:] = row['encoded_content_id']
        data['user_answer'][index, -n:] = 0
        data['answered_correctly'][index, -n:] = 0
        data['question_elapsed_time'][index, -n:] = 0
        data['question_had_explanation'][index, -n:] = 0
        data['non_padding_mask'][index,-n:] = 1
        data['time_lag'][index, -n:] = row['timestamp'][0] - data['timestamp'][index, -(n+1)]
        return [data[k][index] for k in data_keys]
    def map_function(i):
        x = tf.py_function(process_row_i, inp=[i], Tout=data_types)
        return {
            k:v for k,v in zip(data_keys, x)
        }
    return tf.data.Dataset.from_tensor_slices(test_agg_df.index).map(
        map_function,
        num_parallel_calls = tf.data.experimental.AUTOTUNE
    ).prefetch(
        tf.data.experimental.AUTOTUNE
    ).batch(BATCH_SIZE)
    

In [None]:
def update_map_with_user_answer(data_map, prev_test_agg_df):
    data = data_map['data']
    for user_id, row in prev_test_agg_df.iterrows():
        n = len(row['user_answer'])
        index = data_map['index'][user_id]
        data['user_answer'][index][-n:] = row['user_answer']
        data['answered_correctly'][index][-n:] = row['answered_correctly']



In [None]:

model = RiiidAnswerModel(
        encoded_content_map,
        **config
        )

@tf.function(
    input_signature = ({
        key : tf.TensorSpec(shape=(None, SEQUENCE_LENGTH), dtype=dt) 
        for key, dt in zip(data_keys, data_types) 
    },)
)
def predict_batch(test_data):
    logits = model(test_data, training = False)
    logits = tf.squeeze(logits, axis  =-1)
    sigmoids = tf.sigmoid(logits)
    return sigmoids


#load model weights
sample_data = {k:data_map['data'][k][0:BATCH_SIZE,:] for k in data_map['data'].keys()}
predict_batch(sample_data)
model.load_weights('../input/riiidmodel4thplace/weights.h5')


In [None]:

def predict_and_update(test_agg_df, data_map):
    '''
    Return : timestamp, sigmoid prediction
    '''
    ## Padding to have a multiple of batchzize
    
    test_ds = build_test_ds_function(test_agg_df, data_map)
    
    probs = []
    for test_data in test_ds:
        cur_probs = predict_batch(test_data) 
        probs += tf.unstack(cur_probs)
    return probs

In [None]:
import riiideducation
env = riiideducation.make_env()

In [None]:
%%time

iter_test = env.iter_test()
prev_test_df = None
for test_df, sample_prediction_df in iter_test:
    if prev_test_df is not None:
        '''
        Updating answers of previous batch
        '''
        prev_test_df['user_answer'] = eval(test_df.prior_group_responses.iloc[0])
        prev_test_df['answered_correctly'] = eval(test_df.prior_group_answers_correct.iloc[0])
        
        prev_test_agg_df = prev_test_df.groupby(
                'user_id',
                sort = False,
                as_index = True
            )[['user_answer', 'answered_correctly'
                                   ]].agg(list)
        
        update_map_with_user_answer(data_map, prev_test_agg_df)

    
    '''
    Now build test data for the current batch
    '''
    
    test_df.fillna(0, inplace = True)
    test_agg_df = test_df.merge(
            encoded_content_table,
            on = ['content_id', 'content_type_id'],
            how = 'left'
        ).fillna(0).groupby('user_id', 
                            sort = False,
                            as_index = True)[['timestamp',
                                           'encoded_content_id',
                                           'prior_question_elapsed_time',
                                           'prior_question_had_explanation',
                                           'row_id'
                                          ]].agg(list)
   
    probs = predict_and_update(test_agg_df, data_map)
    
    extracted_probs = []
    row_id = []
    
    
    for prob, (_, row) in zip(probs, 
                                 test_agg_df.iterrows()):
            
        '''
        Finding the indices and extract probs for each user
        '''
        n = len(row['row_id'])
        extracted_probs += prob[-n:].numpy().tolist()
        row_id += row['row_id']
    
    pred_df = pd.DataFrame.from_dict({
            'row_id':row_id,
            'answered_correctly': extracted_probs,
    })
    pred_df = sample_prediction_df[['row_id']].merge(
        pred_df, 
        on = 'row_id',
        how = 'left'
    ).fillna(0.5)
        
    env.predict(pred_df)

    prev_test_df = test_df
