In [None]:
from functools import reduce
import gc
import numpy as np
import pandas as pd
from pathlib import Path
import sys
import time

## Summary
Inspired by [Simulating Pipeline for Test Data](https://www.kaggle.com/abdurrafae/simulating-pipeline-for-test-data), this notebook creates a validation set generator that serves batches similarly to the way the competition testing api serves them:
* Each batch includes interactions from only one `task_container_id` for each `user_id`
* For each `user_id`, `timestamps` increase monotonically and are greater than the latest `timestamp` in the training set
* A maximum of 1,000 interactions per batch


The code that ensures that only one `task_container_id` per `user_id` is included in each batch is slow (averaging ~5 seconds per batch with a total validation set size of approximately 2.5 million) and may not have any discernible benefit with respect to cross validation. I noted in the [discussion formus ](https://www.kaggle.com/c/riiid-test-answer-prediction/discussion/192919#1058744) that some competitors at the top of the leaderboard are utlizing a simpler approach. The [competition api](https://www.kaggle.com/c/riiid-test-answer-prediction/data) says it gets through 2.5 million records in roughly 15 minutes, so I'm obviously way off the mark. It would be much faster if the `task_container_id` logic was elimniated and reworked to instead include only one `row_id` per `user_id`, maintaining the other constraints, but it would be nice to have something fast that replicated the test api as closely as possible.

It could be faster using [cudf](https://docs.rapids.ai/api/cudf/stable/), but it may be better just to fade back to the simpler `user_id` logic. I'm sure there are some obvious ways to speed things up that I'm missing - I'd be appreciative of any suggestions.

## Read train.csv

In [None]:
%%time

dtypes = {'row_id': 'int64',
          'timestamp': 'int64',
          'user_id': 'int32',
          'content_id': 'int16',
          'content_type_id': 'int8',
          'task_container_id': 'int16',
          'user_answer': 'int8',
          'answered_correctly': 'int8',
          'prior_question_elapsed_time': 'float32', 
          'prior_question_had_explanation': 'boolean'}

df_train = pd.read_feather('../input/riiid-train-data-multiple-formats/riiid_train.feather')

for c,t in dtypes.items():
    df_train[c] = df_train[c].astype(t)

In [None]:
%%time
# ensure task_container_id increases monotonically per 
# https://www.kaggle.com/c/riiid-test-answer-prediction/discussion/189465

if True:
    df_train.task_container_id = (df_train.groupby('user_id')['task_container_id']
                                  .transform(lambda x: pd.factorize(x)[0])
                                  .astype('int16'))

## Create Validation Set

In [None]:
np.random.seed(42)

# get unique user_ids
user_ids = df_train.user_id.unique()

# choose random set of user ids
user_ids_valid = np.random.choice(user_ids, int(0.1 * len(user_ids)))

# filter training set to include only records with chosen user_ids
df_user_valid = df_train[df_train.user_id.isin(user_ids_valid)][['user_id', 'task_container_id', 'row_id']]

# get unique user_id-task_container_id combinations from records of chosen user_ids
df_user_task_valid = df_user_valid.groupby(['user_id', 'task_container_id']).head(1).reset_index()

# get index of trailing number of unique user_id-task_container_id combinations
index_valid = df_user_task_valid.groupby('user_id').tail(100).set_index(['user_id', 'task_container_id']).index

# use index to get ids of all rows in the chosen set of user_id-task_container combinations
row_valid = df_train.set_index(['user_id', 'task_container_id'])['row_id'].loc[index_valid].values

# get train row_ids using inverse of valid row_ids
row_train = df_train.row_id[np.isin(df_train.row_id, row_valid, invert=True)].values

# create train index of unique user_id-task_container_id combinations
index_train = (df_train[['user_id', 'task_container_id']].iloc[row_train]
               .set_index(['user_id', 'task_container_id']).index)

# compare valid and train indices to make sure they are mutually exclusive 
# for index_train_split in np.array_split(index_train, 20):
#     assert index_valid.isin(index_train_split).sum() == 0

# del index_train_split
# gc.collect()

In [None]:
print(f'Validation set selected from {len(df_user_valid):,d} interactions \
by {len(user_ids_valid):,d} users in \n {len(df_user_task_valid):,d} \
unique user_id-task_container_id combinations.\n')

print(f'Validation set includes {len(row_valid):,d} interactions \
in \n {len(index_valid):,d} unique user_id-task_container_id combinations.')

In [None]:
df_valid = df_train.loc[row_valid, ['user_id', 'task_container_id', 'row_id']].set_index(['user_id', 'task_container_id'])

In [None]:
# quick memory check to see what the big variables are

if True:
    local_vars = list(locals().items())
    for var, obj in local_vars:
        size = sys.getsizeof(obj)
        if size > 1e7:
            print(f'{var:<18}{size/1e6:>10,.1f} MB')

## Create Validation Batch Generator

In [None]:
def valid_batches(df_valid=df_valid, verbose=True, tests=True):
 
    while len(df_valid):
        if verbose:
            print(f'Count of row_ids in validation set before selecting batch: {len(df_valid):,d}')

        # set number of users to include in batch
        n_users = np.random.randint(1000)

        # get remaining user_ids
        users_unique = df_valid.index.unique(level='user_id').to_numpy()

        # shuffle user_ids
        np.random.shuffle(users_unique)

        # get index of first user_id-task_container_id for each user
        df_valid_g = df_valid.loc[users_unique[:n_users]].reset_index().groupby(['user_id'])
        index_valid_batch = df_valid_g.head(1).set_index(['user_id', 'task_container_id']).index.sort_values()
        if verbose:
            print(f'Count of unique user_id-task_container combinations to include in batch: {len(index_valid_batch):,d}')

        # select row_ids to include in batch
        df_valid_batch = df_valid.loc[index_valid_batch]
        if verbose:
            print(f'Count of row_ids included in batch: {len(df_valid_batch):,d}')

        # drop selected row_ids from validation set
        df_valid = df_valid.drop(index_valid_batch)
        if verbose:
            print(f'Count of row_ids in validation set after selecting batch: {len(df_valid):,d}')

        if tests:
            # ensure only one task_container_id for each user_id
            assert (df_valid_batch.groupby(['user_id', 'task_container_id'])
            .head(1).groupby('user_id').count() == 1).all().values[0]

            # ensure task_container_ids in remaining valid set are greater than
            # task_container_ids in batch for each user
            batch_tids = df_valid_batch.groupby('user_id').head(1)
            valid_tids_batch_userids = (df_valid.loc[df_valid_batch.index
                                                     .get_level_values('user_id')
                                                     .unique()].groupby('user_id')
                                                     .head(1))
            assert (valid_tids_batch_userids.reset_index().task_container_id > 
                    batch_tids.reset_index().task_container_id).all()
        
        gc.collect()
        
        yield df_valid_batch

In [None]:
valid_gen = valid_batches(tests=False)

In [None]:
%%time
df_valid_batch = next(valid_gen)