This is a folk of [This notebook](https://www.kaggle.com/its7171/cv-strategy). I randomsampled 1/10 of user to reduce dataset size.

In [None]:
import pandas as pd
import random
import gc
from sklearn.model_selection import GroupShuffleSplit
random.seed(1)

In [None]:
train = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',
                   dtype={'row_id': 'int64',
                          'timestamp': 'int64',
                          'user_id': 'int32',
                          'content_id': 'int16',
                          'content_type_id': 'int8',
                          'task_container_id': 'int16',
                          'user_answer': 'int8',
                          'answered_correctly':'int8',
                          'prior_question_elapsed_time': 'float32',
                          'prior_question_had_explanation': 'boolean'}
                   )

In [None]:
len(train)

In [None]:
reduced_train_size=0.1

In [None]:
train_idx, test_idx =next(GroupShuffleSplit(n_splits=1, train_size=reduced_train_size, random_state=42).split(train,groups=train.user_id))
train=train.iloc[train_idx]

In [None]:
len(train)

In [None]:
valid_split1 = train.groupby('user_id').tail(5)
train_split1 = train[~train.row_id.isin(valid_split1.row_id)]
valid_split1 = valid_split1[valid_split1.content_type_id == 0]
train_split1 = train_split1[train_split1.content_type_id == 0]
print(f'{train_split1.answered_correctly.mean():.3f} {valid_split1.answered_correctly.mean():.3f}')

In [None]:
del valid_split1, train_split1
gc.collect()

Since training data and test data are split by time, the validation data should also be split by time.
However, the given timestamp is the time that has elapsed since the user's first event, not the actual time.
So I set a random first access time for each user within a certain interval.

In [None]:
max_timestamp_u = train[['user_id','timestamp']].groupby(['user_id']).agg(['max']).reset_index()
max_timestamp_u.columns = ['user_id', 'max_time_stamp']
MAX_TIME_STAMP = max_timestamp_u.max_time_stamp.max()

`(MAX_TIME_STAMP for all users) - (max_time_stamp for each user)` is used for this interval.

In [None]:
def rand_time(max_time_stamp):
    interval = MAX_TIME_STAMP - max_time_stamp
    rand_time_stamp = random.randint(0,interval)
    return rand_time_stamp

max_timestamp_u['rand_time_stamp'] = max_timestamp_u.max_time_stamp.apply(rand_time)
train = train.merge(max_timestamp_u, on='user_id', how='left')
train['viretual_time_stamp'] = train.timestamp + train['rand_time_stamp']

In [None]:
del train['max_time_stamp']
del train['rand_time_stamp']
del max_timestamp_u
gc.collect()

In [None]:
kaggle_env = False
if kaggle_env:
    # Full dataframe can not be sorted on kaggle kernel due to lack of memory.
    train = train[:10000000]
train = train.sort_values(['viretual_time_stamp', 'row_id']).reset_index(drop=True)

In [None]:
if kaggle_env:
    val_size = 250000
else:
    val_size = 2500000

for cv in range(5):
    valid = train[-val_size:]
    train = train[:-val_size]
    # check new users and new contents
    new_users = len(valid[~valid.user_id.isin(train.user_id)].user_id.unique())
    valid_question = valid[valid.content_type_id == 0]
    train_question = train[train.content_type_id == 0]
    new_contents = len(valid_question[~valid_question.content_id.isin(train_question.content_id)].content_id.unique())    
    print(f'cv{cv} {train_question.answered_correctly.mean():.3f} {valid_question.answered_correctly.mean():.3f} {new_users} {new_contents}')
    valid.to_pickle(f'cv{cv+1}_valid.pickle')
    train.to_pickle(f'cv{cv+1}_train.pickle')