This script can be run in the kaggle environment.
GPU accelerator is required.

The original script is [CV Strategy](https://www.kaggle.com/its7171/cv-strategy).

In [None]:
import gc
import random
import numpy as np
import pandas as pd
import cudf
import matplotlib.pyplot as plt

In [None]:
def fast_merge(left, right, key):
    return cudf.concat([left.reset_index(drop=True), right.reindex(left[key].values).reset_index(drop=True)], axis=1)

In [None]:
seed = 1
random.seed(seed)
np.random.seed(seed)

The users are slightly biased in the original script.
Please see [this notebook](https://www.kaggle.com/marisakamozz/riiid-cv-strategy-users-are-slightly-biased) for details.
Therefore, I will sample random numbers from the beta distribution instead of the uniform distribution.

In [None]:
a = 2.2
b = 2.3

In [None]:
size = 1000000
samples = np.random.beta(a, b, size)
samples = pd.Series(samples)
samples.hist(bins=100)
plt.show()

In [None]:
samples.describe()

In [None]:
dtypes = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly':'int8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'boolean'
}

In [None]:
%%time
train = cudf.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', dtype=dtypes)

In [None]:
max_timestamp_u = train[['user_id','timestamp']].groupby(['user_id']).max()
max_timestamp_u.columns = ['max_timestamp']
max_timestamp_u['interval'] = max_timestamp_u.max_timestamp.max() - max_timestamp_u.max_timestamp
# max_timestamp_u['random'] = np.random.rand(len(max_timestamp_u))
max_timestamp_u['random'] = np.random.beta(a, b, len(max_timestamp_u))
max_timestamp_u['random_timestamp'] = max_timestamp_u.interval * max_timestamp_u.random
max_timestamp_u['random_timestamp'] = max_timestamp_u.random_timestamp.astype(int)
max_timestamp_u.drop(['interval', 'random'], axis=1, inplace=True)

In [None]:
max_timestamp_u.describe()

In [None]:
train = fast_merge(train, max_timestamp_u, 'user_id')
train['virtual_timestamp'] = train.timestamp + train.random_timestamp
train.set_index(['virtual_timestamp', 'row_id'], inplace=True)
train.sort_index(inplace=True)
train.reset_index(inplace=True)
train.drop(columns=['max_timestamp', 'random_timestamp'], inplace=True)

In [None]:
last100m = train[-100000000:]
interval = 2500000
mean_max_timestamp = []
target_means = []
for i in range(40):
    start = i * interval
    user_list = last100m[start:start+interval].user_id.unique()
    mean_max_timestamp.append(max_timestamp_u[['max_timestamp']].reindex(user_list).mean())
    temp = last100m[last100m.answered_correctly != -1]
    target_means.append(temp[start:start+interval].answered_correctly.mean())
mean_max_timestamp = cudf.concat(mean_max_timestamp).to_pandas()
target_means = pd.Series(target_means)

In [None]:
plt.bar(list(range(40)), mean_max_timestamp)
plt.show()

In [None]:
target_means.plot()
plt.show()

In [None]:
last10m = train[-10000000:]
interval = 1000000
mean_max_timestamp = []
target_means = []
for i in range(10):
    start = i * interval
    user_list = last10m[start:start+interval].user_id.unique()
    mean_max_timestamp.append(max_timestamp_u[['max_timestamp']].reindex(user_list).mean())
    temp = last10m[last10m.answered_correctly != -1]
    target_means.append(temp[start:start+interval].answered_correctly.mean())
mean_max_timestamp = cudf.concat(mean_max_timestamp).to_pandas()
target_means = pd.Series(target_means)

In [None]:
plt.bar(list(range(10)), mean_max_timestamp)
plt.show()

In [None]:
target_means.plot()
plt.show()

The timestamps are almost the same, but the accuracy rate is lower. It is difficult to make an unbiased CV.
You may want to throw away the last 2.5 million.

In [None]:
val_size = 2500000
for cv in range(5):
    valid = train[-val_size:]
    train = train[:-val_size]
    valid.to_parquet(f'cv{cv+1}_valid.parquet')
    train.to_parquet(f'cv{cv+1}_train.parquet')