**Have a (random) subset of users' data to enable quick model experiments, split into train, val w/o much hassle**

principles:
* keep each user's time course healthy, i.e. do not downsample for the sake of smaller data
* first part of time course should go into train - remaining part into val
* across users, have a fixed portion go into train and val, respectively

In [None]:
import pandas as pd
import random

random.seed(33)

fraction_users = 0.001
split = 0.8

In [None]:
%%time 

data = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',
                   dtype={'row_id': 'int64',
                          'timestamp': 'int64',
                          'user_id': 'int32',
                          'content_id': 'int16',
                          'content_type_id': 'int8',
                          'task_container_id': 'int16',
                          'user_answer': 'int8',
                          'answered_correctly':'int8',
                          'prior_question_elapsed_time': 'float32',
                          'prior_question_had_explanation': 'boolean'
                         })

take a subset of users

In [None]:
users = data.user_id.unique()
no_users_sample = int(round(len(users)*fraction_users,0))

print(f'no. of unique users: {len(users)}')
print(f'no. of users in sample: {no_users_sample}')

users = random.sample(list(users), no_users_sample)
mask = data.user_id.isin(users)
data = data[mask]

get xth percentile of timestamp, to prepare split

In [None]:
my_planet = data[['timestamp', 'user_id']].groupby('user_id').quantile(split).reset_index()

mark rows with train, val flag

In [None]:
my_planet.head()

In [None]:
my_planet.columns = ['user_id', 'quartile']
data = pd.merge(data, my_planet)


In [None]:
mask = data['timestamp']>data['quartile']
data['mask'] = mask

In [None]:
data['split'] = 'none'

In [None]:

data['split'].where(data['mask'] is True) = 'val'
#data['split'][data['mask'] is False] = 'train'

In [None]:
data.head()

In [None]:
%%time

# https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas
pd.options.mode.chained_assignment = None

from tqdm import tqdm

data['split'] = 'na'

for i in tqdm(range(len(data))):
    current_user = data.user_id.iloc[i]
    percentile = user_percentile[current_user]
    if data.timestamp.iloc[i] <= percentile:
        data.at[i, split] = 'train'
    else:
        data.at[i, split] = 'val'


split

In [None]:
train = data[data.split == 'train']
val = data[data.split == 'val']

train = train.drop('split', axis=1)
val = val.drop('split', axis=1)

check properties for train and val

In [None]:
print(f'no. of users in train: {len(train.user_id.unique())}')
print(f'no. of users in val: {len(val.user_id.unique())}')
print('-> should be the same')
print(f'no. of records in train: {len(train)}')
print(f'no. of records in val: {len(val)}')

oops - what's this?

In [None]:
train_ids = set(train['user_id'].unique())
val_ids = set(val['user_id'].unique()) 
cnt=0

missing_ids = train_ids - val_ids

for id in missing_ids:
    print(data[['timestamp', 'user_id']][data['user_id']==id])

interesting that there are a number of entries with equal timestamp for this user. But for now let us remove them just from train, val

In [None]:
train = train[~train.user_id.isin(missing_ids)]

check again

In [None]:
print(f'no. of users in train: {len(train.user_id.unique())}')
print(f'no. of users in val: {len(val.user_id.unique())}')
print('-> should be the same')
print(f'no. of records in train: {len(train)}')
print(f'no. of records in val: {len(val)}')

look at mean, standard deviation

In [None]:
valid_question = val[val.content_type_id == 0]
train_question = train[train.content_type_id == 0]

print(f'accuracy of answering in train: {round(train_question.answered_correctly.mean(), 2)} (stdev={round(train_question.answered_correctly.std(), 2)})')
print(f'accuracy of answering in val: {round(valid_question.answered_correctly.mean(), 2)} (stdev={round(valid_question.answered_correctly.std(), 2)})')

write train, val to csv files

In [None]:
train.to_csv(f'/kaggle/working/train_{int(fraction_users*100)}percent.csv')
val.to_csv(f'/kaggle/working/val_{int(fraction_users*100)}percent.csv')


thank you, tito, https://www.kaggle.com/its7171/cv-strategy for valuable insights