In [None]:
import gc
import os
import random

import numpy as np
import pandas as pd
import tqdm
import pickle

from sklearn.model_selection import GroupShuffleSplit

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [None]:
class config:
    SEED = 416
    N_FOLD = 3
    TEST_RATIO = 0.3
    N_SAMPLE_TEST = 0.1

In [None]:
train = pd.read_feather("/kaggle/input/riiid-make-train-feather/train.feather")
train = train[train["answered_correctly"] != -1].reset_index(drop=True)

In [None]:
cv_index = {"train":[], "valid":[]}
gsp = GroupShuffleSplit(config.N_FOLD, test_size=config.TEST_RATIO, random_state=config.SEED)
for fold, (train_idx, valid_idx) in enumerate(gsp.split(train, groups=train['user_id'])):
    print(f"### Fold-{fold} ###")
    seed_everything(config.SEED)
    
    user_count_dict = train['user_id'].iloc[valid_idx].value_counts().to_dict()
    new_train_id = []
    for user in tqdm.tqdm_notebook(user_count_dict.keys()):
        if np.random.rand() > (1 - config.N_SAMPLE_TEST):
            samples_to_add = np.random.binomial(user_count_dict[user], 0.50)
            if samples_to_add > 0:
                new_train_id = new_train_id + list(train[train['user_id'] == user].index[:samples_to_add])

    #train_idx = np.array(new_train_id + list(train_idx))
    valid_idx = np.array(list(set(valid_idx).difference(set(new_train_id))))
    
    #cv_index["train"].append(train_idx)
    cv_index["valid"].append(valid_idx)

In [None]:
#with open("riiid_cv_index.pkl", "wb") as f:
#    pickle.dump(cv_index, f)

In [None]:
train = train[["row_id"]]
gc.collect()

In [None]:
train["valid_fold"] = -1

for fold, v_idx in enumerate(cv_index["valid"]):
    train["valid_fold"].iloc[v_idx] = fold

train.to_csv(f"cv_fold_info.csv", index=None)