**The goal of this notebook is to show how Python slices can be used to create cross-validation folds.**

You can provide your suggestions in the comments to improve the approach and/or the performances.

# Imports

In [None]:
import itertools
import typing as T
import numpy as np
import pandas as pd

# Configs

In [None]:
TRAIN_CSV = '../input/ubiquant-market-prediction/train.csv'

# Schemas

In [None]:
metas_dtypes = {
    'time_id': 'int32',
    'investment_id': 'int32',
    'target': 'float32',
}
feats_dtypes = {
    f'f_{i}': 'float32'
    for i in range(300)
}
train_dtypes = {
    **metas_dtypes,
    **feats_dtypes,
}
len(train_dtypes)

# Datasets

In [None]:
df = pd.read_csv(TRAIN_CSV, index_col='row_id', dtype=train_dtypes)
assert set(df.dtypes) == {np.dtype('float32'), np.dtype('int32')}
assert len(df.dtypes) == len(train_dtypes)
df.head()

# Folds

In [None]:
Fold = T.Tuple[int, slice, slice, slice]

def fold_index(df: pd.DataFrame, n: int = 5, step: int = 10) -> T.Iterator[Fold]:
    """Create a list of ordered folds (train, valid, test).
    
    Valid has a fixed size, train and test use the remaining."""
    assert n > 0
    assert step > 0
    assert df['time_id'].is_monotonic
    # create a list of unique and ordered time codes
    times = df['time_id'].drop_duplicates().sort_values().tolist()
    # create the start and end iterator (reversed)
    starts = range(max(times) - 2*step, 0, -step)
    ends = range(max(times) - step, 0, -step)
    # combine starts and ends into periods
    periods = enumerate(zip(starts, ends))
    # select the first n periods (folds)
    ranges = itertools.islice(periods, n)
    for i, (start, end) in ranges:
        # Note: this could be improved with .iloc
        test_index = df.query('@end < time_id').index
        train_index = df.query('time_id < @start').index
        valid_index = df.query('@start <= time_id <= @end').index
        # ensure the indexes covers the initial dataframe index (complete) 
        assert len(df) == len(valid_index) + len(train_index) + len(test_index)
        # create the slices based on the first / last index
        test_slice = slice(test_index[0], test_index[-1], 1)
        train_slice = slice(train_index[0], train_index[-1], 1)
        valid_slice = slice(valid_index[0], valid_index[-1], 1)
        # create an iterator (can be casted to list for convenience)
        yield i, train_slice, valid_slice, test_slice

folds = list(fold_index(df))
folds

# Usage

In [None]:
# get the slices associated with the first fold
i, train_slice, valid_slice, test_slice = folds[0]
# create the train/valid/test dataframe from the slices
train, valid, test = df[train_slice], df[valid_slice], df[test_slice]

In [None]:
train

In [None]:
valid

In [None]:
test