Some of the comments in the notebooks posted here remarked that improvements the out of folds often lead to worse score. I believe this is related to the way the sampling is done in the CV.

For instance, this notebook

https://www.kaggle.com/nishanthaddagatla/lgbm-baseline

and many others split the whole matrix consisting of pairs ['stock_id', 'time_id'] into five folds.

In the training dataset majority of the time id's include all of the 112 stock ids, with little of them including just 111. This means that the sampling (or splitting in training and test dataset) is done by 'time_id' rather than by the pair ['stock_id', 'time_id']. This suggests that the CV should do the same.

I tried to do CV both ways. Doing CV by pairs ['stock_id', 'time_id'] suggests that LGBM improves when we take the number of iterations up to 2000 (notice that LGBM rarely finishes through early stopping this way, which means you don't even need the validation set). However with this amount of iterations the score worsens.

If you do the splitting into folds via the time id, it indeed shows that the score worsens with 2000 iterations.

In [None]:
import cupy as cp
import cudf

import pandas as pd
import numpy as np

import cuml
import glob
from tqdm import tqdm
import lightgbm as lgb

from sklearn.model_selection import KFold
from sklearn import preprocessing, model_selection
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt

PATH = "/kaggle/input/optiver-realized-volatility-prediction"

# Feature engeneering

taken mostly from here 
https://www.kaggle.com/nishanthaddagatla/lgbm-baseline

and here
https://www.kaggle.com/alexioslyon/accelerating-trading-on-gpu-via-rapids

In [None]:
# Clusters for feature engeneering

def load_data(mode, path="/kaggle/input/optiver-realized-volatility-prediction"):
    # mode = "train"/"test"
    file_name = f'{path}/{mode}.csv'
    return pd.read_csv(file_name)

dev_df = load_data("train", path=PATH)
train_p = dev_df.pivot(index='time_id', columns='stock_id', values='target')
train_p.head()

# use correlation matrix for relative error

corr = train_p.corr()
ids = corr.index
kmeans = KMeans(n_clusters=7, random_state=0).fit(corr.values)

l = []
for n in range(7):
    l.append ( [ (x-1) for x in ( (ids+1)*(kmeans.labels_ == n)) if x > 0] )

for lst in l:
    print(lst)
    
clusters = [l[0], l[1], l[3], l[4], l[6]]

# oreder and trade books
order_book_training = glob.glob(f'{PATH}/book_train.parquet/*/*')
order_book_test = glob.glob(f'{PATH}/book_test.parquet/*/*')

trades_training = glob.glob(f'{PATH}/trade_train.parquet/*/*')
trades_test = glob.glob(f'{PATH}/trade_test.parquet/*/*')

In [None]:
%cd /kaggle/input/rapids-kaggle-utils/

In [None]:
import cu_utils.transform as cutran

def log_diff(df, in_col):
    null_val = -9999
    df["logx"] = df[in_col].log()
    shifted = (df[["time_id", in_col]].groupby("time_id")
                             .apply_grouped(cutran.get_cu_shift_transform(shift_by=1, null_val=null_val),
                                            incols={in_col: 'x'},
                                            outcols=dict(y_out=cp.float32),
                                            tpb=32)["y_out"])
    res = df[in_col].log() - shifted.log()
    res[shifted == null_val] = 0.0
    return res

def realized_volatility(s):
    return s.sum()

def extract_raw_book_features(df, null_val=-9999):
    df['wap1']=(df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    df['wap2']=(df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])     
    df['wap3']=(df['bid_price1'] * df['bid_size1'] + df['ask_price1'] * df['ask_size1']) / (df['bid_size1'] + df['ask_size1'])
    df['wap4']=(df['bid_price2'] * df['bid_size2'] + df['ask_price2'] * df['ask_size2']) / (df['bid_size2'] + df['ask_size2'])                                                                                  
    for n in [1,2,3,4]:
        df[f"square_log_return{n}"] = log_diff(df, in_col=f"wap{n}")**2    
    
    df['wap_balance'] = df['wap1'] - df['wap2']
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread1'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df["bid_ask_spread"] = abs(df['bid_spread'] - df['ask_spread'])
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    return df


def extract_raw_trade_features(df):
    df["trade_log_return"] = log_diff(df, in_col=f"price")
    df["trade_square_log_return"] = df["trade_log_return"] ** 2
    df["tendency"] = df["trade_log_return"] * df['size']
    df["goes_up"] = df["trade_log_return"] > 0
    df["goes_down"] = df["trade_log_return"] < 0
    df["amount"] = df['price'] * df['size']
    
    return df


def agg(df, feature_dict):
    agg_df = df.groupby("time_id").agg(feature_dict).reset_index()
    def f(x):
        if x[1] == "":
            return x[0]
        return x[0] + "_" + x[1]
    
    agg_df.columns = [f(x) for x in agg_df.columns]
    col_vol=[col for col in agg_df.columns if 'square_log_return' in col and ('mean' in col or 'sum' in col)]
    if col_vol:
        agg_df[col_vol]=agg_df[col_vol].sqrt()
    return agg_df    


def extract_book_stats(df):
    feature_dict = {
        'wap1': ["sum", "std"],
        'wap2': ["sum", "std"],
        'wap3': ["sum", "std"],
        'wap4': ["sum", "std"],
        'square_log_return1': ["sum"],
        'square_log_return2': ["sum"],
        'square_log_return3': ["sum"],
        'square_log_return4': ["sum"],
        'price_spread': ["sum", "max"],
        'price_spread1': ["sum", "max"],
        'wap_balance': ["sum", "max"],
        'bid_spread': ["sum", "max"],
        'ask_spread': ["sum", "max"],
        'total_volume': ["sum", "max"],
        'volume_imbalance': ["sum", "max"],
        "bid_ask_spread":["sum", "max"],
    }
    
    return agg(df, feature_dict)
    
    
def extract_trade_stats(df):
    feature_dict = {
        'price': ['std'],
        'trade_square_log_return': ["sum"],
        'seconds_in_bucket':["nunique"],
        'size': ["sum",'max', 'min', 'std'],
        'order_count': ["sum",'max'],
        'amount':['sum','max', 'min'],
        'tendency': ['sum'],
        'goes_up': ['sum'],
        'goes_down': ['sum'],
    }
    return agg(df, feature_dict)


def extract_book_stats_time(df):
    feature_dict = {
        'square_log_return1': ["sum"],
        'square_log_return2': ["sum"],
        'square_log_return3': ["sum"],
        'square_log_return4': ["sum"],
    }
    return agg(df, feature_dict)

def extract_trade_stats_time(df):
    feature_dict = {
        'trade_square_log_return': ["sum"],
        'seconds_in_bucket': ["nunique"],
        'size': ["sum"],
        'order_count': ["sum"],
    }
    return agg(df, feature_dict)


def time_constraint_fe(df, stats_df, seconds_from, func):
    sub_df = df[df["seconds_in_bucket"] >= seconds_from].reset_index(drop=True)
    return stats_df.merge(func(sub_df), on="time_id", how="left", suffixes=('', f'_{seconds_from}'))

# Function to get group stats for the stock_id and time_id
def get_time_stock(df):
    vol_cols = ['square_log_return1_sum', 'square_log_return2_sum', 'square_log_return1_sum_400', 'square_log_return2_sum_400', 
                'square_log_return1_sum_300', 'square_log_return2_sum_300', 'square_log_return1_sum_200', 'square_log_return2_sum_200', 
                'trade_square_log_return_sum', 'trade_square_log_return_sum_400', 'trade_square_log_return_sum_300', 'trade_square_log_return_sum_200']


    # Group by the stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.rename(columns = (lambda x: x + '_' + 'stock'))

    # Group by the stock id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.rename(columns = (lambda x: x + '_' + 'time'))
    
    # Merge with original dataframe
    df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['stock_id__stock', 'time_id__time'], axis = 1, inplace = True)
    return df
    

def feature_engineering(book_path, trade_path):
    book_df = cudf.read_parquet(book_path)
    book_df = extract_raw_book_features(book_df)
    # full window starts for book
    book_stats = extract_book_stats(book_df)
    
    trade_df = cudf.read_parquet(trade_path)
    trade_df = extract_raw_trade_features(trade_df)
    # full window stats for trade
    trade_stats = extract_trade_stats(trade_df)
    
    # partial window stats
    for s in [100, 200, 300, 400, 500]:
        book_stats = time_constraint_fe(book_df, book_stats, s, extract_book_stats_time)
        trade_stats = time_constraint_fe(trade_df, trade_stats, s, extract_trade_stats_time)
    
    trade_stats = trade_stats.rename(columns = (lambda x: "trade_" + x if x.startswith("seconds_in_bucket") else x))
    return book_stats.merge(trade_stats, on="time_id", how="left")
    
def add_tau_features(df):
    df['size_tau'] = np.sqrt( 1/ df['trade_seconds_in_bucket_nunique'] )
    df['size_tau_400'] = np.sqrt( 1/ df['trade_seconds_in_bucket_nunique_400'] )
    df['size_tau_300'] = np.sqrt( 1/ df['trade_seconds_in_bucket_nunique_300'] )
    df['size_tau_200'] = np.sqrt( 1/ df['trade_seconds_in_bucket_nunique_200'] )
    
    df['size_tau2'] = np.sqrt( 1/ df['order_count_sum'] )
    df['size_tau2_400'] = np.sqrt( 0.33/ df['order_count_sum'] )
    df['size_tau2_300'] = np.sqrt( 0.5/ df['order_count_sum'] )
    df['size_tau2_200'] = np.sqrt( 0.66/ df['order_count_sum'] )

    # delta tau
    df['size_tau2_d'] = df['size_tau2_400'] - df['size_tau2']
    return df

def add_cluster_features(df, l):
    features = [
         'square_log_return1_sum',
         'total_volume_sum',
         'size_sum',
         'order_count_sum',      
         'price_spread_sum',  
         'bid_spread_sum',
         'ask_spread_sum',   
         'volume_imbalance_sum',      
         'bid_ask_spread_sum',
         'size_tau2',
    ]
    
    mat = []
    for n, ind in enumerate(l):
        newdf = df[['time_id'] + features][df['stock_id'].isin(ind)]
        if newdf.shape[0] > 0:
            newdf = newdf.groupby(['time_id']).agg("mean").reset_index()
            newdf['stock_id'] = "c" + str(n)
            #print(newdf.columns)
            mat.append(newdf)
        else:
            newdf = cudf.DataFrame()
            newdf['time_id'] = df['time_id'].unique()
            for f in features:
                newdf[f] = 0.0
            
            newdf['stock_id'] = "c" + str(n)
            #print(newdf.columns)
            mat.append(newdf)

    mat1 = cudf.concat(mat).reset_index()
    if 'index' in mat1.columns:
        mat1 = mat1.drop(['index'], axis=1)
    mat1 = mat1.pivot(index='time_id', columns='stock_id')
    mat1.columns = ["_".join(x) for x in mat1.columns.ravel()]
    mat1.reset_index(inplace=True)
    
    return cudf.merge(df, mat1, how='left', on='time_id')        
    

def process_data(order_book_paths, trade_paths, clusters):
    stock_dfs = []
    for book_path, trade_path in tqdm(list(zip(order_book_paths, trade_paths))):
        stock_id = int(book_path.split("=")[1].split("/")[0])

        df = feature_engineering(book_path, trade_path)
        df["stock_id"] = stock_id
        stock_dfs.append(df)
    total_df = cudf.concat(stock_dfs)
    total_df = get_time_stock(total_df)
    total_df = add_tau_features(total_df)
    total_df = add_cluster_features(total_df, clusters)
    
    return total_df

train = process_data(order_book_training, trades_training, clusters).to_pandas()
test = process_data(order_book_test, trades_test, clusters).to_pandas()

def add_row_id(df):
    df['row_id'] = df.apply(lambda x: f"{int(x['stock_id'])}-{int(x['time_id'])}", axis=1)
    return df

train = add_row_id(train)
test = add_row_id(test)

In [None]:
for i, name in enumerate(test.columns):
    print(f"{i}.\t{name}\t{train.columns[i]}")

In [None]:
%cd /kaggle/working/

In [None]:
# Function to read our base train and test set
def read_train_test():
    train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
    test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    return train, test

train_y, test_y = read_train_test()

# Merge with books with target

train['row_id'] = train['row_id'].astype(str)
train_y['row_id'] = train_y['row_id'].astype(str)

train_ = train_y.drop(['stock_id', 'time_id'], axis=1).merge(train, on = ['row_id'], how = 'left')
test_ = test_y.drop(['stock_id', 'time_id'], axis=1).merge(test, on = ['row_id'], how = 'left')

# LGBM



In [None]:
# Function to early stop with root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

def train_and_evaluate_lgb_cvtimeid(train, test, iters=500):
    # Hyperparammeters (just basic)
    
    features = [col for col in train.columns if col not in {"time_id", "target", "row_id"}]
    cath_idx = features.index('stock_id')

    seed0=2021
    seed1=2022
    seed2=2023
    seed3=2024
    params = {
        'objective': 'rmse',
        'boosting_type': 'gbdt',
        'max_depth': -1,
        'max_bin':100,
        'min_data_in_leaf':500,
        'learning_rate': 0.05,
        'subsample': 0.72,
        'subsample_freq': 4,
        'feature_fraction': 0.5,
        'lambda_l1': 0.5,
        'lambda_l2': 1.0,
        'categorical_column':[cath_idx],
        'seed':seed0,
        'feature_fraction_seed': seed0,
        'bagging_seed': seed0,
        'drop_seed': seed0,
        'data_random_seed': seed0,
        'n_jobs':-1,
        'verbose': -1}
    
    y = train['target']
    # Create out of folds array
    oof_predictions = np.zeros(train.shape[0])
    # Create test array to store predictions
    test_predictions = np.zeros(test.shape[0])
    # Create a KFold object
    kfold = KFold(n_splits = 5, random_state = 2021, shuffle = True)
    # Iterate through each fold
    
    train_time_ids = train['time_id'].unique()
    for fold, (time_trn_ind, time_val_ind) in enumerate(kfold.split(train_time_ids)):
        trn_msk = train['time_id'].isin(train_time_ids[time_trn_ind])
        val_msk = ~trn_msk
        
        print(f'Training fold {fold + 1}')
        
        x_train, x_val = train[trn_msk], train[val_msk]
        y_train, y_val = y[trn_msk], y[val_msk]
        
        # Root mean squared percentage error weights
        train_weights = 1 / np.square(y_train)
        val_weights = 1 / np.square(y_val)
        
        train_dataset = lgb.Dataset(x_train[features], y_train, weight = train_weights)
        val_dataset = lgb.Dataset(x_val[features], y_val, weight = val_weights)
        
        model = lgb.train(params = params,
                          num_boost_round=iters,
                          train_set=train_dataset, 
                          valid_sets=[train_dataset, val_dataset], 
                          verbose_eval = 0,
                          early_stopping_rounds=0,
                          feval = feval_rmspe)
        
        # Add predictions to the out of folds array
        oof_predictions[val_msk] = model.predict(x_val[features])
        
        # Predict the test set
        test_predictions += model.predict(test[features]) / 5
    rmspe_score = rmspe(y, oof_predictions)
    print(f'Our out of folds RMSPE is {rmspe_score}')
    #lgb.plot_importance(model,max_num_features=20)
    # Return test predictions
    return test_predictions, oof_predictions

def train_and_evaluate_lgb_cvtimeid_stockid(train, test, iters=500):
    # Hyperparammeters (just basic)
    
    features = [col for col in train.columns if col not in {"time_id", "target", "row_id"}]
    cath_idx = features.index('stock_id')

    seed0=2021
    seed1=2022
    seed2=2023
    seed3=2024
    params = {
        'objective': 'rmse',
        'boosting_type': 'gbdt',
        'max_depth': -1,
        'max_bin':100,
        'min_data_in_leaf':500,
        'learning_rate': 0.05,
        'subsample': 0.72,
        'subsample_freq': 4,
        'feature_fraction': 0.5,
        'lambda_l1': 0.5,
        'lambda_l2': 1.0,
        'categorical_column':[cath_idx],
        'seed':seed0,
        'feature_fraction_seed': seed0,
        'bagging_seed': seed0,
        'drop_seed': seed0,
        'data_random_seed': seed0,
        'n_jobs':-1,
        'verbose': -1}
    
    y = train['target']
    # Create out of folds array
    oof_predictions = np.zeros(train.shape[0])
    # Create test array to store predictions
    test_predictions = np.zeros(test.shape[0])
    # Create a KFold object
    kfold = KFold(n_splits = 5, random_state = 2021, shuffle = True)
    # Iterate through each fold
    
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train)):
        print(f'Training fold {fold + 1}')
        x_train, x_val = train.iloc[trn_ind], train.iloc[val_ind]
        y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
        
        # Root mean squared percentage error weights
        train_weights = 1 / np.square(y_train)
        val_weights = 1 / np.square(y_val)
        
        train_dataset = lgb.Dataset(x_train[features], y_train, weight = train_weights)
        val_dataset = lgb.Dataset(x_val[features], y_val, weight = val_weights)
        
        model = lgb.train(params = params,
                          num_boost_round=iters,
                          train_set=train_dataset, 
                          valid_sets=[train_dataset, val_dataset], 
                          verbose_eval = 0,
                          early_stopping_rounds=0,
                          feval = feval_rmspe)
        
        # Add predictions to the out of folds array
        oof_predictions[val_ind] = model.predict(x_val[features])
        
        # Predict the test set
        test_predictions += model.predict(test[features]) / 5
    rmspe_score = rmspe(y, oof_predictions)
    print(f'Our out of folds RMSPE is {rmspe_score}')
    #lgb.plot_importance(model,max_num_features=20)
    # Return test predictions
    return test_predictions, oof_predictions

# OOF by pairs ['time_id', 'stock_id']

We compare the OOF score with out-of-sample score, where out-of-sample is taken randomly from the train set by taking 20% of the time_ids, and the remaining 80% are treated as the training set

In [None]:
def get_split_train_test(train, frac, seed=414928305):
    all_time_ids = train['time_id'].unique()
    
    np.random.seed(seed)
    msk = np.random.rand(len(all_time_ids)) < frac
    
    train_time_ids = all_time_ids[msk]
    valid_time_ids = all_time_ids[~msk]
    
    train_split = train[train['time_id'].isin(train_time_ids)]
    valid_split = train[train['time_id'].isin(valid_time_ids)]
    
    return train_split, valid_split

def do_split_experiment(train, train_and_eval_func, seed=57001):
    train_split, valid_split = get_split_train_test(train, 0.8, seed=seed)
    train_split_target = np.zeros(len(train_split))
    valid_split_target = np.zeros(len(valid_split))
    for i, x in enumerate(train_split['target']):
        train_split_target[i] = x
    for i, x in enumerate(valid_split['target']):
        valid_split_target[i] = x
    
    # learn LGBM
    iters = [300, 600, 900, 1200, 1500]
    errs_oof = []
    errs_val = []
    for n_iters in iters:
        predictions_lgb, oof_predictions_lgb = train_and_eval_func(
            train_split, valid_split, iters=n_iters)
        errs_oof.append(rmspe(train_split_target, oof_predictions_lgb))
        errs_val.append(rmspe(valid_split_target, predictions_lgb))
        
    plt.clf()
    plt.plot(iters, errs_oof, color='blue')
    plt.plot(iters, errs_val, color='red')
    plt.show() 

In the above the blue line is the OOF score and the red line is the out-of-sample score. We can see that the two lines are inconsistent.

# OOF by 'time_id'

We compare the OOF score with out-of-sample score, where out-of-sample is taken randomly from the train set by taking 20% of the time_ids

this looks better (colors same as above)