In [1]:
import warnings
warnings.filterwarnings("ignore")
from itertools import combinations

import gc
import os
import sys
import json


import numpy as np
# %load_ext cudf.pandas
import pandas as pd

import numba as nb
import polars as pl
import lightgbm as lgb

from lightgbm import LGBMRegressor

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

device = 'cpu'

# Functions

In [2]:
@nb.jit(nopython=False, parallel=True)
def reduce_mem_usage(df):
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)
        
    return df


@nb.jit(nopython=False, parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))
    for i in range(num_combinations):
        a, b, c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            
            if mid_val == min_val:
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)
    return imbalance_features


@nb.jit(nopython=False, fastmath=True)
def calculate_triplet_imbalance(price, df):
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]
    features_array = compute_triplet_imbalance(df_values, comb_indices)
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)
    return features


@nb.jit(nopython=False, parallel=True)
def calculate_weighted_wap(df):
    df["stock_weights"] = df["stock_id"].map(weights)
    df["weighted_wap"] = df["stock_weights"] * df["wap"]
    df['wap_momentum'] = df.groupby('stock_id')['weighted_wap'].pct_change(periods=6)
    return df


@nb.jit(nopython=False, parallel=True)
def calculate_rolling_mean_and_std_expressions(df):
    # Convert from pandas to Polars
    pl_df = pl.from_pandas(df)

    # Define the windows and columns for which you want to calculate the rolling statistics
    windows = [3, 5, 10]
    columns = ['ask_price', 'bid_price', 'ask_size', 'bid_size']

    # prepare the operations for each column and window
    group = ["stock_id"]
    expressions = []

    # Loop over each window and column to create the rolling mean and std expressions
    for window in windows:
        for col in columns:
            rolling_mean_expr = (
                pl.col(f"{col}_diff_{window}")
                .rolling_mean(window)
                .over(group)
                .alias(f'rolling_diff_{col}_{window}')
            )

            rolling_std_expr = (
                pl.col(f"{col}_diff_{window}")
                .rolling_std(window)
                .over(group)
                .alias(f'rolling_std_diff_{col}_{window}')
            )

            expressions.append(rolling_mean_expr)
            expressions.append(rolling_std_expr)

    # Run the operations using Polars' lazy API
    lazy_df = pl_df.lazy().with_columns(expressions)

    # Execute the lazy expressions and overwrite the pl_df variable
    pl_df = lazy_df.collect()

    # Convert back to pandas if necessary
    df = pl_df.to_pandas()
    
    return df

@nb.jit(nopython=False, parallel=True)
def calculate_diff_features(df):
    # Calculate diff features for specific columns
    for col in ['imb_s1', 'ask_price', 'bid_price', 'wap', 'ask_size', 'bid_size', 'weighted_wap', 'price_spread']:
        for window in [3, 5, 10]: # 2
            df[f"{col}_diff_{window}"] = df.groupby(["stock_id", "date_id"])[col].diff(window)
    return df
    
    
@nb.jit(nopython=False, parallel=True)
def calculate_gain_loss_features(df):
    window = 5
    for col in ['imb_s1', 'ask_price', 'bid_price', 'wap']:
        df[f"{col}_gain"] = df[f"{col}_diff_{window}"].where(df[f"{col}_diff_{window}"] > 0, 0)
        df[f"{col}_loss"] = df[f"{col}_diff_{window}"].where(df[f"{col}_diff_{window}"] < 0, 0)
        df[f"{col}_gain_avg"] = df[f"{col}_gain"].rolling(window=window).mean()
        df[f"{col}_loss_avg"] = df[f"{col}_loss"].rolling(window=window).mean()
    return df


@nb.jit(nopython=False, parallel=True)
def calculate_norm_features(df):
    grouped = df.groupby(["stock_id", "date_id"])
    for col in ['imb_s1', 'ask_price', 'bid_price', 'wap']:
        df[f"{col}_cummax"] = grouped[col].cummax() 
        df[f"{col}_cummin"] = grouped[col].cummin() 
        df[f"{col}_min_max_norm"] = df.eval(f'({col} - {col}_cummin) / ({col}_cummax - {col}_cummin)')
        
    columns_to_keep = [col for col in df.columns if 'cummax' not in col]
    df = df[columns_to_keep]
    columns_to_keep = [col for col in df.columns if 'cummin' not in col]
    df = df[columns_to_keep]
    
    return df
    
@nb.jit(nopython=False, parallel=True)
def calculate_bollinger(df):
    # Bollinger Bands 
    periods = [3, 5, 7, 9]
    for col in ['imb_s1', 'ask_price', 'bid_price', 'wap']:
        grouped = df.groupby(["stock_id", "date_id"])[col]
        for p in periods: 
            grouped_std = grouped.rolling(window=p).std().reset_index(drop=True)            
            df[f"{col}_bollinger_upper_{p}"] = df[f"{col}"] + 2 * grouped_std
            df[f"{col}_bollinger_lower_{p}"] = df[f"{col}"] - 2 * grouped_std
    return df


@nb.jit(nopython=False, parallel=True)
def calculate_diff_2_features(df):
    for window in [3, 5, 10]:
        df[f'price_change_diff_{window}'] = df[f'bid_price_diff_{window}'] - df[f'ask_price_diff_{window}']
        df[f'size_change_diff_{window}'] = df[f'bid_size_diff_{window}'] - df[f'ask_size_diff_{window}']
    return df
        

@nb.jit(nopython=False, parallel=True)
def calculate_all_prices_and_sizes_features(df, prices, sizes):
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
    return df


@nb.jit(nopython=False, parallel=True)
def calculate_pct_change_features(df, prices):
    for i in range(len(prices)-1):
        p1 = prices[i]
        for j in range(i+1, len(prices)):
            p2 = prices[j]
            df[f'{p1}_{p2}_pct_change'] = ((df[p1] - df[p2]) / df[p2]) * 100
    return df


@nb.jit(nopython=False, parallel=True)
def compute_combination_features(df, prices):
    for c in combinations(prices, 2):
        # df[f'{c[0]}_minus_{c[1]}'] = df[f'{c[0]}'] - df[f'{c[1]}']
        df[f'{c[0]}_{c[1]}_imb'] = df.eval(f'({c[0]} - {c[1]}) / ({c[0]} + {c[1]})')
        # df[f'{c[0]}_{c[1]}_urgency'] = df[f'{c[0]}_minus_{c[1]}'] * df['imb_s1']
    # columns_to_keep = [col for col in df.columns if 'minus' not in col]
    # df = df[columns_to_keep]
    return df
    


def feat_eng(df):   
    cols = [c for c in df.columns if c not in ['row_id', 'time_id', 'currently_scored']]
    df = df[cols]
    
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
    prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
    
    df = calculate_weighted_wap(df)
        
    # Adding some features
    df["dow"] = df["date_id"] % 5  # Day of the week
    df["seconds"] = df["seconds_in_bucket"] % 60  
    df["minute"] = df["seconds_in_bucket"] // 60
    df['time_to_market_close'] = 540 - df['seconds_in_bucket']
    
    df['imbalance_buy_flag'] = np.where(df['imbalance_buy_sell_flag']==1, 1, 0) 
    df['imbalance_sell_flag'] = np.where(df['imbalance_buy_sell_flag']==-1, 1, 0) 
    df['bid_plus_ask_sizes'] = df['bid_size'] + df['ask_size']
    df['imbalance_ratio'] = df.eval('imbalance_size / matched_size')
    
    df['imb_s1'] = df.eval('(bid_size - ask_size) / (bid_size + ask_size)')
    df['imb_s2'] = df.eval('(imbalance_size - matched_size) / (matched_size + imbalance_size)')
    
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["matched_imbalance"] = df.eval("(imbalance_size - matched_size)/(matched_size + imbalance_size)")
        
    df['ask_x_size'] = df.eval('ask_size * ask_price')
    df['bid_x_size'] = df.eval('bid_size * bid_price')
        
    df['ask_minus_bid'] = df['ask_x_size'] - df['bid_x_size'] 
    df["bid_size_over_ask_size"] = df["bid_size"].div(df["ask_size"])
    df["bid_price_over_ask_price"] = df["bid_price"].div(df["ask_price"])
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    df['relative_spread'] = (df['ask_price'] - df['bid_price']) / df['wap']
    df['harmonic_imbalance'] = df.eval('2 / ((1 / bid_size) + (1 / ask_size))')
    
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df['market_urgency'] = df['price_spread'] * df['imb_s1']
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df['mid_price_movement'] = df['mid_price'].diff(periods=5).apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
    df['mid_price*volume'] = df['mid_price_movement'] * df['volume']
    
    df = calculate_diff_features(df)
    df = calculate_rolling_mean_and_std_expressions(df)
    # df = calculate_gain_loss_features(df)
    df = calculate_norm_features(df)
    df = calculate_bollinger(df)
          
    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance(c, df)
        df[triplet_feature.columns] = triplet_feature.values
        
    df = compute_combination_features(df, prices)
    df = calculate_all_prices_and_sizes_features(df, prices, sizes)
    df = calculate_diff_2_features(df)
    # df = calculate_pct_change_features(df, prices)
            
    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())
        
    df['high_volume'] = np.where(df['bid_plus_ask_sizes'] > df['global_median_vol'], 1, 0) 
    
    # Reduce memory usage
    df = reduce_mem_usage(df)
    
    # Run garbage collector
    gc.collect()
    
    return df.replace([np.inf, -np.inf], 0)

weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]
weights = {int(k):v for k,v in enumerate(weights)}

# Dataset

In [3]:
# Load the dataset
train = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')

# Drop nan on target column
train.dropna(subset=['target'], inplace=True)

global_stock_id_feats = {
        "median_vol": train.groupby("stock_id")["bid_size"].median() + train.groupby("stock_id")["ask_size"].median(),
        "std_size": train.groupby("stock_id")["bid_size"].std() + train.groupby("stock_id")["ask_size"].std(),
        "ptp_size": train.groupby("stock_id")["bid_size"].max() - train.groupby("stock_id")["bid_size"].min(),
        "median_price": train.groupby("stock_id")["bid_price"].median() + train.groupby("stock_id")["ask_price"].median(),
        "std_price": train.groupby("stock_id")["bid_price"].std() + train.groupby("stock_id")["ask_price"].std(),
        "ptp_price": train.groupby("stock_id")["bid_price"].max() - train.groupby("stock_id")["ask_price"].min()
}

# Run garbage collector
gc.collect()

# Drop other relevant nan
train.dropna(subset=['bid_price', 'ask_price', 'wap'], inplace=True)

# Reset the index
train.reset_index(drop=True)

# Apply features engineering
train = feat_eng(train)

# Clustering

In [4]:
n_models = 6
cluster_cols = ['wap', 'reference_price', 'ask_price', 'bid_price', 'imb_s1', 'weighted_wap', 'target']
kmeans = KMeans(n_clusters=n_models)
X = train.drop(columns=['date_id'])
train['cluster'] = kmeans.fit_predict(train[cluster_cols])

# Classification

In [5]:
test_size = 0.25
stopping_rounds = 16
log_evaluation_periods = 4

y = train['cluster']
X = train.drop(columns=['target', 'date_id', 'cluster'])
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, shuffle=True, random_state=42)

lgb_params = {
    'max_depth': 10,
    'num_leaves': 136,
    'objective': 'multiclass',
    'num_class': n_models,
    'n_estimators': 360,
    'colsample_bytree': 0.75,
    'learning_rate': 0.01,
    'reg_alpha': 0.00001,
    'reg_lambda': 0.00001,
    'importance_type' : 'gain',
    'subsample': 0.70,
    'verbosity': 1,
    'device': device,
    'n_jobs': -1,
}

# Run garbage collector
gc.collect()

# Define a LightGBM model for the current fold
lgb_model_classifier = lgb.LGBMClassifier(**lgb_params)

# Train the LightGBM model for the current fold
lgb_model_classifier.fit(
    X_train,
    y_train,
    eval_set=[(X_valid, y_valid)],
    callbacks=[
        lgb.callback.early_stopping(stopping_rounds=stopping_rounds),
        lgb.callback.log_evaluation(period=log_evaluation_periods),
    ],
)

# Free up memory by deleting fold specific variables
del X, y, X_train, y_train, X_valid, y_valid
    
# Run garbage collector
gc.collect()

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34797
[LightGBM] [Info] Number of data points in the train set: 3928320, number of used features: 171
[LightGBM] [Info] Start training from score -1.911860
[LightGBM] [Info] Start training from score -0.989755
[LightGBM] [Info] Start training from score -1.096277
[LightGBM] [Info] Start training from score -3.849720
[LightGBM] [Info] Start training from score -4.306562
[LightGBM] [Info] Start training from score -2.192395
Training until validation scores don't improve for 16 rounds
[4]	valid_0's multi_logloss: 1.39435
[8]	valid_0's multi_logloss: 1.38865
[12]	valid_0's multi_logloss: 1.38351
[16]	valid_0's multi_logloss: 1.3789
[20]	valid_0's multi_logloss: 1.37463
[24]	valid_0's multi_logloss: 1.37073
[28]	valid_0's multi_logloss: 1.36714
[32]	valid_0's multi_logloss: 1.36383
[36]	valid_0's multi_logloss: 1.36075
[40]	valid_0's multi_logloss: 1.35794
[44]	valid_0's multi_logloss: 1.35532
[48]	valid_

250

In [6]:
"""
from sklearn.metrics import accuracy_score
y_pred = lgb_model_classifier.predict(X_valid)
y_pred_classes = np.argmax(y_pred)
accuracy = accuracy_score(y_true=y_valid, y_pred=y_pred)
print(f'Accuracy: {accuracy}')

# Free up memory by deleting fold specific variables
del X, y, X_train, y_train, X_valid, y_valid
    
# Run garbage collector
gc.collect()
"""

"\nfrom sklearn.metrics import accuracy_score\ny_pred = lgb_model_classifier.predict(X_valid)\ny_pred_classes = np.argmax(y_pred)\naccuracy = accuracy_score(y_true=y_valid, y_pred=y_pred)\nprint(f'Accuracy: {accuracy}')\n\n# Free up memory by deleting fold specific variables\ndel X, y, X_train, y_train, X_valid, y_valid\n    \n# Run garbage collector\ngc.collect()\n"

# Regression

In [7]:
test_size = 0.20
stopping_rounds = 100
log_evaluation_periods = 4

lgb_params = {
    'max_depth': 12,
    'num_leaves': 164,
    'objective': 'mae',
    'n_estimators': 3000,
    'colsample_bytree': 0.85,
    'learning_rate': 0.008,
    'reg_alpha': 0.00001,
    'reg_lambda': 0.00001,
    'importance_type' : 'gain',
    'subsample': 0.85,
    'verbosity': 1,
    'device': device
}

models = {}
for cluster_id in train['cluster'].unique():
    print(f"Train LGBM Regressor for cluster {cluster_id}")
    
    cluster_data = train[train['cluster'] == cluster_id]
    
    columns_to_keep = [col for col in cluster_data.columns if 'cluster' not in col]
    cluster_data = cluster_data[columns_to_keep]
    
    y = cluster_data['target']
    X = cluster_data.drop(columns=['target', 'date_id'])
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, shuffle=False)

    # Define a LightGBM model for the current fold
    lgb_model = lgb.LGBMRegressor(**lgb_params)
    
    # Train the LightGBM model for the current fold
    lgb_model.fit(
        X_train,
        y_train,
        eval_set=[(X_valid, y_valid)],
        callbacks=[
            lgb.callback.early_stopping(stopping_rounds=stopping_rounds),
            lgb.callback.log_evaluation(period=log_evaluation_periods),
        ],
    )
    
    models[cluster_id] = lgb_model
        
    # Free up memory by deleting fold specific variables
    del X, y, cluster_data, X_train, y_train, X_valid, y_valid
    
    # Run garbage collector
    gc.collect()
    
    print("\n\n")

Train LGBM Regressor for cluster 1
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34360
[LightGBM] [Info] Number of data points in the train set: 1556912, number of used features: 171
[LightGBM] [Info] Start training from score -2.458984
Training until validation scores don't improve for 100 rounds
[4]	valid_0's l1: 1.73362
[8]	valid_0's l1: 1.73263
[12]	valid_0's l1: 1.73168
[16]	valid_0's l1: 1.73078
[20]	valid_0's l1: 1.72994
[24]	valid_0's l1: 1.72915
[28]	valid_0's l1: 1.72839
[32]	valid_0's l1: 1.72768
[36]	valid_0's l1: 1.72697
[40]	valid_0's l1: 1.72631
[44]	valid_0's l1: 1.7257
[48]	valid_0's l1: 1.72511
[52]	valid_0's l1: 1.72456
[56]	valid_0's l1: 1.72402
[60]	valid_0's l1: 1.72352
[64]	valid_0's l1: 1.72304
[68]	valid_0's l1: 1.72259
[72]	valid_0's l1: 1.72216
[76]	valid_0's l1: 1.72174
[80]	valid_0's l1: 1.72136
[84]	valid_0's l1: 1.72098
[88]	valid_0's l1: 1.72062
[92]	valid_0's l1: 1.72027
[96]	valid_0's l1: 1.71994
[100]	valid_0's

# Submission

In [8]:
# Init
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()


@nb.jit(nopython=False, parallel=True)
def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)
    step = np.sum(prices)/np.sum(std_error)
    out = prices-std_error*step
    return out
    
# Init a counter
counter = 0

# To clip predictions
y_min, y_max = -64, 64

# Init an empty dataframe
cache = pd.DataFrame()

for (test, revealed_targets, sample_prediction) in iter_test:       
    # Add data to the chache dataset
    cache = pd.concat([cache, test], ignore_index=True, axis=0)
    if counter > 0:
        cache = cache.groupby(['stock_id']).tail(21).sort_values(by=['date_id', 'seconds_in_bucket', 'stock_id']).reset_index(drop=True)
    
    # First iteration 
    if test.currently_scored.iloc[0]==False:
        sample_prediction['target'] = 0
        env.predict(sample_prediction)
        continue
        
    features = feat_eng(cache)[-len(test):]
    features = features.drop(columns='date_id').reset_index(drop=True)
    features['cluster'] = lgb_model_classifier.predict(features)
    
    predictions = np.zeros(len(test))
    for i in range(n_models):
        selected_indices = features[features['cluster']==i].index
        if len(selected_indices) == 0:
            continue
        else:
            selected_features = features.iloc[selected_indices]
            selected_features = selected_features.drop(columns=['cluster'])
            if len(selected_indices) == 1:
                selected_features = (selected_features.values).reshape(1, -1)
        
        model_predictions = models[i].predict(selected_features)
        predictions[selected_indices] = model_predictions
    
    sample_prediction['target'] = zero_sum(predictions, test['bid_size'] + test['ask_size'])
    
    num_nan_values = sample_prediction['target'].isna().sum()
    print("Numero di righe con valori NaN nella colonna 'target':", num_nan_values)
    
    # Predict
    env.predict(sample_prediction)
        
    # Update the counter
    counter += 1
    
    # Run garbage collector
    gc.collect()

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
