In [1]:
import gc  
import os  
import time  
import warnings 
from itertools import combinations  
from warnings import simplefilter 
import joblib  
import lightgbm as lgb  
import numpy as np  
import pandas as pd  
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import KFold, TimeSeriesSplit  
warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
is_offline = False 
is_train = True  
is_infer = False 
max_lookback = np.nan 
split_day = 435




# Data Loading and Preprocessing 

In [2]:
df = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/train.csv")
df = df.dropna(subset=["target"])
df.reset_index(drop=True, inplace=True)
df_shape = df.shape

# Memory Optimization

In [3]:
def reduce_mem_usage(df, verbose=0):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
               
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)
    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")
    return df


 # Parallel Triplet Imbalance Calculation

In [4]:
from numba import njit, prange

@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))
    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            
            if mid_val == min_val:
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features

def calculate_triplet_imbalance_numba(price, df):
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]
    features_array = compute_triplet_imbalance(df_values, comb_indices)
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)
    return features


# Feature Generation Functions 

In [5]:
@njit(parallel=True)
def log_bid_order_flow(df_bid_size, shifted_df_bid_size, df_bid_price, shifted_df_bid_price):
    if df_bid_price > shifted_df_bid_price:
        return np.log(df_bid_size)

    elif df_bid_price == shifted_df_bid_price:
        return np.log(df_bid_size) - np.log(shifted_df_bid_size)

    else:
        return - np.log(shifted_df_bid_size)
    
@njit(parallel=True)
def log_ask_order_flow(df_ask_size, shifted_df_ask_size, df_ask_price, shifted_df_ask_price):
    if df_ask_price > shifted_df_ask_price:
        return - np.log(shifted_df_ask_size)

    elif df_ask_price == shifted_df_ask_price:
        return np.log(df_ask_size) - np.log(shifted_df_ask_size)

    else:
        return np.log(df_ask_size)

def calculate_log_bid_order_flow(group):
    return group.apply(lambda x: log_bid_order_flow(x["bid_size"], x["shifted_bid_size"], x["bid_price"], x["shifted_bid_price"]), axis=1)

def calculate_log_ask_order_flow(group):
    return group.apply(lambda x: log_ask_order_flow(x["ask_size"], x["shifted_ask_size"], x["ask_price"], x["shifted_ask_price"]), axis=1)

# Define a function to apply EWMA to a specific column with a variable span
def apply_ewm(column, span):
    return column.ewm(span=span, adjust=False).mean()

In [6]:
# from tqdm.notebook import tqdm
# tqdm.pandas()

In [7]:
def imbalance_features(df):
        
    df["bid_size"] = df["bid_size"].apply(lambda x: x if x != 0 else 1)
    df["ask_size"] = df["ask_size"].apply(lambda x: x if x != 0 else 1)
    
    # Define lists of price and size-related column names
    
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["size_imbalance"] = df.eval("bid_size / ask_size")

    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")

    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values
   
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    
    # Calculate various statistical aggregation features
    
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
        

    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1, 2, 3, 10]:
            df[f"{col}_shift_{window}"] = df.groupby(["date_id", "stock_id"], as_index=False)[col].shift(window).reset_index(level=0, drop=True)
            df[f"{col}_ret_{window}"] = df.groupby(["date_id", "stock_id"], as_index=False)[col].pct_change(window).reset_index(level=0, drop=True)
#             df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
#             df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)
        
    for window in [1, 2, 3, 10]:
            df[f"market_urgency_shift_{window}"] = df.groupby(["date_id", "stock_id"], as_index=False)["market_urgency"].shift(window).reset_index(level=0, drop=True)
    
    for span in [3,6,9]:
            df[f"ewm_market_urgency_{span}"] = df.groupby(["date_id", "stock_id"])["market_urgency"].transform(apply_ewm, span=span)
            
    # Calculate diff features for specific columns
    
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size', 'market_urgency', 'imbalance_momentum', 'size_imbalance']:
        for window in [1, 2, 3, 6]:
            df[f"{col}_diff_{window}"] = df.groupby(["date_id", "stock_id"], as_index=False)[col].diff(window).reset_index(level=0, drop=True)
    
    df = reduce_mem_usage(df)
    
    # OFI
    
    df["shifted_bid_price"] = df.groupby(["date_id", "stock_id"]).bid_price.shift(1).fillna(0)
    df["shifted_bid_size"] = df.groupby(["date_id", "stock_id"]).bid_size.shift(1).fillna(1)

    df["shifted_ask_price"] = df.groupby(["date_id", "stock_id"]).ask_price.shift(1).fillna(10)
    df["shifted_ask_size"] = df.groupby(["date_id", "stock_id"]).ask_size.shift(1).fillna(1)
            
    df["log_bid_orderflow"] = df.groupby(["date_id", "stock_id"], as_index=False).apply(calculate_log_bid_order_flow).reset_index(level=0, drop=True)
    df["log_ask_orderflow"] = df.groupby(["date_id", "stock_id"], as_index=False).apply(calculate_log_ask_order_flow).reset_index(level=0, drop=True)
    df["log_orderflow_imbalance"] = df.eval("log_bid_orderflow - log_ask_orderflow")
    
    # Global OFI
    df["associated_weights"] = df.stock_id.map(weights)
    weights_of_day = dict(df.groupby(["date_id", "seconds_in_bucket"], group_keys=True)["associated_weights"].sum())
    df["weighted_log_bid_orderflow"] = df.eval("associated_weights*log_bid_orderflow")
    df["weighted_log_ask_orderflow"] = df.eval("associated_weights*log_ask_orderflow")

    w = dict(df.groupby(["date_id", "seconds_in_bucket"], group_keys=True)["weighted_log_bid_orderflow"].sum())
    df["global_log_bid_orderflow"] = pd.Series(list(zip(df.date_id, df.seconds_in_bucket))).map(w)

    w = dict(df.groupby(["date_id", "seconds_in_bucket"], group_keys=True)["weighted_log_ask_orderflow"].sum())
    df["global_log_ask_orderflow"] = pd.Series(list(zip(df.date_id, df.seconds_in_bucket))).map(w)

    df["global_log_bid_orderflow"] = df["global_log_bid_orderflow"]/pd.Series(list(zip(df.date_id, df.seconds_in_bucket))).map(weights_of_day)
    df["global_log_ask_orderflow"] = df["global_log_ask_orderflow"]/pd.Series(list(zip(df.date_id, df.seconds_in_bucket))).map(weights_of_day)
    df["global_ofi"] = df.eval("global_log_bid_orderflow -  global_log_ask_orderflow")
    df["diff_ofi_global_ofi"] = df.eval("log_orderflow_imbalance -  global_ofi")
    
    df = reduce_mem_usage(df)
    
    # index_wap 
    #df["associated_weights"] = df.stock_id.map(weights)
    df.eval("weighted_wap = wap*associated_weights", inplace=True)
    w = dict(df.groupby(["date_id", "seconds_in_bucket"], group_keys=True)["weighted_wap"].sum())
    df["index_wap"] = pd.Series(list(zip(df.date_id, df.seconds_in_bucket))).map(w)
    #weights_of_day = dict(df.groupby(["date_id", "seconds_in_bucket"], group_keys=True)["associated_weights"].sum())
    df["index_wap_norm"] = df["index_wap"]/pd.Series(list(zip(df.date_id, df.seconds_in_bucket))).map(weights_of_day)
    df["shift_wap"] = df.groupby(["date_id", "stock_id"], as_index=False)["wap"].shift().fillna(1).reset_index(level=0, drop=True)
    df["shift_index_wap_norm"] = df.groupby(["date_id", "stock_id"], as_index=False)["index_wap_norm"].shift().fillna(1).reset_index(level=0, drop=True)
    df["var_wap"] = df["wap"]/df["shift_wap"]
    df["var_index_wap_norm"] = df["index_wap_norm"]/df["shift_index_wap_norm"]
    df.eval("diff_var_wap_var_index_wap=var_wap-var_index_wap_norm", inplace=True)
    
    del weights_of_day
    del w
    
    # New lag features 
    
    for col in ['log_bid_orderflow', 'log_ask_orderflow', 'log_orderflow_imbalance']:
        for window in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
            df[f"{col}_shift_{window}"] = df.groupby([ "date_id", "stock_id"], as_index=False)[col].shift(window).reset_index(level=0, drop=True)
            
    for span in [6, 9]:
        df[f"ewm_diff_var_wap_var_index_wap_{span}"] = df.groupby(["date_id", "stock_id"])["diff_var_wap_var_index_wap"].transform(apply_ewm, span=span)
    
    df.drop(['shifted_bid_price', 'shifted_bid_size', 'shifted_ask_price', 'shifted_ask_size',
             'associated_weights', 'index_wap', "shift_wap", "shift_index_wap_norm"], axis=1, inplace=True)
    
    df = reduce_mem_usage(df)
    
    #new features
      #diff price features
    df["diff_ref_price_and_wap"] = df["reference_price"] - df["wap"]
    df["diff_near_price_and_far_price"] = df["near_price"] - df["far_price"]

    #imbalance features
    df["imbalance_1"] = df["imbalance_size"] * df["imbalance_buy_sell_flag"]
    df["imbalance_2"] = df["imbalance_size"] / df["matched_size"]

    #volume/size ratio
    df["ratio_bid_ask"] = df["bid_size"] / df["ask_size"]
    df["ratio_size_imbalance_and_bid"] = df["imbalance_size"] / df["bid_size"]
    df["ratio_size_imbalance_and_ask"] = df["imbalance_size"] / df["ask_size"]
    df["ratio_size_matched_and_askbid"] = df["matched_size"] / (df["bid_size"] + df["ask_size"])
    
    #features ratio with shift 
    for index in ["ratio_bid_ask", "ratio_size_imbalance_and_bid", "ratio_size_imbalance_and_ask", "ratio_size_matched_and_askbid", "imbalance_2"]:
        df[f"shift_ratio_{index}"] = df.groupby(["date_id", "stock_id"], as_index=False)[index].shift().reset_index(level=0, drop=True)
        df[f"ratio_to_ratio_and_shift_ratio_{index}"] = df[index]/df[f"shift_ratio_{index}"]

        df.drop([f"shift_ratio_{index}"], axis=1, inplace=True)
    
    return df.replace([np.inf, -np.inf], 0)

def other_features(df):
    df["auction"] = df["seconds_in_bucket"].apply(lambda x: 1 if x >= 300 else 0)
    df["dow"] = df["date_id"] % 5  # Day of the week
    df["seconds"] = df["seconds_in_bucket"] % 60  
    df["minute"] = df["seconds_in_bucket"] // 60  
    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df

def generate_all_features(df):
    # Select relevant columns for feature generation
    cols = [c for c in df.columns if c not in ["row_id", "time_id", "target"]]
    df = df[cols]
    
    # Generate imbalance features
    df = imbalance_features(df)
    df = other_features(df)
    gc.collect()  
    feature_name = [i for i in df.columns if i not in ["row_id", "target", "time_id", "date_id"]]
    
    return df[feature_name]


In [8]:
#df = imbalance_features(df)

In [9]:
#df 

In [10]:
weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]
weights = {int(k):v for k,v in enumerate(weights)}

## Data Splitting

In [11]:
if is_offline:
    
    df_train = df[df["date_id"] <= split_day]
    df_valid = df[df["date_id"] > split_day]
    print("Offline mode")
    print(f"train : {df_train.shape}, valid : {df_valid.shape}")
else:
    df_train = df
    print("Online mode")


Online mode


In [12]:
if is_train:
    global_stock_id_feats = {
        "median_size": df_train.groupby("stock_id")["bid_size"].median() + df_train.groupby("stock_id")["ask_size"].median(),
        "std_size": df_train.groupby("stock_id")["bid_size"].std() + df_train.groupby("stock_id")["ask_size"].std(),
        "ptp_size": df_train.groupby("stock_id")["bid_size"].max() - df_train.groupby("stock_id")["bid_size"].min(),
        "median_price": df_train.groupby("stock_id")["bid_price"].median() + df_train.groupby("stock_id")["ask_price"].median(),
        "std_price": df_train.groupby("stock_id")["bid_price"].std() + df_train.groupby("stock_id")["ask_price"].std(),
        "ptp_price": df_train.groupby("stock_id")["bid_price"].max() - df_train.groupby("stock_id")["ask_price"].min(),
    }
    if is_offline:
        df_train_feats = generate_all_features(df_train)
        print("Build Train Feats Finished.")
        df_valid_feats = generate_all_features(df_valid)
        print("Build Valid Feats Finished.")
        df_valid_feats = reduce_mem_usage(df_valid_feats)
    else:
        df_train_feats = generate_all_features(df_train)
        print("Build Online Train Feats Finished.")

    df_train_feats = reduce_mem_usage(df_train_feats)

Build Online Train Feats Finished.


In [13]:
# a = df_train_feats.columns

In [14]:
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
import gc

lgb_params = {
    "objective": "mae",
    "n_estimators": 10000,
    "num_leaves": 200,
    "subsample": 0.6,
    "colsample_bytree": 0.8,
#     "learning_rate": 0.00871,
    "learning_rate": 0.01,
    'max_depth': 11,
    "n_jobs": -1,
    "device": "gpu",
    "verbosity": -1,
    "importance_type": "gain",
    "reg_alpha": 0.8,
    "reg_lambda": 3.25
}
feature_name = list(df_train_feats.columns)
print(f"Feature length = {len(feature_name)}")

num_folds = 5
fold_size = 480 // num_folds
gap = 10

models = []
scores = []

model_save_path = 'modelitos_para_despues' 
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

date_ids = df_train['date_id'].values

for i in range(num_folds):
    start = i * fold_size
    end = start + fold_size
    if i < num_folds - 1:  # No need to purge after the last fold
        purged_start = end - 2
        purged_end = end + gap + 2
        train_indices = (date_ids >= start) & (date_ids < purged_start) | (date_ids > purged_end)
    else:
        train_indices = (date_ids >= start) & (date_ids < end)
    
    test_indices = (date_ids >= end) & (date_ids < end + fold_size)
    
    df_fold_train = df_train_feats[train_indices]
    df_fold_train_target = df_train['target'][train_indices]
    df_fold_valid = df_train_feats[test_indices]
    df_fold_valid_target = df_train['target'][test_indices]

    print(f"Fold {i+1} Model Training")
    
    # Train a LightGBM model for the current fold
    lgb_model = lgb.LGBMRegressor(**lgb_params)
    lgb_model.fit(
        df_fold_train[feature_name],
        df_fold_train_target,
        eval_set=[(df_fold_valid[feature_name], df_fold_valid_target)],
        callbacks=[
            lgb.callback.early_stopping(stopping_rounds=100),
            lgb.callback.log_evaluation(period=100),
        ],
    )

    models.append(lgb_model)
    # Save the model to a file
    model_filename = os.path.join(model_save_path, f'doblez_{i+1}.txt')
    lgb_model.booster_.save_model(model_filename)
    print(f"Model for fold {i+1} saved to {model_filename}")

    # Evaluate model performance on the validation set
    fold_predictions = lgb_model.predict(df_fold_valid[feature_name])
    fold_score = mean_absolute_error(fold_predictions, df_fold_valid_target)
    scores.append(fold_score)
    print(f"Fold {i+1} MAE: {fold_score}")

    # Free up memory by deleting fold specific variables
    del df_fold_train, df_fold_train_target, df_fold_valid, df_fold_valid_target
    gc.collect()

# Calculate the average best iteration from all regular folds
average_best_iteration = int(np.mean([model.best_iteration_ for model in models]))

# Update the lgb_params with the average best iteration
final_model_params = lgb_params.copy()
final_model_params['n_estimators'] = average_best_iteration

print(f"Training final model with average best iteration: {average_best_iteration}")

# Train the final model on the entire dataset
final_model = lgb.LGBMRegressor(**final_model_params)
final_model.fit(
    df_train_feats[feature_name],
    df_train['target'],
    callbacks=[
        lgb.callback.log_evaluation(period=100),
    ],
)

# Append the final model to the list of models
models.append(final_model)

# Save the final model to a file
final_model_filename = os.path.join(model_save_path, 'doblez-conjunto.txt')
final_model.booster_.save_model(final_model_filename)
print(f"Final model saved to {final_model_filename}")

# Now 'models' holds the trained models for each fold and 'scores' holds the validation scores
print(f"Average MAE across all folds: {np.mean(scores)}")


Feature length = 191
Fold 1 Model Training
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 7.08752
[200]	valid_0's l1: 7.0412
[300]	valid_0's l1: 7.01438
[400]	valid_0's l1: 6.99372
[500]	valid_0's l1: 6.97766
[600]	valid_0's l1: 6.96398
[700]	valid_0's l1: 6.95148
[800]	valid_0's l1: 6.9398
[900]	valid_0's l1: 6.92935
[1000]	valid_0's l1: 6.92055
[1100]	valid_0's l1: 6.91165
[1200]	valid_0's l1: 6.90303
[1300]	valid_0's l1: 6.89572
[1400]	valid_0's l1: 6.88855
[1500]	valid_0's l1: 6.88191
[1600]	valid_0's l1: 6.87529
[1700]	valid_0's l1: 6.86882
[1800]	valid_0's l1: 6.86275
[1900]	valid_0's l1: 6.85696
[2000]	valid_0's l1: 6.85134
[2100]	valid_0's l1: 6.84541
[2200]	valid_0's l1: 6.83964
[2300]	valid_0's l1: 6.83383
[2400]	valid_0's l1: 6.82824
[2500]	valid_0's l1: 6.82268
[2600]	valid_0's l1: 6.8177
[2700]	valid_0's l1: 6.81216
[2800]	valid_0's l1: 6.80727
[2900]	valid_0's l1: 6.80191
[3000]	valid_0's l1: 6.79829
[3100]	valid_0's l1: 6.79341
[3200]	v

In [15]:
def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)
    step = np.sum(prices) / np.sum(std_error)
    out = prices - std_error * step
    return out

if is_infer:
    import optiver2023
    env = optiver2023.make_env()
    iter_test = env.iter_test()
    counter = 0
    y_min, y_max = -64, 64
    qps, predictions = [], []
    cache = pd.DataFrame()

    # Weights for each fold model
    model_weights = [1/len(models)] * len(models) 
    
    for (test, revealed_targets, sample_prediction) in iter_test:
        now_time = time.time()
        cache = pd.concat([cache, test], ignore_index=True, axis=0)
        if counter > 0:
            cache = cache.groupby(['stock_id']).tail(21).sort_values(by=['date_id', 'seconds_in_bucket', 'stock_id']).reset_index(drop=True)
        feat = generate_all_features(cache)[-len(test):]
        feat.drop(["currently_scored"], axis=1, inplace=True)
        # Generate predictions for each model and calculate the weighted average
        lgb_predictions = np.zeros(len(test))
        for model, weight in zip(models, model_weights):
            lgb_predictions += weight * model.predict(feat)

        lgb_predictions = zero_sum(lgb_predictions, test['bid_size'] + test['ask_size'])
        clipped_predictions = np.clip(lgb_predictions, y_min, y_max)
        sample_prediction['target'] = clipped_predictions
        env.predict(sample_prediction)
        counter += 1
        qps.append(time.time() - now_time)
        if counter % 10 == 0:
            print(counter, 'qps:', np.mean(qps))

    time_cost = 1.146 * np.mean(qps)
    print(f"The code will take approximately {np.round(time_cost, 4)} hours to reason about")
