In [19]:
# =========================
# 1) IMPORTS
# =========================

# Core
import os
import gc
import sys
import math
import json
import warnings
from datetime import datetime

# Numerics / Data
import numpy as np
import pandas as pd

# Modeling
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Tree models
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# ARIMA (statsmodels)
import statsmodels.api as sm

# Plotting
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 200)


In [None]:
# =========================
# 2) CONSTANTS / CONFIG
# =========================

# Paths (adjust if needed)
DATA_DIR = "./dataset/"
TRAIN_FEATS_PATH = os.path.join(DATA_DIR, "train.csv")
TRAIN_LABELS_PATH = os.path.join(DATA_DIR, "train_labels.csv")
TEST_FEATS_PATH = os.path.join(DATA_DIR, "test.csv")
TARGET_PAIRS_PATH = os.path.join(DATA_DIR, "target_pairs.csv")  # optional

# Output
OUTPUT_DIR = os.path.join(DATA_DIR, "outputs")
SUBMISSION_PATH = os.path.join(OUTPUT_DIR, "predictions_submission.csv")

# Cross-validation
N_SPLITS = 5  # time series splits
RANDOM_STATE = 42

# Feature engineering config
LAG_WINDOWS = [1, 2, 5, 10]
ROLL_WINDOWS = [3, 5, 10]
ROLL_STATS = ["mean", "std"]

# Columns we will never use as features
NEVER_FEATURE_COLS = {
    # common identifiers to exclude from features
    "row_id",
    "id",
    "time_id",
    "timestamp",
    "date",
    "datetime",
    "symbol",
    "asset",
    "ticker",
    "target",
    "y",
}

# XGB / LGBM baseline hyperparams (feel free to tune)
XGB_PARAMS = dict(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.0,
    reg_lambda=1.0,
    random_state=RANDOM_STATE,
    tree_method="hist",
)

LGBM_PARAMS = dict(
    n_estimators=900,
    learning_rate=0.03,
    max_depth=-1,
    num_leaves=63,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.0,
    reg_lambda=0.0,
    random_state=RANDOM_STATE,
    n_jobs=-1,
)

# ARIMA config (very small search space to keep it quick)
ARIMA_ORDERS = [(1, 1, 0), (1, 1, 1), (2, 1, 1)]


In [21]:
# =========================
# 3) UTILS (Metrics, Plotting)
# =========================


def mape(y_true, y_pred, eps=1e-9):
    y_true = np.asarray(y_true, dtype=float).ravel()
    y_pred = np.asarray(y_pred, dtype=float).ravel()
    denom = np.where(np.abs(y_true) < eps, eps, np.abs(y_true))
    return np.mean(np.abs((y_true - y_pred) / denom)) * 100.0


def smape(y_true, y_pred, eps=1e-9):
    y_true = np.asarray(y_true, dtype=float).ravel()
    y_pred = np.asarray(y_pred, dtype=float).ravel()
    denom = np.maximum(eps, (np.abs(y_true) + np.abs(y_pred)) / 2.0)
    return np.mean(np.abs(y_true - y_pred) / denom) * 100.0


# sklearn-agnostic RMSE (supports old versions without 'squared' kwarg)
def rmse(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float).ravel()
    y_pred = np.asarray(y_pred, dtype=float).ravel()
    try:
        # Newer sklearn
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        # Older sklearn
        return np.sqrt(mean_squared_error(y_true, y_pred))


def evaluate_metrics(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float).ravel()
    y_pred = np.asarray(y_pred, dtype=float).ravel()
    return {
        "MAE": mean_absolute_error(y_true, y_pred),
        "RMSE": rmse(y_true, y_pred),
        "MAPE(%)": mape(y_true, y_pred),
        "sMAPE(%)": smape(y_true, y_pred),
        "R2": r2_score(y_true, y_pred),
    }


def plot_actual_vs_pred(df_plot, title="Actual vs Predicted"):
    # df_plot: columns ['timestamp','actual','xgb','lgbm','arima'] (any subset is fine)
    plt.figure(figsize=(12, 4))
    if "timestamp" in df_plot.columns:
        x = df_plot["timestamp"]
    else:
        x = np.arange(len(df_plot))
    if "actual" in df_plot:
        plt.plot(x, df_plot["actual"], label="Actual", linewidth=2)
    for col in df_plot.columns:
        if col not in ("timestamp", "actual"):
            plt.plot(x, df_plot[col], label=col)
    plt.title(title)
    plt.xlabel("Time")
    plt.ylabel("Value")
    plt.legend()
    plt.tight_layout()
    plt.show()


def bar_compare_metrics(metrics_dict):
    """
    metrics_dict: {
       'XGB': {'MAE':..., 'RMSE':..., ...},
       'LGBM': {...},
       'ARIMA': {...}
    }
    """
    dfm = pd.DataFrame(metrics_dict).T
    ax = dfm[["MAE", "RMSE", "MAPE(%)", "sMAPE(%)", "R2"]].plot(
        kind="bar", figsize=(12, 5)
    )
    ax.set_title("Model Comparison (Validation Metrics)")
    ax.set_ylabel("Score (lower is better for errors; higher better for R2)")
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.show()


In [22]:
# =========================
# 4) LOAD DATA
# =========================

train = pd.read_csv(TRAIN_FEATS_PATH)
train_labels = pd.read_csv(TRAIN_LABELS_PATH)
test = pd.read_csv(TEST_FEATS_PATH)

print("train shape:", train.shape)
print("train_labels shape:", train_labels.shape)
print("test shape:", test.shape)

# Optional helper file that may map assets/targets
if os.path.exists(TARGET_PAIRS_PATH):
    target_pairs = pd.read_csv(TARGET_PAIRS_PATH)
    print("target_pairs shape:", target_pairs.shape)
else:
    target_pairs = None


train shape: (1961, 558)
train_labels shape: (1961, 425)
test shape: (134, 559)
target_pairs shape: (424, 3)


In [23]:
# =========================
# 5) AUTO-DETECT JOIN KEYS & TARGETS
# =========================

# Try to find shared keys between train and train_labels to merge targets
shared_cols = [c for c in train.columns if c in train_labels.columns]

# Heuristic: do not keep columns that look like numeric targets (we'll detect target separately)
# Keep "id-like" shared columns as join keys
candidate_id_cols = []
for c in shared_cols:
    if c.lower() in (
        "row_id",
        "id",
        "time_id",
        "timestamp",
        "date",
        "datetime",
        "symbol",
        "asset",
        "ticker",
    ):
        candidate_id_cols.append(c)

# If nothing obvious, fallback to any shared column (except numeric-only columns) to try as key
if not candidate_id_cols:
    for c in shared_cols:
        if train[c].dtype == "O" or "id" in c.lower():
            candidate_id_cols.append(c)

# Detect target columns in train_labels: numeric columns not in candidate_id_cols
target_cols = [
    c
    for c in train_labels.columns
    if (c not in candidate_id_cols)
    and (np.issubdtype(train_labels[c].dtype, np.number))
]

print("Detected join keys:", candidate_id_cols)
print("Detected target columns:", target_cols)

# Merge labels into train
if candidate_id_cols and target_cols:
    train_full = train.merge(
        train_labels[candidate_id_cols + target_cols], on=candidate_id_cols, how="inner"
    )
else:
    # Fallback: if train_labels has 'row_id' that matches 'row_id' in train, try that
    if "row_id" in train.columns and "row_id" in train_labels.columns:
        train_full = train.merge(train_labels, on="row_id", how="inner")
        # Update detection
        candidate_id_cols = ["row_id"]
        target_cols = [
            c
            for c in train_labels.columns
            if c != "row_id" and np.issubdtype(train_labels[c].dtype, np.number)
        ]
    else:
        raise ValueError(
            "Could not auto-detect join keys or target columns. Please set candidate_id_cols and target_cols manually."
        )

print("train_full shape:", train_full.shape)
display(train_full.head(3))


Detected join keys: ['date_id']
Detected target columns: ['target_0', 'target_1', 'target_2', 'target_3', 'target_4', 'target_5', 'target_6', 'target_7', 'target_8', 'target_9', 'target_10', 'target_11', 'target_12', 'target_13', 'target_14', 'target_15', 'target_16', 'target_17', 'target_18', 'target_19', 'target_20', 'target_21', 'target_22', 'target_23', 'target_24', 'target_25', 'target_26', 'target_27', 'target_28', 'target_29', 'target_30', 'target_31', 'target_32', 'target_33', 'target_34', 'target_35', 'target_36', 'target_37', 'target_38', 'target_39', 'target_40', 'target_41', 'target_42', 'target_43', 'target_44', 'target_45', 'target_46', 'target_47', 'target_48', 'target_49', 'target_50', 'target_51', 'target_52', 'target_53', 'target_54', 'target_55', 'target_56', 'target_57', 'target_58', 'target_59', 'target_60', 'target_61', 'target_62', 'target_63', 'target_64', 'target_65', 'target_66', 'target_67', 'target_68', 'target_69', 'target_70', 'target_71', 'target_72', 'ta

Unnamed: 0,date_id,LME_AH_Close,LME_CA_Close,LME_PB_Close,LME_ZS_Close,JPX_Gold_Mini_Futures_Open,JPX_Gold_Rolling-Spot_Futures_Open,JPX_Gold_Standard_Futures_Open,JPX_Platinum_Mini_Futures_Open,JPX_Platinum_Standard_Futures_Open,JPX_RSS3_Rubber_Futures_Open,JPX_Gold_Mini_Futures_High,JPX_Gold_Rolling-Spot_Futures_High,JPX_Gold_Standard_Futures_High,JPX_Platinum_Mini_Futures_High,JPX_Platinum_Standard_Futures_High,JPX_RSS3_Rubber_Futures_High,JPX_Gold_Mini_Futures_Low,JPX_Gold_Rolling-Spot_Futures_Low,JPX_Gold_Standard_Futures_Low,JPX_Platinum_Mini_Futures_Low,JPX_Platinum_Standard_Futures_Low,JPX_RSS3_Rubber_Futures_Low,JPX_Gold_Mini_Futures_Close,JPX_Gold_Rolling-Spot_Futures_Close,JPX_Gold_Standard_Futures_Close,JPX_Platinum_Mini_Futures_Close,JPX_Platinum_Standard_Futures_Close,JPX_RSS3_Rubber_Futures_Close,JPX_Gold_Mini_Futures_Volume,JPX_Gold_Rolling-Spot_Futures_Volume,JPX_Gold_Standard_Futures_Volume,JPX_Platinum_Mini_Futures_Volume,JPX_Platinum_Standard_Futures_Volume,JPX_RSS3_Rubber_Futures_Volume,JPX_Gold_Mini_Futures_settlement_price,JPX_Gold_Rolling-Spot_Futures_settlement_price,JPX_Platinum_Mini_Futures_settlement_price,JPX_RSS3_Rubber_Futures_settlement_price,JPX_Gold_Mini_Futures_open_interest,JPX_Gold_Rolling-Spot_Futures_open_interest,JPX_Gold_Standard_Futures_open_interest,JPX_Platinum_Mini_Futures_open_interest,JPX_Platinum_Standard_Futures_open_interest,JPX_RSS3_Rubber_Futures_open_interest,US_Stock_ACWI_adj_open,US_Stock_AEM_adj_open,US_Stock_AG_adj_open,US_Stock_AGG_adj_open,US_Stock_ALB_adj_open,US_Stock_AMP_adj_open,US_Stock_BCS_adj_open,US_Stock_BKR_adj_open,US_Stock_BND_adj_open,US_Stock_BNDX_adj_open,US_Stock_BP_adj_open,US_Stock_BSV_adj_open,US_Stock_CAT_adj_open,US_Stock_CCJ_adj_open,US_Stock_CLF_adj_open,US_Stock_COP_adj_open,US_Stock_CVE_adj_open,US_Stock_CVX_adj_open,US_Stock_DE_adj_open,US_Stock_DVN_adj_open,US_Stock_EEM_adj_open,US_Stock_EFA_adj_open,US_Stock_EMB_adj_open,US_Stock_ENB_adj_open,US_Stock_EOG_adj_open,US_Stock_EWJ_adj_open,US_Stock_EWT_adj_open,US_Stock_EWY_adj_open,US_Stock_EWZ_adj_open,US_Stock_FCX_adj_open,US_Stock_FNV_adj_open,US_Stock_FXI_adj_open,US_Stock_GDX_adj_open,US_Stock_GDXJ_adj_open,US_Stock_GLD_adj_open,US_Stock_GOLD_adj_open,US_Stock_HAL_adj_open,US_Stock_HES_adj_open,US_Stock_HL_adj_open,US_Stock_IAU_adj_open,US_Stock_IEF_adj_open,US_Stock_IEMG_adj_open,US_Stock_IGSB_adj_open,US_Stock_JNK_adj_open,US_Stock_KGC_adj_open,US_Stock_KMI_adj_open,US_Stock_LQD_adj_open,US_Stock_LYB_adj_open,US_Stock_MBB_adj_open,US_Stock_MPC_adj_open,US_Stock_MS_adj_open,US_Stock_NEM_adj_open,US_Stock_NUE_adj_open,US_Stock_NUGT_adj_open,US_Stock_OIH_adj_open,...,target_324,target_325,target_326,target_327,target_328,target_329,target_330,target_331,target_332,target_333,target_334,target_335,target_336,target_337,target_338,target_339,target_340,target_341,target_342,target_343,target_344,target_345,target_346,target_347,target_348,target_349,target_350,target_351,target_352,target_353,target_354,target_355,target_356,target_357,target_358,target_359,target_360,target_361,target_362,target_363,target_364,target_365,target_366,target_367,target_368,target_369,target_370,target_371,target_372,target_373,target_374,target_375,target_376,target_377,target_378,target_379,target_380,target_381,target_382,target_383,target_384,target_385,target_386,target_387,target_388,target_389,target_390,target_391,target_392,target_393,target_394,target_395,target_396,target_397,target_398,target_399,target_400,target_401,target_402,target_403,target_404,target_405,target_406,target_407,target_408,target_409,target_410,target_411,target_412,target_413,target_414,target_415,target_416,target_417,target_418,target_419,target_420,target_421,target_422,target_423
0,0,2264.5,7205.0,2570.0,3349.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,63.4457,39.6837,6.7563,89.5868,117.2076,147.0081,8.1897,25.5772,66.0686,43.4882,27.4272,67.0996,133.7797,9.0349,6.9508,42.4813,7.9941,90.9993,140.4689,30.3211,46.6643,57.3474,81.7757,23.9998,82.8197,53.0634,24.3186,66.7279,27.2048,17.446,74.4045,41.3834,21.6651,31.7992,124.66,98.3121,42.6159,42.879,3.8859,25.24,89.9489,47.5108,42.5442,69.7568,3.9328,11.9254,94.2649,61.3057,87.5953,51.9964,42.0562,30.2279,54.7107,153.7555,467.8681,...,,-0.04059,,-0.001146,,,-0.00907,,0.012078,-0.035902,-0.031057,-0.022328,0.029666,0.0122,,0.025222,,,-0.03869,-0.006624,-0.007629,0.020645,,-0.006132,,-0.049421,-0.008647,,0.004848,0.048748,-0.043554,-0.026801,,-0.028981,-0.001919,0.003485,-0.04848,-0.08299,,,,-0.039969,0.041481,0.014812,,0.044623,,,,0.055153,-0.019665,,0.009049,0.022842,,0.04637,0.04635,0.028058,,0.019017,0.004705,,-0.012729,,-0.011773,0.023571,,-0.091022,0.006293,0.022177,0.040977,0.002379,-0.051436,0.005262,,-0.066416,0.007124,-0.014872,,,,0.035734,0.02009,,-0.042561,-0.012987,0.027634,-0.041252,0.031637,,,0.021239,-0.005595,,-0.004628,0.033793,,0.038234,,0.02731
1,1,2228.0,7147.0,2579.0,3327.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,63.7519,40.0082,7.1514,89.5786,120.0349,146.4136,8.1748,26.158,66.0524,43.4962,27.6685,67.0741,132.7149,9.4531,7.4622,42.7049,8.4975,92.2661,141.5628,30.7927,47.1575,57.6402,82.1634,24.5669,83.94,53.4962,24.5506,67.1337,27.4936,18.1421,73.3529,41.6183,21.9606,32.2969,125.05,97.99,43.3825,42.9416,4.0884,25.32,89.8722,47.9304,42.5238,69.7947,4.0323,12.2834,93.9697,61.7308,87.6446,53.0582,41.6896,30.4993,56.909,159.9869,478.4485,...,-0.037377,-0.035228,0.018944,-0.00399,0.001439,0.005442,0.018436,-0.017276,0.013023,-0.04535,-0.00396,-0.01207,0.023311,0.00161,-0.015401,0.019943,0.010631,0.003091,-0.033982,-0.016824,0.012047,-0.000923,0.009749,-0.003478,0.001487,-0.035106,-0.008274,-0.003159,-0.004567,0.055604,-0.032553,-0.043308,0.005173,-0.023901,-0.001651,-0.003429,-0.038367,-0.08524,0.028655,-0.008811,-0.046645,-0.040424,0.040197,0.011455,-0.021626,0.051413,0.012968,-0.01171004,-0.00551,0.038579,-0.014224,0.021975,0.009566,0.006713,-0.005479,0.031304,0.036766,0.02467,-0.004608,0.004908,-0.007125,0.04698,-0.000298,0.004824,-0.010374,0.006768,-0.009518,-0.070225,-0.001248,0.059259,0.020991,0.008073,-0.040249,-0.001909,0.00972,-0.051488,-0.007198,-0.012201,-0.028174,0.028013,-0.037905,0.019292,0.017704,-0.018676,-0.022584,-0.005958,0.021191,-0.040295,0.029351,-0.006528,0.003377,0.021372,-0.001517,0.012846,0.010547,0.030527,-0.000764,0.025021,0.003548,0.02094
2,2,2250.0,7188.5,2587.0,3362.0,4684.0,4691.0,4684.0,3363.0,3367.0,207.0,4735.0,4746.0,4735.0,3443.0,3449.0,207.7,4679.0,4688.0,4679.0,3362.0,3362.0,206.4,4727.0,4739.0,4730.0,3426.0,3427.0,206.9,2681.0,37908.0,30656.0,624.0,13713.0,4128.0,4730.0,4735.0,3423.0,206.9,1768.0,128380.0,17671.0,1323.0,7893.0,7391.0,64.3732,39.6239,6.9638,89.3901,120.2258,146.2327,8.227,27.2469,65.9713,43.4882,28.0467,67.0232,133.4332,9.4725,7.519,43.4991,9.001,92.6208,142.7373,31.112,47.4554,58.4289,82.1422,24.7498,84.1305,54.5031,24.6367,66.8514,27.8939,17.9772,71.9231,42.0531,21.7482,31.8269,124.89,96.106,44.3059,44.1841,3.9631,25.28,89.6589,48.2512,42.5076,70.1364,4.0504,12.3616,93.8453,62.2718,87.5542,54.002,42.0881,30.1321,57.5396,155.455,490.1049,...,-0.010037,-0.01876,0.031047,0.003446,-0.003856,-0.082631,0.02444,-0.060391,-0.002704,-0.032089,-0.0105,0.045023,0.011767,-0.012111,-0.024611,0.006784,-0.002298,-0.00553,-0.025162,-0.010426,0.026367,-0.028056,0.018295,0.004207,0.00393,-0.01442,0.010339,0.014611,0.014157,0.060654,-0.034887,-0.040393,-0.013646,-0.019227,0.00509,0.012471,-0.021991,-0.097312,0.045188,0.003952,-0.036997,-0.034077,0.038468,-0.005488,-0.014796,0.040806,0.015353,-2.795074e-07,0.005538,0.043075,-0.000577,0.014431,-0.009259,0.004082,-0.022966,0.010501,0.02347,0.010894,-0.022604,-0.010525,-0.030055,0.028862,0.017025,0.041938,0.005172,-0.000601,-0.031099,-0.056066,-6.3e-05,0.115478,0.036307,0.015766,-0.052541,0.010377,0.020045,-0.045181,-0.012679,0.003707,-0.011758,0.038694,-0.037981,0.005764,-0.001951,-0.012845,-0.007365,0.008149,0.013421,-0.090242,0.016779,-0.003223,-0.006712,0.009308,0.001857,-0.012761,-0.002345,0.017529,-0.005394,0.004835,-0.009075,0.001706


In [24]:
# =========================
# 6) BASIC PRE-PROCESSING
#    - Ensure time order
#    - Create a unified timestamp if available
#    - Handle categorical
#    - Create lags/rolling for numeric features
# =========================

# Try to form a time column if present
time_col = None
for c in ["timestamp", "datetime", "date", "time_id"]:
    if c in train_full.columns:
        time_col = c
        break

if time_col is None:
    # Create a fake sequential time index per group if needed
    train_full["_fake_time_idx"] = np.arange(len(train_full))
    test["_fake_time_idx"] = np.arange(len(test))
    time_col = "_fake_time_idx"

# Identify potential grouping columns (e.g., symbol, asset, ticker); used to build per-series features
group_cols = [c for c in ["symbol", "asset", "ticker"] if c in train_full.columns]

# If none found, we will treat the whole dataset as one group
if not group_cols:
    group_cols = []

# Identify numeric feature candidates (exclude ids, time, targets)
drop_like = set(candidate_id_cols) | NEVER_FEATURE_COLS | set(target_cols) | {time_col}
num_feats = [
    c
    for c in train_full.columns
    if c not in drop_like and np.issubdtype(train_full[c].dtype, np.number)
]

# Include encoded categoricals later
cat_cols = [
    c for c in train_full.columns if c not in drop_like and train_full[c].dtype == "O"
]

print(f"time_col: {time_col}")
print(f"group_cols: {group_cols}")
print(f"numeric feature candidates: {len(num_feats)}")
print(f"categorical columns: {cat_cols}")


time_col: _fake_time_idx
group_cols: []
numeric feature candidates: 557
categorical columns: []


In [25]:
# =========================
# 6a) CATEGORICAL ENCODING (target-agnostic, count/freq)
# =========================

for c in cat_cols:
    # Simple frequency encoding
    freq = train_full[c].value_counts(normalize=True).to_dict()
    train_full[f"{c}_freq"] = train_full[c].map(freq)
    if c in test.columns:
        test[f"{c}_freq"] = test[c].map(freq).fillna(0.0)

# Update numeric features to include these encodings
enc_cols = [f"{c}_freq" for c in cat_cols]
num_feats += enc_cols

# Drop raw cats from modeling sets (kept for grouping if needed)
X_base_cols = sorted(set(num_feats + candidate_id_cols + group_cols + [time_col]))


In [26]:
# =========================
# 6b) SORT BY TIME & FEATURE ENGINEERING (LAGS/ROLLS) — INDEX-SAFE, RID-MERGE
# =========================

# 0) Ensure a unique row id to align/merge features safely (no reliance on index)
if "_rid" not in train_full.columns:
    train_full = train_full.copy()
    train_full["_rid"] = np.arange(len(train_full))
if "_rid" not in test.columns:
    test = test.copy()
    test["_rid"] = np.arange(len(test)) + 10_000_000  # offset to avoid collisions


def _sorted_groupby(df, group_cols, time_col):
    """Return (df_sorted, groupby_obj_or_None) sorted by (group_cols + time_col)."""
    if group_cols:
        df_sorted = df.sort_values(group_cols + [time_col]).copy()
        gobj = df_sorted.groupby(group_cols, sort=False)
    else:
        df_sorted = df.sort_values([time_col]).copy()
        gobj = None
    return df_sorted, gobj


def compute_lag_roll_features(df, group_cols, time_col, base_numeric_cols):
    """
    Compute lag/rolling features on a sorted copy of df.
    Returns a THIN features frame with only ['_rid'] + engineered feature columns.
    """
    # Make sure _rid exists
    if "_rid" not in df.columns:
        raise ValueError("Expected a unique row id column '_rid'.")

    df_sorted, gobj = _sorted_groupby(df, group_cols, time_col)

    new_cols = []

    # ---- Lags
    for w in LAG_WINDOWS:
        for col in base_numeric_cols:
            new_name = f"{col}_lag{w}"
            if gobj is not None:
                df_sorted[new_name] = gobj[col].shift(w)
            else:
                df_sorted[new_name] = df_sorted[col].shift(w)
            new_cols.append(new_name)

    # ---- Rolling stats
    for w in ROLL_WINDOWS:
        for col in base_numeric_cols:
            if gobj is not None:
                roll = gobj[col].rolling(w)
                if "mean" in ROLL_STATS:
                    mname = f"{col}_roll{w}_mean"
                    df_sorted[mname] = roll.mean().reset_index(
                        level=group_cols, drop=True
                    )
                    new_cols.append(mname)
                if "std" in ROLL_STATS:
                    sname = f"{col}_roll{w}_std"
                    df_sorted[sname] = roll.std(ddof=0).reset_index(
                        level=group_cols, drop=True
                    )
                    new_cols.append(sname)
            else:
                if "mean" in ROLL_STATS:
                    mname = f"{col}_roll{w}_mean"
                    df_sorted[mname] = df_sorted[col].rolling(w).mean()
                    new_cols.append(mname)
                if "std" in ROLL_STATS:
                    sname = f"{col}_roll{w}_std"
                    df_sorted[sname] = df_sorted[col].rolling(w).std(ddof=0)
                    new_cols.append(sname)

    # Return a thin frame to merge by _rid later (no index alignment!)
    feat_cols = ["_rid"] + sorted(set(new_cols))
    return df_sorted[feat_cols].copy(), sorted(set(new_cols))


# --- Build features on TRAIN (train-only to avoid leakage)
train_base_cols = list(
    set((group_cols if group_cols else []) + [time_col] + num_feats + ["_rid"])
)
train_for_feats = train_full[train_base_cols].copy()

train_feats_only, fe_cols_train = compute_lag_roll_features(
    train_for_feats,
    group_cols=group_cols,
    time_col=time_col,
    base_numeric_cols=num_feats,
)

# --- Build features on TEST using TRAIN+TEST combo (so test lags exist)
test_base_cols = list(
    set((group_cols if group_cols else []) + [time_col] + num_feats + ["_rid"])
)
test_for_feats = test[test_base_cols].copy()

combo_for_feats = pd.concat(
    [train_for_feats, test_for_feats], axis=0, ignore_index=True
)
combo_feats_only, fe_cols_combo = compute_lag_roll_features(
    combo_for_feats,
    group_cols=group_cols,
    time_col=time_col,
    base_numeric_cols=num_feats,
)

# Split combo features back to train/test by _rid (index-safe)
combo_feats_train = combo_feats_only[combo_feats_only["_rid"].isin(train_full["_rid"])]
combo_feats_test = combo_feats_only[combo_feats_only["_rid"].isin(test["_rid"])]

# Note: we keep train features from train-only computation to avoid any leakage from test.
#       (You *could* also use combo_feats_train if you explicitly want continuity at the boundary,
#        but that's typically not desired.)
# Merge features back into full frames by _rid
train_fe = train_full.merge(train_feats_only, on="_rid", how="left")
test_fe = test.merge(combo_feats_test, on="_rid", how="left")

# Final feature list (union of engineered cols and numeric inputs)
FEATURES = sorted(set(num_feats + fe_cols_train + fe_cols_combo))
print("Total feature count:", len(FEATURES))


Total feature count: 6127


In [27]:
# =========================
# 6c) IMPUTE + SCALE (optional but helpful)
#     We'll fit imputer/scaler per CV fold to avoid leakage.
# =========================

imputer = SimpleImputer(strategy="median")  # numeric median imputation
scaler = StandardScaler(
    with_mean=True, with_std=True
)  # for tree models it's optional, but helps consistency

# We will fit these inside CV loops.


In [28]:
# =========================
# 7) TRAIN/EVAL HELPER — SHAPE-SAFE PREDICTIONS
# =========================
def get_time_series_order(df, time_col):
    return df.sort_values(time_col).index


def train_eval_single_target(df, target_col, model_kind="xgb"):
    """
    Train and evaluate XGB/LGBM using TimeSeriesSplit.
    Returns out-of-fold predictions, aggregated metrics, and trained fold models.
    """
    assert target_col in df.columns, f"Target {target_col} not found in dataframe."
    idx_sorted = get_time_series_order(df, time_col)
    df = df.loc[idx_sorted].reset_index(drop=True)

    # Features/target arrays
    X = df[FEATURES].values
    y = df[target_col].astype(float).values.ravel()

    tscv = TimeSeriesSplit(n_splits=N_SPLITS)
    oof = np.zeros(len(df), dtype=float)
    models = []

    fold_metrics = []
    for fold, (tr_idx, va_idx) in enumerate(tscv.split(X), 1):
        X_tr, X_va = X[tr_idx], X[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]

        # Impute/scale per fold
        X_tr = imputer.fit_transform(X_tr)
        X_tr = scaler.fit_transform(X_tr)
        X_va = imputer.transform(X_va)
        X_va = scaler.transform(X_va)

        if model_kind == "xgb":
            model = XGBRegressor(**XGB_PARAMS)
        elif model_kind == "lgbm":
            model = LGBMRegressor(**LGBM_PARAMS)
        else:
            raise ValueError("model_kind must be 'xgb' or 'lgbm'")

        model.fit(X_tr, y_tr)
        p = np.asarray(model.predict(X_va), dtype=float).ravel()

        # Safety: align lengths
        if len(p) != len(va_idx):
            raise RuntimeError(
                f"Prediction length {len(p)} != validation length {len(va_idx)}"
            )

        oof[va_idx] = p
        models.append(model)

        met = evaluate_metrics(y_va, p)
        met["fold"] = fold
        fold_metrics.append(met)
        print(f"[{model_kind.upper()}] Fold {fold}:", met)

    # Aggregate metrics
    agg = evaluate_metrics(y, oof)
    print(f"\n[{model_kind.upper()}] OOF metrics ({target_col}):", agg)
    return oof, agg, models


In [29]:
# =========================
# 8) ARIMA BASELINE (per-series, simple)
# =========================


def arima_fit_forecast(train_df, test_df, target_col):
    """
    Fit a simple non-seasonal ARIMA to each group (or global if no group),
    then forecast next len(test in group) steps. Returns an array aligned to test_df order.
    """
    # Prepare output
    preds = np.zeros(len(test_df))
    # Work by groups to keep series coherent
    if group_cols:
        for keys, g_tr in train_df.groupby(group_cols):
            # matching rows in test
            mask_te = (test_df[group_cols] == pd.Series(keys, index=group_cols)).all(
                axis=1
            )
            if mask_te.sum() == 0:
                continue

            y_series = g_tr.sort_values(time_col)[target_col].dropna().values
            if len(y_series) < 10:
                # too short: naive forecast = last observed
                last = y_series[-1] if len(y_series) else 0.0
                preds[mask_te] = last
                continue

            # Simple grid for small ARIMA search by AIC
            best_aic, best_order = np.inf, None
            for order in ARIMA_ORDERS:
                try:
                    model = sm.tsa.ARIMA(
                        y_series,
                        order=order,
                        enforce_stationarity=False,
                        enforce_invertibility=False,
                    )
                    res = model.fit(method="statespace", disp=0)
                    if res.aic < best_aic:
                        best_aic = res.aic
                        best_order = order
                except Exception:
                    continue

            try:
                model = sm.tsa.ARIMA(
                    y_series,
                    order=best_order or (1, 1, 1),
                    enforce_stationarity=False,
                    enforce_invertibility=False,
                )
                res = model.fit(method="statespace", disp=0)
                steps = mask_te.sum()
                fc = res.forecast(steps=steps)
                preds[mask_te] = fc
            except Exception:
                last = y_series[-1]
                preds[mask_te] = last
    else:
        # Single global series
        y_series = train_df.sort_values(time_col)[target_col].dropna().values
        if len(y_series) < 10:
            preds[:] = y_series[-1] if len(y_series) else 0.0
        else:
            best_aic, best_order = np.inf, None
            for order in ARIMA_ORDERS:
                try:
                    model = sm.tsa.ARIMA(
                        y_series,
                        order=order,
                        enforce_stationarity=False,
                        enforce_invertibility=False,
                    )
                    res = model.fit(method="statespace", disp=0)
                    if res.aic < best_aic:
                        best_aic = res.aic
                        best_order = order
                except Exception:
                    continue
            try:
                model = sm.tsa.ARIMA(
                    y_series,
                    order=best_order or (1, 1, 1),
                    enforce_stationarity=False,
                    enforce_invertibility=False,
                )
                res = model.fit(method="statespace", disp=0)
                preds[:] = res.forecast(steps=len(test_df))
            except Exception:
                last = y_series[-1]
                preds[:] = last
    return preds


In [None]:
# =========================
# 9) TRAINING & VALIDATION (for each detected target)
# =========================

all_results = {}

for target in target_cols:
    print("\n" + "=" * 80)
    print(f"TARGET: {target}")
    print("=" * 80)

    # Drop rows where target is NaN
    df_t = train_fe.dropna(subset=[target]).copy()

    # XGB
    xgb_oof, xgb_metrics, xgb_models = train_eval_single_target(
        df_t, target, model_kind="xgb"
    )

    # LGBM
    lgbm_oof, lgbm_metrics, lgbm_models = train_eval_single_target(
        df_t, target, model_kind="lgbm"
    )

    # ARIMA (we'll evaluate on the same validation fold segmentation approach by aligning time)
    # For a fair-ish comparison, we simulate last split as validation for ARIMA:
    idx_sorted = get_time_series_order(df_t, time_col)
    df_t_sorted = df_t.loc[idx_sorted].reset_index(drop=True)

    tscv = TimeSeriesSplit(n_splits=N_SPLITS)
    arima_preds_full = np.zeros(len(df_t_sorted))
    last_va_idx = None
    for fold, (tr_idx, va_idx) in enumerate(tscv.split(df_t_sorted), 1):
        tr_df = df_t_sorted.iloc[tr_idx]
        va_df = df_t_sorted.iloc[va_idx]
        arima_va = arima_fit_forecast(tr_df, va_df, target_col=target)
        arima_preds_full[va_idx] = arima_va
        last_va_idx = va_idx

    arima_metrics = evaluate_metrics(df_t_sorted[target].values, arima_preds_full)
    print(f"[ARIMA] OOF-like metrics ({target}):", arima_metrics)

    # Collect results
    all_results[target] = {
        "XGB": xgb_metrics,
        "LGBM": lgbm_metrics,
        "ARIMA": arima_metrics,
    }

    # =========================
    # 10) VISUALIZE per target (last fold segment)
    # =========================
    try:
        # Build a small plot df on last validation segment
        plot_df = pd.DataFrame(
            {
                "timestamp": df_t_sorted.loc[last_va_idx, time_col].values,
                "actual": df_t_sorted.loc[last_va_idx, target].values,
                "XGB": xgb_oof[idx_sorted][last_va_idx]
                if len(xgb_oof) == len(df_t_sorted)
                else xgb_oof[last_va_idx],
                "LGBM": lgbm_oof[idx_sorted][last_va_idx]
                if len(lgbm_oof) == len(df_t_sorted)
                else lgbm_oof[last_va_idx],
                "ARIMA": arima_preds_full[last_va_idx],
            }
        ).sort_values("timestamp")
        plot_actual_vs_pred(plot_df, title=f"{target} – Last Fold Validation")
        bar_compare_metrics(all_results[target])
    except Exception as e:
        print("Plotting skipped due to:", e)

    # Free memory
    del xgb_oof, lgbm_oof
    gc.collect()

# Show overall comparison (if multiple targets, this prints the dict)
print("\n\n=== Aggregated Results (per target) ===")
for t, d in all_results.items():
    print(f"\nTarget: {t}")
    for m, met in d.items():
        print(f"  {m}: {met}")



TARGET: target_0
[XGB] Fold 1: {'MAE': 0.010927124079082366, 'RMSE': np.float64(0.019158447139537994), 'MAPE(%)': np.float64(4336670.624559823), 'sMAPE(%)': np.float64(138.79446918794778), 'R2': -0.04395972070373122, 'fold': 1}
[XGB] Fold 2: {'MAE': 0.027446514545804702, 'RMSE': np.float64(0.03193921868988246), 'MAPE(%)': np.float64(14765587.19956954), 'sMAPE(%)': np.float64(166.04949090762145), 'R2': -12.787444631164352, 'fold': 2}
[XGB] Fold 3: {'MAE': 0.012293784986413045, 'RMSE': np.float64(0.01572817583031001), 'MAPE(%)': np.float64(357.9528569266651), 'sMAPE(%)': np.float64(138.97637175037315), 'R2': -0.38063762310979143, 'fold': 3}
[XGB] Fold 4: {'MAE': 0.007442168874487667, 'RMSE': np.float64(0.009214281899519818), 'MAPE(%)': np.float64(725451.0796239701), 'sMAPE(%)': np.float64(152.66339148970937), 'R2': -0.4539630315044103, 'fold': 4}
[XGB] Fold 5: {'MAE': 0.008825900765474988, 'RMSE': np.float64(0.012269746062163871), 'MAPE(%)': np.float64(1154347.708133735), 'sMAPE(%)': np

KeyboardInterrupt: 

In [None]:
# =========================
# 11) FIT ON FULL TRAIN & PREDICT TEST
#     (Create a submission file with columns: join keys + each target prediction)
# =========================

submission = test.copy()
for target in target_cols:
    print(f"\nFitting full models for TEST predictions: {target}")

    # Train full models
    df_t = train_fe.dropna(subset=[target]).copy()

    # Refit XGB and LGBM with CV, then average predictions on test for each
    xgb_oof, xgb_metrics, xgb_models = train_eval_single_target(
        df_t, target, model_kind="xgb"
    )
    lgbm_oof, lgbm_metrics, lgbm_models = train_eval_single_target(
        df_t, target, model_kind="lgbm"
    )

    # Predict test
    xgb_pred_test = fit_predict_test(train_fe, test_fe, xgb_models)
    lgbm_pred_test = fit_predict_test(train_fe, test_fe, lgbm_models)
    arima_pred_test = arima_fit_forecast(train_fe, test_fe, target_col=target)

    # Simple average ensemble (optional)
    submission[f"{target}_xgb"] = xgb_pred_test
    submission[f"{target}_lgbm"] = lgbm_pred_test
    submission[f"{target}_arima"] = arima_pred_test
    submission[f"{target}_ens"] = (
        xgb_pred_test + lgbm_pred_test + arima_pred_test
    ) / 3.0

# Keep only useful columns for the final file
keep_cols = []
# preserve any id columns to help evaluation on your side
for c in [
    "row_id",
    "id",
    "time_id",
    "timestamp",
    "symbol",
    "asset",
    "ticker",
    "date",
    "datetime",
]:
    if c in submission.columns:
        keep_cols.append(c)
# add prediction columns
pred_cols = [c for c in submission.columns if any(c.startswith(t) for t in target_cols)]
keep_cols += pred_cols

submission_final = submission[keep_cols].copy()
submission_final.to_csv(SUBMISSION_PATH, index=False)
print(f"\nWrote predictions to: {SUBMISSION_PATH}")
display(submission_final.head())



Fitting full models for TEST predictions: target_0
[XGB] Fold 1: {'MAE': 0.010927124079082366, 'RMSE': np.float64(0.019158447139537994), 'MAPE(%)': np.float64(4336670.624559823), 'sMAPE(%)': np.float64(138.79446918794778), 'R2': -0.04395972070373122, 'fold': 1}
[XGB] Fold 2: {'MAE': 0.027446514545804702, 'RMSE': np.float64(0.03193921868988246), 'MAPE(%)': np.float64(14765587.19956954), 'sMAPE(%)': np.float64(166.04949090762145), 'R2': -12.787444631164352, 'fold': 2}
[XGB] Fold 3: {'MAE': 0.012293784986413045, 'RMSE': np.float64(0.01572817583031001), 'MAPE(%)': np.float64(357.9528569266651), 'sMAPE(%)': np.float64(138.97637175037315), 'R2': -0.38063762310979143, 'fold': 3}
[XGB] Fold 4: {'MAE': 0.007442168874487667, 'RMSE': np.float64(0.009214281899519818), 'MAPE(%)': np.float64(725451.0796239701), 'sMAPE(%)': np.float64(152.66339148970937), 'R2': -0.4539630315044103, 'fold': 4}
[XGB] Fold 5: {'MAE': 0.008825900765474988, 'RMSE': np.float64(0.012269746062163871), 'MAPE(%)': np.float64(

KeyboardInterrupt: 