
# MITSUI & CO. Commodity Prediction Challenge — Non-Neural Baseline
**Models:** Linear/Ridge/Lasso/ElasticNet, RandomForest, HistGradientBoosting, with optional XGBoost / LightGBM  
**Metric:** Competition Sharpe-like (mean Spearman over time groups divided by std)  
**Notes:**  
- Internet disabled; pure Python + installed libs only.  
- Time-series-safe CV with `TimeSeriesSplit`.  
- Minimal leakage by sorting on time and avoiding future data in CV.  
- LightGBM/XGBoost are attempted if available; otherwise skipped automatically.  
- No neural networks are used.


In [2]:

# --- Imports & Setup ---
import os
import sys
import json
import math
import time
import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
import numpy as np
import pandas as pd

# SciPy for Spearman
try:
    from scipy.stats import spearmanr
except Exception as e:
    # Fallback: simple Spearman via pandas corr(method='spearman') if SciPy not present
    spearmanr = None

# Sklearn models (non-NN)
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Optional models
try:
    from xgboost import XGBRegressor
    XGB_OK = True
except Exception:
    XGB_OK = False

try:
    from lightgbm import LGBMRegressor
    LGBM_OK = True
except Exception:
    LGBM_OK = False

DATA_DIR = Path("./dataset")
print("Files in /mnt/data:", [p.name for p in DATA_DIR.glob("*")])


Files in /mnt/data: ['playground.ipynb', 'target_pairs.csv', 'test.csv', 'train.csv', 'train_labels.csv']


## 1) Load Data

In [3]:

# Try to load the provided files
train_path = DATA_DIR / "train.csv"
labels_path = DATA_DIR / "train_labels.csv"
test_path  = DATA_DIR / "test.csv"
pairs_path = DATA_DIR / "target_pairs.csv"

train = pd.read_csv(train_path) if train_path.exists() else None
labels = pd.read_csv(labels_path) if labels_path.exists() else None
test  = pd.read_csv(test_path) if test_path.exists() else None
pairs = pd.read_csv(pairs_path) if pairs_path.exists() else None

for name, df in [("train", train), ("train_labels", labels), ("test", test), ("target_pairs", pairs)]:
    if df is not None:
        display(df.head(3))
        print(name, "shape:", df.shape)
        print(name, "columns:", list(df.columns))
        print()
    else:
        print(f"WARNING: {name} not found at {DATA_DIR}")


Unnamed: 0,date_id,LME_AH_Close,LME_CA_Close,LME_PB_Close,LME_ZS_Close,JPX_Gold_Mini_Futures_Open,JPX_Gold_Rolling-Spot_Futures_Open,JPX_Gold_Standard_Futures_Open,JPX_Platinum_Mini_Futures_Open,JPX_Platinum_Standard_Futures_Open,...,FX_GBPCAD,FX_CADCHF,FX_NZDCAD,FX_NZDCHF,FX_ZAREUR,FX_NOKGBP,FX_NOKCHF,FX_ZARCHF,FX_NOKJPY,FX_ZARGBP
0,0,2264.5,7205.0,2570.0,3349.0,,,,,,...,1.699987,0.776874,0.888115,0.689954,0.066653,0.090582,0.11963,0.078135,13.82274,0.059163
1,1,2228.0,7147.0,2579.0,3327.0,,,,,,...,1.695279,0.778682,0.889488,0.692628,0.067354,0.091297,0.12052,0.079066,13.888146,0.059895
2,2,2250.0,7188.5,2587.0,3362.0,4684.0,4691.0,4684.0,3363.0,3367.0,...,1.692724,0.780186,0.894004,0.69749,0.067394,0.091478,0.120809,0.079287,13.983675,0.060037


train shape: (1961, 558)
train columns: ['date_id', 'LME_AH_Close', 'LME_CA_Close', 'LME_PB_Close', 'LME_ZS_Close', 'JPX_Gold_Mini_Futures_Open', 'JPX_Gold_Rolling-Spot_Futures_Open', 'JPX_Gold_Standard_Futures_Open', 'JPX_Platinum_Mini_Futures_Open', 'JPX_Platinum_Standard_Futures_Open', 'JPX_RSS3_Rubber_Futures_Open', 'JPX_Gold_Mini_Futures_High', 'JPX_Gold_Rolling-Spot_Futures_High', 'JPX_Gold_Standard_Futures_High', 'JPX_Platinum_Mini_Futures_High', 'JPX_Platinum_Standard_Futures_High', 'JPX_RSS3_Rubber_Futures_High', 'JPX_Gold_Mini_Futures_Low', 'JPX_Gold_Rolling-Spot_Futures_Low', 'JPX_Gold_Standard_Futures_Low', 'JPX_Platinum_Mini_Futures_Low', 'JPX_Platinum_Standard_Futures_Low', 'JPX_RSS3_Rubber_Futures_Low', 'JPX_Gold_Mini_Futures_Close', 'JPX_Gold_Rolling-Spot_Futures_Close', 'JPX_Gold_Standard_Futures_Close', 'JPX_Platinum_Mini_Futures_Close', 'JPX_Platinum_Standard_Futures_Close', 'JPX_RSS3_Rubber_Futures_Close', 'JPX_Gold_Mini_Futures_Volume', 'JPX_Gold_Rolling-Spot_Futur

Unnamed: 0,date_id,target_0,target_1,target_2,target_3,target_4,target_5,target_6,target_7,target_8,...,target_414,target_415,target_416,target_417,target_418,target_419,target_420,target_421,target_422,target_423
0,0,0.005948,-0.002851,-0.004675,-0.000639,,,-0.006729,0.006066,,...,,0.021239,-0.005595,,-0.004628,0.033793,,0.038234,,0.02731
1,1,0.005783,-0.024118,-0.007052,-0.018955,-0.031852,-0.019452,0.003002,-0.006876,-0.002042,...,0.003377,0.021372,-0.001517,0.012846,0.010547,0.030527,-0.000764,0.025021,0.003548,0.02094
2,2,0.001048,0.023836,-0.008934,-0.02206,,,0.037449,0.007658,,...,-0.006712,0.009308,0.001857,-0.012761,-0.002345,0.017529,-0.005394,0.004835,-0.009075,0.001706


train_labels shape: (1961, 425)
train_labels columns: ['date_id', 'target_0', 'target_1', 'target_2', 'target_3', 'target_4', 'target_5', 'target_6', 'target_7', 'target_8', 'target_9', 'target_10', 'target_11', 'target_12', 'target_13', 'target_14', 'target_15', 'target_16', 'target_17', 'target_18', 'target_19', 'target_20', 'target_21', 'target_22', 'target_23', 'target_24', 'target_25', 'target_26', 'target_27', 'target_28', 'target_29', 'target_30', 'target_31', 'target_32', 'target_33', 'target_34', 'target_35', 'target_36', 'target_37', 'target_38', 'target_39', 'target_40', 'target_41', 'target_42', 'target_43', 'target_44', 'target_45', 'target_46', 'target_47', 'target_48', 'target_49', 'target_50', 'target_51', 'target_52', 'target_53', 'target_54', 'target_55', 'target_56', 'target_57', 'target_58', 'target_59', 'target_60', 'target_61', 'target_62', 'target_63', 'target_64', 'target_65', 'target_66', 'target_67', 'target_68', 'target_69', 'target_70', 'target_71', 'target_

Unnamed: 0,date_id,LME_AH_Close,LME_CA_Close,LME_PB_Close,LME_ZS_Close,JPX_Gold_Mini_Futures_Open,JPX_Gold_Rolling-Spot_Futures_Open,JPX_Gold_Standard_Futures_Open,JPX_Platinum_Mini_Futures_Open,JPX_Platinum_Standard_Futures_Open,...,FX_CADCHF,FX_NZDCAD,FX_NZDCHF,FX_ZAREUR,FX_NOKGBP,FX_NOKCHF,FX_ZARCHF,FX_NOKJPY,FX_ZARGBP,is_scored
0,1827,2684.5,9190.0,1967.0,2942.0,13623.0,13920.0,13618.0,4696.0,4692.0,...,0.631633,0.808485,0.510666,0.051733,0.071654,0.079797,0.048828,13.631347,0.043845,True
1,1828,2691.5,9275.0,1985.0,2963.5,13640.0,13922.0,13634.0,4613.0,4613.0,...,0.633526,0.812571,0.514785,0.051802,0.071793,0.080214,0.048912,13.743387,0.043778,True
2,1829,2646.0,9284.5,1971.0,2914.0,13634.0,13923.0,13638.0,4647.0,4632.0,...,0.632156,0.811948,0.513278,0.051902,0.07163,0.080134,0.048971,13.766241,0.043774,True


test shape: (134, 559)
test columns: ['date_id', 'LME_AH_Close', 'LME_CA_Close', 'LME_PB_Close', 'LME_ZS_Close', 'JPX_Gold_Mini_Futures_Open', 'JPX_Gold_Rolling-Spot_Futures_Open', 'JPX_Gold_Standard_Futures_Open', 'JPX_Platinum_Mini_Futures_Open', 'JPX_Platinum_Standard_Futures_Open', 'JPX_RSS3_Rubber_Futures_Open', 'JPX_Gold_Mini_Futures_High', 'JPX_Gold_Rolling-Spot_Futures_High', 'JPX_Gold_Standard_Futures_High', 'JPX_Platinum_Mini_Futures_High', 'JPX_Platinum_Standard_Futures_High', 'JPX_RSS3_Rubber_Futures_High', 'JPX_Gold_Mini_Futures_Low', 'JPX_Gold_Rolling-Spot_Futures_Low', 'JPX_Gold_Standard_Futures_Low', 'JPX_Platinum_Mini_Futures_Low', 'JPX_Platinum_Standard_Futures_Low', 'JPX_RSS3_Rubber_Futures_Low', 'JPX_Gold_Mini_Futures_Close', 'JPX_Gold_Rolling-Spot_Futures_Close', 'JPX_Gold_Standard_Futures_Close', 'JPX_Platinum_Mini_Futures_Close', 'JPX_Platinum_Standard_Futures_Close', 'JPX_RSS3_Rubber_Futures_Close', 'JPX_Gold_Mini_Futures_Volume', 'JPX_Gold_Rolling-Spot_Futures_

Unnamed: 0,target,lag,pair
0,target_0,1,US_Stock_VT_adj_close
1,target_1,1,LME_PB_Close - US_Stock_VT_adj_close
2,target_2,1,LME_CA_Close - LME_ZS_Close


target_pairs shape: (424, 3)
target_pairs columns: ['target', 'lag', 'pair']



## 2) Column Inference & Merge with Labels

In [4]:

def guess_time_col(df: pd.DataFrame):
    if df is None: return None
    # Common time-like names
    for c in df.columns:
        lc = str(c).lower()
        if lc in ("date","timestamp","time","time_id","datetime"):
            return c
    # Try to detect by dtype
    for c in df.columns:
        try:
            if np.issubdtype(df[c].dtype, np.datetime64):
                return c
        except Exception:
            pass
    return None

def ensure_datetime(df, col):
    if df is None or col is None: return df, col
    if not np.issubdtype(df[col].dtype, np.datetime64):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            try:
                df[col] = pd.to_datetime(df[col], errors="ignore")
            except Exception:
                pass
    return df, col

def guess_pair_col(df: pd.DataFrame):
    if df is None: return None
    # look for pair_id or asset_id-like keys
    for key in ["pair_id", "asset_id", "pair", "symbol", "ticker", "pair_code"]:
        if key in df.columns: return key
    return None

def guess_row_id_col(df: pd.DataFrame):
    if df is None: return None
    for c in ["row_id","id","rowid","pred_id","sample_id"]:
        if c in df.columns: return c
    return None

def guess_target_col(df: pd.DataFrame):
    if df is None: return None
    candidates = [c for c in df.columns if str(c).lower() in ("target","y","ret","return","label")]
    if candidates:
        return candidates[0]
    # Otherwise guess last non-key numeric column
    key_like = {"row_id","id","date","timestamp","time","time_id","datetime","pair","pair_id","asset_id","symbol","ticker"}
    numeric = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and str(c).lower() not in key_like]
    if numeric:
        return numeric[-1]
    return None

# Try to infer keys
tcol_train = guess_time_col(train)
tcol_test  = guess_time_col(test)
train, tcol_train = ensure_datetime(train, tcol_train)
test,  tcol_test  = ensure_datetime(test,  tcol_test)

pcol_train = guess_pair_col(train)
pcol_test  = guess_pair_col(test)

rid_test = guess_row_id_col(test)

print("Inferred time column (train,test):", tcol_train, tcol_test)
print("Inferred pair column (train,test):", pcol_train, pcol_test)
print("Inferred test row_id:", rid_test)

# Merge labels if provided and not already in train
ycol = None
if labels is not None and train is not None:
    # Find common merge keys between train and labels
    common = [c for c in train.columns if c in labels.columns]
    # Prefer a minimal set of keys
    preferred_keys = ["row_id","id"]
    keys = [k for k in preferred_keys if k in common] or common
    if keys:
        print("Merging labels on keys:", keys[:3], "(showing up to 3)")
        train_merged = train.merge(labels, on=keys, how="left")
    else:
        print("No shared merge keys found with labels; assuming labels already in train.")
        train_merged = train.copy()
else:
    print("No separate labels file, assuming target is in train.")
    train_merged = train.copy() if train is not None else None

ycol = guess_target_col(train_merged) if train_merged is not None else None
print("Inferred target column:", ycol)

if ycol is None:
    raise ValueError("Could not infer a target column. Please rename your target to one of: target, y, ret, return, label.")


Inferred time column (train,test): None None
Inferred pair column (train,test): None None
Inferred test row_id: None
Merging labels on keys: ['date_id'] (showing up to 3)
Inferred target column: target_423


## 3) Simple Feature Engineering (Safe Defaults)

In [5]:

def add_time_features(df: pd.DataFrame, tcol: str):
    if df is None or tcol is None: 
        return df
    out = df.copy()
    if not np.issubdtype(out[tcol].dtype, np.datetime64):
        try:
            out[tcol] = pd.to_datetime(out[tcol], errors="coerce")
        except Exception:
            return out
    out["year"]  = out[tcol].dt.year
    out["month"] = out[tcol].dt.month
    out["day"]   = out[tcol].dt.day
    out["dow"]   = out[tcol].dt.dayofweek
    out["dom"]   = out[tcol].dt.day
    out["hour"]  = out[tcol].dt.hour
    return out

def add_group_lags(df: pd.DataFrame, tcol: str, gcol: str, num_lags=3, roll_windows=(3,7)):
    if df is None or tcol is None or gcol is None:
        return df
    out = df.copy()
    # Find numeric columns to lag (exclude keys and target)
    key_like = {tcol, gcol, "row_id","id"}
    if "target" in out.columns:
        key_like.add("target")
    num_cols = [c for c in out.columns if pd.api.types.is_numeric_dtype(out[c]) and c not in key_like]
    # Sort by group,time for proper lags
    out = out.sort_values([gcol, tcol]).reset_index(drop=True)
    for c in num_cols:
        for L in range(1, num_lags+1):
            out[f"{c}_lag{L}"] = out.groupby(gcol, dropna=False)[c].shift(L)
        for w in roll_windows:
            out[f"{c}_rmean{w}"] = out.groupby(gcol, dropna=False)[c].shift(1).rolling(w).mean().reset_index(level=0, drop=True)
            out[f"{c}_rstd{w}"]  = out.groupby(gcol, dropna=False)[c].shift(1).rolling(w).std().reset_index(level=0, drop=True)
    return out

train_fe = add_time_features(train_merged, tcol_train)
test_fe  = add_time_features(test, tcol_test)

train_fe = add_group_lags(train_fe, tcol_train, pcol_train, num_lags=3, roll_windows=(3,7))
test_fe  = add_group_lags(test_fe,  tcol_test,  pcol_test,  num_lags=3, roll_windows=(3,7))

print("Feature-engineered shapes:", None if train_fe is None else train_fe.shape, None if test_fe is None else test_fe.shape)
display(train_fe.head(3) if train_fe is not None else "No train data")


Feature-engineered shapes: (1961, 982) (134, 559)


Unnamed: 0,date_id,LME_AH_Close,LME_CA_Close,LME_PB_Close,LME_ZS_Close,JPX_Gold_Mini_Futures_Open,JPX_Gold_Rolling-Spot_Futures_Open,JPX_Gold_Standard_Futures_Open,JPX_Platinum_Mini_Futures_Open,JPX_Platinum_Standard_Futures_Open,...,target_414,target_415,target_416,target_417,target_418,target_419,target_420,target_421,target_422,target_423
0,0,2264.5,7205.0,2570.0,3349.0,,,,,,...,,0.021239,-0.005595,,-0.004628,0.033793,,0.038234,,0.02731
1,1,2228.0,7147.0,2579.0,3327.0,,,,,,...,0.003377,0.021372,-0.001517,0.012846,0.010547,0.030527,-0.000764,0.025021,0.003548,0.02094
2,2,2250.0,7188.5,2587.0,3362.0,4684.0,4691.0,4684.0,3363.0,3367.0,...,-0.006712,0.009308,0.001857,-0.012761,-0.002345,0.017529,-0.005394,0.004835,-0.009075,0.001706


## 4) Build CV Folds (TimeSeriesSplit)

In [6]:

# Sort by time globally to avoid peeking
if train_fe is not None and tcol_train is not None and tcol_train in train_fe.columns:
    train_fe = train_fe.sort_values(tcol_train).reset_index(drop=True)

# Define features/target
key_cols = set([c for c in [tcol_train, pcol_train, "row_id","id"] if (c is not None and c in (train_fe.columns if train_fe is not None else []))])
X_cols = [c for c in (train_fe.columns if train_fe is not None else []) if c != ycol and c not in key_cols and pd.api.types.is_numeric_dtype(train_fe[c])]

print("Number of feature columns:", len(X_cols))
print("First 15 feature columns:", X_cols[:15])

X = train_fe[X_cols].fillna(0.0).astype(float)
y = train_fe[ycol].astype(float)

# TimeSeriesSplit
N_SPLITS = 5 if len(train_fe) > 1000 else 3
tscv = TimeSeriesSplit(n_splits=N_SPLITS)


Number of feature columns: 981
First 15 feature columns: ['date_id', 'LME_AH_Close', 'LME_CA_Close', 'LME_PB_Close', 'LME_ZS_Close', 'JPX_Gold_Mini_Futures_Open', 'JPX_Gold_Rolling-Spot_Futures_Open', 'JPX_Gold_Standard_Futures_Open', 'JPX_Platinum_Mini_Futures_Open', 'JPX_Platinum_Standard_Futures_Open', 'JPX_RSS3_Rubber_Futures_Open', 'JPX_Gold_Mini_Futures_High', 'JPX_Gold_Rolling-Spot_Futures_High', 'JPX_Gold_Standard_Futures_High', 'JPX_Platinum_Mini_Futures_High']


## 5) Define Competition Metric (Sharpe-like of Spearman)

In [7]:

def sharpe_like_spearman(y_true: np.ndarray, y_pred: np.ndarray, groups: np.ndarray=None):
    """
    Compute mean Spearman correlation across groups divided by the std across groups.
    If groups is None, compute once over all data with std=1 to avoid division by zero.
    """
    if groups is None:
        # Single group fallback
        if spearmanr is not None:
            rho = spearmanr(y_true, y_pred, nan_policy='omit').correlation
        else:
            rho = pd.Series(y_true).corr(pd.Series(y_pred), method='spearman')
        return float(rho) if rho is not None and not np.isnan(rho) else 0.0

    df = pd.DataFrame({"y": y_true, "p": y_pred, "g": groups})
    rhos = []
    for g, sub in df.groupby("g"):
        if len(sub) < 3:
            continue
        if spearmanr is not None:
            r = spearmanr(sub["y"], sub["p"], nan_policy='omit').correlation
        else:
            r = sub["y"].corr(sub["p"], method='spearman')
        if r is not None and not np.isnan(r):
            rhos.append(r)
    if len(rhos) == 0:
        return 0.0
    rhos = np.array(rhos, dtype=float)
    mean_r = rhos.mean()
    std_r  = rhos.std(ddof=1) if len(rhos) > 1 else 1.0
    return float(mean_r / std_r if std_r != 0 else 0.0)

# Choose grouping key for the metric: prefer time, else pair, else None
group_key_for_metric = tcol_train if (tcol_train is not None and tcol_train in train_fe.columns) else (pcol_train if (pcol_train is not None and pcol_train in train_fe.columns) else None)
groups_array = train_fe[group_key_for_metric].values if (group_key_for_metric is not None and train_fe is not None) else None

print("Grouping key for metric:", group_key_for_metric)


Grouping key for metric: None


## 6) Models (No Neural Nets) & CV Evaluation

In [8]:

models = []

# Linear family
models.append(("Linear", Pipeline([("scaler", StandardScaler(with_mean=False)), ("reg", LinearRegression())])))
models.append(("Ridge",  Pipeline([("scaler", StandardScaler(with_mean=False)), ("reg", Ridge(alpha=1.0, random_state=42))])))
models.append(("Lasso",  Pipeline([("scaler", StandardScaler(with_mean=False)), ("reg", Lasso(alpha=1e-3, random_state=42, max_iter=10000))])))
models.append(("ElasticNet", Pipeline([("scaler", StandardScaler(with_mean=False)), ("reg", ElasticNet(alpha=1e-3, l1_ratio=0.2, random_state=42, max_iter=10000))])))

# Tree-based
models.append(("RandomForest", RandomForestRegressor(n_estimators=300, max_depth=None, n_jobs=-1, random_state=42)))
models.append(("HistGB", HistGradientBoostingRegressor(max_depth=None, learning_rate=0.05, max_iter=500, random_state=42)))

if XGB_OK:
    models.append(("XGB", XGBRegressor(
        n_estimators=800, 
        max_depth=6, 
        learning_rate=0.04, 
        subsample=0.8, 
        colsample_bytree=0.8, 
        reg_alpha=0.0, 
        reg_lambda=1.0, 
        random_state=42,
        tree_method="hist",
        n_jobs=-1
    )))
else:
    print("XGBoost not installed; skipping XGBRegressor.")

if LGBM_OK:
    models.append(("LGBM", LGBMRegressor(
        n_estimators=1000,
        learning_rate=0.03,
        num_leaves=63,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )))
else:
    print("LightGBM not installed; skipping LGBMRegressor.")

results = []
oof_preds = {}

for name, model in models:
    print(f"\n=== CV: {name} ===")
    fold_scores = []
    preds_all = np.zeros(len(X))
    for fold, (tr_idx, va_idx) in enumerate(tscv.split(X)):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

        # Fit
        model.fit(X_tr, y_tr)
        p = model.predict(X_va)
        preds_all[va_idx] = p

        # Metric by groups within validation
        grp_va = groups_array[va_idx] if groups_array is not None else None
        score = sharpe_like_spearman(y_va.values, p, grp_va)
        fold_scores.append(score)
        print(f"  Fold {fold}: Sharpe-like Spearman = {score:.5f}")

    mean_score = float(np.mean(fold_scores)) if fold_scores else 0.0
    std_score  = float(np.std(fold_scores, ddof=1)) if len(fold_scores) > 1 else 0.0
    print(f"  {name} CV mean={mean_score:.5f}, std={std_score:.5f}")
    results.append({"model": name, "cv_mean": mean_score, "cv_std": std_score})
    oof_preds[name] = preds_all

results_df = pd.DataFrame(results).sort_values("cv_mean", ascending=False).reset_index(drop=True)
display(results_df)


XGBoost not installed; skipping XGBRegressor.
LightGBM not installed; skipping LGBMRegressor.

=== CV: Linear ===


ValueError: Input y contains NaN.

## 7) Fit Best Model on Full Train & Predict Test

In [None]:

best_row = results_df.iloc[0]
best_name = best_row["model"]
print("Best model by CV:", best_name)

# Retrieve the trained object definition for consistency
best_model = None
for name, model in models:
    if name == best_name:
        best_model = model
        break

# Refit on full training
best_model.fit(X, y)

# Prepare test features
test_pred = None
if test_fe is not None and isinstance(test_fe, pd.DataFrame) and len(test_fe) > 0:
    # align columns
    missing = [c for c in X_cols if c not in test_fe.columns]
    for m in missing:
        test_fe[m] = 0.0
    X_test = test_fe[X_cols].fillna(0.0).astype(float)
    test_pred = best_model.predict(X_test)
else:
    print("No test set found; skipping prediction.")


## 8) Build Submission File

In [None]:

sub = None
if test_pred is not None:
    # Row id
    if rid_test is not None and rid_test in test.columns:
        sub = pd.DataFrame({rid_test: test[rid_test], "prediction": test_pred})
    else:
        # fallback to integer index (Kaggle API may remap later; adjust if sample submission provided)
        sub = pd.DataFrame({"row_id": np.arange(len(test_pred)), "prediction": test_pred})
    sub_path = (Path("/mnt/data") / "submission.csv")
    sub.to_csv(sub_path, index=False)
    print("Saved submission to:", sub_path)
    display(sub.head())
else:
    print("Submission not created because test predictions were not produced.")


## 9) Save Artifacts

In [None]:

import joblib
ART_DIR = Path("/mnt/data") / "artifacts"
ART_DIR.mkdir(exist_ok=True)

# Save best model
model_path = ART_DIR / f"best_model_{best_name}.joblib"
joblib.dump(best_model, model_path)

# Save feature columns for inference compatibility
with open(ART_DIR / "feature_columns.json", "w") as f:
    json.dump(X_cols, f)

# Save CV leaderboard
results_csv = ART_DIR / "cv_results.csv"
results_df.to_csv(results_csv, index=False)

print("Saved:", model_path)
print("Saved:", results_csv)
print("Saved feature columns list.")


## 10) Next Steps & Tips


- Verify that `train_labels.csv` actually contains the target and the right join keys. If not, rename your columns:
  - Target name suggestions the notebook can auto-detect: `target`, `y`, `ret`, `return`, or `label`.
  - Keys like `row_id` or `id` help merging labels safely.
- Ensure time ordering by **true event time** (not file order). If your file uses `time_id`, keep it increasing.
- For correctness with the challenge metric:
  - Prefer grouping the metric by **time** so each timestamp computes a Spearman across pairs, then apply mean/std.
  - If no time column exists, consider adding one or approximate with an integer `time_id` in order.
- Feature ideas that remain non-leaky:
  - More lags and rolling stats (volatility, autocorr), per pair.
  - Cross-asset spread features using `target_pairs.csv` (e.g., lagged differences).
  - Calendar effects (month, dow, holidays).
- Hyperparameters:
  - For XGBoost/LightGBM, keep `tree_method='hist'` (XGB) and moderate `n_estimators` to fit under 8h.
  - Use early stopping if you have a proper validation split (not used here to keep the code simple).
- Ensembling:
  - Average predictions from top 2–3 models (e.g., LGBM, XGB, HistGB) using validation-weighted means.
- Submission:
  - This notebook writes `/mnt/data/submission.csv`. The official challenge uses an evaluation API; adapt the final
    format to the API requirements (row ids, column names) or mirror the example notebook from the organizers.
