In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/mitsui-commodity-prediction-challenge/target_pairs.csv
/kaggle/input/mitsui-commodity-prediction-challenge/train_labels.csv
/kaggle/input/mitsui-commodity-prediction-challenge/train.csv
/kaggle/input/mitsui-commodity-prediction-challenge/test.csv
/kaggle/input/mitsui-commodity-prediction-challenge/lagged_test_labels/test_labels_lag_1.csv
/kaggle/input/mitsui-commodity-prediction-challenge/lagged_test_labels/test_labels_lag_4.csv
/kaggle/input/mitsui-commodity-prediction-challenge/lagged_test_labels/test_labels_lag_3.csv
/kaggle/input/mitsui-commodity-prediction-challenge/lagged_test_labels/test_labels_lag_2.csv
/kaggle/input/mitsui-commodity-prediction-challenge/kaggle_evaluation/mitsui_inference_server.py
/kaggle/input/mitsui-commodity-prediction-challenge/kaggle_evaluation/mitsui_gateway.py
/kaggle/input/mitsui-commodity-prediction-challenge/kaggle_evaluation/__init__.py
/kaggle/input/mitsui-commodity-prediction-challenge/kaggle_evaluation/core/templates.py
/kaggle/input/mitsui-commodity-prediction-challenge/kaggle_evaluation/core/base_gateway.py
/kaggle/input/mitsui-commodity-prediction-challenge/kaggle_evaluation/core/relay.py
/kaggle/input/mitsui-commodity-prediction-challenge/kaggle_evaluation/core/kaggle_evaluation.proto
/kaggle/input/mitsui-commodity-prediction-challenge/kaggle_evaluation/core/__init__.py
/kaggle/input/mitsui-commodity-prediction-challenge/kaggle_evaluation/core/generated/kaggle_evaluation_pb2.py
/kaggle/input/mitsui-commodity-prediction-challenge/kaggle_evaluation/core/generated/kaggle_evaluation_pb2_grpc.py
/kaggle/input/mitsui-commodity-prediction-challenge/kaggle_evaluation/core/generated/__init__.py
#Code Summary – Hybrid Stagewise Ensemble for Mitsui Commodity Prediction

#This script implements a multi-stage ensemble pipeline for the Kaggle Mitsui & Co. Commodity Prediction Challenge. It combines classical machine learning, deep learning, and gradient boosting in a stacked architecture designed for robust time-series forecasting of 424 target variables.

#1. Configuration & Setup

#Global Config (CFG):

#Dataset path (/kaggle/input/mitsui-commodity-prediction-challenge/)

#Random seed for reproducibility

#Total number of targets: 424

#Null filler: 0.0

#Time Neural Network (TNN) parameters: epochs, lookback window, hidden units

#Stage-2 LightGBM hyperparameters (learning rate, leaves, estimators, etc.)

#Device: Uses GPU if available (cuda), otherwise CPU.

#2. Data Preprocessing

#Cleaning: Converts object columns to numeric, handles missing/inf values, fills NaN with 0.0.

#Feature Engineering:

#Adds lag features (1,2,3 steps)

#Adds rolling mean/std features

#Stabilization: Adds small Gaussian noise to flat predictions to avoid identical row outputs.

#3. Model Components
#🔹 Extra Trees Regressor (ETR – Stage 1 classical model)

#Ensemble of randomized decision trees.

#Provides fast, stable base forecasts.

#🔹 Time Neural Network (TNN – Stage 1 deep model)

#Depthwise Separable 1D Convolutions → capture local temporal patterns.

#Time Attention Layer → learns dependencies across time steps.

#Pooling & Dense Head → compresses features and predicts 424 targets simultaneously.

#Optimized using AdamW and SmoothL1Loss.

#🔹 Stage 2 LightGBM (Residual Learner)

#Trains one LightGBM model per target (424 total).

#Input: Stage-1 predictions + original features.

#Learns residual corrections, boosting final accuracy.

#4. Training Pipeline

#Stage 1 (Base Models):

#Trains Extra Trees + TNN on lagged features and labels.

#Uses multiple random seeds for robustness → ensemble averaging.

#Produces out-of-sample Stage-1 predictions.

#Stage 2 (Stacking & Residuals):

#Computes residuals between Stage-1 predictions and true labels.

#Trains 424 LightGBM models (one per target) on residuals.

#Combines Stage-1 and Stage-2 for final predictions.

#Evaluation Metrics:

#Stage-1 and Stage-2 evaluated with RMSE and MAE.

#Reports improvement after residual correction.

#5. Prediction Workflow

#Test data is cleaned, scaled, and lag features generated.

#Stage-1 predictions are computed (ETR + TNN ensemble).

#Stage-2 LightGBM models adjust residuals.

#Final outputs are stabilized to avoid constant rows.

#Returns prediction DataFrame with all 424 targets.

#6. Inference Integration

#Integrated with Kaggle’s inference server:

#inference_server.serve() when running in competition rerun mode.

#inference_server.run_local_gateway() for local testing/debugging.#
import os
import gc
import math
import numpy as np
import pandas as pd
import polars as pl
import warnings

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import lightgbm as lgb

import torch
import torch.nn as nn
import torch.optim as optim

import kaggle_evaluation.mitsui_inference_server

warnings.simplefilter("ignore")

# =========================
# Config & Globals
# =========================
class CFG:
    path = "/kaggle/input/mitsui-commodity-prediction-challenge/"
    seed = 42
    total_targets = 424
    targets = [f"target_{i}" for i in range(total_targets)]
    solution_null_filler = 0.0

    tnn_epochs = int(os.getenv("KAGGLE_TNN_EPOCHS", "2"))
    tnn_lookback = int(os.getenv("KAGGLE_TNN_LOOKBACK", "16"))
    tnn_hidden = int(os.getenv("KAGGLE_TNN_HIDDEN", "256"))

    n_stage1_seeds = 3
    stage2_params = {
        "objective": "regression",
        "metric": "rmse",
        "verbosity": -1,
        "n_estimators": 300,
        "learning_rate": 0.05,
        "num_leaves": 64
    }

def _set_seed(s):
    import random
    random.seed(s)
    np.random.seed(s)
    torch.manual_seed(s)
    torch.cuda.manual_seed_all(s)

_set_seed(CFG.seed)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Globals
STAGE1_MODELS = []
STAGE2_MODEL = None
FEATURES = []
MODEL_READY = False

# =========================
# Utils
# =========================
def clean_dataframe(df):
    if df is None or df.empty: return df
    obj_cols = df.select_dtypes(include="object").columns
    for col in obj_cols:
        df[col] = pd.to_numeric(df[col].astype(str).str.replace(",", ""), errors="coerce")
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0.0, inplace=True)
    return df

def preprocess_columns(df):
    df = clean_dataframe(df)
    return add_lag_features(df)

def add_lag_features(df, lags=[1,2,3]):
    if "date_id" not in df.columns: return df
    df = df.sort_values("date_id").reset_index(drop=True)
    for col in df.columns:
        if col == "date_id": continue
        for lag in lags:
            df[f"{col}_lag{lag}"] = df[col].shift(lag).fillna(0.0)
        df[f"{col}_roll_mean3"] = df[col].rolling(3).mean().fillna(0.0)
        df[f"{col}_roll_std3"] = df[col].rolling(3).std().fillna(0.0)
    return df

def _stabilize_and_detie_rows(out_df, date_ids=None):
    out_df = out_df.astype(np.float32)  # ensure float32
    out_df[:] = np.nan_to_num(out_df.values, nan=0.0, posinf=0.0, neginf=0.0)
    n_rows, n_cols = out_df.shape
    if date_ids is None:
        date_ids = np.zeros(n_rows, dtype=int)
    vals = out_df.to_numpy(np.float32)
    row_stds = np.std(vals, axis=1)
    flat_mask = row_stds < 1e-15
    if np.any(flat_mask):
        for r_idx in np.where(flat_mask)[0]:
            rng = np.random.default_rng(int(date_ids[r_idx]) + 131071)
            noise = rng.normal(loc=0.0, scale=1.0, size=n_cols).astype(np.float32)
            scale = (1.0 + abs(float(np.mean(vals[r_idx])))) * 1e-6
            vals[r_idx] = vals[r_idx] + noise * scale
        out_df.iloc[:, :] = vals
    return out_df

# =========================
# TNN
# =========================
class DepthwiseSeparableConv1d(nn.Module):
    def __init__(self, in_ch, out_ch, kernel_size=5):
        super().__init__()
        pad = kernel_size // 2
        self.depthwise = nn.Conv1d(in_ch, in_ch, kernel_size=kernel_size, groups=in_ch, padding=pad)
        self.pointwise = nn.Conv1d(in_ch, out_ch, kernel_size=1)
        self.act = nn.GELU()
        self.bn = nn.BatchNorm1d(out_ch)
    def forward(self, x):
        x = self.depthwise(x)
        x = self.pointwise(x)
        x = self.bn(x)
        return self.act(x)

class TimeAttention(nn.Module):
    def __init__(self, d_model, n_heads=4):
        super().__init__()
        self.n_heads = n_heads
        self.d_k = d_model // n_heads
        self.q_proj = nn.Linear(d_model, d_model)
        self.k_proj = nn.Linear(d_model, d_model)
        self.v_proj = nn.Linear(d_model, d_model)
        self.out = nn.Linear(d_model, d_model)
    def forward(self, x):
        B,T,D = x.shape
        H = self.n_heads
        q = self.q_proj(x).view(B,T,H,self.d_k).transpose(1,2)
        k = self.k_proj(x).view(B,T,H,self.d_k).transpose(1,2)
        v = self.v_proj(x).view(B,T,H,self.d_k).transpose(1,2)
        attn_logits = (q @ k.transpose(-2,-1))/math.sqrt(self.d_k)
        attn = torch.softmax(attn_logits, dim=-1)
        out = attn @ v
        out = out.transpose(1,2).contiguous().view(B,T,D)
        return self.out(out), attn

class TNN(nn.Module):
    def __init__(self, in_features, hidden, lookback, out_dim=CFG.total_targets, n_heads=4):
        super().__init__()
        self.kernel = DepthwiseSeparableConv1d(in_ch=in_features, out_ch=hidden)
        self.proj = nn.Linear(hidden, hidden)
        self.attn = TimeAttention(d_model=hidden, n_heads=n_heads)
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.head = nn.Sequential(
            nn.Linear(hidden, hidden//2),
            nn.GELU(),
            nn.Linear(hidden//2, out_dim)
        )
    def forward(self, x):
        x_conv = x.transpose(1,2)
        x_conv = self.kernel(x_conv)
        x_h = x_conv.transpose(1,2)
        x_h = self.proj(x_h)
        x_attn,_ = self.attn(x_h)
        pooled = self.pool(x_attn.transpose(1,2)).squeeze(-1)
        return self.head(pooled)

# =========================
# Stage 1 Training
# =========================
def train_stage1(seed):
    _set_seed(seed)
    train = pd.read_csv(os.path.join(CFG.path, "train.csv")).sort_values("date_id")
    train_labels = pd.read_csv(os.path.join(CFG.path, "train_labels.csv"))

    all_cols = [c for c in train.columns if c != "date_id"]
    global FEATURES
    if not FEATURES: FEATURES = all_cols.copy()

    X_train_df = preprocess_columns(train[["date_id"]+FEATURES].copy()).fillna(0.0)
    y_train_df = train_labels[["date_id"]+CFG.targets].fillna(CFG.solution_null_filler).copy()

    SCALER = StandardScaler()
    X_scaled = SCALER.fit_transform(X_train_df[FEATURES]).astype(np.float32)
    X_scaled_df = pd.DataFrame(X_scaled, columns=FEATURES)
    X_scaled_df.insert(0,"date_id", X_train_df["date_id"].values)

    etr = ExtraTreesRegressor(random_state=seed, n_jobs=-1, n_estimators=384, max_features="sqrt")
    etr.fit(X_scaled_df[FEATURES], y_train_df[CFG.targets])

    lookback = max(2, CFG.tnn_lookback)
    X_seq, y_seq = build_sequences(X_scaled_df, y_train_df, lookback)
    if X_seq is not None:
        X_tr,X_va,y_tr,y_va = train_test_split(X_seq,y_seq,test_size=0.1,random_state=seed,shuffle=False)
        train_loader = torch.utils.data.DataLoader(SeqDataset(X_tr,y_tr),batch_size=128,shuffle=True)
        in_features = X_seq.shape[-1]
        tnn = TNN(in_features, CFG.tnn_hidden, lookback).to(DEVICE)
        opt = optim.AdamW(tnn.parameters(), lr=2e-3, weight_decay=1e-4)
        crit = nn.SmoothL1Loss()
        for ep in range(CFG.tnn_epochs):
            tnn.train()
            for xb,yb in train_loader:
                xb,yb = xb.to(DEVICE), yb.to(DEVICE)
                loss = crit(tnn(xb), yb)
                opt.zero_grad(); loss.backward(); opt.step()
        tnn.eval()
    else:
        tnn = None

    return etr, tnn, SCALER

def build_sequences(df_feat, df_lbl, lookback):
    if "date_id" not in df_feat.columns: return None,None
    df_feat = df_feat.sort_values("date_id").reset_index(drop=True)
    df_lbl = df_lbl.sort_values("date_id").reset_index(drop=True)
    X_all = df_feat[FEATURES].to_numpy(np.float32)
    y_all = df_lbl[CFG.targets].to_numpy(np.float32)
    if len(X_all) < lookback+1: return None,None
    X_seq,y_seq=[],[]
    for t in range(lookback-1,len(X_all)):
        X_seq.append(X_all[t-lookback+1:t+1])
        y_seq.append(y_all[t])
    return np.stack(X_seq,axis=0), np.stack(y_seq,axis=0)

class SeqDataset(torch.utils.data.Dataset):
    def __init__(self,X,y): self.X,self.y=X.astype(np.float32),y.astype(np.float32)
    def __len__(self): return self.X.shape[0]
    def __getitem__(self,idx): return torch.from_numpy(self.X[idx]),torch.from_numpy(self.y[idx])

# =========================
# Stage 2 Training
# =========================
def train_stage2(stage1_preds, X_scaled, y_true):
    residuals = y_true - stage1_preds
    train_data = pd.DataFrame(X_scaled.astype(np.float32), columns=FEATURES).copy()
    for i in range(CFG.total_targets):
        train_data[f"stage1_pred_{i}"] = stage1_preds[:,i]
    models=[]
    for i in range(CFG.total_targets):
        dtrain = lgb.Dataset(train_data, label=residuals[:,i].astype(np.float32))
        model = lgb.train(CFG.stage2_params, dtrain)
        models.append(model)
    return models

# =========================
# Training Pipeline
# =========================
def load_and_train_model():
    global STAGE1_MODELS, STAGE2_MODEL, MODEL_READY
    STAGE1_MODELS=[]
    for s in range(CFG.n_stage1_seeds):
        STAGE1_MODELS.append(train_stage1(seed=CFG.seed+s))

    train = pd.read_csv(os.path.join(CFG.path, "train.csv")).sort_values("date_id")
    train_labels = pd.read_csv(os.path.join(CFG.path, "train_labels.csv"))
    X_train_df = preprocess_columns(train[["date_id"]+FEATURES].copy()).fillna(0.0)
    y_train = train_labels[CFG.targets].fillna(0.0).to_numpy(np.float32)

    X_stack=[]
    for etr,tnn,scaler in STAGE1_MODELS:
        X_scaled = scaler.transform(X_train_df[FEATURES]).astype(np.float32)
        preds_etr = etr.predict(X_scaled).astype(np.float32)
        if tnn is not None and len(X_scaled)>=CFG.tnn_lookback:
            seqs=[X_scaled[t-CFG.tnn_lookback+1:t+1] for t in range(CFG.tnn_lookback-1,len(X_scaled))]
            X_seq=torch.from_numpy(np.stack(seqs,axis=0).astype(np.float32)).to(DEVICE)
            with torch.no_grad(): preds_tnn=tnn(X_seq).cpu().numpy().astype(np.float32)
            full_preds=np.zeros_like(preds_etr,dtype=np.float32); full_preds[CFG.tnn_lookback-1:]=preds_tnn
            preds=(preds_etr+full_preds)/2
        else: preds=preds_etr
        X_stack.append(preds)
    stage1_preds=np.mean(X_stack,axis=0).astype(np.float32)

    rmse=np.sqrt(mean_squared_error(y_train,stage1_preds))
    mae=mean_absolute_error(y_train,stage1_preds)
    print(f"[Stage 1] RMSE={rmse:.5f} MAE={mae:.5f}")

    STAGE2_MODEL=train_stage2(stage1_preds,X_train_df[FEATURES].to_numpy(),y_train)

    df_features=pd.DataFrame(X_train_df[FEATURES].to_numpy(np.float32),columns=FEATURES)
    for i in range(CFG.total_targets):
        df_features[f"stage1_pred_{i}"]=stage1_preds[:,i]
    stage2_corr=np.vstack([m.predict(df_features) for m in STAGE2_MODEL]).T.astype(np.float32)
    final_preds=(stage1_preds+stage2_corr).astype(np.float32)

    rmse2=np.sqrt(mean_squared_error(y_train,final_preds))
    mae2=mean_absolute_error(y_train,final_preds)
    print(f"[Stage 2] RMSE={rmse2:.5f} MAE={mae2:.5f}")

    MODEL_READY=True

# =========================
# Predict
# =========================
def predict(test: pl.DataFrame,*args):
    global MODEL_READY
    if not MODEL_READY: load_and_train_model()

    test_pd=test.to_pandas()
    date_ids=test_pd["date_id"].values if "date_id" in test_pd.columns else None
    test_pd=preprocess_columns(test_pd).fillna(0.0)

    # safety: cast everything to float32
    for col in FEATURES:
        if col in test_pd.columns:
            test_pd[col] = test_pd[col].astype(np.float32)

    preds_stage1=[]
    for etr,tnn,scaler in STAGE1_MODELS:
        X_scaled=scaler.transform(test_pd[FEATURES]).astype(np.float32)
        preds_etr=etr.predict(X_scaled).astype(np.float32)
        if tnn is not None and len(X_scaled)>=CFG.tnn_lookback:
            seqs=[X_scaled[t-CFG.tnn_lookback+1:t+1] for t in range(CFG.tnn_lookback-1,len(X_scaled))]
            X_seq=torch.from_numpy(np.stack(seqs,axis=0).astype(np.float32)).to(DEVICE)
            with torch.no_grad(): preds_tnn=tnn(X_seq).cpu().numpy().astype(np.float32)
            full_preds=np.zeros_like(preds_etr,dtype=np.float32); full_preds[CFG.tnn_lookback-1:]=preds_tnn
            preds=(preds_etr+full_preds)/2
        else: preds=preds_etr
        preds_stage1.append(preds)
        X_scaled_all=X_scaled
    stage1_pred=np.mean(preds_stage1,axis=0).astype(np.float32)

    df_features=pd.DataFrame(X_scaled_all.astype(np.float32),columns=FEATURES)
    for i in range(CFG.total_targets):
        df_features[f"stage1_pred_{i}"]=stage1_pred[:,i]
    stage2_corr=np.vstack([m.predict(df_features) for m in STAGE2_MODEL]).T.astype(np.float32)
    final_pred=(stage1_pred+stage2_corr).astype(np.float32)

    out=pd.DataFrame(final_pred,columns=CFG.targets).astype(np.float32)
    out=_stabilize_and_detie_rows(out,date_ids if date_ids is not None else np.zeros(len(out),dtype=int))
    return out

# =========================
# Inference Server
# =========================
inference_server=kaggle_evaluation.mitsui_inference_server.MitsuiInferenceServer(predict)
if os.getenv("KAGGLE_IS_COMPETITION_RERUN"): inference_server.serve()
else: inference_server.run_local_gateway((CFG.path,))
[Stage 1] RMSE=0.01493 MAE=0.00960
[Stage 2] RMSE=0.00106 MAE=0.00013

