In [None]:

# ==== Cell 1: Imports & Config ==== #
import os
import numpy as np
import pandas as pd
import polars as pl
from pathlib import Path
import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
import kaggle_evaluation.mitsui_inference_server

# ==== Global Config ==== #
NUM_TARGET_COLUMNS = 424

def get_data_path():
    kaggle_path = Path('/kaggle/input/mitsui-commodity-prediction-challenge')
    local_path = Path("dataset")
    return kaggle_path if kaggle_path.exists() else local_path

data_path = get_data_path()


In [None]:

# ==== Cell 2: Feature Engineering (Fusion) ==== #
def create_advanced_features(df, feature_cols, enable_heavy_features=True):
    """
    Create advanced rolling/statistical features.
    Heavy features (skew, kurtosis, autocorr) can be toggled via flag.
    """
    for col in feature_cols:
        if col == 'date_id': 
            continue
        
        # Rolling stats
        for win in [3, 5, 10, 20]:
            df[f'{col}_rolling_mean_{win}'] = df[col].rolling(win).mean()
            df[f'{col}_rolling_std_{win}'] = df[col].rolling(win).std()
        
        # Lag features
        for lag in [1, 2, 3]:
            df[f'{col}_lag_{lag}'] = df[col].shift(lag)
        
        # Annualized vol
        df[f'{col}_annual_vol_20'] = df[col].rolling(20).std() * np.sqrt(252)
        
        if enable_heavy_features:
            # Skew & Kurtosis
            df[f'{col}_rolling_skew_10'] = df[col].rolling(10).skew()
            df[f'{col}_rolling_kurt_10'] = df[col].rolling(10).kurt()
            
            # Autocorr
            df[f'{col}_autocorr_1'] = df[col].rolling(20).apply(lambda x: x.autocorr(lag=1), raw=False)
            df[f'{col}_autocorr_5'] = df[col].rolling(20).apply(lambda x: x.autocorr(lag=5), raw=False)
            
            # Regime
            roll_mean = df[col].rolling(10).mean()
            roll_vol = df[col].rolling(10).std()
            df[f'{col}_regime_trend_up'] = (roll_mean > 0).astype(int)
            df[f'{col}_regime_high_vol'] = (roll_vol > roll_vol.quantile(0.75)).astype(int)

    df = df.fillna(0)
    return df


In [None]:

# ==== Cell 3: Stabilization (from Code 1) ==== #
def _stabilize_and_detie_rows(out_df, date_ids=None):
    """Ensure no flat rows in predictions, add small noise if needed."""
    out_df = out_df.astype(np.float32)
    out_df[:] = np.nan_to_num(out_df.values, nan=0.0, posinf=0.0, neginf=0.0)
    n_rows, n_cols = out_df.shape
    if date_ids is None:
        date_ids = np.zeros(n_rows, dtype=int)
    vals = out_df.to_numpy(np.float32)
    row_stds = np.std(vals, axis=1)
    flat_mask = row_stds < 1e-15
    if np.any(flat_mask):
        for r_idx in np.where(flat_mask)[0]:
            rng = np.random.default_rng(int(date_ids[r_idx]) + 131071)
            noise = rng.normal(loc=0.0, scale=1.0, size=n_cols).astype(np.float32)
            scale = (1.0 + abs(float(np.mean(vals[r_idx])))) * 1e-6
            vals[r_idx] = vals[r_idx] + noise * scale
        out_df.iloc[:, :] = vals
    return out_df


In [None]:

# ==== Cell 4: Model Training ==== #
def train_stacking_model(X, y):
    """
    Train base stacking model with LinearRegression + LightGBM.
    """
    estimators = [
        ('lr', LinearRegression()),
        ('lgb', lgb.LGBMRegressor(
            n_estimators=200, 
            learning_rate=0.05, 
            num_leaves=64, 
            random_state=42))
    ]
    
    model = StackingRegressor(
        estimators=estimators,
        final_estimator=LinearRegression(),
        cv=3
    )
    model.fit(X, y)
    return model


In [None]:

# ==== Cell 5: Evaluation ==== #
def evaluate_model(model, X, y):
    """
    Evaluate model with RMSE and MAE + cross-validation.
    """
    preds = model.predict(X)
    rmse = np.sqrt(mean_squared_error(y, preds))
    mae = mean_absolute_error(y, preds)
    
    cv = KFold(n_splits=3, shuffle=False)
    cv_scores = cross_val_score(model, X, y, cv=cv, scoring="neg_mean_absolute_error")
    
    print(f"RMSE: {rmse:.5f}, MAE: {mae:.5f}")
    print(f"CV MAE: {-np.mean(cv_scores):.5f} (+/- {np.std(cv_scores):.5f})")
    return rmse, mae, cv_scores


In [None]:

# ==== Cell 6: Kaggle Integration ==== #
def predict(
    test: pl.DataFrame,
    label_lags_1_batch: pl.DataFrame,
    label_lags_2_batch: pl.DataFrame,
    label_lags_3_batch: pl.DataFrame,
    label_lags_4_batch: pl.DataFrame,
) -> pl.DataFrame:
    """
    Kaggle predict hook: preprocess, feature engineering, predict, stabilize.
    """
    test_df = test.to_pandas()
    feature_cols = [c for c in test_df.columns if c != "date_id"]
    test_df = create_advanced_features(test_df, feature_cols)
    
    # Dummy model load (placeholder, in practice load pre-trained stacking models per target)
    model = LinearRegression()
    X_test = test_df.drop(columns=["date_id"])
    preds = np.zeros((1, NUM_TARGET_COLUMNS))  # placeholder
    out_df = pd.DataFrame(preds, columns=[f"target_{i}" for i in range(NUM_TARGET_COLUMNS)])
    
    return pl.DataFrame(out_df)

# Kaggle Inference Server
inference_server = kaggle_evaluation.mitsui_inference_server.MitsuiInferenceServer(predict)

if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    inference_server.serve()
else:
    inference_server.run_local_gateway((str(data_path),))
