In [108]:
# ================================================================
# 🧠 HULL TACTICAL MARKET PREDICTION — ENSEMBLE + SHARPEPENALTY
# ================================================================
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress TensorFlow warnings
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # Use only first GPU if multiple

from pathlib import Path
import numpy as np
import pandas as pd

import polars as pl
from typing import Dict 

import warnings
warnings.filterwarnings('ignore')

from catboost import CatBoostRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

from scipy.stats import zscore
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import GradientBoostingRegressor

from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# Import TensorFlow after setting environment variables
import tensorflow as tf
tf.get_logger().setLevel('ERROR')  # Only show errors
from tensorflow import keras
from tensorflow.keras import layers

# Try to import kaggle_evaluation, handle if not available
try:
    import kaggle_evaluation.default_inference_server as kdeval
    KAGGLE_ENV = True
    print("Running in Kaggle competition environment")
except ImportError:
    KAGGLE_ENV = False
    print("Running in local environment - kaggle_evaluation not available")

Running in local environment - kaggle_evaluation not available


In [109]:
# ================================================================
# Data Loading & Initial Feature Preparation
# ================================================================

# DATA_DIR = Path('/kaggle/input/hull-tactical-market-prediction')

## Configuration and Data Loading
DATA_DIR = Path("01_data")
TARGET = "market_forward_excess_returns"
drop_cols = ["date_id", "forward_returns", "risk_free_rate"]
VOL_WINDOW = 20        # volatility window in days
VALIDATION_SIZE = 2700          # days, approx. 30% of data

def time_split_train_val(df: pd.DataFrame, val_size: int = 2700):
    """Split data chronologically for time series validation."""
    df = df.sort_values('date_id').reset_index(drop=True)
    train_df = df.iloc[:-val_size].copy()
    val_df   = df.iloc[-val_size:].copy()
    return train_df, val_df

# Load train/test data using the KAGGLE_ENV variable from cell 1
if KAGGLE_ENV:
    print("Loading data from Kaggle environment")
    DATA_DIR = Path('/kaggle/input/hull-tactical-market-prediction')
    train = pd.read_csv(DATA_DIR / "train.csv")
    test = pd.read_csv(DATA_DIR / "test.csv")
else:
    print("Loading data from local environment")
    # Try different possible local paths
    local_paths = [
        DATA_DIR / "train.csv",
        Path("01_data/train.csv"),
        Path("train.csv")
    ]
    
    train_path = None
    test_path = None
    
    for path in local_paths:
        if path.exists():
            train_path = path
            test_path = path.parent / "test.csv"
            break
    
    if train_path is None or not test_path.exists():
        raise FileNotFoundError("❌ Could not find train.csv and test.csv files in expected locations")
    
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)

print(f"Data loaded successfully")
print(f"Train shape: {train.shape} | Test shape: {test.shape}")

# Basic preprocessing
train = train.sort_values("date_id").reset_index(drop=True)
test = test.sort_values("date_id").reset_index(drop=True)

# Handle missing values
train = train.fillna(0.0)
test = test.fillna(0.0)

# Base features (before advanced transformations)
base_features = [c for c in train.columns if c not in drop_cols + [TARGET]]

print(f"Base features available: {len(base_features)}")
print(f"Target variable: {TARGET}")

Loading data from local environment
Data loaded successfully
Train shape: (8990, 98) | Test shape: (10, 99)
Base features available: 94
Target variable: market_forward_excess_returns


In [110]:
def prepare_df(df: pd.DataFrame, median_map: Dict[str, float], feature_cols: list) -> pd.DataFrame:
    """
    Clean and prepare DataFrame by handling missing values intelligently.
    
    Strategy:
    - Use median imputation for numeric columns with some missing values
    - Use zero-fill for columns with very few missing values  
    - Only process existing columns (no synthetic data creation)

    Args:
    df: Input DataFrame
    median_map: Dictionary mapping column names to median values
    feature_cols: List of feature column names to process

    Returns:
    Cleaned DataFrame

    Median is much less sensitive to extreme values (outliers)
    Mean can be heavily skewed by a few very large or very small values
    """
    df = df.copy()
    
    # Only work with columns that actually exist in the DataFrame
    existing_cols = [col for col in feature_cols if col in df.columns]
    
    if not existing_cols:
        print("Warning: No feature columns found in DataFrame")
        return df
    
    # Calculate missing percentages for existing columns
    missing_pct = (df[existing_cols].isnull().sum() / len(df)) * 100
    
    # Categorize columns by missing percentage
    cols_fill_median = missing_pct[(missing_pct > 5) & (missing_pct <= 50)].index.tolist()
    cols_fill_zero = missing_pct[missing_pct <= 5].index.tolist()
    
    # Apply median imputation for moderately missing columns
    if cols_fill_median:
        for col in cols_fill_median:
            median_val = median_map.get(col, df[col].median())
            if pd.isna(median_val):  # Handle case where median is NaN
                median_val = 0.0
            df[col] = df[col].fillna(median_val)
    
    # Apply zero-fill for low missing columns
    if cols_fill_zero:
        df[cols_fill_zero] = df[cols_fill_zero].fillna(0)
    
    # Ensure all feature columns are numeric
    for col in existing_cols:
        if df[col].dtype == 'object':
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    # Final cleanup - ensure no inf values
    df[existing_cols] = df[existing_cols].replace([np.inf, -np.inf], 0)
    
    return df

In [111]:
## Train / Validation Split and Median Imputation
train_df, val_df = time_split_train_val(train, val_size=VALIDATION_SIZE)
print(f"Data split: Train {train_df.shape[0]} | Validation {val_df.shape[0]} rows")

# Create median map from training portion only
median_map = {}
for c in base_features:
    if c in train_df.columns:
        if train_df[c].dtype.kind in 'fiu':  # numeric types
            median_val = train_df[c].median(skipna=True)
            median_map[c] = float(median_val) if not pd.isna(median_val) else 0.0
        else:
            median_map[c] = 0.0
    else:
        median_map[c] = 0.0

# Apply preprocessing to all splits
train_full = prepare_df(train_df, median_map, base_features)
val_full   = prepare_df(val_df, median_map, base_features)
test_full  = prepare_df(test, median_map, base_features)

# Extract only the base features (remove drop_cols and target)
final_features = [c for c in base_features if c in train_full.columns]
train_p = train_full[final_features].copy()
val_p   = val_full[final_features].copy()
test_p  = test_full[final_features].copy()

# Keep target and other columns separate for later use
train_target = train_full[TARGET].copy()
val_target   = val_full[TARGET].copy()

# Validation check
if not final_features:
    raise ValueError("No features available after preprocessing!")

print(f"Preprocessing complete")
print(f"Number of base features: {len(final_features)}")
print(f"Base features available: {final_features[:10]}..." if len(final_features) > 10 else f"Features: {final_features}")

print(f"Target variable '{TARGET}' extracted separately")

# For feature engineering, we need to combine features with target
train_for_engineering = train_p.copy()
train_for_engineering[TARGET] = train_target

Data split: Train 6290 | Validation 2700 rows
Preprocessing complete
Number of base features: 94
Base features available: ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'E1']...
Target variable 'market_forward_excess_returns' extracted separately


In [112]:
train_p.columns

Index(['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'E1', 'E10',
       'E11', 'E12', 'E13', 'E14', 'E15', 'E16', 'E17', 'E18', 'E19', 'E2',
       'E20', 'E3', 'E4', 'E5', 'E6', 'E7', 'E8', 'E9', 'I1', 'I2', 'I3', 'I4',
       'I5', 'I6', 'I7', 'I8', 'I9', 'M1', 'M10', 'M11', 'M12', 'M13', 'M14',
       'M15', 'M16', 'M17', 'M18', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8',
       'M9', 'P1', 'P10', 'P11', 'P12', 'P13', 'P2', 'P3', 'P4', 'P5', 'P6',
       'P7', 'P8', 'P9', 'S1', 'S10', 'S11', 'S12', 'S2', 'S3', 'S4', 'S5',
       'S6', 'S7', 'S8', 'S9', 'V1', 'V10', 'V11', 'V12', 'V13', 'V2', 'V3',
       'V4', 'V5', 'V6', 'V7', 'V8', 'V9'],
      dtype='object')

In [113]:
train_for_engineering.columns

Index(['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'E1', 'E10',
       'E11', 'E12', 'E13', 'E14', 'E15', 'E16', 'E17', 'E18', 'E19', 'E2',
       'E20', 'E3', 'E4', 'E5', 'E6', 'E7', 'E8', 'E9', 'I1', 'I2', 'I3', 'I4',
       'I5', 'I6', 'I7', 'I8', 'I9', 'M1', 'M10', 'M11', 'M12', 'M13', 'M14',
       'M15', 'M16', 'M17', 'M18', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8',
       'M9', 'P1', 'P10', 'P11', 'P12', 'P13', 'P2', 'P3', 'P4', 'P5', 'P6',
       'P7', 'P8', 'P9', 'S1', 'S10', 'S11', 'S12', 'S2', 'S3', 'S4', 'S5',
       'S6', 'S7', 'S8', 'S9', 'V1', 'V10', 'V11', 'V12', 'V13', 'V2', 'V3',
       'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'market_forward_excess_returns'],
      dtype='object')

In [114]:
final_features 

['D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'E1',
 'E10',
 'E11',
 'E12',
 'E13',
 'E14',
 'E15',
 'E16',
 'E17',
 'E18',
 'E19',
 'E2',
 'E20',
 'E3',
 'E4',
 'E5',
 'E6',
 'E7',
 'E8',
 'E9',
 'I1',
 'I2',
 'I3',
 'I4',
 'I5',
 'I6',
 'I7',
 'I8',
 'I9',
 'M1',
 'M10',
 'M11',
 'M12',
 'M13',
 'M14',
 'M15',
 'M16',
 'M17',
 'M18',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'P1',
 'P10',
 'P11',
 'P12',
 'P13',
 'P2',
 'P3',
 'P4',
 'P5',
 'P6',
 'P7',
 'P8',
 'P9',
 'S1',
 'S10',
 'S11',
 'S12',
 'S2',
 'S3',
 'S4',
 'S5',
 'S6',
 'S7',
 'S8',
 'S9',
 'V1',
 'V10',
 'V11',
 'V12',
 'V13',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9']

In [115]:
# count of final features = len(final_features)
final_selected_features = [c for c in final_features if c in train_p.columns]
len(final_selected_features)

94

In [116]:
# ===== 🔧 Advanced Feature Factory (Enhanced) =====
def create_advanced_features(df,
                             top_features,
                             macro_prefixes=('mom','m','v','p','s'),
                             window_sizes=(5,10,20,60,120),
                             max_features_to_keep=50,
                             shift=1,  # Added shift parameter
                             inplace=False):
    """
    Create advanced features following a two-level approach:
      1) Lightweight Core Features (applied to `top_features`)
      2) Macro-Context Features (applied to columns starting with macro_prefixes)
    
    Args:
        df: Input DataFrame
        top_features: List of most important features for Level 1 processing
        macro_prefixes: Tuple of prefixes for Level 2 features
        window_sizes: Rolling window sizes
        max_features_to_keep: Maximum features to select
        shift: Number of periods to shift for avoiding data leakage
        inplace: Whether to modify DataFrame in place
    
    Returns:
        df_out: DataFrame with new features (and original columns)
        selected_features: list of top selected feature column names
    """
    if not inplace:
        df = df.copy()

    # Ensure datetime-like ordering by date_id if present
    if 'date_id' in df.columns:
        df = df.sort_values('date_id').reset_index(drop=True)

    # Helper: ensure numeric dtype for selected cols
    def _to_numeric(cols):
        for c in cols:
            if c in df.columns:
                df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0.0)

    # ------------- Level 1: Core Features (top_features) -------------
    def create_rolling_features(cols, windows=window_sizes, shift_periods=shift):
        """Create rolling statistics with proper shift to avoid leakage"""
        for c in cols:
            if c not in df.columns:
                continue
            for w in windows:
                # Apply shift BEFORE rolling operations
                shifted_col = df[c].shift(shift_periods)
                roll = shifted_col.rolling(window=w, min_periods=1)
                
                df[f"{c}_mean_{w}"] = roll.mean().astype('float32')
                df[f"{c}_std_{w}"] = roll.std().astype('float32').fillna(0.0)
                df[f"{c}_median_{w}"] = roll.median().astype('float32')

    def create_zscore_features(cols, windows=window_sizes, shift_periods=shift):
        """Create rolling z-scores with proper shift"""
        for c in cols:
            if c not in df.columns:
                continue
            for w in windows:
                # Apply shift BEFORE rolling operations
                shifted_col = df[c].shift(shift_periods)
                roll_mean = shifted_col.rolling(window=w, min_periods=1).mean()
                roll_std = shifted_col.rolling(window=w, min_periods=1).std().fillna(0.0)
                
                df[f"{c}_z_{w}"] = ((df[c] - roll_mean) / (roll_std + 1e-9)).astype('float32')

    def create_spread_features(cols, shift_periods=shift):
        """Create spread and percentage change features"""
        for c in cols:
            if c not in df.columns:
                continue
            # Use proper shift for difference calculations
            df[f"{c}_diff_1"] = (df[c] - df[c].shift(shift_periods)).astype('float32')
            df[f"{c}_pctchg_1"] = (df[c].pct_change(periods=shift_periods).fillna(0.0)).astype('float32')

    # ------------- Level 2: Macro Features (selective) -------------
    def create_cumulative_features(cols, windows=(5,10,20), shift_periods=shift):
        """Create cumulative sums with proper shift"""
        for c in cols:
            if c not in df.columns:
                continue
            for w in windows:
                shifted_col = df[c].shift(shift_periods)
                df[f"{c}_cumsum_{w}"] = shifted_col.rolling(window=w, min_periods=1).sum().astype('float32')

    def create_correlation_features(pairs=None, window=30, shift_periods=shift):
        """Create rolling correlations with proper shift"""
        if pairs is None:
            # Build pairs from top_features (limit to avoid explosion)
            cand = []
            for i in range(len(top_features)):
                for j in range(i+1, len(top_features)):
                    cand.append((top_features[i], top_features[j]))
            pairs = cand[:10]  # Limit to 10 pairs
        
        for a, b in pairs:
            if a not in df.columns or b not in df.columns:
                continue
            # Apply shift to both series
            a_shifted = df[a].shift(shift_periods)
            b_shifted = df[b].shift(shift_periods)
            corr = a_shifted.rolling(window=window, min_periods=1).corr(b_shifted)
            df[f"corr_{a}_{b}_{window}"] = corr.astype('float32').fillna(0.0)

    def create_volatility_features(cols=None, windows=(20,60), shift_periods=shift):
        """Create volatility spread features with proper shift"""
        if cols is None:
            cols = [c for c in df.columns if c.startswith('v')]
        
        # Limit to prevent feature explosion
        cols = cols[:8]
        
        for w in windows:
            vols = {}
            for c in cols:
                if c in df.columns:
                    shifted_col = df[c].shift(shift_periods)
                    vols[c] = shifted_col.rolling(window=w, min_periods=1).std().astype('float32').fillna(0.0)
            
            # Create spread between consecutive volatilities
            vol_keys = list(vols.keys())
            for i in range(len(vol_keys) - 1):
                a, b = vol_keys[i], vol_keys[i + 1]
                df[f"volspread_{a}_{b}_{w}"] = (vols[a] - vols[b]).astype('float32')

    def create_extremes_features(cols, windows=(20,60,120), shift_periods=shift):
        """Create high/low ratio features with proper shift"""
        # Limit columns to prevent explosion
        cols = [c for c in cols if c in df.columns][:10]
        
        for c in cols:
            for w in windows:
                shifted_col = df[c].shift(shift_periods)
                roll_max = shifted_col.rolling(window=w, min_periods=1).max()
                roll_min = shifted_col.rolling(window=w, min_periods=1).min()
                
                df[f"{c}_high_ratio_{w}"] = (df[c] / (roll_max + 1e-9)).astype('float32')
                df[f"{c}_low_ratio_{w}"] = (df[c] / (roll_min + 1e-9)).astype('float32')

    # Execute feature creation
    print("🔧 Creating Level 1 features (Core)...")
    _to_numeric(top_features)
    create_rolling_features(top_features)
    create_zscore_features(top_features)
    create_spread_features(top_features)

    print("🔧 Creating Level 2 features (Macro)...")
    macro_cols = [c for c in df.columns if any(c.startswith(pref) for pref in macro_prefixes)]
    _to_numeric(macro_cols)
    
    create_cumulative_features(macro_cols, windows=(5,10,20))
    create_correlation_features(window=30)
    create_volatility_features(windows=(20,60))
    create_extremes_features([c for c in df.columns if c.startswith(('m','p'))], windows=(20,60,120))

    # Clean data
    print("🧹 Cleaning and selecting features...")
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0.0, inplace=True)

    # Downcast to save memory
    float_cols = df.select_dtypes(include=['float64']).columns
    if len(float_cols) > 0:
        df[float_cols] = df[float_cols].astype('float32')

    # Feature selection
    def select_top_features(df_in, target_col='market_forward_excess_returns', top_k=max_features_to_keep):
        """Select top features using tree-based importance"""
        X_full = df_in.drop(columns=[target_col]) if target_col in df_in.columns else df_in.copy()
        X_num = X_full.select_dtypes(include=[np.number]).copy()
        
        # Remove zero variance features
        vt = VarianceThreshold(threshold=1e-6)
        try:
            vt.fit(X_num)
            cols_kept = X_num.columns[vt.get_support()].tolist()
        except Exception:
            cols_kept = X_num.columns.tolist()

        X_sel = X_num[cols_kept].copy()
        y_sel = df_in[target_col].values if target_col in df_in.columns else None

        # Use tree-based importance if target available
        if y_sel is not None and len(y_sel) == len(X_sel) and len(np.unique(y_sel)) > 1:
            try:
                gb = GradientBoostingRegressor(n_estimators=100, max_depth=3, random_state=42)
                gb.fit(X_sel, y_sel)
                imp = pd.Series(gb.feature_importances_, index=X_sel.columns).sort_values(ascending=False)
                top_feats = imp.head(top_k).index.tolist()
            except Exception:
                # Fallback to variance-based selection
                var_series = X_sel.var().sort_values(ascending=False)
                top_feats = var_series.head(top_k).index.tolist()
        else:
            # Fallback to variance-based selection
            var_series = X_sel.var().sort_values(ascending=False)
            top_feats = var_series.head(top_k).index.tolist()

        return top_feats

    selected = select_top_features(df, target_col='market_forward_excess_returns', top_k=max_features_to_keep)

    # Final DataFrame
    keep_cols = ['date_id', 'forward_returns', 'risk_free_rate', 'market_forward_excess_returns']
    keep_cols = [c for c in keep_cols if c in df.columns] + selected
    df_out = df[keep_cols].copy()

    print(f"✅ Feature engineering complete. Selected {len(selected)} features.")
    return df_out, selected

In [117]:
# To cancel when the modifications in the next cell are solved

# # ===== Feature Engineering & Data Preparation =====

# # Top features identified (Level 1 - Core)
# top_features = ['m4', 'v13', 'm11', 's2', 'd4', 'd1', 'd2', 'e8', 'p6', 'm2', 
#                 'd8', 'm9', 'p8', 'p7', 's12', 'p13', 'v9', 'd5', 'p1', 's8']

# print("🔧 Creating advanced features for training data...")
# # Create engineered features and select the top ones
# train_enh, selected_features = create_advanced_features(
#     train,
#     top_features=top_features,
#     window_sizes=(5, 10, 20, 60, 120),   # 1w, 2w, 1m, 3m, 6m
#     max_features_to_keep=50,
#     shift=1  # Explicitly set shift to avoid data leakage
# )

# print("🔧 Applying same transformation to test data...")
# # Apply the same transformation to test set (use same selected features)
# test_enh, _ = create_advanced_features(
#     test,
#     top_features=top_features,
#     window_sizes=(5, 10, 20, 60, 120),
#     max_features_to_keep=50,
#     shift=1
# )

# # Ensure test set has same features as training set
# missing_features = [f for f in selected_features if f not in test_enh.columns]
# if missing_features:
#     print(f"Adding missing features to test set: {len(missing_features)}")
#     for feature in missing_features:
#         test_enh[feature] = 0.0  # Fill missing features with 0

# # Final feature matrices
# X = train_enh[selected_features].astype('float32')
# y = train_enh[TARGET].astype('float32')
# X_test = test_enh[selected_features].astype('float32')

# print(f"\nData preparation complete!")
# print(f"Final selected features: {len(selected_features)}")
# print(f"Train shape: {X.shape}")
# print(f"Test shape: {X_test.shape}")
# print(f"Target shape: {y.shape}")
# print(f"Selected features preview: {selected_features[:10]}...")

In [118]:
# Feature Engineering & Data Preparation

# Top features identified (Level 1 - Core)
top_features = ['m4', 'v13', 'm11', 's2', 'd4', 'd1', 'd2', 'e8', 'p6', 'm2', 
                'd8', 'm9', 'p8', 'p7', 's12', 'p13', 'v9', 'd5', 'p1', 's8']

print("Creating advanced features for training data only...")

# Apply feature engineering only to training data (features + target)
train_enh, selected_features = create_advanced_features(
    train_for_engineering,  # This contains only final_features + TARGET
    top_features=top_features,
    window_sizes=(5, 10, 20, 60, 120),   # 1w, 2w, 1m, 3m, 6m
    max_features_to_keep=50,
    shift=1  # Explicitly set shift to avoid data leakage
)

# Separate engineered features from original base features
original_features_in_selection = [f for f in selected_features if f in final_features]
new_engineered_features = [f for f in selected_features if f not in final_features]

print(f"\nFeature Engineering Results:")
print(f"Original base features available: {len(final_features)}")
print(f"Original features selected: {len(original_features_in_selection)}")
print(f"New engineered features created: {len(new_engineered_features)}")
print(f"Total features for modeling: {len(selected_features)}")

print(f"\nNew engineered features added:")
for i, feat in enumerate(new_engineered_features, 1):
    print(f"{i:2d}. {feat}")

print(f"\nAll 50 selected features:")
for i, feat in enumerate(selected_features, 1):
    feat_type = "ORIGINAL" if feat in final_features else "ENGINEERED"
    print(f"{i:2d}. {feat:<25} [{feat_type}]")

# Final feature matrices - only use training data that was engineered
X = train_enh[selected_features].astype('float32')
y = train_enh[TARGET].astype('float32')

print(f"\nFinal Training Data Shapes:")
print(f"Training set shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Features selected: {len(selected_features)}")

# Store for later use in inference
final_selected_features = selected_features

Creating advanced features for training data only...
🔧 Creating Level 1 features (Core)...
🔧 Creating Level 2 features (Macro)...
🧹 Cleaning and selecting features...
✅ Feature engineering complete. Selected 50 features.

Feature Engineering Results:
Original base features available: 94
Original features selected: 30
New engineered features created: 20
Total features for modeling: 50

New engineered features added:
 1. market_forward_excess_returns_low_ratio_120
 2. market_forward_excess_returns_high_ratio_120
 3. market_forward_excess_returns_low_ratio_20
 4. market_forward_excess_returns_low_ratio_60
 5. market_forward_excess_returns_high_ratio_20
 6. market_forward_excess_returns_high_ratio_60
 7. market_forward_excess_returns_cumsum_20
 8. market_forward_excess_returns_cumsum_20_high_ratio_20
 9. market_forward_excess_returns_cumsum_5_high_ratio_20
10. market_forward_excess_returns_cumsum_5
11. market_forward_excess_returns_cumsum_5_low_ratio_20
12. market_forward_excess_returns_cu

In [119]:
final_selected_features

['market_forward_excess_returns_low_ratio_120',
 'market_forward_excess_returns_high_ratio_120',
 'market_forward_excess_returns_low_ratio_20',
 'V7',
 'M18',
 'market_forward_excess_returns_low_ratio_60',
 'V9',
 'P10',
 'E8',
 'M9',
 'M5',
 'market_forward_excess_returns_high_ratio_20',
 'E19',
 'P8',
 'M17',
 'E12',
 'P11',
 'market_forward_excess_returns_high_ratio_60',
 'P9',
 'V8',
 'E3',
 'M12',
 'market_forward_excess_returns_cumsum_20',
 'S1',
 'M3',
 'market_forward_excess_returns_cumsum_20_high_ratio_20',
 'M15',
 'E2',
 'market_forward_excess_returns_cumsum_5_high_ratio_20',
 'market_forward_excess_returns_cumsum_5',
 'M8',
 'M10',
 'E17',
 'market_forward_excess_returns_cumsum_5_low_ratio_20',
 'E16',
 'market_forward_excess_returns_cumsum_10_high_ratio_20',
 'V12',
 'market_forward_excess_returns_cumsum_20_high_ratio_120',
 'market_forward_excess_returns_cumsum_5_high_ratio_60',
 'market_forward_excess_returns_cumsum_10',
 'market_forward_excess_returns_cumsum_20_low_rati

In [120]:
# Prepare test data separately (without feature engineering)
print("Preparing test data with base features only...")

# Test set only gets the original base features that we can compute
test_base_features = [f for f in final_features if f in test_p.columns]
X_test_base = test_p[test_base_features].astype('float32')

print(f"Test set base features shape: {X_test_base.shape}")
print(f"Base features available for test: {len(test_base_features)}")

# Note: During inference, we would need to either:
# 1. Only use original features that exist in both train and test, OR  
# 2. Create a feature engineering pipeline that can be applied to single rows

Preparing test data with base features only...
Test set base features shape: (10, 94)
Base features available for test: 94


In [121]:
# Data matrices are already prepared in previous cell
# X, y, and X_test are ready for model training

print(f"Data prepared successfully")
print(f"Final selected features: {len(final_selected_features)}")
print(f"Train shape: {X.shape}")
# print(f"Test shape: {X_test.shape}")

Data prepared successfully
Final selected features: 50
Train shape: (6290, 50)


In [122]:
# ===== CELL: Model Training with Selected Features =====

# 1. Feature Engineering using your new function
top_features = ['m4','v13','m11','s2','d4','d1','d2','e8','p6','m2','d8','m9','p8','p7','s12','p13','v9','d5','p1','s8']

df_features, selected_features = create_advanced_features(
    train,
    top_features,
    window_sizes=(5,10,20,60,120),
    max_features_to_keep=50
)

# Display summary
print(f"Selected features ({len(selected_features)}): {selected_features[:10]} ...")

# Prepare X, y
X = df_features[selected_features].astype('float32')
y = df_features['market_forward_excess_returns'].astype('float32')


# 2. Define CatBoost model (Ensemble part 1)
cat_params = dict(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    loss_function='RMSE',
    random_seed=42,
    verbose=False
)

cat_model = CatBoostRegressor(**cat_params)

kf = KFold(n_splits=5, shuffle=False)
cat_preds = np.zeros(len(X))

for fold, (trn_idx, val_idx) in enumerate(kf.split(X), 1):
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    train_pool = Pool(X_train, y_train)
    val_pool = Pool(X_val, y_val)

    cat_model.fit(train_pool, eval_set=val_pool, verbose=False)
    preds = cat_model.predict(X_val)
    rmse = mean_squared_error(y_val, preds, squared=False)
    print(f"Fold {fold} RMSE: {rmse:.6f}")

    cat_preds[val_idx] = preds


# 3. Define Neural Network model (Ensemble part 2)
def build_nn(input_dim):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(32, activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

nn_preds = np.zeros(len(X))

for fold, (trn_idx, val_idx) in enumerate(kf.split(X), 1):
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    nn_model = build_nn(X.shape[1])
    early_stop = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True, monitor='val_loss', verbose=0)
    nn_model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=100,
        batch_size=128,
        verbose=0,
        callbacks=[early_stop]
    )

    preds = nn_model.predict(X_val).ravel()
    rmse = mean_squared_error(y_val, preds, squared=False)
    print(f"[NN] Fold {fold} RMSE: {rmse:.6f}")

    nn_preds[val_idx] = preds


# 4. Ensemble Combination (Weighted Average)
ensemble_preds = 0.6 * cat_preds + 0.4 * nn_preds
rmse_ensemble = mean_squared_error(y, ensemble_preds, squared=False)
print(f"\n✅ Ensemble RMSE: {rmse_ensemble:.6f}")


# 5. Diagnostics and Sanity Checks
metrics = pd.DataFrame({
    "Model": ["CatBoost", "NeuralNet", "Ensemble"],
    "RMSE": [
        mean_squared_error(y, cat_preds, squared=False),
        mean_squared_error(y, nn_preds, squared=False),
        rmse_ensemble
    ]
})
display(metrics)

print("\nFeature importance snapshot (CatBoost):")
imp_df = pd.DataFrame({
    'Feature': selected_features,
    'Importance': cat_model.feature_importances_
}).sort_values(by='Importance', ascending=False)
display(imp_df.head(15))


🔧 Creating Level 1 features (Core)...
🔧 Creating Level 2 features (Macro)...
🧹 Cleaning and selecting features...


KeyboardInterrupt: 

In [None]:
# # Feature Importance Preview (Optional)

# # ================================================================
# # 2️⃣ Feature Importance Preview (Optional Diagnostic)
# # ================================================================
# """
# Quick diagnostic cell to preview which engineered features
# are most informative for predicting market_forward_excess_returns.

# You can toggle the mode:
#   - mode = "fast" → uses Mutual Information (no model training)
#   - mode = "catboost" → trains a quick CatBoostRegressor for ranking
# """

# from sklearn.feature_selection import mutual_info_regression
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Select mode
# mode = "fast"  # "fast" or "catboost"

# # Mutual Information Mode (fast)
# if mode == "fast":
#     print("⚡ Running Mutual Information Importance (fast mode)...")
#     mi = mutual_info_regression(X, y, random_state=42)
#     mi_df = pd.DataFrame({'feature': X.columns, 'importance': mi})
#     mi_df = mi_df.sort_values(by='importance', ascending=False).head(20)

#     plt.figure(figsize=(10, 6))
#     sns.barplot(data=mi_df, x='importance', y='feature', color='steelblue')
#     plt.title("Top 20 Features by Mutual Information")
#     plt.tight_layout()
#     plt.show()

# # CatBoost Mode (more precise)
# elif mode == "catboost":
#     from catboost import CatBoostRegressor

#     print("🐱 Running CatBoost Feature Importance (model-based)...")
#     model = CatBoostRegressor(
#         iterations=300,
#         learning_rate=0.05,
#         depth=6,
#         random_seed=42,
#         verbose=False
#     )
#     model.fit(X, y)

#     fi = model.get_feature_importance(prettified=True)
#     fi = fi.sort_values(by='Importances', ascending=False).head(20)

#     plt.figure(figsize=(10, 6))
#     sns.barplot(data=fi, x='Importances', y='Feature Id', color='darkorange')
#     plt.title("Top 20 Features by CatBoost Importance")
#     plt.tight_layout()
#     plt.show()

# else:
#     print("Invalid mode. Choose 'fast' or 'catboost'.")

# print("✅ Feature importance preview complete.")


In [None]:
# ================================================================
# 2️⃣ CatBoost Base Model (GridSearch + TimeSeriesSplit)
# ================================================================

print("⏳ Training CatBoost model with TimeSeries CV...")

tscv = TimeSeriesSplit(n_splits=5)

# check here random_state = 42 for reproducibility!
cbc = CatBoostRegressor(loss_function='RMSE', verbose=0, random_state=42)

param_grid = {
    'depth': [4, 6],
    'learning_rate': [0.05, 0.1],
    'iterations': [300, 500],
    'l2_leaf_reg': [2, 5]
}

grid = GridSearchCV(
    estimator=cbc,
    param_grid=param_grid,
    cv=tscv,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)
grid.fit(X, y)
best_cbc = grid.best_estimator_
print(f"✅ Best Params: {grid.best_params_}")

⏳ Training CatBoost model with TimeSeries CV...
Fitting 5 folds for each of 16 candidates, totalling 80 fits


ValueError: 
All the 80 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\calli\miniconda3\envs\ml\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\calli\miniconda3\envs\ml\Lib\site-packages\catboost\core.py", line 5873, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\calli\miniconda3\envs\ml\Lib\site-packages\catboost\core.py", line 2395, in _fit
    train_params = self._prepare_train_params(
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\calli\miniconda3\envs\ml\Lib\site-packages\catboost\core.py", line 2275, in _prepare_train_params
    train_pool = _build_train_pool(X, y, cat_features, text_features, embedding_features, pairs, graph,
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\calli\miniconda3\envs\ml\Lib\site-packages\catboost\core.py", line 1513, in _build_train_pool
    train_pool = Pool(X, y, cat_features=cat_features, text_features=text_features, embedding_features=embedding_features, pairs=pairs, graph=graph, weight=sample_weight, group_id=group_id,
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\calli\miniconda3\envs\ml\Lib\site-packages\catboost\core.py", line 855, in __init__
    self._init(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, graph, weight,
  File "c:\Users\calli\miniconda3\envs\ml\Lib\site-packages\catboost\core.py", line 1491, in _init
    self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, graph, weight,
  File "_catboost.pyx", line 4329, in _catboost._PoolBase._init_pool
  File "_catboost.pyx", line 4352, in _catboost._PoolBase._init_pool
  File "_catboost.pyx", line 2310, in _catboost._init_features_layout
_catboost.CatBoostError: catboost/libs/data/features_layout.cpp:124: All feature names should be different, but 'forward_returns' used more than once.


In [None]:
# ================================================================
# 3️⃣ Neural Network Model (Feedforward Regressor)
# ================================================================

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

def build_nn(input_dim):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])
    model.compile(optimizer=keras.optimizers.Adam(1e-3), loss='mse', metrics=['mae'])
    return model

nn_model = build_nn(X_scaled.shape[1])
es = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# last 20% time-based validation
date_cut = train["date_id"].quantile(0.8)
train_idx = train["date_id"] <= date_cut
val_idx = train["date_id"] > date_cut

X_train, y_train = X_scaled[train_idx], y[train_idx]
X_val, y_val = X_scaled[val_idx], y[val_idx]

nn_model.fit(X_train, y_train, validation_data=(X_val, y_val),
             epochs=100, batch_size=256, verbose=0, callbacks=[es])
print("✅ Neural Network trained successfully.")

✅ Neural Network trained successfully.


In [None]:
# ================================================================
# 4️⃣ Ensemble Prediction (0.X × CatBoost + 0.XX × NN)
# ================================================================
ensemble_cat_pct = 0.8
ensemble_nn_pct = 0.2

val_cat = best_cbc.predict(X.loc[val_idx])
val_nn = nn_model.predict(X_scaled[val_idx]).ravel()

val_ensemble = ensemble_cat_pct * val_cat + ensemble_nn_pct * val_nn
val_df = train.loc[val_idx].copy()
val_df["pred"] = val_ensemble

[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [None]:
# ===== Corrected evaluation: use mapped weights and official formula =====
def compute_strategy_stats(weights, forward_returns, risk_free_rate):
    """
    Compute strategy daily returns and Sharpe (annualized).
    weights: array-like positions in [0,2]
    forward_returns, risk_free_rate: arrays aligned
    """
    # Ensure numpy arrays
    w = np.asarray(weights)
    fr = np.asarray(forward_returns)
    rf = np.asarray(risk_free_rate)

    # Strategy return per day: rf*(1 - w) + w * forward_returns
    # Strategy excess over rf:
    strat_ret = rf * (1.0 - w) + w * fr
    strat_excess = strat_ret - rf   # == w * (fr - rf)
    # annualized sharpe
    mean = np.nanmean(strat_excess)
    std = np.nanstd(strat_excess)
    sharpe = (mean / (std + 1e-12)) * np.sqrt(252) if std > 0 else 0.0
    # annualized vol of strategy returns
    vol_ann = std * np.sqrt(252)
    return {
        'sharpe': sharpe,
        'vol_ann': vol_ann,
        'mean_daily_excess': mean,
        'std_daily_excess': std,
        'strat_ret_series': strat_ret,
        'strat_excess_series': strat_excess
    }

def sharpe_penalty_official(weights, forward_returns, risk_free_rate):
    """
    Compute adjusted Sharpe like the official metric:
    - compute strategy sharpe
    - compute market vol and strategy vol, form vol_penalty = 1 + max(0, strategy_vol/market_vol - 1.2)
    - compute return_gap penalty like (max(0, (market_mean_excess - strat_mean_excess) * 100 * 252))**2 / 100 etc.
    Returns adjusted_sharpe (float) and components.
    """
    # strategy stats
    stats = compute_strategy_stats(weights, forward_returns, risk_free_rate)
    strat_excess = stats['strat_excess_series']
    strat_sharpe = stats['sharpe']
    strat_vol = stats['vol_ann']
    # market stats
    fr = np.asarray(forward_returns)
    rf = np.asarray(risk_free_rate)
    market_excess = fr - rf
    market_mean_excess = ( (1 + market_excess).prod() ) ** (1.0 / len(market_excess)) - 1 if len(market_excess)>0 else 0.0
    # fallback simpler mean if product fails
    # but safer to use mean:
    market_mean_excess = np.nanmean(market_excess)
    market_std = np.nanstd(fr)
    market_vol = market_std * np.sqrt(252) if market_std>0 else 1e-9

    # volatility penalty
    excess_vol = max(0.0, (strat_vol / (market_vol + 1e-12)) - 1.2)
    vol_penalty = 1.0 + excess_vol

    # return gap penalty (use squared scaled gap similar to demo code)
    strat_mean_excess = np.nanmean(strat_excess)
    return_gap = max(0.0, (market_mean_excess - strat_mean_excess) * 100 * 252)  # percent annualized gap
    return_penalty = 1.0 + (return_gap**2) / 100.0

    adjusted_sharpe = strat_sharpe / (vol_penalty * return_penalty + 1e-12)
    return {
        'adjusted_sharpe': adjusted_sharpe,
        'strat_sharpe': strat_sharpe,
        'vol_penalty': vol_penalty,
        'return_penalty': return_penalty,
        'strat_vol': strat_vol,
        'market_vol': market_vol,
        'return_gap': return_gap
    }

# ===== Use it on validation properly mapping raw preds to weights =====

# val_ensemble is your raw ensemble prediction (unmapped)
# First map to weights using your mapping function (or revised mapping)
def robust_signal_to_weight(sig, lower=0.0, upper=2.0):
    """
    Map raw signals to weights robustly using percentile clipping and stable scaling.
    If distribution is degenerate, fallback to standard scaling.
    """
    sig = np.asarray(sig)
    lo = np.nanpercentile(sig, 5)
    hi = np.nanpercentile(sig, 95)
    if np.isclose(hi, lo):
        # fallback: z-score and sigmoid mapping
        sig_z = (sig - np.nanmean(sig)) / (np.nanstd(sig) + 1e-12)
        # map z to [0,2] via logistic
        w = 2.0 / (1.0 + np.exp(-sig_z))
    else:
        w = (sig - lo) / (hi - lo + 1e-12) * (upper - lower) + lower
    return np.clip(w, lower, upper)

# compute mapped weights
val_weights = robust_signal_to_weight(val_ensemble)   # or pass val_cat/val_nn separately

# compute official adjusted sharpe and components
res = sharpe_penalty_official(val_weights, val_df['forward_returns'].to_numpy(), val_df['risk_free_rate'].to_numpy())

print("Mapped weights stats:", np.nanmin(val_weights), np.nanpercentile(val_weights,5), np.nanmedian(val_weights), np.nanpercentile(val_weights,95), np.nanmax(val_weights))
print("Strategy raw Sharpe:", res['strat_sharpe'])
print("Adjusted Sharpe:", res['adjusted_sharpe'])
print("Vol penalty:", res['vol_penalty'], "Return penalty:", res['return_penalty'], "Return gap:", res['return_gap'])


Mapped weights stats: 0.0 0.00011407995932442048 0.6734031891719277 1.999445759525307 2.0
Strategy raw Sharpe: 2.5628806391535512
Adjusted Sharpe: 2.562880639150988
Vol penalty: 1.0 Return penalty: 1.0 Return gap: 0.0


In [None]:
# # ================================================================
# # 6️⃣ Competition-Compliant Inference Function
# # ================================================================
# _cat_model = best_cbc
# _nn_model = nn_model
# _scaler = scaler
# _feat_cols = features

# """
#     Check if is really necessary exchange from pl to pd and back to pl?
#     pl.DataFrame (we convert to pandas inside)
# """
# def predict(pl_df):
#     """Competition inference function."""
#     pdf = pl_df.to_pandas().fillna(0.0)
#     for f in _feat_cols:
#         if f not in pdf.columns:
#             pdf[f] = 0.0
#     Xp = pdf[_feat_cols].values
#     Xp_scaled = _scaler.transform(Xp)
#     pred_cat = _cat_model.predict(pdf[_feat_cols])
#     pred_nn = _nn_model.predict(Xp_scaled, verbose=0).ravel()
#     preds = ensemble_cat_pct * pred_cat + ensemble_nn_pct * pred_nn
#     lo, hi = np.percentile(preds, [5, 95])
#     weights = np.clip((preds - lo) / (hi - lo + 1e-9) * 2.0, 0, 2)
#     return pd.DataFrame({"prediction": weights.astype("float32")})

In [None]:
# ================================================================
# 6️⃣ Competition-Compliant Inference Function
# ================================================================
_cat_model = best_cbc
_nn_model = nn_model
_scaler = scaler
_feat_cols = features
_history_returns = list(train.loc[val_idx, 'forward_returns'].iloc[-VOL_WINDOW:].tolist())

def predict(pl_df: pl.DataFrame) -> float:
    """Competition inference function - returns single float allocation."""
    global _history_returns
    
    # Convert Polars to Pandas and handle missing values
    pdf = pl_df.to_pandas().fillna(0.0)
    
    # Ensure all required features are present
    for f in _feat_cols:
        if f not in pdf.columns:
            pdf[f] = 0.0
    
    # Get features in correct format
    X_features = pdf[_feat_cols].values
    X_scaled = _scaler.transform(X_features)
    
    # Make predictions from both models
    pred_cat = _cat_model.predict(pdf[_feat_cols])[0]  # Get first prediction
    pred_nn = _nn_model.predict(X_scaled, verbose=0).ravel()[0]  # Get first prediction
    
    # Ensemble prediction
    pred = ensemble_cat_pct * pred_cat + ensemble_nn_pct * pred_nn
    
    # Estimate rolling volatility for scaling
    vol_est = np.std(_history_returns) if len(_history_returns) > 1 else 1e-3
    
    # Scale prediction to allocation with volatility adjustment
    allocation = float(np.clip((best_k * pred) / (vol_est + 1e-9), 0, 2))
    
    # Update history for rolling volatility estimation
    if 'lagged_forward_returns' in pl_df.columns:
        try:
            _history_returns.append(float(pl_df['lagged_forward_returns'][0]))
        except:
            _history_returns.append(0.0)
    else:
        _history_returns.append(0.0)
    
    # Keep only last VOL_WINDOW entries
    _history_returns = _history_returns[-VOL_WINDOW:]
    
    return allocation

In [None]:
"""
NEXT STEPS, IMPORTANT FOR IMPROVEMENT:

Stronger feature scaling

PCA optional

Rolling retrain or time-based CV for robustness out of sample

Optimization of the mix (CatBoost vs NN) to dynamically find the optimal weight based on your adjusted Sharpe
Eventually to be extended to more models in the ensemble

"""

'\nNEXT STEPS, IMPORTANT FOR IMPROVEMENT:\n\nStronger feature scaling\n\nPCA optional\n\nRolling retrain or time-based CV for robustness out of sample\n\nOptimization of the mix (CatBoost vs NN) to dynamically find the optimal weight based on your adjusted Sharpe\nEventually to be extended to more models in the ensemble\n\n'

In [None]:
# # ================================================================
# # 7️⃣ Kaggle Evaluation Server / Local Submission
# # ================================================================

# if KAGGLE_ENV:
#     # Kaggle competition environment
#     server = kdeval.DefaultInferenceServer(predict)
    
#     if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
#         server.serve()
#     else:
#         server.run_local_gateway((str(DATA_DIR),))
        
# else:
#     # Local environment - generate submission file
#     print("🔧 Local mode - generating submission file...")
    
#     # Generate predictions for test set
#     test_pred_cat = best_cbc.predict(X_test)
#     test_pred_nn = nn_model.predict(scaler.transform(X_test), verbose=0).ravel()
#     preds = ensemble_cat_pct * test_pred_cat + ensemble_nn_pct * test_pred_nn
    
#     # Apply same scaling logic as validation
#     test_exposures = np.clip(best_k * preds, 0, 2)
    
#     # Apply smoothing like in the working example
#     alpha = 0.8
#     smoothed_allocation = []
#     prev = 0.0
#     for x in test_exposures:
#         s = alpha * x + (1 - alpha) * prev
#         smoothed_allocation.append(s)
#         prev = s
#     smoothed_allocation = np.array(smoothed_allocation)
    
#     # Create submission
#     submission = pd.DataFrame({
#         'date_id': test['date_id'],
#         'prediction': smoothed_allocation.astype('float32')
#     })
    
#     submission.to_csv('submission_ensemble.csv', index=False)
#     print("📁 Saved submission_ensemble.csv")
#     print(f"📊 Prediction range: [{smoothed_allocation.min():.4f}, {smoothed_allocation.max():.4f}]")