In [76]:
# ================================================================
#  HULL TACTICAL MARKET PREDICTION — ENSEMBLE + SHARPEPENALTY
# ================================================================
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress TensorFlow warnings
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # Use only first GPU if multiple

from pathlib import Path
import numpy as np
import pandas as pd

import polars as pl
from typing import Dict 

import warnings
warnings.filterwarnings('ignore')

from catboost import CatBoostRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

from scipy.stats import zscore
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.preprocessing import StandardScaler
import time

from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# Import TensorFlow after setting environment variables
import tensorflow as tf
tf.get_logger().setLevel('ERROR')  # Only show errors
from tensorflow import keras
from tensorflow.keras import layers

# Try to import kaggle_evaluation, handle if not available
try:
    import kaggle_evaluation.default_inference_server as kdeval
    KAGGLE_ENV = True
    print("Running in Kaggle competition environment")
except ImportError:
    KAGGLE_ENV = False
    print("Running in local environment - kaggle_evaluation not available")

Running in local environment - kaggle_evaluation not available


In [77]:
# ================================================================
# Data Loading & Initial Feature Preparation
# ================================================================

# DATA_DIR = Path('/kaggle/input/hull-tactical-market-prediction')

## Configuration and Data Loading
DATA_DIR = Path("01_data")
TARGET = "market_forward_excess_returns"
drop_cols = ["date_id", "forward_returns", "risk_free_rate"]
VOL_WINDOW = 20        # volatility window in days
VALIDATION_SIZE = 2700          # days, approx. 30% of data

def time_split_train_val(df: pd.DataFrame, val_size: int = 2700):
    """Split data chronologically for time series validation."""
    df = df.sort_values('date_id').reset_index(drop=True)
    train_df = df.iloc[:-val_size].copy()
    val_df   = df.iloc[-val_size:].copy()
    return train_df, val_df

# Load train/test data using the KAGGLE_ENV variable from cell 1
if KAGGLE_ENV:
    print("Loading data from Kaggle environment")
    DATA_DIR = Path('/kaggle/input/hull-tactical-market-prediction')
    train = pd.read_csv(DATA_DIR / "train.csv")
    test = pd.read_csv(DATA_DIR / "test.csv")
else:
    print("Loading data from local environment")
    # Try different possible local paths
    local_paths = [
        DATA_DIR / "train.csv",
        Path("01_data/train.csv"),
        Path("train.csv")
    ]
    
    train_path = None
    test_path = None
    
    for path in local_paths:
        if path.exists():
            train_path = path
            test_path = path.parent / "test.csv"
            break
    
    if train_path is None or not test_path.exists():
        raise FileNotFoundError("Could not find train.csv and test.csv files in expected locations")
    
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)

print(f"Data loaded successfully")
print(f"Train shape: {train.shape} | Test shape: {test.shape}")

# Basic preprocessing
train = train.sort_values("date_id").reset_index(drop=True)
test = test.sort_values("date_id").reset_index(drop=True)

# Handle missing values
train = train.fillna(0.0)
test = test.fillna(0.0)

# Base features (before advanced transformations)
base_features = [c for c in train.columns if c not in drop_cols + [TARGET]]

print(f"Base features available: {len(base_features)}")
print(f"Target variable: {TARGET}")

Loading data from local environment
Data loaded successfully
Train shape: (8990, 98) | Test shape: (10, 99)
Base features available: 94
Target variable: market_forward_excess_returns


In [78]:
def prepare_df(df: pd.DataFrame, median_map: Dict[str, float], feature_cols: list) -> pd.DataFrame:
    """
    Clean and prepare DataFrame by handling missing values intelligently.
    
    Strategy:
    - Use median imputation for numeric columns with some missing values
    - Use zero-fill for columns with very few missing values  
    - Only process existing columns (no synthetic data creation)

    Args:
    df: Input DataFrame
    median_map: Dictionary mapping column names to median values
    feature_cols: List of feature column names to process

    Returns:
    Cleaned DataFrame

    Median is much less sensitive to extreme values (outliers)
    Mean can be heavily skewed by a few very large or very small values
    """
    df = df.copy()
    
    # Only work with columns that actually exist in the DataFrame
    existing_cols = [col for col in feature_cols if col in df.columns]
    
    if not existing_cols:
        print("Warning: No feature columns found in DataFrame")
        return df
    
    # Calculate missing percentages for existing columns
    missing_pct = (df[existing_cols].isnull().sum() / len(df)) * 100
    
    # Categorize columns by missing percentage
    cols_fill_median = missing_pct[(missing_pct > 5) & (missing_pct <= 50)].index.tolist()
    cols_fill_zero = missing_pct[missing_pct <= 5].index.tolist()
    
    # Apply median imputation for moderately missing columns
    if cols_fill_median:
        for col in cols_fill_median:
            median_val = median_map.get(col, df[col].median())
            if pd.isna(median_val):  # Handle case where median is NaN
                median_val = 0.0
            df[col] = df[col].fillna(median_val)
    
    # Apply zero-fill for low missing columns
    if cols_fill_zero:
        df[cols_fill_zero] = df[cols_fill_zero].fillna(0)
    
    # Ensure all feature columns are numeric
    for col in existing_cols:
        if df[col].dtype == 'object':
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    # Final cleanup - ensure no inf values
    df[existing_cols] = df[existing_cols].replace([np.inf, -np.inf], 0)
    
    return df

In [79]:
## Train / Validation Split and Median Imputation
train_df, val_df = time_split_train_val(train, val_size=VALIDATION_SIZE)
print(f"Data split: Train {train_df.shape[0]} | Validation {val_df.shape[0]} rows")

# Create median map from training portion only
median_map = {}
for c in base_features:
    if c in train_df.columns:
        if train_df[c].dtype.kind in 'fiu':  # numeric types
            median_val = train_df[c].median(skipna=True)
            median_map[c] = float(median_val) if not pd.isna(median_val) else 0.0
        else:
            median_map[c] = 0.0
    else:
        median_map[c] = 0.0

# Apply preprocessing to all splits
train_full = prepare_df(train_df, median_map, base_features)
val_full   = prepare_df(val_df, median_map, base_features)
test_full  = prepare_df(test, median_map, base_features)

# Extract only the base features (remove drop_cols and target)
final_features = [c for c in base_features if c in train_full.columns]
train_p = train_full[final_features].copy()
val_p   = val_full[final_features].copy()
test_p  = test_full[final_features].copy()

# Keep target and other columns separate for later use
train_target = train_full[TARGET].copy()
val_target   = val_full[TARGET].copy()

# Validation check
if not final_features:
    raise ValueError("No features available after preprocessing!")

print(f"Preprocessing complete")
print(f"Number of base features: {len(final_features)}")
print(f"Base features available: {final_features[:10]}..." if len(final_features) > 10 else f"Features: {final_features}")

print(f"Target variable '{TARGET}' extracted separately")

# REMOVE THIS - causes data leakage:
# train_for_engineering = train_p.copy()
# train_for_engineering[TARGET] = train_target

# CORRECT APPROACH - keep features and target separate
print("Features and target prepared separately to avoid data leakage")

Data split: Train 6290 | Validation 2700 rows
Preprocessing complete
Number of base features: 94
Base features available: ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'E1']...
Target variable 'market_forward_excess_returns' extracted separately
Features and target prepared separately to avoid data leakage


In [80]:
base_features

['D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'E1',
 'E10',
 'E11',
 'E12',
 'E13',
 'E14',
 'E15',
 'E16',
 'E17',
 'E18',
 'E19',
 'E2',
 'E20',
 'E3',
 'E4',
 'E5',
 'E6',
 'E7',
 'E8',
 'E9',
 'I1',
 'I2',
 'I3',
 'I4',
 'I5',
 'I6',
 'I7',
 'I8',
 'I9',
 'M1',
 'M10',
 'M11',
 'M12',
 'M13',
 'M14',
 'M15',
 'M16',
 'M17',
 'M18',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'P1',
 'P10',
 'P11',
 'P12',
 'P13',
 'P2',
 'P3',
 'P4',
 'P5',
 'P6',
 'P7',
 'P8',
 'P9',
 'S1',
 'S10',
 'S11',
 'S12',
 'S2',
 'S3',
 'S4',
 'S5',
 'S6',
 'S7',
 'S8',
 'S9',
 'V1',
 'V10',
 'V11',
 'V12',
 'V13',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9']

In [81]:
train_full.columns # correct

Index(['date_id', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'E1',
       'E10', 'E11', 'E12', 'E13', 'E14', 'E15', 'E16', 'E17', 'E18', 'E19',
       'E2', 'E20', 'E3', 'E4', 'E5', 'E6', 'E7', 'E8', 'E9', 'I1', 'I2', 'I3',
       'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'M1', 'M10', 'M11', 'M12', 'M13',
       'M14', 'M15', 'M16', 'M17', 'M18', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7',
       'M8', 'M9', 'P1', 'P10', 'P11', 'P12', 'P13', 'P2', 'P3', 'P4', 'P5',
       'P6', 'P7', 'P8', 'P9', 'S1', 'S10', 'S11', 'S12', 'S2', 'S3', 'S4',
       'S5', 'S6', 'S7', 'S8', 'S9', 'V1', 'V10', 'V11', 'V12', 'V13', 'V2',
       'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'forward_returns',
       'risk_free_rate', 'market_forward_excess_returns'],
      dtype='object')

In [82]:
train_p.columns # correct

Index(['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'E1', 'E10',
       'E11', 'E12', 'E13', 'E14', 'E15', 'E16', 'E17', 'E18', 'E19', 'E2',
       'E20', 'E3', 'E4', 'E5', 'E6', 'E7', 'E8', 'E9', 'I1', 'I2', 'I3', 'I4',
       'I5', 'I6', 'I7', 'I8', 'I9', 'M1', 'M10', 'M11', 'M12', 'M13', 'M14',
       'M15', 'M16', 'M17', 'M18', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8',
       'M9', 'P1', 'P10', 'P11', 'P12', 'P13', 'P2', 'P3', 'P4', 'P5', 'P6',
       'P7', 'P8', 'P9', 'S1', 'S10', 'S11', 'S12', 'S2', 'S3', 'S4', 'S5',
       'S6', 'S7', 'S8', 'S9', 'V1', 'V10', 'V11', 'V12', 'V13', 'V2', 'V3',
       'V4', 'V5', 'V6', 'V7', 'V8', 'V9'],
      dtype='object')

In [83]:
train_target # correct

0      -0.003038
1      -0.009114
2      -0.010243
3       0.004046
4      -0.012301
          ...   
6285   -0.016339
6286    0.004761
6287   -0.016470
6288   -0.007177
6289   -0.008327
Name: market_forward_excess_returns, Length: 6290, dtype: float64

In [84]:
final_features # correct, this are the columns of train_p

['D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'E1',
 'E10',
 'E11',
 'E12',
 'E13',
 'E14',
 'E15',
 'E16',
 'E17',
 'E18',
 'E19',
 'E2',
 'E20',
 'E3',
 'E4',
 'E5',
 'E6',
 'E7',
 'E8',
 'E9',
 'I1',
 'I2',
 'I3',
 'I4',
 'I5',
 'I6',
 'I7',
 'I8',
 'I9',
 'M1',
 'M10',
 'M11',
 'M12',
 'M13',
 'M14',
 'M15',
 'M16',
 'M17',
 'M18',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'P1',
 'P10',
 'P11',
 'P12',
 'P13',
 'P2',
 'P3',
 'P4',
 'P5',
 'P6',
 'P7',
 'P8',
 'P9',
 'S1',
 'S10',
 'S11',
 'S12',
 'S2',
 'S3',
 'S4',
 'S5',
 'S6',
 'S7',
 'S8',
 'S9',
 'V1',
 'V10',
 'V11',
 'V12',
 'V13',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9']

In [85]:
# count of final features = len(final_features)
final_selected_features = [c for c in final_features if c in train_p.columns]
len(final_selected_features)

94

In [None]:
# ===== Advanced Feature Factory (Enhanced) =====
def create_advanced_features(df,
                             top_features,
                             macro_prefixes=('M','V','P','S'),
                             window_sizes=(5,10,20,60,120),
                             shift=1,  # Added shift parameter
                             inplace=False):
    """
    Create advanced features following a two-level approach:
      1) Lightweight Core Features (applied to `top_features`)
      2) Macro-Context Features (applied to columns starting with macro_prefixes)
    
    Args:
        df: Input DataFrame
        top_features: List of most important features for Level 1 processing
        macro_prefixes: Tuple of prefixes for Level 2 features
        window_sizes: Rolling window sizes
        shift: Number of periods to shift for avoiding data leakage
        inplace: Whether to modify DataFrame in place
    
    Returns:
        df_out: DataFrame with new features (and original columns)
    """
    if not inplace:
        df = df.copy()

    # Ensure datetime-like ordering by date_id if present
    if 'date_id' in df.columns:
        df = df.sort_values('date_id').reset_index(drop=True)

    # Helper: ensure numeric dtype for selected cols
    def _to_numeric(cols):
        for c in cols:
            if c in df.columns:
                df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0.0)

    # ------------- Level 1: Core Features (top_features) -------------
    def create_rolling_and_distance_features(cols, windows=window_sizes, shift_periods=shift):
        """Create rolling statistics and distance features efficiently using shared roll object"""
        for c in cols:
            if c not in df.columns:
                continue
            for w in windows:
                # Apply shift BEFORE rolling operations
                shifted_col = df[c].shift(shift_periods)
                roll = shifted_col.rolling(window=w, min_periods=1)  # Create roll object once
            
                # Calculate all rolling statistics from the same roll object
                roll_mean = roll.mean()
                roll_std = roll.std().fillna(0.0)
                roll_median = roll.median()
                roll_max = roll.max()
                roll_min = roll.min()
            
                # Store rolling features
                df[f"{c}_rolling_mean_{w}"] = roll_mean.astype('float32')
                df[f"{c}_rolling_std_{w}"] = roll_std.astype('float32')
                df[f"{c}_rolling_median_{w}"] = roll_median.astype('float32')
                df[f"{c}_rolling_max_{w}"] = roll_max.astype('float32')
                df[f"{c}_rolling_min_{w}"] = roll_min.astype('float32')
            
                # Calculate distance to rolling mean using the same roll_mean
                df[f"{c}_dist_to_rolling_mean_{w}"] = (df[c] - roll_mean).astype('float32')

    def create_zscore_features(cols, windows=window_sizes, shift_periods=shift):
        """Create rolling z-scores with proper shift"""
        for c in cols:
            if c not in df.columns:
                continue
            for w in windows:
                # Apply shift BEFORE rolling operations
                shifted_col = df[c].shift(shift_periods)
                roll_mean = shifted_col.rolling(window=w, min_periods=1).mean()
                roll_std = shifted_col.rolling(window=w, min_periods=1).std().fillna(0.0)
                
                df[f"{c}_z_{w}"] = ((df[c] - roll_mean) / (roll_std + 1e-9)).astype('float32')

    def create_spread_features(cols, shift_periods=shift):
        """Create spread and percentage change features"""
        for c in cols:
            if c not in df.columns:
                continue
            # Use proper shift for difference calculations
            df[f"{c}_diff_1"] = (df[c] - df[c].shift(shift_periods)).astype('float32')
            df[f"{c}_pctchg_1"] = (df[c].pct_change(periods=shift_periods).fillna(0.0)).astype('float32')

    # Function to calculate numerical PACF values to be added as additional features
    def create_pacf_features(cols, nlags=10, shift_periods=shift):
        """Create PACF features for selected columns"""
        from statsmodels.tsa.stattools import pacf
        for c in cols:
            if c not in df.columns:
                continue
            shifted_col = df[c].shift(shift_periods).fillna(0.0)
            # Change from 'ywunbiased' to 'yw' (Yule-Walker method)
            pacf_values = pacf(shifted_col, nlags=nlags, method='yw')
            for lag in range(1, nlags + 1):
                df[f"{c}_pacf_{lag}"] = pacf_values[lag]

    # Function to calculate numerical ACF values to be added as additional features
    def create_acf_features(cols, nlags=10, shift_periods=shift):
        """Create ACF features for selected columns"""
        from statsmodels.tsa.stattools import acf
        for c in cols:
            if c not in df.columns:
                continue
            shifted_col = df[c].shift(shift_periods).fillna(0.0)
            acf_values = acf(shifted_col, nlags=nlags, fft=False)
            for lag in range(1, nlags + 1):
                df[f"{c}_acf_{lag}"] = acf_values[lag]

    # Function to calculate pandas autocorr values to be added as additional features
    def create_autocorr_features(cols, lags=10, shift_periods=shift):
        """Create autocorrelation features for selected columns"""
        for c in cols:
            if c not in df.columns:
                continue
            for lag in range(1, lags + 1):
                df[f"{c}_autocorr_{lag}"] = df[c].autocorr(lag=lag)

    # Function to calculate skewness and kurtosis features
    def create_skewness_kurtosis_features(cols, shift_periods=shift):
        """Create skewness and kurtosis features for selected columns"""
        for c in cols:
            if c not in df.columns:
                continue
            shifted_col = df[c].shift(shift_periods)
            df[f"{c}_skewness"] = shifted_col.rolling(window=30, min_periods=1).skew().astype('float32').fillna(0.0)
            df[f"{c}_kurtosis"] = shifted_col.rolling(window=30, min_periods=1).kurt().astype('float32').fillna(0.0)

    # Function to calculate momentum features
    def create_momentum_features(cols, windows=(5,10,20), shift_periods=shift):
        """Create momentum features with proper shift"""
        for c in cols:
            if c not in df.columns:
                continue
            for w in windows:
                shifted_col = df[c].shift(shift_periods)
                df[f"{c}_momentum_{w}"] = (shifted_col - shifted_col.shift(w)).astype('float32')

    # Function to calculate distance from value to momentum
    def create_distance_to_momentum_features(cols, windows=(5,10,20), shift_periods=shift):
        """Create distance to momentum features with proper shift"""
        for c in cols:
            if c not in df.columns:
                continue
            for w in windows:
                shifted_col = df[c].shift(shift_periods)
                momentum = shifted_col - shifted_col.shift(w)
                df[f"{c}_dist_to_momentum_{w}"] = (df[c] - momentum).astype('float32')

    # Function to calculate difference series
    def create_difference_features(cols, lags=(1,5,10), shift_periods=shift):
        """Create difference features with proper shift"""
        for c in cols:
            if c not in df.columns:
                continue
            for lag in lags:
                df[f"{c}_diff_{lag}"] = (df[c] - df[c].shift(lag + shift_periods)).astype('float32')

    # Function to calculate normalized series
    def create_normalized_features(cols, shift_periods=shift):
        """Create normalized features with proper shift"""
        for c in cols:
            if c not in df.columns:
                continue
            shifted_col = df[c].shift(shift_periods)
            df[f"{c}_normalized"] = (shifted_col - shifted_col.mean()) / shifted_col.std()

    # ------------- Level 2: Macro Features (selective) -------------
    def create_cumulative_features(cols, windows=(5,10,20), shift_periods=shift):
        """Create cumulative sums with proper shift"""
        for c in cols:
            if c not in df.columns:
                continue
            for w in windows:
                shifted_col = df[c].shift(shift_periods)
                df[f"{c}_macro_cumsum_{w}"] = shifted_col.rolling(window=w, min_periods=1).sum().astype('float32')

    def create_correlation_features(pairs=None, window=30, shift_periods=shift):
        """Create rolling correlations with proper shift"""
        if pairs is None:
            # Build pairs from top_features (limit to avoid explosion)
            cand = []
            for i in range(len(top_features)):
                for j in range(i+1, len(top_features)):
                    cand.append((top_features[i], top_features[j]))
            pairs = cand[:10]  # Limit to 10 pairs
        
        for a, b in pairs:
            if a not in df.columns or b not in df.columns:
                continue
            # Apply shift to both series
            a_shifted = df[a].shift(shift_periods)
            b_shifted = df[b].shift(shift_periods)
            corr = a_shifted.rolling(window=window, min_periods=1).corr(b_shifted)
            df[f"macro_corr_{a}_{b}_{30}"] = corr.astype('float32').fillna(0.0)

    def create_volatility_features(cols=None, windows=(20,60), shift_periods=shift):
        """Create volatility spread features with proper shift"""
        if cols is None:
            cols = [c for c in df.columns if c.startswith('v')]
        
        # Limit to prevent feature explosion
        cols = cols[:8]
        
        for w in windows:
            vols = {}
            for c in cols:
                if c in df.columns:
                    shifted_col = df[c].shift(shift_periods)
                    vols[c] = shifted_col.rolling(window=w, min_periods=1).std().astype('float32').fillna(0.0)
            
            # Create spread between consecutive volatilities
            vol_keys = list(vols.keys())
            for i in range(len(vol_keys) - 1):
                a, b = vol_keys[i], vol_keys[i + 1]
                df[f"macro_volspread_{a}_{b}_{w}"] = (vols[a] - vols[b]).astype('float32')

    def create_extremes_features(cols, windows=(20,60,120), shift_periods=shift):
        """Create high/low ratio features with proper shift"""
        # Limit columns to prevent explosion
        cols = [c for c in cols if c in df.columns][:10]
        
        for c in cols:
            for w in windows:
                shifted_col = df[c].shift(shift_periods)
                roll_max = shifted_col.rolling(window=w, min_periods=1).max()
                roll_min = shifted_col.rolling(window=w, min_periods=1).min()

                df[f"{c}_macro_high_ratio_{w}"] = (df[c] / (roll_max + 1e-9)).astype('float32')
                df[f"{c}_macro_low_ratio_{w}"] = (df[c] / (roll_min + 1e-9)).astype('float32')

    # Execute feature creation
    print("Creating Level 1 features (Core)...")
    _to_numeric(top_features)
    create_rolling_and_distance_features(top_features)
    create_zscore_features(top_features)
    create_spread_features(top_features)
    create_pacf_features(top_features)
    create_acf_features(top_features)
    create_autocorr_features(top_features)
    create_skewness_kurtosis_features(top_features)
    create_momentum_features(top_features)
    create_distance_to_momentum_features(top_features)
    create_difference_features(top_features)
    create_normalized_features(top_features)

    print("Creating Level 2 features (Macro)...")
    macro_cols = [c for c in df.columns if any(c.startswith(pref) for pref in macro_prefixes)]
    _to_numeric(macro_cols)
    print('Macro columns for Level 2 features:', macro_cols)
    
    create_cumulative_features(macro_cols, windows=(5,10,20))
    create_correlation_features(window=30)
    create_volatility_features(windows=(20,60))
    create_extremes_features([c for c in df.columns if c.startswith(('m','p'))], windows=(20,60,120))

    # Clean data
    print("Cleaning and selecting features...")
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0.0, inplace=True)

    # Downcast to save memory
    float_cols = df.select_dtypes(include=['float64']).columns
    if len(float_cols) > 0:
        df[float_cols] = df[float_cols].astype('float32')

    print(f"Feature engineering complete. Created {len(df.columns)} total columns.")
    return df

In [87]:
# # Working
# # substitute with def enhanced_feature_selection()
# # keep here for reference, in case needed later

# # Feature Engineering & Data Preparation

# # Fix the case sensitivity - use uppercase to match your DataFrame columns
# top_features = ['M4', 'V13', 'M11', 'S2', 'D4', 'D1', 'D2', 'E8', 'P6', 'M2', 
#                 'D8', 'M9', 'P8', 'P7', 'S12', 'P13', 'V9', 'D5', 'P1', 'S8']

# print("Creating advanced features for training data...")

# # CORRECT: Create DataFrame with date_id + features but WITHOUT target columns to prevent data leakage
# # date_id is present because we need it for time-based operations in create_advanced_features()
# columns_to_exclude = ["market_forward_excess_returns", "forward_returns", "risk_free_rate"]
# columns_to_include = ['date_id'] + [col for col in final_features if col in train_full.columns]

# train_for_engineering = train_full[columns_to_include].copy()

# print(f"Columns for feature engineering: {len(columns_to_include)}")
# print(f"Excluded columns (prevent leakage): {columns_to_exclude}")

# train_enh = create_advanced_features(
#     train_for_engineering,
#     top_features=top_features,  # Now with correct case
#     window_sizes=(5, 10, 20, 60, 120),
#     shift=1
# )

# # Add target back AFTER feature engineering for supervised selection
# train_enh[TARGET] = train_full[TARGET].values

# # Now do supervised feature selection with target present
# # Get only the engineered feature columns (exclude date_id and target)
# feature_columns = [c for c in train_enh.columns if c not in ['date_id', TARGET]]
# # print feature_columns
# print(f"Feature columns for selection: {feature_columns}")

# # Supervised feature selection
# X_features = train_enh[feature_columns]
# y_target = train_enh[TARGET]

# # Remove zero variance features
# vt = VarianceThreshold(threshold=1e-6)
# X_filtered = X_features.loc[:, vt.fit(X_features).get_support()]

# # Tree-based feature importance
# gb = GradientBoostingRegressor(n_estimators=100, max_depth=3, random_state=42)
# gb.fit(X_filtered, y_target)

# # Select top 50 features
# importance_scores = pd.Series(gb.feature_importances_, index=X_filtered.columns)
# selected_features = importance_scores.nlargest(50).index.tolist()

# # Separate engineered features from original base features
# original_features_in_selection = [f for f in selected_features if f in final_features]
# new_engineered_features = [f for f in selected_features if f not in final_features]

# print(f"\nFeature Engineering Results:")
# print(f"Original base features available: {len(final_features)}")
# print(f"Original features selected: {len(original_features_in_selection)}")
# print(f"New engineered features created: {len(new_engineered_features)}")
# print(f"Total features for modeling: {len(selected_features)}")

# print(f"\nNew engineered features added:")
# for i, feat in enumerate(new_engineered_features, 1):
#     print(f"{i:2d}. {feat}")

# print(f"\nAll 50 selected features:")
# for i, feat in enumerate(selected_features, 1):
#     feat_type = "ORIGINAL" if feat in final_features else "ENGINEERED"
#     print(f"{i:2d}. {feat:<25} [{feat_type}]")

# # Final feature matrices
# X = train_enh[selected_features].astype('float32')
# y = train_enh[TARGET].astype('float32')

# print(f"\nFinal Training Data Shapes:")
# print(f"Training set shape: {X.shape}")
# print(f"Target shape: {y.shape}")
# print(f"Features selected: {len(selected_features)}")

# # Store for later use in inference
# final_selected_features = selected_features

In [None]:
# ===== Enhanced Ensemble Feature Selection (Replaces the old selection method) =====

def enhanced_feature_selection(X_features, y_target, final_features, n_features=100, verbose=True):
    """
    Enhanced ensemble feature selection combining multiple methods.
    
    Uses dynamic random states based on current time to ensure
    different feature selections on each run for robustness testing.
    
    Args:
        X_features: Feature DataFrame (from train_enh after feature engineering)
        y_target: Target Series (from train_enh[TARGET])
        final_features: List of original base features for categorization
        n_features: Number of top features to select
        verbose: Print progress information
    
    Returns:
        list: Selected feature names using ensemble voting
        dict: Detailed results from each method
    """
    
    # Generate dynamic random state based on current time
    dynamic_seed = int(time.time() * 1000) % 100000
    if verbose:
        print(f"Using dynamic seed: {dynamic_seed}")
    
    # Remove zero variance features first
    vt = VarianceThreshold(threshold=1e-6)
    X_filtered = X_features.loc[:, vt.fit(X_features).get_support()]
    
    if verbose:
        print(f"Features after variance filtering: {X_filtered.shape[1]}")
    
    feature_scores = {}
    selected_features_by_method = {}
    
    # Method 1: Gradient Boosting Importance (Dynamic Random State)
    if verbose:
        print("Method 1: Gradient Boosting Feature Importance...")
    
    gb = GradientBoostingRegressor(
        n_estimators=100, 
        max_depth=3, 
        random_state=dynamic_seed,  # Dynamic instead of 42
        subsample=0.8,
        learning_rate=0.1
    )
    gb.fit(X_filtered, y_target)
    gb_scores = pd.Series(gb.feature_importances_, index=X_filtered.columns)
    gb_top = gb_scores.nlargest(n_features).index.tolist()
    
    feature_scores['gradient_boosting'] = gb_scores
    selected_features_by_method['gradient_boosting'] = gb_top
    
    # Method 2: Random Forest Importance (Dynamic Random State)
    if verbose:
        print("Method 2: Random Forest Feature Importance...")
    
    rf = RandomForestRegressor(
        n_estimators=100, 
        max_depth=5, 
        random_state=dynamic_seed + 1,  # Dynamic + offset
        n_jobs=-1,
        bootstrap=True
    )
    rf.fit(X_filtered, y_target)
    rf_scores = pd.Series(rf.feature_importances_, index=X_filtered.columns)
    rf_top = rf_scores.nlargest(n_features).index.tolist()
    
    feature_scores['random_forest'] = rf_scores
    selected_features_by_method['random_forest'] = rf_top
    
    # Method 3: F-test Statistical Significance  
    if verbose:
        print("Method 3: F-test Statistical Selection...")
    
    f_selector = SelectKBest(score_func=f_regression, k=min(n_features, X_filtered.shape[1]))
    f_selector.fit(X_filtered, y_target)
    f_scores = pd.Series(f_selector.scores_, index=X_filtered.columns)
    f_top = f_scores.nlargest(n_features).index.tolist()
    
    feature_scores['f_test'] = f_scores
    selected_features_by_method['f_test'] = f_top
    
    # Method 4: Mutual Information (Dynamic Random State)
    if verbose:
        print("Method 4: Mutual Information Selection...")
    
    mi_scores = mutual_info_regression(
        X_filtered, y_target, 
        random_state=dynamic_seed + 2  # Dynamic + offset
    )
    mi_scores_series = pd.Series(mi_scores, index=X_filtered.columns)
    mi_top = mi_scores_series.nlargest(n_features).index.tolist()
    
    feature_scores['mutual_info'] = mi_scores_series
    selected_features_by_method['mutual_info'] = mi_top
    
    # Ensemble Voting: Features selected by multiple methods
    if verbose:
        print("Ensemble Voting: Combining all methods...")
    
    # Count votes for each feature
    feature_votes = {}
    all_features = set()
    
    for method, features in selected_features_by_method.items():
        all_features.update(features)
        for feature in features:
            feature_votes[feature] = feature_votes.get(feature, 0) + 1
    
    # Sort by votes, then by average score across methods
    def get_average_score(feature):
        scores = []
        for method, score_series in feature_scores.items():
            if feature in score_series.index:
                # Normalize scores to [0,1] for fair averaging
                normalized = (score_series[feature] - score_series.min()) / (score_series.max() - score_series.min() + 1e-10)
                scores.append(normalized)
        return np.mean(scores) if scores else 0.0
    
    # Create ensemble ranking
    ensemble_ranking = []
    for feature in all_features:
        votes = feature_votes.get(feature, 0)
        avg_score = get_average_score(feature)
        ensemble_ranking.append({
            'feature': feature,
            'votes': votes,
            'avg_score': avg_score,
            'ensemble_score': votes + avg_score  # Hybrid scoring
        })
    
    # Sort by ensemble score (votes + normalized average)
    ensemble_ranking.sort(key=lambda x: x['ensemble_score'], reverse=True)
    
    # Select top features
    ensemble_features = [item['feature'] for item in ensemble_ranking[:n_features]]
    
    if verbose:
        # Separate engineered features from original base features for reporting
        original_features_in_selection = [f for f in ensemble_features if f in final_features]
        new_engineered_features = [f for f in ensemble_features if f not in final_features]
        
        print(f"\nEnsemble Feature Selection Results:")
        print(f"   Total unique features considered: {len(all_features)}")
        print(f"   Selected by 4 methods: {sum(1 for f in all_features if feature_votes.get(f, 0) == 4)}")
        print(f"   Selected by 3 methods: {sum(1 for f in all_features if feature_votes.get(f, 0) == 3)}")
        print(f"   Selected by 2 methods: {sum(1 for f in all_features if feature_votes.get(f, 0) == 2)}")
        print(f"   Selected by 1 method:  {sum(1 for f in all_features if feature_votes.get(f, 0) == 1)}")
        print(f"   Final ensemble selection: {len(ensemble_features)} features")
        
        print(f"\nFeature Engineering Results:")
        print(f"Original base features available: {len(final_features)}")
        print(f"Original features selected: {len(original_features_in_selection)}")
        print(f"New engineered features created: {len(new_engineered_features)}")
        print(f"Total features for modeling: {len(ensemble_features)}")

        print(f"\nNew engineered features added:")
        for i, feat in enumerate(new_engineered_features, 1):
            print(f"{i:2d}. {feat}")

        print(f"\nAll {len(ensemble_features)} selected features:")
        for i, feat in enumerate(ensemble_features, 1):
            feat_type = "ORIGINAL" if feat in final_features else "ENGINEERED"
            print(f"{i:2d}. {feat:<25} [{feat_type}]")
        
        # Show top 10 features with vote details
        print(f"\nTop 10 Ensemble Features by Score:")
        for i, item in enumerate(ensemble_ranking[:10], 1):
            feat_type = "ORIGINAL" if item['feature'] in final_features else "ENGINEERED"
            print(f"   {i:2d}. {item['feature']:<25} | Votes: {item['votes']} | Score: {item['avg_score']:.3f} | [{feat_type}]")
    
    results = {
        'ensemble_features': ensemble_features,
        'method_features': selected_features_by_method,
        'feature_scores': feature_scores,
        'ensemble_ranking': ensemble_ranking,
        'dynamic_seed': dynamic_seed,
        'original_features_selected': [f for f in ensemble_features if f in final_features],
        'engineered_features_selected': [f for f in ensemble_features if f not in final_features]
    }
    
    return ensemble_features, results

# ===== REPLACE THE OLD FEATURE SELECTION SECTION =====

# Feature Engineering & Data Preparation
top_features = ['M4', 'V13', 'M11', 'S2', 'D4', 'D1', 'D2', 'E8', 'P6', 'M2', 
                'D8', 'M9', 'P8', 'P7', 'S12', 'P13', 'V9', 'D5', 'P1', 'S8']

print("Creating advanced features for training data...")

# CORRECT: Create DataFrame with date_id + features but WITHOUT target columns to prevent data leakage
columns_to_exclude = ["market_forward_excess_returns", "forward_returns", "risk_free_rate"]
columns_to_include = ['date_id'] + [col for col in final_features if col in train_full.columns]

train_for_engineering = train_full[columns_to_include].copy()

print(f"Columns for feature engineering (count): {len(columns_to_include)}")
# print name of columns included
print(f"Included columns names: {columns_to_include}")
# print length of excluded columns
print(f"Excluded columns (count): {len(columns_to_exclude)}")
print(f"Excluded columns (prevent leakage) names: {columns_to_exclude}")

train_enh = create_advanced_features(
    train_for_engineering,
    top_features=top_features,
    window_sizes=(5, 10, 20, 60, 120),
    shift=1
)

# Add target back AFTER feature engineering for supervised selection
train_enh[TARGET] = train_full[TARGET].values

# Now do ENHANCED supervised feature selection with target present
feature_columns = [c for c in train_enh.columns if c not in ['date_id', TARGET]]
print(f"Feature columns for selection: {len(feature_columns)} total features available")

# Supervised feature selection using ENHANCED method
X_features = train_enh[feature_columns]
y_target = train_enh[TARGET]

# Apply Enhanced Feature Selection (replaces the old single-method approach)
print("\n" + "="*60)
print("ENHANCED ENSEMBLE FEATURE SELECTION")
print("="*60)

selected_features, selection_results = enhanced_feature_selection(
    X_features, y_target, final_features,
    n_features=100,  
    verbose=True
)

# Final feature matrices
X = train_enh[selected_features].astype('float32')
y = train_enh[TARGET].astype('float32')

print(f"\nFinal Training Data Shapes:")
print(f"Training set shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Features selected: {len(selected_features)}")

# Store for later use in inference
final_selected_features = selected_features

print("\nEnhanced feature selection complete!")
print("Ready for model training with dynamically selected features")

Creating advanced features for training data...
Columns for feature engineering (count): 95
Included columns names: ['date_id', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'E1', 'E10', 'E11', 'E12', 'E13', 'E14', 'E15', 'E16', 'E17', 'E18', 'E19', 'E2', 'E20', 'E3', 'E4', 'E5', 'E6', 'E7', 'E8', 'E9', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'M1', 'M10', 'M11', 'M12', 'M13', 'M14', 'M15', 'M16', 'M17', 'M18', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'P1', 'P10', 'P11', 'P12', 'P13', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'S1', 'S10', 'S11', 'S12', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'V1', 'V10', 'V11', 'V12', 'V13', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9']
Excluded columns (count): 3
Excluded columns (prevent leakage) names: ['market_forward_excess_returns', 'forward_returns', 'risk_free_rate']
Creating Level 1 features (Core)...
Creating Level 2 features (Macro)...
Macro columns for Level 2 features: ['M1', 'M10', 'M11', 'M12',

In [None]:
"""
We can add also MC shuffle of.... or synthetic data generation from existing data to increase training set size

!!!!!!!!!!!!!!!!
synthetic data generation should be used carefully to avoid data leakage, 
overfitting and unrealistic patterns.
!!!!!!!!!!!!!!!!

""";

In [90]:
final_selected_features

['M4',
 'V9_diff_1',
 'V13_diff_1',
 'V13',
 'M4_diff_1',
 'V13_diff_1_macro_cumsum_5',
 'S2',
 'S8',
 'P8_diff_1',
 'E19',
 'V13_diff_1_macro_cumsum_10',
 'M11_rolling_std_5_macro_cumsum_5',
 'M4_diff_1_macro_cumsum_10',
 'V9',
 'E2',
 'M4_rolling_mean_120_macro_cumsum_5',
 'V9_rolling_std_10_macro_cumsum_10',
 'M11_rolling_mean_120_macro_cumsum_20',
 'E16',
 'M4_rolling_mean_60_macro_cumsum_10',
 'S5_macro_cumsum_20',
 'E12',
 'P6_rolling_std_20_macro_cumsum_10',
 'I2',
 'P10',
 'M3_macro_cumsum_5',
 'M11_diff_1',
 'P7_rolling_median_5',
 'V9_diff_1_macro_cumsum_10',
 'V9_rolling_mean_5',
 'M3',
 'M11_z_120',
 'V13_z_120',
 'V7',
 'V13_rolling_std_5',
 'M4_diff_1_macro_cumsum_20',
 'P8',
 'M11_diff_1_macro_cumsum_5',
 'V13_rolling_std_5_macro_cumsum_5',
 'V13_rolling_std_5_macro_cumsum_10',
 'M4_rolling_mean_60',
 'V13_rolling_mean_20_macro_cumsum_20',
 'M4_rolling_std_60',
 'V13_rolling_std_10',
 'V7_macro_cumsum_5',
 'E20',
 'M4_rolling_std_60_macro_cumsum_5',
 'M11_rolling_std_10_

In [91]:
# Now your models train on the BEST features, not fixed ones
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Generate dynamic random state for models
model_seed = int(time.time() * 1000) % 100000
print(f"Using dynamic random seed for models: {model_seed}")

# CatBoost with BEST features and dynamic random state
catboost_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    random_seed=model_seed,  # Dynamic instead of fixed 42
    verbose=False
)

# Use the correctly selected features (X instead of undefined X_best)
catboost_model.fit(X, y)

# Neural Network with BEST features  
nn_model = Sequential([
    Dense(128, activation='relu', input_shape=(len(selected_features),)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2), 
    Dense(1)
])
nn_model.compile(optimizer='adam', loss='mse')

# Set TensorFlow random seed for reproducibility within this run
tf.random.set_seed(model_seed)
nn_model.fit(X, y, epochs=100, validation_split=0.2, verbose=0)

print("Model training complete with dynamically selected features")
print(f"CatBoost trained on {X.shape[1]} features")
print(f"Neural Network trained on {X.shape[1]} features")
print(f"Selected features: {selected_features[:10]}..." if len(selected_features) > 10 else f"Selected features: {selected_features}")

Using dynamic random seed for models: 39044
Model training complete with dynamically selected features
CatBoost trained on 50 features
Neural Network trained on 50 features
Selected features: ['M4', 'V9_diff_1', 'V13_diff_1', 'V13', 'M4_diff_1', 'V13_diff_1_macro_cumsum_5', 'S2', 'S8', 'P8_diff_1', 'E19']...


In [92]:
# ===== CELL: Model Training with Selected Features =====

# 1. Feature Engineering using your new function
top_features = ['m4','v13','m11','s2','d4','d1','d2','e8','p6','m2','d8','m9','p8','p7','s12','p13','v9','d5','p1','s8']

df_features, selected_features = create_advanced_features(
    train,
    top_features,
    window_sizes=(5,10,20,60,120),
    max_features_to_keep=50
)

# Display summary
print(f"Selected features ({len(selected_features)}): {selected_features[:10]} ...")

# Prepare X, y
X = df_features[selected_features].astype('float32')
y = df_features['market_forward_excess_returns'].astype('float32')


# 2. Define CatBoost model (Ensemble part 1)
cat_params = dict(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    loss_function='RMSE',
    random_seed=42,
    verbose=False
)

cat_model = CatBoostRegressor(**cat_params)

kf = KFold(n_splits=5, shuffle=False)
cat_preds = np.zeros(len(X))

for fold, (trn_idx, val_idx) in enumerate(kf.split(X), 1):
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    train_pool = Pool(X_train, y_train)
    val_pool = Pool(X_val, y_val)

    cat_model.fit(train_pool, eval_set=val_pool, verbose=False)
    preds = cat_model.predict(X_val)
    rmse = mean_squared_error(y_val, preds, squared=False)
    print(f"Fold {fold} RMSE: {rmse:.6f}")

    cat_preds[val_idx] = preds


# 3. Define Neural Network model (Ensemble part 2)
def build_nn(input_dim):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(32, activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

nn_preds = np.zeros(len(X))

for fold, (trn_idx, val_idx) in enumerate(kf.split(X), 1):
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    nn_model = build_nn(X.shape[1])
    early_stop = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True, monitor='val_loss', verbose=0)
    nn_model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=100,
        batch_size=128,
        verbose=0,
        callbacks=[early_stop]
    )

    preds = nn_model.predict(X_val).ravel()
    rmse = mean_squared_error(y_val, preds, squared=False)
    print(f"[NN] Fold {fold} RMSE: {rmse:.6f}")

    nn_preds[val_idx] = preds


# 4. Ensemble Combination (Weighted Average)
ensemble_preds = 0.6 * cat_preds + 0.4 * nn_preds
rmse_ensemble = mean_squared_error(y, ensemble_preds, squared=False)
print(f"\n Ensemble RMSE: {rmse_ensemble:.6f}")


# 5. Diagnostics and Sanity Checks
metrics = pd.DataFrame({
    "Model": ["CatBoost", "NeuralNet", "Ensemble"],
    "RMSE": [
        mean_squared_error(y, cat_preds, squared=False),
        mean_squared_error(y, nn_preds, squared=False),
        rmse_ensemble
    ]
})
display(metrics)

print("\nFeature importance snapshot (CatBoost):")
imp_df = pd.DataFrame({
    'Feature': selected_features,
    'Importance': cat_model.feature_importances_
}).sort_values(by='Importance', ascending=False)
display(imp_df.head(15))


TypeError: create_advanced_features() got an unexpected keyword argument 'max_features_to_keep'

In [None]:
# # Feature Importance Preview (Optional)

# # ================================================================
# #  Feature Importance Preview (Optional Diagnostic)
# # ================================================================
# """
# Quick diagnostic cell to preview which engineered features
# are most informative for predicting market_forward_excess_returns.

# You can toggle the mode:
#   - mode = "fast" → uses Mutual Information (no model training)
#   - mode = "catboost" → trains a quick CatBoostRegressor for ranking
# """

# from sklearn.feature_selection import mutual_info_regression
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Select mode
# mode = "fast"  # "fast" or "catboost"

# # Mutual Information Mode (fast)
# if mode == "fast":
#     print("Running Mutual Information Importance (fast mode)...")
#     mi = mutual_info_regression(X, y, random_state=42)
#     mi_df = pd.DataFrame({'feature': X.columns, 'importance': mi})
#     mi_df = mi_df.sort_values(by='importance', ascending=False).head(20)

#     plt.figure(figsize=(10, 6))
#     sns.barplot(data=mi_df, x='importance', y='feature', color='steelblue')
#     plt.title("Top 20 Features by Mutual Information")
#     plt.tight_layout()
#     plt.show()

# # CatBoost Mode (more precise)
# elif mode == "catboost":
#     from catboost import CatBoostRegressor

#     print(" Running CatBoost Feature Importance (model-based)...")
#     model = CatBoostRegressor(
#         iterations=300,
#         learning_rate=0.05,
#         depth=6,
#         random_seed=42,
#         verbose=False
#     )
#     model.fit(X, y)

#     fi = model.get_feature_importance(prettified=True)
#     fi = fi.sort_values(by='Importances', ascending=False).head(20)

#     plt.figure(figsize=(10, 6))
#     sns.barplot(data=fi, x='Importances', y='Feature Id', color='darkorange')
#     plt.title("Top 20 Features by CatBoost Importance")
#     plt.tight_layout()
#     plt.show()

# else:
#     print("Invalid mode. Choose 'fast' or 'catboost'.")

# print(" Feature importance preview complete.")


In [None]:
# ================================================================
#  CatBoost Base Model (GridSearch + TimeSeriesSplit)
# ================================================================

print("Training CatBoost model with TimeSeries CV...")

tscv = TimeSeriesSplit(n_splits=5)

# check here random_state = 42 for reproducibility!
cbc = CatBoostRegressor(loss_function='RMSE', verbose=0, random_state=42)

param_grid = {
    'depth': [4, 6],
    'learning_rate': [0.05, 0.1],
    'iterations': [300, 500],
    'l2_leaf_reg': [2, 5]
}

grid = GridSearchCV(
    estimator=cbc,
    param_grid=param_grid,
    cv=tscv,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)
grid.fit(X, y)
best_cbc = grid.best_estimator_
print(f" Best Params: {grid.best_params_}")

⏳ Training CatBoost model with TimeSeries CV...
Fitting 5 folds for each of 16 candidates, totalling 80 fits


ValueError: 
All the 80 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\calli\miniconda3\envs\ml\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\calli\miniconda3\envs\ml\Lib\site-packages\catboost\core.py", line 5873, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\calli\miniconda3\envs\ml\Lib\site-packages\catboost\core.py", line 2395, in _fit
    train_params = self._prepare_train_params(
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\calli\miniconda3\envs\ml\Lib\site-packages\catboost\core.py", line 2275, in _prepare_train_params
    train_pool = _build_train_pool(X, y, cat_features, text_features, embedding_features, pairs, graph,
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\calli\miniconda3\envs\ml\Lib\site-packages\catboost\core.py", line 1513, in _build_train_pool
    train_pool = Pool(X, y, cat_features=cat_features, text_features=text_features, embedding_features=embedding_features, pairs=pairs, graph=graph, weight=sample_weight, group_id=group_id,
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\calli\miniconda3\envs\ml\Lib\site-packages\catboost\core.py", line 855, in __init__
    self._init(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, graph, weight,
  File "c:\Users\calli\miniconda3\envs\ml\Lib\site-packages\catboost\core.py", line 1491, in _init
    self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, graph, weight,
  File "_catboost.pyx", line 4329, in _catboost._PoolBase._init_pool
  File "_catboost.pyx", line 4352, in _catboost._PoolBase._init_pool
  File "_catboost.pyx", line 2310, in _catboost._init_features_layout
_catboost.CatBoostError: catboost/libs/data/features_layout.cpp:124: All feature names should be different, but 'forward_returns' used more than once.


In [None]:
# ================================================================
#  Neural Network Model (Feedforward Regressor)
# ================================================================

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

def build_nn(input_dim):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])
    model.compile(optimizer=keras.optimizers.Adam(1e-3), loss='mse', metrics=['mae'])
    return model

nn_model = build_nn(X_scaled.shape[1])
es = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# last 20% time-based validation
date_cut = train["date_id"].quantile(0.8)
train_idx = train["date_id"] <= date_cut
val_idx = train["date_id"] > date_cut

X_train, y_train = X_scaled[train_idx], y[train_idx]
X_val, y_val = X_scaled[val_idx], y[val_idx]

nn_model.fit(X_train, y_train, validation_data=(X_val, y_val),
             epochs=100, batch_size=256, verbose=0, callbacks=[es])
print(" Neural Network trained successfully.")

✅ Neural Network trained successfully.


In [None]:
# ================================================================
# Ensemble Prediction (0.X × CatBoost + 0.XX × NN)
# ================================================================
ensemble_cat_pct = 0.8
ensemble_nn_pct = 0.2

val_cat = best_cbc.predict(X.loc[val_idx])
val_nn = nn_model.predict(X_scaled[val_idx]).ravel()

val_ensemble = ensemble_cat_pct * val_cat + ensemble_nn_pct * val_nn
val_df = train.loc[val_idx].copy()
val_df["pred"] = val_ensemble

[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [None]:
# ===== Corrected evaluation: use mapped weights and official formula =====
def compute_strategy_stats(weights, forward_returns, risk_free_rate):
    """
    Compute strategy daily returns and Sharpe (annualized).
    weights: array-like positions in [0,2]
    forward_returns, risk_free_rate: arrays aligned
    """
    # Ensure numpy arrays
    w = np.asarray(weights)
    fr = np.asarray(forward_returns)
    rf = np.asarray(risk_free_rate)

    # Strategy return per day: rf*(1 - w) + w * forward_returns
    # Strategy excess over rf:
    strat_ret = rf * (1.0 - w) + w * fr
    strat_excess = strat_ret - rf   # == w * (fr - rf)
    # annualized sharpe
    mean = np.nanmean(strat_excess)
    std = np.nanstd(strat_excess)
    sharpe = (mean / (std + 1e-12)) * np.sqrt(252) if std > 0 else 0.0
    # annualized vol of strategy returns
    vol_ann = std * np.sqrt(252)
    return {
        'sharpe': sharpe,
        'vol_ann': vol_ann,
        'mean_daily_excess': mean,
        'std_daily_excess': std,
        'strat_ret_series': strat_ret,
        'strat_excess_series': strat_excess
    }

def sharpe_penalty_official(weights, forward_returns, risk_free_rate):
    """
    Compute adjusted Sharpe like the official metric:
    - compute strategy sharpe
    - compute market vol and strategy vol, form vol_penalty = 1 + max(0, strategy_vol/market_vol - 1.2)
    - compute return_gap penalty like (max(0, (market_mean_excess - strat_mean_excess) * 100 * 252))**2 / 100 etc.
    Returns adjusted_sharpe (float) and components.
    """
    # strategy stats
    stats = compute_strategy_stats(weights, forward_returns, risk_free_rate)
    strat_excess = stats['strat_excess_series']
    strat_sharpe = stats['sharpe']
    strat_vol = stats['vol_ann']
    # market stats
    fr = np.asarray(forward_returns)
    rf = np.asarray(risk_free_rate)
    market_excess = fr - rf
    market_mean_excess = ( (1 + market_excess).prod() ) ** (1.0 / len(market_excess)) - 1 if len(market_excess)>0 else 0.0
    # fallback simpler mean if product fails
    # but safer to use mean:
    market_mean_excess = np.nanmean(market_excess)
    market_std = np.nanstd(fr)
    market_vol = market_std * np.sqrt(252) if market_std>0 else 1e-9

    # volatility penalty
    excess_vol = max(0.0, (strat_vol / (market_vol + 1e-12)) - 1.2)
    vol_penalty = 1.0 + excess_vol

    # return gap penalty (use squared scaled gap similar to demo code)
    strat_mean_excess = np.nanmean(strat_excess)
    return_gap = max(0.0, (market_mean_excess - strat_mean_excess) * 100 * 252)  # percent annualized gap
    return_penalty = 1.0 + (return_gap**2) / 100.0

    adjusted_sharpe = strat_sharpe / (vol_penalty * return_penalty + 1e-12)
    return {
        'adjusted_sharpe': adjusted_sharpe,
        'strat_sharpe': strat_sharpe,
        'vol_penalty': vol_penalty,
        'return_penalty': return_penalty,
        'strat_vol': strat_vol,
        'market_vol': market_vol,
        'return_gap': return_gap
    }

# ===== Use it on validation properly mapping raw preds to weights =====

# val_ensemble is your raw ensemble prediction (unmapped)
# First map to weights using your mapping function (or revised mapping)
def robust_signal_to_weight(sig, lower=0.0, upper=2.0):
    """
    Map raw signals to weights robustly using percentile clipping and stable scaling.
    If distribution is degenerate, fallback to standard scaling.
    """
    sig = np.asarray(sig)
    lo = np.nanpercentile(sig, 5)
    hi = np.nanpercentile(sig, 95)
    if np.isclose(hi, lo):
        # fallback: z-score and sigmoid mapping
        sig_z = (sig - np.nanmean(sig)) / (np.nanstd(sig) + 1e-12)
        # map z to [0,2] via logistic
        w = 2.0 / (1.0 + np.exp(-sig_z))
    else:
        w = (sig - lo) / (hi - lo + 1e-12) * (upper - lower) + lower
    return np.clip(w, lower, upper)

# compute mapped weights
val_weights = robust_signal_to_weight(val_ensemble)   # or pass val_cat/val_nn separately

# compute official adjusted sharpe and components
res = sharpe_penalty_official(val_weights, val_df['forward_returns'].to_numpy(), val_df['risk_free_rate'].to_numpy())

print("Mapped weights stats:", np.nanmin(val_weights), np.nanpercentile(val_weights,5), np.nanmedian(val_weights), np.nanpercentile(val_weights,95), np.nanmax(val_weights))
print("Strategy raw Sharpe:", res['strat_sharpe'])
print("Adjusted Sharpe:", res['adjusted_sharpe'])
print("Vol penalty:", res['vol_penalty'], "Return penalty:", res['return_penalty'], "Return gap:", res['return_gap'])


Mapped weights stats: 0.0 0.00011407995932442048 0.6734031891719277 1.999445759525307 2.0
Strategy raw Sharpe: 2.5628806391535512
Adjusted Sharpe: 2.562880639150988
Vol penalty: 1.0 Return penalty: 1.0 Return gap: 0.0


In [None]:
# # ================================================================
# #  Competition-Compliant Inference Function
# # ================================================================
# _cat_model = best_cbc
# _nn_model = nn_model
# _scaler = scaler
# _feat_cols = features

# """
#     Check if is really necessary exchange from pl to pd and back to pl?
#     pl.DataFrame (we convert to pandas inside)
# """
# def predict(pl_df):
#     """Competition inference function."""
#     pdf = pl_df.to_pandas().fillna(0.0)
#     for f in _feat_cols:
#         if f not in pdf.columns:
#             pdf[f] = 0.0
#     Xp = pdf[_feat_cols].values
#     Xp_scaled = _scaler.transform(Xp)
#     pred_cat = _cat_model.predict(pdf[_feat_cols])
#     pred_nn = _nn_model.predict(Xp_scaled, verbose=0).ravel()
#     preds = ensemble_cat_pct * pred_cat + ensemble_nn_pct * pred_nn
#     lo, hi = np.percentile(preds, [5, 95])
#     weights = np.clip((preds - lo) / (hi - lo + 1e-9) * 2.0, 0, 2)
#     return pd.DataFrame({"prediction": weights.astype("float32")})

In [None]:
# ================================================================
#  Competition-Compliant Inference Function
# ================================================================
_cat_model = best_cbc
_nn_model = nn_model
_scaler = scaler
_feat_cols = features
_history_returns = list(train.loc[val_idx, 'forward_returns'].iloc[-VOL_WINDOW:].tolist())

def predict(pl_df: pl.DataFrame) -> float:
    """Competition inference function - returns single float allocation."""
    global _history_returns
    
    # Convert Polars to Pandas and handle missing values
    pdf = pl_df.to_pandas().fillna(0.0)
    
    # Ensure all required features are present
    for f in _feat_cols:
        if f not in pdf.columns:
            pdf[f] = 0.0
    
    # Get features in correct format
    X_features = pdf[_feat_cols].values
    X_scaled = _scaler.transform(X_features)
    
    # Make predictions from both models
    pred_cat = _cat_model.predict(pdf[_feat_cols])[0]  # Get first prediction
    pred_nn = _nn_model.predict(X_scaled, verbose=0).ravel()[0]  # Get first prediction
    
    # Ensemble prediction
    pred = ensemble_cat_pct * pred_cat + ensemble_nn_pct * pred_nn
    
    # Estimate rolling volatility for scaling
    vol_est = np.std(_history_returns) if len(_history_returns) > 1 else 1e-3
    
    # Scale prediction to allocation with volatility adjustment
    allocation = float(np.clip((best_k * pred) / (vol_est + 1e-9), 0, 2))
    
    # Update history for rolling volatility estimation
    if 'lagged_forward_returns' in pl_df.columns:
        try:
            _history_returns.append(float(pl_df['lagged_forward_returns'][0]))
        except:
            _history_returns.append(0.0)
    else:
        _history_returns.append(0.0)
    
    # Keep only last VOL_WINDOW entries
    _history_returns = _history_returns[-VOL_WINDOW:]
    
    return allocation

In [None]:
"""
NEXT STEPS, IMPORTANT FOR IMPROVEMENT:

Stronger feature scaling

PCA optional

Rolling retrain or time-based CV for robustness out of sample

Optimization of the mix (CatBoost vs NN) to dynamically find the optimal weight based on your adjusted Sharpe
Eventually to be extended to more models in the ensemble

"""

'\nNEXT STEPS, IMPORTANT FOR IMPROVEMENT:\n\nStronger feature scaling\n\nPCA optional\n\nRolling retrain or time-based CV for robustness out of sample\n\nOptimization of the mix (CatBoost vs NN) to dynamically find the optimal weight based on your adjusted Sharpe\nEventually to be extended to more models in the ensemble\n\n'

In [None]:
# # ================================================================
# # Kaggle Evaluation Server / Local Submission
# # ================================================================

# if KAGGLE_ENV:
#     # Kaggle competition environment
#     server = kdeval.DefaultInferenceServer(predict)
    
#     if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
#         server.serve()
#     else:
#         server.run_local_gateway((str(DATA_DIR),))
        
# else:
#     # Local environment - generate submission file
#     print("🔧 Local mode - generating submission file...")
    
#     # Generate predictions for test set
#     test_pred_cat = best_cbc.predict(X_test)
#     test_pred_nn = nn_model.predict(scaler.transform(X_test), verbose=0).ravel()
#     preds = ensemble_cat_pct * test_pred_cat + ensemble_nn_pct * test_pred_nn
    
#     # Apply same scaling logic as validation
#     test_exposures = np.clip(best_k * preds, 0, 2)
    
#     # Apply smoothing like in the working example
#     alpha = 0.8
#     smoothed_allocation = []
#     prev = 0.0
#     for x in test_exposures:
#         s = alpha * x + (1 - alpha) * prev
#         smoothed_allocation.append(s)
#         prev = s
#     smoothed_allocation = np.array(smoothed_allocation)
    
#     # Create submission
#     submission = pd.DataFrame({
#         'date_id': test['date_id'],
#         'prediction': smoothed_allocation.astype('float32')
#     })
    
#     submission.to_csv('submission_ensemble.csv', index=False)
#     print(" Saved submission_ensemble.csv")
#     print(f" Prediction range: [{smoothed_allocation.min():.4f}, {smoothed_allocation.max():.4f}]")