In [1]:
# ================================================================
#  HULL TACTICAL MARKET PREDICTION — 200 TOP FEATURES + CAT BOOST
# ================================================================
import os
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl

from typing import Dict 

import warnings
warnings.filterwarnings('ignore')

from scipy.stats import zscore
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold

import time

# Try to import kaggle_evaluation, handle if not available
try:
    import kaggle_evaluation.default_inference_server as kdeval
    KAGGLE_ENV = True
    print("Running in Kaggle competition environment")
except ImportError:
    KAGGLE_ENV = False
    print("Running in local environment - kaggle_evaluation not available")

Running in local environment - kaggle_evaluation not available


In [2]:
# ================================================================
# Data Loading & Initial Feature Preparation
# ================================================================

## Configuration and Data Loading
# DATA_DIR = Path('/kaggle/input/hull-tactical-market-prediction')
DATA_DIR = Path("01_data")

TARGET = "market_forward_excess_returns"
drop_cols = ["date_id", "forward_returns", "risk_free_rate"]
VOL_WINDOW = 20        # volatility window in days
VALIDATION_SIZE = 2700          # days, approx. 30% of data

def time_split_train_val(df: pd.DataFrame, val_size: int = 2700):
    """Split data chronologically for time series validation."""
    df = df.sort_values('date_id').reset_index(drop=True)
    train_df = df.iloc[:-val_size].copy()
    val_df   = df.iloc[-val_size:].copy()
    return train_df, val_df

# Load train/test data using the KAGGLE_ENV variable from cell 1
if KAGGLE_ENV:
    print("Loading data from Kaggle environment")
    DATA_DIR = Path('/kaggle/input/hull-tactical-market-prediction')
    train = pd.read_csv(DATA_DIR / "train.csv")
    test = pd.read_csv(DATA_DIR / "test.csv")
else:
    print("Loading data from local environment")
    # Try different possible local paths
    local_paths = [
        DATA_DIR / "train.csv",
        Path("01_data/train.csv"),
        Path("train.csv")
    ]
    
    train_path = None
    test_path = None
    
    for path in local_paths:
        if path.exists():
            train_path = path
            test_path = path.parent / "test.csv"
            break
    
    if train_path is None or not test_path.exists():
        raise FileNotFoundError("Could not find train.csv and test.csv files in expected locations")
    
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)

print(f"Data loaded successfully")
print(f"Train shape: {train.shape} | Test shape: {test.shape}")

# Basic preprocessing
train = train.sort_values("date_id").reset_index(drop=True)
test = test.sort_values("date_id").reset_index(drop=True)

# Handle missing values
train = train.fillna(0.0)
test = test.fillna(0.0)

# Base features (before advanced transformations)
base_features = [c for c in train.columns if c not in drop_cols + [TARGET]]

print(f"Base features available: {len(base_features)}")
print(f"Target variable: {TARGET}")

Loading data from local environment
Data loaded successfully
Train shape: (9021, 98) | Test shape: (10, 99)
Base features available: 94
Target variable: market_forward_excess_returns


In [3]:
def prepare_df(df: pd.DataFrame, median_map: Dict[str, float], feature_cols: list) -> pd.DataFrame:
    """
    Clean and prepare DataFrame by handling missing values intelligently.
    
    Strategy:
    - Use median imputation for numeric columns with some missing values
    - Use zero-fill for columns with very few missing values  
    - Only process existing columns (no synthetic data creation)

    Args:
    df: Input DataFrame
    median_map: Dictionary mapping column names to median values
    feature_cols: List of feature column names to process

    Returns:
    Cleaned DataFrame

    Median is much less sensitive to extreme values (outliers)
    Mean can be heavily skewed by a few very large or very small values
    """
    df = df.copy()
    
    # Only work with columns that actually exist in the DataFrame
    existing_cols = [col for col in feature_cols if col in df.columns]
    
    if not existing_cols:
        print("Warning: No feature columns found in DataFrame")
        return df
    
    # Calculate missing percentages for existing columns
    missing_pct = (df[existing_cols].isnull().sum() / len(df)) * 100
    
    # Categorize columns by missing percentage
    cols_fill_median = missing_pct[(missing_pct > 5) & (missing_pct <= 50)].index.tolist()
    cols_fill_zero = missing_pct[missing_pct <= 5].index.tolist()
    
    # Apply median imputation for moderately missing columns
    if cols_fill_median:
        for col in cols_fill_median:
            median_val = median_map.get(col, df[col].median())
            if pd.isna(median_val):  # Handle case where median is NaN
                median_val = 0.0
            df[col] = df[col].fillna(median_val)
    
    # Apply zero-fill for low missing columns
    if cols_fill_zero:
        df[cols_fill_zero] = df[cols_fill_zero].fillna(0)
    
    # Ensure all feature columns are numeric
    for col in existing_cols:
        if df[col].dtype == 'object':
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    # Final cleanup - ensure no inf values
    df[existing_cols] = df[existing_cols].replace([np.inf, -np.inf], 0)
    
    return df

In [4]:
## Train / Validation Split and Median Imputation
train_df, val_df = time_split_train_val(train, val_size=VALIDATION_SIZE)
print(f"Data split: Train {train_df.shape[0]} | Validation {val_df.shape[0]} rows")

# Create median map from training portion only
median_map = {}
for c in base_features:
    if c in train_df.columns:
        if train_df[c].dtype.kind in 'fiu':  # numeric types
            median_val = train_df[c].median(skipna=True)
            median_map[c] = float(median_val) if not pd.isna(median_val) else 0.0
        else:
            median_map[c] = 0.0
    else:
        median_map[c] = 0.0

# Apply preprocessing to all splits
train_full = prepare_df(train_df, median_map, base_features)
val_full   = prepare_df(val_df, median_map, base_features)
test_full  = prepare_df(test, median_map, base_features)

# Extract only the base features (remove drop_cols and target)
final_features = [c for c in base_features if c in train_full.columns]
train_p = train_full[final_features].copy()
val_p   = val_full[final_features].copy()
test_p  = test_full[final_features].copy()

# Keep target and other columns separate for later use
train_target = train_full[TARGET].copy()
val_target   = val_full[TARGET].copy()

# Validation check
if not final_features:
    raise ValueError("No features available after preprocessing!")

print(f"Preprocessing complete")
print(f"Number of base features: {len(final_features)}")
print(f"Base features available: {final_features[:10]}..." if len(final_features) > 10 else f"Features: {final_features}")

print(f"Target variable '{TARGET}' extracted separately")
print("Features and target prepared separately to avoid data leakage")

Data split: Train 6321 | Validation 2700 rows
Preprocessing complete
Number of base features: 94
Base features available: ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'E1']...
Target variable 'market_forward_excess_returns' extracted separately
Features and target prepared separately to avoid data leakage


In [5]:
# ===== Advanced Feature Factory (Enhanced) =====
def create_advanced_features(df,
                             top_features,
                             macro_prefixes=('M','V','P','S'),
                             window_sizes=(5,10),
                             shift=1,  # Added shift parameter
                             inplace=False):
    """
    Create advanced features following a two-level approach:
      1) Lightweight Core Features (applied to `top_features`)
      2) Macro-Context Features (applied to columns starting with macro_prefixes)
    
    Args:
        df: Input DataFrame
        top_features: List of most important features for Level 1 processing
        macro_prefixes: Tuple of prefixes for Level 2 features
        window_sizes: Rolling window sizes
        shift: Number of periods to shift for avoiding data leakage
        inplace: Whether to modify DataFrame in place
    
    Returns:
        df_out: DataFrame with new features (and original columns)
    """
    if not inplace:
        df = df.copy()

    # Ensure datetime-like ordering by date_id if present
    if 'date_id' in df.columns:
        df = df.sort_values('date_id').reset_index(drop=True)

    # Helper: ensure numeric dtype for selected cols
    def _to_numeric(cols):
        for c in cols:
            if c in df.columns:
                df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0.0)

    # ------------- Level 1: Core Features (top_features) -------------
    # Function to calculate rolling statistics and distance to rolling mean
    def create_rolling_and_distance_features(cols, windows=window_sizes, shift_periods=shift):
        """Create rolling statistics and distance features efficiently using shared roll object"""
        for c in cols:
            if c not in df.columns:
                continue
            for w in windows:
                # Apply shift BEFORE rolling operations
                shifted_col = df[c].shift(shift_periods)
                roll = shifted_col.rolling(window=w, min_periods=1)  # Create roll object once
            
                # Calculate all rolling statistics from the same roll object
                roll_mean = roll.mean()
                roll_std = roll.std().fillna(0.0)
                roll_median = roll.median()
                roll_max = roll.max()
                roll_min = roll.min()
            
                # Store rolling features
                df[f"{c}_rolling_mean_{w}"] = roll_mean.astype('float32')
                df[f"{c}_rolling_std_{w}"] = roll_std.astype('float32')
                df[f"{c}_rolling_median_{w}"] = roll_median.astype('float32')
                df[f"{c}_rolling_max_{w}"] = roll_max.astype('float32')
                df[f"{c}_rolling_min_{w}"] = roll_min.astype('float32')
            
                # Calculate distance to rolling mean using the same roll_mean
                df[f"{c}_dist_to_rolling_mean_{w}"] = (df[c] - roll_mean).astype('float32')
    
    # Function to calculate variance features
    def create_variance_features(cols, windows=window_sizes, shift_periods=shift):
        """Create rolling variance features with proper shift"""
        for c in cols:
            if c not in df.columns:
                continue
            for w in windows:
                # Apply shift BEFORE rolling operations
                shifted_col = df[c].shift(shift_periods)
                roll_var = shifted_col.rolling(window=w, min_periods=1).var().fillna(0.0)
                
                df[f"{c}_rolling_var_{w}"] = roll_var.astype('float32')

    # Function to calculate z-score features
    def create_zscore_features(cols, windows=window_sizes, shift_periods=shift):
        """Create rolling z-scores with proper shift"""
        for c in cols:
            if c not in df.columns:
                continue
            for w in windows:
                # Apply shift BEFORE rolling operations
                shifted_col = df[c].shift(shift_periods)
                roll_mean = shifted_col.rolling(window=w, min_periods=1).mean()
                roll_std = shifted_col.rolling(window=w, min_periods=1).std().fillna(0.0)
                
                df[f"{c}_z_{w}"] = ((df[c] - roll_mean) / (roll_std + 1e-9)).astype('float32')

    # function to calculate zscore from scipy.stats
    def create_scipy_zscore_features(cols, shift_periods=shift):
        """Create z-score features using scipy.stats.zscore with proper shift"""
        for c in cols:
            if c not in df.columns:
                continue
            shifted_col = df[c].shift(shift_periods).fillna(0.0)
            zscored = zscore(shifted_col)
            df[f"{c}_scipy_zscore"] = zscored.astype('float32').fillna(0.0)

    # Function to calculate spread and percentage change features
    def create_spread_features(cols, shift_periods=shift):
        """Create spread and percentage change features"""
        for c in cols:
            if c not in df.columns:
                continue
            # Use proper shift for difference calculations
            df[f"{c}_diff_1"] = (df[c] - df[c].shift(shift_periods)).astype('float32')
            df[f"{c}_pctchg_1"] = (df[c].pct_change(periods=shift_periods).fillna(0.0)).astype('float32')

    # Function to calculate numerical PACF values to be added as additional features
    def create_pacf_features(cols, nlags=10, shift_periods=shift):
        """Create PACF features for selected columns"""
        from statsmodels.tsa.stattools import pacf
        for c in cols:
            if c not in df.columns:
                continue
            shifted_col = df[c].shift(shift_periods).fillna(0.0)
            # Change from 'ywunbiased' to 'yw' (Yule-Walker method)
            pacf_values = pacf(shifted_col, nlags=nlags, method='yw')
            for lag in range(1, nlags + 1):
                df[f"{c}_pacf_{lag}"] = pacf_values[lag]

    # Function to calculate numerical ACF values to be added as additional features
    def create_acf_features(cols, nlags=10, shift_periods=shift):
        """Create ACF features for selected columns"""
        from statsmodels.tsa.stattools import acf
        for c in cols:
            if c not in df.columns:
                continue
            shifted_col = df[c].shift(shift_periods).fillna(0.0)
            acf_values = acf(shifted_col, nlags=nlags, fft=False)
            for lag in range(1, nlags + 1):
                df[f"{c}_acf_{lag}"] = acf_values[lag]

    # Function to calculate pandas autocorr values to be added as additional features
    def create_autocorr_features(cols, lags=10, shift_periods=shift):
        """Create autocorrelation features for selected columns"""
        for c in cols:
            if c not in df.columns:
                continue
            for lag in range(1, lags + 1):
                df[f"{c}_autocorr_{lag}"] = df[c].autocorr(lag=lag)

    # Function to calculate skewness and kurtosis features
    def create_skewness_kurtosis_features(cols, shift_periods=shift):
        """Create skewness and kurtosis features for selected columns"""
        for c in cols:
            if c not in df.columns:
                continue
            shifted_col = df[c].shift(shift_periods)
            df[f"{c}_skewness"] = shifted_col.rolling(window=30, min_periods=1).skew().astype('float32').fillna(0.0)
            df[f"{c}_kurtosis"] = shifted_col.rolling(window=30, min_periods=1).kurt().astype('float32').fillna(0.0)

    # Function to calculate momentum features
    def create_momentum_features(cols, windows=(5,10,20), shift_periods=shift):
        """Create momentum features with proper shift"""
        for c in cols:
            if c not in df.columns:
                continue
            for w in windows:
                shifted_col = df[c].shift(shift_periods)
                df[f"{c}_momentum_{w}"] = (shifted_col - shifted_col.shift(w)).astype('float32')

    # Function to calculate distance from value to momentum
    def create_distance_to_momentum_features(cols, windows=(5,10,20), shift_periods=shift):
        """Create distance to momentum features with proper shift"""
        for c in cols:
            if c not in df.columns:
                continue
            for w in windows:
                shifted_col = df[c].shift(shift_periods)
                momentum = shifted_col - shifted_col.shift(w)
                df[f"{c}_dist_to_momentum_{w}"] = (df[c] - momentum).astype('float32')

    # Function to calculate difference series
    def create_difference_features(cols, lags=(1,5,10), shift_periods=shift):
        """Create difference features with proper shift"""
        for c in cols:
            if c not in df.columns:
                continue
            for lag in lags:
                df[f"{c}_diff_{lag}"] = (df[c] - df[c].shift(lag + shift_periods)).astype('float32')

    # Function to calculate normalized series
    def create_normalized_features(cols, shift_periods=shift):
        """Create normalized features with proper shift"""
        for c in cols:
            if c not in df.columns:
                continue
            shifted_col = df[c].shift(shift_periods)
            df[f"{c}_normalized"] = (shifted_col - shifted_col.mean()) / shifted_col.std()

    # Function to calculate rolling sums features
    def create_rolling_sum_features(cols, windows=(5,10,20), shift_periods=shift):
        """Create rolling sums with proper shift"""
        for c in cols:
            if c not in df.columns:
                continue
            for w in windows:
                shifted_col = df[c].shift(shift_periods)
                df[f"{c}_macro_rolling_sum_{w}"] = shifted_col.rolling(window=w, min_periods=1).sum().astype('float32')            

    # Function to calculate cumsum features
    def create_cumsum_features(cols, shift_periods=shift):
        """Create cumulative sum features with proper shift"""
        for c in cols:
            if c not in df.columns:
                continue
            shifted_col = df[c].shift(shift_periods)
            df[f"{c}_cumsum"] = shifted_col.cumsum().astype('float32')

    # Function to calculate Hurst exponent features
    def create_hurst_features(cols, shift_periods=shift):
        """Create Hurst exponent features with proper shift"""
        """
        The Hurst exponent uses lags to measure the long-term memory of the time series. 
        For each lag in the range, calculate the standard deviation of the differenced series. 
        Then calculate the slope of the log lags versus the standard deviations. 
        You can do this by returning the first value from NumPy’s polyfit function 
        which fits a first-degree polynomial function.
        
        The Hurst exponent ranges between 0 and 1.

        If the Hurst exponent is below 0.5, the market is mean reverting. 
        Reversal strategies win in these markets.

        If the Hurst exponent of 0.5 means the market is random. 
        In this case, a trading strategy that relies on the market direction will lose money.

        If the Hurst exponent is above 0.5 the market is trending. 
        Markets with a high Hurst exponent are perfect for trend-following strategies.
        """
        def hurst_exponent(ts):
            lags = range(2, 20)
            tau = [np.std(np.subtract(ts[lag:], ts[:-lag])) for lag in lags]
            poly = np.polyfit(np.log(lags), np.log(tau), 1)
            return poly[0] * 2.0
        
        for c in cols:
            if c not in df.columns:
                continue
            shifted_col = df[c].shift(shift_periods).fillna(0.0)
            df[f"{c}_hurst"] = hurst_exponent(shifted_col)

    # Function to calculate lagged observations from the past
    def create_lagged_features(cols, lags=(1,5,10), shift_periods=shift):
        """Create lagged features with proper shift"""
        for c in cols:
            if c not in df.columns:
                continue
            for lag in lags:
                df[f"{c}_lag_{lag}"] = df[c].shift(lag + shift_periods).astype('float32')

    # ------------- Level 2: Macro Features (selective) -------------
    # Function to calculate correlation features
    def create_correlation_features(pairs=None, window=30, shift_periods=shift):
        """Create rolling correlations with proper shift"""
        if pairs is None:
            # Build pairs from top_features (limit to avoid explosion)
            cand = []
            for i in range(len(top_features)):
                for j in range(i+1, len(top_features)):
                    cand.append((top_features[i], top_features[j]))
            pairs = cand[:10]  # Limit to 10 pairs
        
        for a, b in pairs:
            if a not in df.columns or b not in df.columns:
                continue
            # Apply shift to both series
            a_shifted = df[a].shift(shift_periods)
            b_shifted = df[b].shift(shift_periods)
            corr = a_shifted.rolling(window=window, min_periods=1).corr(b_shifted)
            df[f"macro_corr_{a}_{b}_{30}"] = corr.astype('float32').fillna(0.0)

    # Function to calculate volatility spread features
    def create_volatility_features(cols=None, windows=(20,60), shift_periods=shift):
        """Create volatility spread features with proper shift"""
        if cols is None:
            cols = [c for c in df.columns if c.startswith('v')]
        
        # Limit to prevent feature explosion
        cols = cols[:8]
        
        for w in windows:
            vols = {}
            for c in cols:
                if c in df.columns:
                    shifted_col = df[c].shift(shift_periods)
                    vols[c] = shifted_col.rolling(window=w, min_periods=1).std().astype('float32').fillna(0.0)
            
            # Create spread between consecutive volatilities
            vol_keys = list(vols.keys())
            for i in range(len(vol_keys) - 1):
                a, b = vol_keys[i], vol_keys[i + 1]
                df[f"macro_volspread_{a}_{b}_{w}"] = (vols[a] - vols[b]).astype('float32')

    # Function to calculate high/low ratio features
    def create_extremes_features(cols, windows=(20,60,120), shift_periods=shift):
        """Create high/low ratio features with proper shift"""
        # Limit columns to prevent explosion
        cols = [c for c in cols if c in df.columns][:10]
        
        for c in cols:
            for w in windows:
                shifted_col = df[c].shift(shift_periods)
                roll_max = shifted_col.rolling(window=w, min_periods=1).max()
                roll_min = shifted_col.rolling(window=w, min_periods=1).min()

                df[f"{c}_macro_high_ratio_{w}"] = (df[c] / (roll_max + 1e-9)).astype('float32')
                df[f"{c}_macro_low_ratio_{w}"] = (df[c] / (roll_min + 1e-9)).astype('float32')

    # Execute feature creation
    print("Creating Level 1 features (Core)...")
    _to_numeric(top_features)
    create_rolling_and_distance_features(top_features)
    create_variance_features(top_features)
    create_zscore_features(top_features)
    create_scipy_zscore_features(top_features)
    create_spread_features(top_features)
    create_pacf_features(top_features)
    create_acf_features(top_features)
    create_autocorr_features(top_features)
    create_skewness_kurtosis_features(top_features)
    create_momentum_features(top_features)
    create_distance_to_momentum_features(top_features)
    create_difference_features(top_features)
    create_normalized_features(top_features)
    create_rolling_sum_features(top_features)
    create_cumsum_features(top_features)
    create_hurst_features(top_features)
    create_lagged_features(top_features)

    print("Creating Level 2 features (Macro)...")
    macro_cols = [c for c in df.columns if any(c.startswith(pref) for pref in macro_prefixes)]
    _to_numeric(macro_cols)
    print('Macro columns for Level 2 features:', macro_cols)

    create_correlation_features(window=30)
    create_volatility_features(windows=(20,60))
    create_extremes_features([c for c in df.columns if c.startswith(('m','p'))], windows=(20,60,120))

    # Clean data
    print("Cleaning and selecting features...")
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0.0, inplace=True)

    # Downcast to save memory
    float_cols = df.select_dtypes(include=['float64']).columns
    if len(float_cols) > 0:
        df[float_cols] = df[float_cols].astype('float32')

    print(f"Feature engineering complete. Created {len(df.columns)} total columns.")
    return df

In [6]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

In [7]:
# ===== Enhanced Ensemble Feature Selection (Replaces the old selection method) =====
from xml.sax.handler import all_features


def enhanced_feature_selection(X_features, y_target, final_features, n_features=200, verbose=True):
    """
    Enhanced ensemble feature selection combining multiple methods.
    
    Uses dynamic random states based on current time to ensure
    different feature selections on each run for robustness testing.
    
    Args:
        X_features: Feature DataFrame (from train_enh after feature engineering)
        y_target: Target Series (from train_enh[TARGET])
        final_features: List of original base features for categorization
        n_features: Number of top features to select
        verbose: Print progress information
    
    Returns:
        list: Selected feature names using ensemble voting
        dict: Detailed results from each method
    """
    
    # Generate dynamic random state based on current time
    dynamic_seed = int(time.time() * 1000) % 100000
    if verbose:
        print(f"Using dynamic seed: {dynamic_seed}")
    
    # Remove zero variance features first
    vt = VarianceThreshold(threshold=1e-6)
    X_filtered = X_features.loc[:, vt.fit(X_features).get_support()]
    
    if verbose:
        print(f"Features after variance filtering: {X_filtered.shape[1]}")
    
    feature_scores = {}
    selected_features_by_method = {}
    
    # Method 1a: Gradient Boosting Importance (Dynamic Random State)
    if verbose:
        print("Method 1a: Gradient Boosting Feature Importance...")
    
    gb = GradientBoostingRegressor(
        n_estimators=100, 
        max_depth=3, 
        random_state=dynamic_seed,  # Dynamic instead of 42
        subsample=0.8,
        learning_rate=0.1
    )
    gb.fit(X_filtered, y_target)
    gb_scores = pd.Series(gb.feature_importances_, index=X_filtered.columns)
    gb_top = gb_scores.nlargest(n_features).index.tolist()
    
    feature_scores['gradient_boosting'] = gb_scores
    selected_features_by_method['gradient_boosting'] = gb_top

    # Method 1b: Bagging Regressor Importance
    if verbose:
        print("Method 1b: Bagging Regressor Feature Importance...")

    br = BaggingRegressor(
        estimator=DecisionTreeRegressor(),
        n_estimators=100,
        max_samples=0.8,
        random_state=dynamic_seed,
        n_jobs=-1,
        bootstrap=True
    )
    br.fit(X_filtered, y_target)

    # Aggregate feature importances from fitted base estimators
    _importances = np.zeros(X_filtered.shape[1], dtype=float)
    count = 0
    for est in br.estimators_:
        est_imp = getattr(est, "feature_importances_", None)
        if est_imp is not None:
            _importances += est_imp
            count += 1

    if count > 0:
        _importances /= count
    else:
        _importances = np.zeros(X_filtered.shape[1], dtype=float)

    br_scores = pd.Series(_importances, index=X_filtered.columns)
    br_top = br_scores.nlargest(n_features).index.tolist()

    feature_scores['bagging_regressor'] = br_scores
    selected_features_by_method['bagging_regressor'] = br_top
    
    # Ensemble Voting: Features selected by multiple methods
    if verbose:
        print("Ensemble Voting: Combining all methods...")
    
    # Count votes for each feature
    feature_votes = {}
    all_features = set()
    
    for method, features in selected_features_by_method.items():
        all_features.update(features)
        for feature in features:
            feature_votes[feature] = feature_votes.get(feature, 0) + 1
    
    # Sort by votes, then by average score across methods
    def get_average_score(feature):
        scores = []
        for method, score_series in feature_scores.items():
            if feature in score_series.index:
                # Normalize scores to [0,1] for fair averaging
                normalized = (score_series[feature] - score_series.min()) / (score_series.max() - score_series.min() + 1e-10)
                scores.append(normalized)
        return np.mean(scores) if scores else 0.0
    
    # Create ensemble ranking
    ensemble_ranking = []
    for feature in all_features:
        votes = feature_votes.get(feature, 0)
        avg_score = get_average_score(feature)
        ensemble_ranking.append({
            'feature': feature,
            'votes': votes,
            'avg_score': avg_score,
            'ensemble_score': votes + avg_score  # Hybrid scoring
        })
    
    # Sort by ensemble score (votes + normalized average)
    ensemble_ranking.sort(key=lambda x: x['ensemble_score'], reverse=True)
    
    # Select top features
    ensemble_features = [item['feature'] for item in ensemble_ranking[:n_features]]
    
    if verbose:
        # Separate engineered features from original base features for reporting
        original_features_in_selection = [f for f in ensemble_features if f in final_features]
        new_engineered_features = [f for f in ensemble_features if f not in final_features]
        
        print(f"\nEnsemble Feature Selection Results:")
        print(f"   Total unique features considered: {len(all_features)}")
        num_1a = sum(1 for f in all_features if f in selected_features_by_method.get('gradient_boosting', []))
        num_1b = sum(1 for f in all_features if f in selected_features_by_method.get('bagging_regressor', []))

        print(f"   Selected by 1a (gradient boosting): {num_1a}")
        print(f"   Selected by 1b (bagging regressor):  {num_1b}")
        print(f"   Final ensemble selection: {len(ensemble_features)} features")

        print(f"   Final ensemble selection: {len(ensemble_features)} features")
        
        print(f"\nFeature Engineering Results:")
        print(f"Original base features available: {len(final_features)}")
        print(f"Original features selected: {len(original_features_in_selection)}")
        print(f"New engineered features created: {len(new_engineered_features)}")
        print(f"Total features for modeling: {len(ensemble_features)}")

        print(f"\nNew engineered features added:")
        for i, feat in enumerate(new_engineered_features, 1):
            print(f"{i:2d}. {feat}")

        print(f"\nAll {len(ensemble_features)} selected features:")
        for i, feat in enumerate(ensemble_features, 1):
            feat_type = "ORIGINAL" if feat in final_features else "ENGINEERED"
            print(f"{i:2d}. {feat:<25} [{feat_type}]")
        
        # Show top 10 features with vote details
        print(f"\nTop 10 Ensemble Features by Score:")
        for i, item in enumerate(ensemble_ranking[:10], 1):
            feat_type = "ORIGINAL" if item['feature'] in final_features else "ENGINEERED"
            print(f"   {i:2d}. {item['feature']:<25} | Votes: {item['votes']} | Score: {item['avg_score']:.3f} | [{feat_type}]")
    
    results = {
        'ensemble_features': ensemble_features,
        'method_features': selected_features_by_method,
        'feature_scores': feature_scores,
        'ensemble_ranking': ensemble_ranking,
        'dynamic_seed': dynamic_seed,
        'original_features_selected': [f for f in ensemble_features if f in final_features],
        'engineered_features_selected': [f for f in ensemble_features if f not in final_features]
    }
    
    return ensemble_features, results

# Feature Engineering & Data Preparation
top_features = ['M4', 'V13', 'M11', 'S2', 'D4', 'D1', 'D2', 'E8', 'P6', 'M2', 
                'D8', 'M9', 'P8', 'P7', 'S12', 'P13', 'V9', 'D5', 'P1', 'S8']

print("Creating advanced features for training data...")

# CORRECT: Create DataFrame with date_id + features but WITHOUT target columns to prevent data leakage
columns_to_exclude = ["market_forward_excess_returns", "forward_returns", "risk_free_rate"]
columns_to_include = ['date_id'] + [col for col in final_features if col in train_full.columns]

train_for_engineering = train_full[columns_to_include].copy()

print(f"Columns for feature engineering (count): {len(columns_to_include)}")
# print name of columns included
print(f"Included columns names: {columns_to_include}")
# print length of excluded columns
print(f"Excluded columns (count): {len(columns_to_exclude)}")
print(f"Excluded columns (prevent leakage) names: {columns_to_exclude}")

train_enh = create_advanced_features(
    train_for_engineering,
    top_features=top_features,
    window_sizes=(5, 10, 20, 60, 120),
    shift=1
)

# Add target back AFTER feature engineering for supervised selection
train_enh[TARGET] = train_full[TARGET].values

# Now do ENHANCED supervised feature selection with target present
feature_columns = [c for c in train_enh.columns if c not in ['date_id', TARGET]]
print(f"Feature columns for selection: {len(feature_columns)} total features available")

# Supervised feature selection using ENHANCED method
X_features = train_enh[feature_columns]
y_target = train_enh[TARGET]

# Apply Enhanced Feature Selection (replaces the old single-method approach)
print("\n" + "="*60)
print("ENHANCED ENSEMBLE FEATURE SELECTION")
print("="*60)

selected_features, selection_results = enhanced_feature_selection(
    X_features, y_target, final_features,
    n_features=200,  
    verbose=True
)

# Final feature matrices
X = train_enh[selected_features].astype('float32')
y = train_enh[TARGET].astype('float32')

print(f"\nFinal Training Data Shapes:")
print(f"Training set shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Features selected: {len(selected_features)}")

# Store for later use in inference
final_selected_features = selected_features

print("\nEnhanced feature selection complete!")
print("Ready for model training with dynamically selected features")

Creating advanced features for training data...
Columns for feature engineering (count): 95
Included columns names: ['date_id', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'E1', 'E10', 'E11', 'E12', 'E13', 'E14', 'E15', 'E16', 'E17', 'E18', 'E19', 'E2', 'E20', 'E3', 'E4', 'E5', 'E6', 'E7', 'E8', 'E9', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'M1', 'M10', 'M11', 'M12', 'M13', 'M14', 'M15', 'M16', 'M17', 'M18', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'P1', 'P10', 'P11', 'P12', 'P13', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'S1', 'S10', 'S11', 'S12', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'V1', 'V10', 'V11', 'V12', 'V13', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9']
Excluded columns (count): 3
Excluded columns (prevent leakage) names: ['market_forward_excess_returns', 'forward_returns', 'risk_free_rate']
Creating Level 1 features (Core)...
Creating Level 2 features (Macro)...
Macro columns for Level 2 features: ['M1', 'M10', 'M11', 'M12',

In [8]:
# ================================================================
#  CatBoost Base Model 
# ================================================================
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split

# Generate dynamic random state for models
model_seed = int(time.time() * 1000) % 100000

# Initialize CatBoostRegressor with BEST features and dynamic random state
ml_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    random_seed=model_seed,
    verbose=False,
)

# Fit the model on full training data
ml_model.fit(X, y)

# Create validation split for evaluation (use time series split - no shuffle)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, shuffle=False
)

# Get validation indices to retrieve forward_returns and risk_free_rate
val_idx = X_val.index

# Generate predictions on validation set
val_cat = ml_model.predict(X_val)

# print(f"Model trained on {X.shape[0]} samples")
# print(f"Validation split: {X_val.shape[0]} samples")

In [9]:
# ===== OFFICIAL Competition Metric Implementation =====
def compute_official_score(positions, forward_returns, risk_free_rate):
    """
    Exact implementation of the official Kaggle competition metric.
    Uses GEOMETRIC mean (compounding) like the competition.
    
    Args:
        positions: array of allocations in [0, 2]
        # # forward_returns: actual market returns
        risk_free_rate: risk-free rate
    
    Returns:
        dict with adjusted_sharpe and all components
    """
    positions = np.asarray(positions)
    forward_returns = np.asarray(forward_returns)
    risk_free_rate = np.asarray(risk_free_rate)
    
    n = len(positions)
    trading_days_per_yr = 252
    
    # Strategy returns: rf*(1-position) + position*forward_returns
    strategy_returns = risk_free_rate * (1 - positions) + positions * forward_returns
    
    # Strategy excess returns and GEOMETRIC mean
    strategy_excess_returns = strategy_returns - risk_free_rate
    strategy_excess_cumulative = (1 + strategy_excess_returns).prod()
    strategy_mean_excess_return = strategy_excess_cumulative ** (1 / n) - 1
    strategy_std = np.std(strategy_returns)
    
    if strategy_std == 0:
        return {'adjusted_sharpe': 0, 'error': 'strategy_std is zero'}
    
    # Strategy Sharpe ratio
    sharpe = (strategy_mean_excess_return / strategy_std) * np.sqrt(trading_days_per_yr)
    strategy_volatility = strategy_std * np.sqrt(trading_days_per_yr) * 100
    
    # Market excess returns and GEOMETRIC mean
    market_excess_returns = forward_returns - risk_free_rate
    market_excess_cumulative = (1 + market_excess_returns).prod()
    market_mean_excess_return = market_excess_cumulative ** (1 / n) - 1
    market_std = np.std(forward_returns)
    market_volatility = market_std * np.sqrt(trading_days_per_yr) * 100
    
    if market_volatility == 0:
        return {'adjusted_sharpe': 0, 'error': 'market_volatility is zero'}
    
    # Volatility penalty: penalize if strategy vol > 1.2 * market vol
    excess_vol = max(0, strategy_volatility / market_volatility - 1.2)
    vol_penalty = 1 + excess_vol
    
    # Return gap penalty: penalize underperforming the market (quadratic)
    return_gap = max(0, (market_mean_excess_return - strategy_mean_excess_return) * 100 * trading_days_per_yr)
    return_penalty = 1 + (return_gap ** 2) / 100
    
    # Adjusted Sharpe
    adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
    
    return {
        'adjusted_sharpe': float(adjusted_sharpe),
        'raw_sharpe': float(sharpe),
        'vol_penalty': float(vol_penalty),
        'return_penalty': float(return_penalty),
        'strategy_volatility': float(strategy_volatility),
        'market_volatility': float(market_volatility),
        'strategy_geo_return': float(strategy_mean_excess_return * 100 * trading_days_per_yr),  # annualized %
        'market_geo_return': float(market_mean_excess_return * 100 * trading_days_per_yr),  # annualized %
        'return_gap': float(return_gap)
    }


def robust_signal_to_weight(sig, lower=0.0, upper=2.0):
    """
    Map raw signals to weights robustly using percentile clipping.
    """
    sig = np.asarray(sig)
    lo = np.nanpercentile(sig, 5)
    hi = np.nanpercentile(sig, 95)
    if np.isclose(hi, lo):
        sig_z = (sig - np.nanmean(sig)) / (np.nanstd(sig) + 1e-12)
        w = 2.0 / (1.0 + np.exp(-sig_z))
    else:
        w = (sig - lo) / (hi - lo + 1e-12) * (upper - lower) + lower
    return np.clip(w, lower, upper)


# Evaluate on validation set
val_df = train.loc[val_idx].copy()
val_weights = robust_signal_to_weight(val_cat)

res = compute_official_score(
    val_weights, 
    val_df['forward_returns'].to_numpy(), 
    val_df['risk_free_rate'].to_numpy()
)

print("=" * 50)
print("OFFICIAL COMPETITION METRIC RESULTS")
print("=" * 50)
print(f"Adjusted Sharpe:      {res['adjusted_sharpe']:.4f}")
print(f"Raw Sharpe:           {res['raw_sharpe']:.4f}")
print(f"Vol Penalty:          {res['vol_penalty']:.4f}")
print(f"Return Penalty:       {res['return_penalty']:.4f}")
print(f"Strategy Vol:         {res['strategy_volatility']:.2f}%")
print(f"Market Vol:           {res['market_volatility']:.2f}%")
print(f"Strategy Return:      {res['strategy_geo_return']:.2f}% ann.")
print(f"Market Return:        {res['market_geo_return']:.2f}% ann.")
print(f"Return Gap:           {res['return_gap']:.2f}")

OFFICIAL COMPETITION METRIC RESULTS
Adjusted Sharpe:      5.9732
Raw Sharpe:           6.1268
Vol Penalty:          1.0257
Return Penalty:       1.0000
Strategy Vol:         24.05%
Market Vol:           19.62%
Strategy Return:      147.38% ann.
Market Return:        6.89% ann.
Return Gap:           0.00


In [10]:
# ===== Volatility Scaling Calibration - OPTIMIZED FOR COMPETITION METRIC =====
from scipy.optimize import minimize_scalar

VOL_MULTIPLIER_LIMIT = 1.19  # Slightly below 1.9 for safety buffer

def competition_objective(k, predictions, forward_returns, risk_free_rate):
    """
    Objective function: negative adjusted sharpe (for minimization).
    Directly optimizes the official competition metric.
    """
    exposures = np.clip(k * predictions, 0, 2)
    result = compute_official_score(exposures, forward_returns, risk_free_rate)
    
    if 'error' in result:
        return 1e9  # Return large value if error
    
    # Check volatility constraint
    if result['strategy_volatility'] > VOL_MULTIPLIER_LIMIT * result['market_volatility']:
        return 1e9  # Penalize exceeding vol limit
    
    return -result['adjusted_sharpe']  # Negative for minimization


# Get validation data
val_pred = ml_model.predict(X_val)
val_forward_returns = train.loc[val_idx, 'forward_returns'].values
val_risk_free_rate = train.loc[val_idx, 'risk_free_rate'].values

# Method 1: Scipy optimization
print("Optimizing scaling factor k using scipy...")
result = minimize_scalar(
    lambda k: competition_objective(k, val_pred, val_forward_returns, val_risk_free_rate),
    bounds=(0.01, 5.0),
    method='bounded'
)
scipy_best_k = result.x
scipy_best_score = -result.fun

# Method 2: Grid search (as backup/verification)
print("Verifying with grid search...")
best_k, best_adjusted_sharpe = 0.1, -1e9
best_result = None

for k in np.linspace(0.01, 5.0, 200):
    exposures = np.clip(k * val_pred, 0, 2)
    res = compute_official_score(exposures, val_forward_returns, val_risk_free_rate)
    
    if 'error' in res:
        continue
    
    # Check volatility constraint
    if res['strategy_volatility'] <= VOL_MULTIPLIER_LIMIT * res['market_volatility']:
        if res['adjusted_sharpe'] > best_adjusted_sharpe:
            best_k = k
            best_adjusted_sharpe = res['adjusted_sharpe']
            best_result = res

# Use the better result
if scipy_best_score > best_adjusted_sharpe:
    best_k = scipy_best_k
    best_adjusted_sharpe = scipy_best_score
    exposures = np.clip(best_k * val_pred, 0, 2)
    best_result = compute_official_score(exposures, val_forward_returns, val_risk_free_rate)

print("\n" + "=" * 50)
print("CALIBRATION RESULTS")
print("=" * 50)
print(f"Optimal k:            {best_k:.4f}")
print(f"Adjusted Sharpe:      {best_adjusted_sharpe:.4f}")
print(f"Raw Sharpe:           {best_result['raw_sharpe']:.4f}")
print(f"Vol Penalty:          {best_result['vol_penalty']:.4f}")
print(f"Return Penalty:       {best_result['return_penalty']:.4f}")
print(f"Strategy Vol:         {best_result['strategy_volatility']:.2f}%")
print(f"Market Vol:           {best_result['market_volatility']:.2f}%")
print(f"Vol Ratio:            {best_result['strategy_volatility']/best_result['market_volatility']:.2f}x")

Optimizing scaling factor k using scipy...
Verifying with grid search...

CALIBRATION RESULTS
Optimal k:            4.4475
Adjusted Sharpe:      4.9390
Raw Sharpe:           4.9390
Vol Penalty:          1.0000
Return Penalty:       1.0000
Strategy Vol:         1.43%
Market Vol:           19.62%
Vol Ratio:            0.07x


In [None]:
# ===== Test Predictions + Smoothing (OPTIMIZED) =====

# Apply feature engineering to test set (same as training)
print("Applying feature engineering to test set...")
test_for_engineering = test[columns_to_include].copy()

test_enh = create_advanced_features(
    test_for_engineering,
    top_features=top_features,
    window_sizes=(5, 10, 20, 60, 120),  # Use SAME windows as training!
    shift=1
)

# Extract same selected features used in training
X_test = test_enh[selected_features].astype('float32')
print(f"Test set feature engineering complete: {X_test.shape}")

# Generate predictions
test_pred = ml_model.predict(X_test)

# Apply optimal k scaling
scaled_pred = np.clip(best_k * test_pred, 0, 2)

# Apply exponential smoothing to reduce volatility
# Lower alpha = more smoothing = lower volatility
alpha = 0.7  # Reduced from 0.8 for more smoothing

smoothed_allocation = []
prev = 1.0  # Start at market exposure (safe default)

for x in scaled_pred:
    s = alpha * x + (1 - alpha) * prev
    smoothed_allocation.append(s)
    prev = s

smoothed_allocation = np.array(smoothed_allocation)

# Safety check: ensure mean is close to 1.0 (market exposure)
# This helps avoid the return penalty
mean_alloc = smoothed_allocation.mean()
print(f"\nPre-adjustment mean allocation: {mean_alloc:.4f}")

# Optional: Bias toward market exposure if model is uncertain
# Uncomment below if you're getting penalized for underperforming market
# if mean_alloc < 0.9:
#     smoothed_allocation = smoothed_allocation * (1.0 / mean_alloc) * 0.95
#     smoothed_allocation = np.clip(smoothed_allocation, 0, 2)

# Create submission DataFrame
submission_df = pd.DataFrame({
    'date_id': test['date_id'],
    'prediction': smoothed_allocation.astype('float32')
})

# Save submission file
submission_df.to_csv("submission.csv", index=False)

print("\n" + "=" * 50)
print("SUBMISSION STATISTICS")
print("=" * 50)
print(f"File saved: submission.csv")
print(f"Range:      [{smoothed_allocation.min():.4f}, {smoothed_allocation.max():.4f}]")
print(f"Mean:       {smoothed_allocation.mean():.4f}")
print(f"Median:     {np.median(smoothed_allocation):.4f}")
print(f"Std:        {smoothed_allocation.std():.4f}")
print(f"Total:      {len(smoothed_allocation)} predictions")

# Show distribution
print(f"\nAllocation Distribution:")
print(f"  < 0.5:    {(smoothed_allocation < 0.5).sum()} ({(smoothed_allocation < 0.5).mean()*100:.1f}%)")
print(f"  0.5-1.0:  {((smoothed_allocation >= 0.5) & (smoothed_allocation < 1.0)).sum()} ({((smoothed_allocation >= 0.5) & (smoothed_allocation < 1.0)).mean()*100:.1f}%)")
print(f"  1.0-1.5:  {((smoothed_allocation >= 1.0) & (smoothed_allocation < 1.5)).sum()} ({((smoothed_allocation >= 1.0) & (smoothed_allocation < 1.5)).mean()*100:.1f}%)")
print(f"  >= 1.5:   {(smoothed_allocation >= 1.5).sum()} ({(smoothed_allocation >= 1.5).mean()*100:.1f}%)")

Applying feature engineering to test set...
Creating Level 1 features (Core)...


ValueError: Can only compute partial correlations for lags up to 50% of the sample size. The requested nlags 10 must be < 5.

In [None]:
# Added to stop here the running of the code
-

In [None]:
# # ================================================================
# #  Competition-Compliant Inference Function
# # ================================================================
# _ml_model = ml_model  # Use the CatBoost model trained in Cell 9
# _feat_cols = selected_features  # Use selected features from Cell 8

# def predict(pl_df):
#     """Competition inference function - returns DataFrame with predictions."""
#     # Convert Polars to Pandas and handle missing values
#     pdf = pl_df.to_pandas().fillna(0.0)
    
#     # Ensure all required features are present
#     for f in _feat_cols:
#         if f not in pdf.columns:
#             pdf[f] = 0.0
    
#     # Make predictions from CatBoost model
#     preds = _ml_model.predict(pdf[_feat_cols])
    
#     # Map predictions to weights [0, 2] using percentile scaling
#     lo, hi = np.percentile(preds, [5, 95])
#     weights = np.clip((preds - lo) / (hi - lo + 1e-9) * 2.0, 0, 2)
    
#     return pd.DataFrame({"prediction": weights.astype("float32")})

In [None]:
# # ================================================================
# # Kaggle Evaluation Server
# # ================================================================

# server = kdeval.DefaultInferenceServer(predict)

# if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
#     server.serve()
# else:
#     server.run_local_gateway((DATA_DIR,))

In [None]:
# selected_features

In [None]:
# # ================================================================
# #  Competition-Compliant Inference Function (Single Float Return)
# # ================================================================
# _ml_model = ml_model  # Use the CatBoost model trained in Cell 9
# _feat_cols = selected_features  # Use selected features from Cell 8
# _history_returns = list(train.loc[val_idx, 'forward_returns'].iloc[-VOL_WINDOW:].tolist())

# def predict(pl_df: pl.DataFrame) -> float:
#     """Competition inference function - returns single float allocation."""
#     global _history_returns
    
#     # Convert Polars to Pandas and handle missing values
#     pdf = pl_df.to_pandas().fillna(0.0)
    
#     # Ensure all required features are present
#     for f in _feat_cols:
#         if f not in pdf.columns:
#             pdf[f] = 0.0
    
#     # Make prediction from CatBoost model
#     pred = _ml_model.predict(pdf[_feat_cols])[0]  # Get first prediction
    
#     # Estimate rolling volatility for scaling
#     vol_est = np.std(_history_returns) if len(_history_returns) > 1 else 1e-3
    
#     # Map prediction to weight using robust scaling
#     lo = np.nanpercentile([pred], 5)
#     hi = np.nanpercentile([pred], 95)
    
#     if np.isclose(hi, lo):
#         # Fallback: simple clipping
#         weight = np.clip(pred, 0, 2)
#     else:
#         weight = (pred - lo) / (hi - lo + 1e-9) * 2.0
    
#     # Apply volatility adjustment and clip to [0, 2]
#     allocation = float(np.clip(weight / (vol_est + 1e-9), 0, 2))
    
#     # Update history for rolling volatility estimation
#     if 'lagged_forward_returns' in pl_df.columns:
#         try:
#             _history_returns.append(float(pl_df['lagged_forward_returns'][0]))
#         except:
#             _history_returns.append(0.0)
#     else:
#         _history_returns.append(0.0)
    
#     # Keep only last VOL_WINDOW entries
#     _history_returns = _history_returns[-VOL_WINDOW:]
    
#     return allocation

In [None]:
# selected_features

In [None]:
# print("Total selected features for inference:", len(selected_features))

In [None]:
# # ================================================================
# # Kaggle Evaluation Server / Local Submission
# # ================================================================

# if KAGGLE_ENV:
#     # Kaggle competition environment
#     server = kdeval.DefaultInferenceServer(predict)
    
#     if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
#         server.serve()
#     else:
#         server.run_local_gateway((str(DATA_DIR),))
        
# else:
#     # Local environment - generate submission file
#     print("Local mode - generating submission file...")
#     print(f"Test set size: {len(test)} rows")
    
#     # Apply same feature engineering pipeline as training
#     print("Applying feature engineering to test set...")
    
#     # Prepare test data for feature engineering
#     test_for_engineering = test[columns_to_include].copy()
    
#     # Apply same feature engineering pipeline
#     test_enh = create_advanced_features(
#         test_for_engineering,
#         top_features=top_features,
#         window_sizes=(5, 10, 20, 60, 120),
#         shift=1
#     )
    
#     # Extract same selected features
#     X_test = test_enh[selected_features].astype('float32')
    
#     print(f"Feature engineering complete: {X_test.shape[1]} features")
    
#     # Generate predictions using trained CatBoost model
#     print("\nGenerating predictions with CatBoost model...")
#     test_pred = ml_model.predict(X_test)
    
#     # Map predictions to weights [0, 2] using robust scaling
#     lo, hi = np.percentile(test_pred, [5, 95])
#     test_weights = np.clip((test_pred - lo) / (hi - lo + 1e-9) * 2.0, 0, 2)
    
#     # Apply exponential smoothing
#     alpha = 0.8
#     smoothed_allocation = []
#     prev = 0.0
#     for x in test_weights:
#         s = alpha * x + (1 - alpha) * prev
#         smoothed_allocation.append(s)
#         prev = s
#     smoothed_allocation = np.array(smoothed_allocation)
    
#     # Create submission DataFrame
#     submission = pd.DataFrame({
#         'date_id': test['date_id'],
#         'prediction': smoothed_allocation.astype('float32')
#     })
    
#     # Save to CSV
#     submission.to_csv('submission.csv', index=False)
    
#     print("\nSubmission file saved: submission.csv")
#     print(f"Prediction statistics:")
#     print(f"  Range: [{smoothed_allocation.min():.4f}, {smoothed_allocation.max():.4f}]")
#     print(f"  Mean: {smoothed_allocation.mean():.4f}")
#     print(f"  Median: {np.median(smoothed_allocation):.4f}")
#     print(f"  Std: {smoothed_allocation.std():.4f}")
#     print(f"  Total predictions: {len(smoothed_allocation)}")
    
#     # Display submission preview
#     print("\nSubmission preview:")
#     print(submission.head(10))

In [None]:
# X_test.sample(10)

In [None]:
"""
Kaggle Submission Version
""";

In [None]:
VOL_MULTIPLIER_LIMIT = 1.2
VOL_WINDOW = 20 # volatility window in days

In [None]:
## Volatility Scaling Calibration
def strategy_stats(returns, exposures):
    strategy_returns = exposures * returns
    mean = np.nanmean(strategy_returns)
    std  = np.nanstd(strategy_returns)
    sharpe = (mean / (std + 1e-9)) * np.sqrt(252)
    vol = std * np.sqrt(252)
    return {'sharpe': sharpe, 'vol': vol}

# Remove num_iteration parameter - not needed without early stopping
val_pred = ml_model.predict(X_val)

# Get forward_returns from validation data
val_forward_returns = train.loc[val_idx, 'forward_returns'].values
market_vol = np.nanstd(val_forward_returns) * np.sqrt(252)

best_k, best_sharpe = 0.1, -1e9
for k in np.linspace(0.01, 5.0, 100):
    exposures = np.clip((k * val_pred), 0, 2)
    stats = strategy_stats(val_forward_returns, exposures)
    if stats['vol'] <= VOL_MULTIPLIER_LIMIT * market_vol and stats['sharpe'] > best_sharpe:
        best_k = k
        best_sharpe = stats['sharpe']

print(f"Chosen scaling factor k={best_k:.3f} with Sharpe={best_sharpe:.2f}")

Chosen scaling factor k=5.000 with Sharpe=4.95


In [None]:
## Test Predictions + Smoothing

# Apply feature engineering to test set (same as training)
print("Applying feature engineering to test set...")
test_for_engineering = test[columns_to_include].copy()

test_enh = create_advanced_features(
    test_for_engineering,
    top_features=top_features,
    window_sizes=(5), # Reduced window sizes for faster inference on test set only
    shift=1
)

# Extract same selected features used in training
X_test = test_enh[selected_features].astype('float32')

print(f"Test set feature engineering complete: {X_test.shape}")

# Generate predictions (remove num_iteration parameter)
test_pred = ml_model.predict(X_test)

# Apply exponential smoothing with best_k scaling
alpha = 0.8
smoothed_allocation = []
prev = 0.0
for x in np.clip(best_k * test_pred, 0, 2):
    s = alpha * x + (1 - alpha) * prev
    smoothed_allocation.append(s)
    prev = s
smoothed_allocation = np.array(smoothed_allocation)

# Create submission DataFrame
submission_df = pd.DataFrame({
    'date_id': test['date_id'],
    'prediction': smoothed_allocation.astype('float32')
})

# Save submission file
submission_df.to_csv("submission_cat_200feat.csv", index=False)
print(f"\nSaved submission_cat_200feat.csv")
print(f"Prediction statistics:")
print(f"  Range: [{smoothed_allocation.min():.4f}, {smoothed_allocation.max():.4f}]")
print(f"  Mean: {smoothed_allocation.mean():.4f}")
print(f"  Median: {np.median(smoothed_allocation):.4f}")
print(f"  Total predictions: {len(smoothed_allocation)}")

Applying feature engineering to test set...
Creating Level 1 features (Core)...


TypeError: 'int' object is not iterable

In [None]:
#   Change window_sizes=(5, 10) for test set ONLY

In [None]:
## Test Predictions + Smoothing

# Apply feature engineering to test set (same as training)
print("Applying feature engineering to test set...")
test_for_engineering = test[columns_to_include].copy()

test_enh = create_advanced_features(
    test_for_engineering,
    top_features=top_features,
    window_sizes=(5, 10),
    shift=1
)

# Extract same selected features used in training
X_test = test_enh[selected_features].astype('float32')

print(f"Test set feature engineering complete: {X_test.shape}")

# Generate predictions (remove num_iteration parameter)
test_pred = ml_model.predict(X_test)

# Apply exponential smoothing with best_k scaling
alpha = 0.8
smoothed_allocation = []
prev = 0.0
for x in np.clip(best_k * test_pred, 0, 2):
    s = alpha * x + (1 - alpha) * prev
    smoothed_allocation.append(s)
    prev = s
smoothed_allocation = np.array(smoothed_allocation)

# Create submission DataFrame
submission_df = pd.DataFrame({
    'date_id': test['date_id'],
    'prediction': smoothed_allocation.astype('float32')
})

# Save submission file
submission_df.to_csv("submission_cat_200feat.csv", index=False)
print(f"\nSaved submission_cat_200feat.csv")
print(f"Prediction statistics:")
print(f"  Range: [{smoothed_allocation.min():.4f}, {smoothed_allocation.max():.4f}]")
print(f"  Mean: {smoothed_allocation.mean():.4f}")
print(f"  Median: {np.median(smoothed_allocation):.4f}")
print(f"  Total predictions: {len(smoothed_allocation)}")