In [None]:
# ================================================================
# 🧠 HULL TACTICAL MARKET PREDICTION — ENSEMBLE + SHARPEPENALTY
# ================================================================
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress TensorFlow warnings
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # Use only first GPU if multiple

from pathlib import Path
import numpy as np
import pandas as pd

import polars as pl

import warnings
warnings.filterwarnings('ignore')

from catboost import CatBoostRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

from scipy.stats import zscore
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import GradientBoostingRegressor

from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# Import TensorFlow after setting environment variables
import tensorflow as tf
tf.get_logger().setLevel('ERROR')  # Only show errors
from tensorflow import keras
from tensorflow.keras import layers

# Try to import kaggle_evaluation, handle if not available
try:
    import kaggle_evaluation.default_inference_server as kdeval
    KAGGLE_ENV = True
    print("✅ Running in Kaggle competition environment")
except ImportError:
    KAGGLE_ENV = False
    print("⚠️ Running in local environment - kaggle_evaluation not available")

⚠️ Running in local environment - kaggle_evaluation not available


In [None]:
# ================================================================
# 1️⃣ Data Loading & Initial Feature Preparation
# ================================================================

# DATA_DIR = Path('/kaggle/input/hull-tactical-market-prediction')

## Configuration and Data Loading (local version only)
DATA_DIR = Path("01_data")
TARGET = "market_forward_excess_returns"
drop_cols = ["date_id", "forward_returns", "risk_free_rate"]
VOL_WINDOW = 20        # volatility window in days
VALIDATION_SIZE = 2700          # days, approx. 30% of data

def time_split_train_val(df: pd.DataFrame, val_size: int = 2700):
    """Split data chronologically for time series validation."""
    df = df.sort_values('date_id').reset_index(drop=True)
    train_df = df.iloc[:-val_size].copy()
    val_df   = df.iloc[-val_size:].copy()
    return train_df, val_df

# Load train/test data
if (DATA_DIR / "train.csv").exists():
    print("✅ Running in Kaggle environment")
    train = pd.read_csv(DATA_DIR / "train.csv")
    test = pd.read_csv(DATA_DIR / "test.csv")
else:
    print("✅ Running locally")
    train = pd.read_csv("01_data/train.csv")
    test = pd.read_csv("01_data/test.csv")

print(f"Train shape: {train.shape} | Test shape: {test.shape}")


# Basic preprocessing
train = train.sort_values("date_id").reset_index(drop=True)
test = test.sort_values("date_id").reset_index(drop=True)

# handle missing values
train = train.fillna(0.0)
test = test.fillna(0.0)

# Base features (before advanced transformations)
base_features = [c for c in train.columns if c not in drop_cols + [TARGET]]

print(f"Base features available: {len(base_features)}")
print(f"Target variable: {TARGET}")

In [None]:
def prepare_df(df: pd.DataFrame, median_map: Dict[str, float], feature_cols: list) -> pd.DataFrame:
    """
    Clean and prepare DataFrame by handling missing values intelligently.
    
    Strategy:
    - Use median imputation for numeric columns with some missing values
    - Use zero-fill for columns with very few missing values  
    - Only process existing columns (no synthetic data creation)

    Median is much less sensitive to extreme values (outliers)
    Mean can be heavily skewed by a few very large or very small values
    """
    df = df.copy()
    
    # Only work with columns that actually exist in the DataFrame
    existing_cols = [col for col in feature_cols if col in df.columns]
    
    if not existing_cols:
        return df
    
    # Calculate missing percentages for existing columns
    missing_pct = (df[existing_cols].isnull().sum() / len(df)) * 100
    
    # Categorize columns by missing percentage
    cols_fill_median = missing_pct[(missing_pct > 5) & (missing_pct <= 50)].index.tolist()
    cols_fill_zero = missing_pct[missing_pct <= 5].index.tolist()
    
    # Apply median imputation for moderately missing columns
    if cols_fill_median:
        for col in cols_fill_median:
            median_val = median_map.get(col, df[col].median())
            df[col] = df[col].fillna(median_val)
    
    # Apply zero-fill for low missing columns
    if cols_fill_zero:
        df[cols_fill_zero] = df[cols_fill_zero].fillna(0)
    
    # Ensure all columns are numeric
    for col in existing_cols:
        if df[col].dtype == 'object':
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
    
    return df

In [None]:
## Train / Validation Split and Median Imputation
train_df, val_df = time_split_train_val(train, val_size=VALIDATION_SIZE)

median_map = {c: float(train_df[c].median(skipna=True)) if train_df[c].dtype.kind in 'fiu' else 0.0 
              for c in base_features}

train_p = prepare_df(train_df, median_map, base_features)
val_p   = prepare_df(val_df, median_map, base_features)
test_p  = prepare_df(test, median_map, base_features)

# Use only the actual feature columns (no synthetic _was_na columns)
final_features = [c for c in base_features if c in train_p.columns]
print("Number of features:", len(final_features))

In [None]:
# ===== Advanced Feature Factory =====
# Insert this cell before model training and call create_advanced_features(train, top_features, window_sizes=...)
def create_advanced_features(df,
                             top_features,
                             macro_prefixes=('mom','m','v','p','s'),
                             window_sizes=(5,10,20,60,120),
                             max_features_to_keep=50,
                             inplace=False):
    """
    Create advanced features following a two-level approach:
      1) Lightweight Core Features (applied to `top_features`)
      2) Macro-Context Features (applied to columns starting with macro_prefixes)
    Returns:
      df_out: DataFrame with new features (and original columns)
      selected_features: list of top selected feature column names (keeps up to max_features_to_keep)
    """
    if not inplace:
        df = df.copy()

    # ensure datetime-like ordering by date_id if present
    if 'date_id' in df.columns:
        df = df.sort_values('date_id').reset_index(drop=True)

    # helper: ensure numeric dtype for selected cols
    def _to_numeric(cols):
        for c in cols:
            if c in df.columns:
                df[c] = pd.to_numeric(df[c], errors='coerce')

    # ------------- Subfunction 1: rolling statistics -------------
    def create_rolling_features(cols, windows=window_sizes, shift=1):
        """
        For each col in cols create:
          - {col}_mean_{w}: rolling mean (past w days)
          - {col}_std_{w}: rolling std
          - {col}_median_{w}: rolling median
        Use shift to avoid leakage (default shift=1).
        """
        for c in cols:
            if c not in df.columns:
                continue
            for w in windows:
                roll = df[c].shift(shift).rolling(window=w, min_periods=1)
                df[f"{c}_mean_{w}"] = roll.mean().astype('float32')
                df[f"{c}_std_{w}"] = roll.std().astype('float32').fillna(0.0)
                df[f"{c}_median_{w}"] = roll.median().astype('float32')

    # ------------- Subfunction 2: returns features -------------
    def create_returns_features(base_col='forward_returns', windows=(5,20,60), shift=1):
        """
        Create rolling cumulative returns and last-N returns features.
        If forward_returns missing, skip.
        """
        if base_col not in df.columns:
            return
        for w in windows:
            # rolling sum of past w returns (use shift to exclude current day)
            df[f"{base_col}_sum_{w}"] = df[base_col].shift(shift).rolling(window=w, min_periods=1).sum().astype('float32')
            df[f"{base_col}_mean_{w}"] = df[base_col].shift(shift).rolling(window=w, min_periods=1).mean().astype('float32')
            df[f"{base_col}_std_{w}"] = df[base_col].shift(shift).rolling(window=w, min_periods=1).std().astype('float32').fillna(0.0)

    # ------------- Subfunction 3: cumulative difference from mean -------------
    def create_diff_features(base_col='forward_returns', windows=(10,30,90), shift=1):
        """
        Create cumulative difference from rolling mean:
        cumsum_t - rolling_mean_t * n_obs
        """
        if base_col not in df.columns:
            return
        cumsum = df[base_col].shift(shift).cumsum()
        for w in windows:
            roll_mean = df[base_col].shift(shift).rolling(window=w, min_periods=1).mean()
            # distance between cumulative sum and what cumulative would be if at rolling mean
            df[f"{base_col}_cumsum_dist_mean_{w}"] = (cumsum - roll_mean * np.arange(1, len(df)+1)).astype('float32')

    # ------------- Subfunction 4: z-score features -------------
    def create_zscore_features(cols, windows=window_sizes, shift=1):
        """
        Create rolling z-scores: (value - rolling_mean) / rolling_std
        Use shift to avoid leakage.
        """
        for c in cols:
            if c not in df.columns:
                continue
            for w in windows:
                roll_mean = df[c].shift(shift).rolling(window=w, min_periods=1).mean()
                roll_std = df[c].shift(shift).rolling(window=w, min_periods=1).std().fillna(0.0)
                df[f"{c}_z_{w}"] = ((df[c] - roll_mean) / (roll_std + 1e-9)).astype('float32')

    # ------------- Subfunction 5: spread features -------------
    def create_spread_features(cols, shift=1):
        """
        Spread with previous value: current - previous
        """
        for c in cols:
            if c not in df.columns:
                continue
            df[f"{c}_diff_1"] = (df[c] - df[c].shift(1)).astype('float32')
            # percentage change
            df[f"{c}_pctchg_1"] = (df[c].pct_change(periods=1).fillna(0.0)).astype('float32')

    # ------------- Subfunction 6: correlation features (selective) -------------
    def create_corr_features(pairs=None, window=30, shift=1):
        """
        Create rolling correlation for given pairs list of tuples (a,b).
        If pairs None, build pairs from top_features combos but limited to reasonable count.
        """
        if pairs is None:
            # build candidate pairs from top_features (unique combinations) but limit to top 10 pairs
            cand = []
            for i in range(len(top_features)):
                for j in range(i+1, len(top_features)):
                    cand.append((top_features[i], top_features[j]))
            pairs = cand[:10]
        for a,b in pairs:
            if a not in df.columns or b not in df.columns:
                continue
            df[f"corr_{a}_{b}_{window}"] = df[a].shift(shift).rolling(window=window, min_periods=1).corr(df[b].shift(shift)).astype('float32').fillna(0.0)

    # ------------- Subfunction 7: cumulative sums (various windows) -------------
    def create_cumm_sum_features(cols, windows=(5,10,20), shift=1):
        """
        Cumulative sums of columns (shifted) over rolling windows
        """
        for c in cols:
            if c not in df.columns:
                continue
            for w in windows:
                df[f"{c}_cumsum_{w}"] = df[c].shift(shift).rolling(window=w, min_periods=1).sum().astype('float32')

    # ------------- Subfunction 8: volatility spreads -------------
    def create_vol_spreads(cols=None, windows=(20,60), shift=1):
        """
        Compute spread between volatilities of pairs in `cols`.
        If cols None, use a subset of volatility-related columns (v*)
        """
        if cols is None:
            cols = [c for c in df.columns if c.startswith('v')]
        # compute rolling vol for each col
        for w in windows:
            vols = {}
            for c in cols:
                vols[c] = df[c].shift(shift).rolling(window=w, min_periods=1).std().astype('float32').fillna(0.0)
            # compute spreads for a small subset to limit growth
            keys = list(vols.keys())[:6]
            for i in range(len(keys)-1):
                a,b = keys[i], keys[i+1]
                df[f"volspread_{a}_{b}_{w}"] = (vols[a] - vols[b]).astype('float32')

    # ------------- Subfunction 9: highs / lows ratios -------------
    def create_high_low_ratios(cols, windows=(20,60,120), shift=1):
        """
        For each col create ratio to rolling max/min:
          {col}_high_ratio_w = current / rolling_max_w
          {col}_low_ratio_w  = current / rolling_min_w
        """
        for c in cols:
            if c not in df.columns:
                continue
            for w in windows:
                roll_max = df[c].shift(shift).rolling(window=w, min_periods=1).max()
                roll_min = df[c].shift(shift).rolling(window=w, min_periods=1).min()
                df[f"{c}_high_{w}"] = (df[c] / (roll_max + 1e-9)).astype('float32')
                df[f"{c}_low_{w}"] = (df[c] / (roll_min + 1e-9)).astype('float32')

    # -------------------- Execute Level 1: Lightweight Core Features --------------------
    # Ensure numeric conversion for top_features
    _to_numeric(top_features)
    create_rolling_features(top_features, windows=window_sizes)
    # create_returns_features(base_col='forward_returns', windows=(5,20,60))
    # create_diff_features(base_col='forward_returns', windows=(10,30,90))
    create_zscore_features(top_features, windows=window_sizes)
    create_spread_features(top_features)

    # -------------------- Execute Level 2: Macro-Context Features (selective) --------------------
    # Gather macro-context columns starting with the given prefixes
    macro_cols = [c for c in df.columns if any(c.startswith(pref) for pref in macro_prefixes)]
    # ensure numeric
    _to_numeric(macro_cols)
    # apply selected transforms on macro set (limited to avoid explosion)
    create_cumm_sum_features(macro_cols, windows=(5,10,20))
    # build selective correlation pairs from the most frequent macro cols (limit)
    corr_candidates = []
    if len(macro_cols) > 1:
        # pick top 8 macro_cols to form pairs
        msel = macro_cols[:8]
        for i in range(len(msel)):
            for j in range(i+1, len(msel)):
                corr_candidates.append((msel[i], msel[j]))
    create_corr_features(pairs=corr_candidates, window=30)
    create_vol_spreads(cols=[c for c in df.columns if c.startswith('v')][:8], windows=(20,60))
    create_high_low_ratios([c for c in df.columns if c.startswith(('m','p'))][:10], windows=(20,60,120))

    # -------------------- Feature cleaning: replace inf/nan and downcast --------------------
    # replace inf and NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0.0, inplace=True)

    # downcast floats to float32 to save memory
    float_cols = df.select_dtypes(include=['float64']).columns
    df[float_cols] = df[float_cols].astype('float32')

    # -------------------- Feature selection: keep top features --------------------
    def select_top_features(df_in, target_col='market_forward_excess_returns', top_k=max_features_to_keep):
        """
        1) Remove low-variance features
        2) Fit a fast tree model (GradientBoosting) to get importances
        3) Return list of top_k feature names
        """
        X_full = df_in.drop(columns=[target_col]) if target_col in df_in.columns else df_in.copy()
        # drop non-numeric columns
        X_num = X_full.select_dtypes(include=[np.number]).copy()
        # remove columns with zero variance
        vt = VarianceThreshold(threshold=1e-6)
        try:
            vt.fit(X_num)
            cols_kept = X_num.columns[vt.get_support()].tolist()
        except Exception:
            cols_kept = X_num.columns.tolist()

        X_sel = X_num[cols_kept].copy()
        y_sel = df_in[target_col].values if target_col in df_in.columns else None

        # If target is available, compute importances, else fall back to variance ranking
        if y_sel is not None and len(y_sel) == len(X_sel) and np.unique(y_sel).shape[0] > 1:
            gb = GradientBoostingRegressor(n_estimators=100, max_depth=3)
            gb.fit(X_sel, y_sel)
            imp = pd.Series(gb.feature_importances_, index=X_sel.columns).sort_values(ascending=False)
            top_feats = imp.head(top_k).index.tolist()
        else:
            # fallback: pick top_k by variance
            var_series = X_sel.var().sort_values(ascending=False)
            top_feats = var_series.head(top_k).index.tolist()

        return top_feats

    selected = select_top_features(df, target_col='market_forward_excess_returns', top_k=max_features_to_keep)

    # final df_out contains only selected features + key columns
    keep_cols = ['date_id', 'forward_returns', 'risk_free_rate', 'market_forward_excess_returns'] if 'date_id' in df.columns else ['forward_returns', 'risk_free_rate', 'market_forward_excess_returns']
    # add selected
    keep_cols = [c for c in keep_cols if c in df.columns] + selected
    df_out = df[keep_cols].copy()

    return df_out, selected

# End of create_advanced_features cell


In [None]:
# # call the feature factory (example)
# top_features = ['m4',  'v13', 'm11', 's2', 'd4', 'd1', 'd2', 'e8', 'p6', 'm2', 'd8', 'm9', 'p8', 'p7', 's12', 'p13', 'v9', 'd5', 'p1', 's8']
# df_features, selected_features = create_advanced_features(train, top_features, window_sizes=(5,10,20,60,120), max_features_to_keep=50)

# # Build X, y from df_features (keeping original index alignment)
# # Note: df_features includes date_id, forward_returns, risk_free_rate, market_forward_excess_returns
# X = df_features[selected_features]
# y = df_features[TARGET]

In [None]:
# Apply advanced feature engineering (2-level approach)
# Top features (Level 1 - Core)
top_features = ['m4', 'v13', 'm11', 's2', 'd4', 'd1', 'd2', 'e8', 'p6', 'm2', 'd8', 'm9', 'p8', 'p7', 's12', 'p13', 'v9', 'd5', 'p1', 's8']

# Create engineered features and select the top ones
train_enh, selected_features = create_advanced_features(
    train,
    top_features=top_features,
    window_sizes=[5, 10, 20, 60, 120],   # 1w, 2w, 1m, 3m, 6m
    max_features_to_keep=50
)

# Apply the same transformation to test set
test_enh, _ = create_advanced_features(
    test,
    top_features=top_features,
    window_sizes=[5, 10, 20, 60, 120],
    max_features_to_keep=50
)


In [None]:
# Final feature selection for training
X = train_enh[selected_features].astype('float32')
y = train_enh[TARGET].astype('float32')
X_test = test_enh[selected_features].astype('float32')

print(f"\n✅ Data prepared successfully")
print(f"Final selected features: {len(selected_features)}")
print(f"Train shape after feature creation: {X.shape}")

In [None]:
# ===== CELL: Model Training with Selected Features =====

# 1. Feature Engineering using your new function
top_features = ['m4','v13','m11','s2','d4','d1','d2','e8','p6','m2','d8','m9','p8','p7','s12','p13','v9','d5','p1','s8']

df_features, selected_features = create_advanced_features(
    train,
    top_features,
    window_sizes=(5,10,20,60,120),
    max_features_to_keep=50
)

# Display summary
print(f"Selected features ({len(selected_features)}): {selected_features[:10]} ...")

# Prepare X, y
X = df_features[selected_features].astype('float32')
y = df_features['market_forward_excess_returns'].astype('float32')


# 2. Define CatBoost model (Ensemble part 1)
cat_params = dict(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    loss_function='RMSE',
    random_seed=42,
    verbose=False
)

cat_model = CatBoostRegressor(**cat_params)

kf = KFold(n_splits=5, shuffle=False)
cat_preds = np.zeros(len(X))

for fold, (trn_idx, val_idx) in enumerate(kf.split(X), 1):
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    train_pool = Pool(X_train, y_train)
    val_pool = Pool(X_val, y_val)

    cat_model.fit(train_pool, eval_set=val_pool, verbose=False)
    preds = cat_model.predict(X_val)
    rmse = mean_squared_error(y_val, preds, squared=False)
    print(f"Fold {fold} RMSE: {rmse:.6f}")

    cat_preds[val_idx] = preds


# 3. Define Neural Network model (Ensemble part 2)
def build_nn(input_dim):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(32, activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

nn_preds = np.zeros(len(X))

for fold, (trn_idx, val_idx) in enumerate(kf.split(X), 1):
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    nn_model = build_nn(X.shape[1])
    early_stop = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True, monitor='val_loss', verbose=0)
    nn_model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=100,
        batch_size=128,
        verbose=0,
        callbacks=[early_stop]
    )

    preds = nn_model.predict(X_val).ravel()
    rmse = mean_squared_error(y_val, preds, squared=False)
    print(f"[NN] Fold {fold} RMSE: {rmse:.6f}")

    nn_preds[val_idx] = preds


# 4. Ensemble Combination (Weighted Average)
ensemble_preds = 0.6 * cat_preds + 0.4 * nn_preds
rmse_ensemble = mean_squared_error(y, ensemble_preds, squared=False)
print(f"\n✅ Ensemble RMSE: {rmse_ensemble:.6f}")


# 5. Diagnostics and Sanity Checks
metrics = pd.DataFrame({
    "Model": ["CatBoost", "NeuralNet", "Ensemble"],
    "RMSE": [
        mean_squared_error(y, cat_preds, squared=False),
        mean_squared_error(y, nn_preds, squared=False),
        rmse_ensemble
    ]
})
display(metrics)

print("\nFeature importance snapshot (CatBoost):")
imp_df = pd.DataFrame({
    'Feature': selected_features,
    'Importance': cat_model.feature_importances_
}).sort_values(by='Importance', ascending=False)
display(imp_df.head(15))


In [None]:
# # Feature Importance Preview (Optional)

# # ================================================================
# # 2️⃣ Feature Importance Preview (Optional Diagnostic)
# # ================================================================
# """
# Quick diagnostic cell to preview which engineered features
# are most informative for predicting market_forward_excess_returns.

# You can toggle the mode:
#   - mode = "fast" → uses Mutual Information (no model training)
#   - mode = "catboost" → trains a quick CatBoostRegressor for ranking
# """

# from sklearn.feature_selection import mutual_info_regression
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Select mode
# mode = "fast"  # "fast" or "catboost"

# # Mutual Information Mode (fast)
# if mode == "fast":
#     print("⚡ Running Mutual Information Importance (fast mode)...")
#     mi = mutual_info_regression(X, y, random_state=42)
#     mi_df = pd.DataFrame({'feature': X.columns, 'importance': mi})
#     mi_df = mi_df.sort_values(by='importance', ascending=False).head(20)

#     plt.figure(figsize=(10, 6))
#     sns.barplot(data=mi_df, x='importance', y='feature', color='steelblue')
#     plt.title("Top 20 Features by Mutual Information")
#     plt.tight_layout()
#     plt.show()

# # CatBoost Mode (more precise)
# elif mode == "catboost":
#     from catboost import CatBoostRegressor

#     print("🐱 Running CatBoost Feature Importance (model-based)...")
#     model = CatBoostRegressor(
#         iterations=300,
#         learning_rate=0.05,
#         depth=6,
#         random_seed=42,
#         verbose=False
#     )
#     model.fit(X, y)

#     fi = model.get_feature_importance(prettified=True)
#     fi = fi.sort_values(by='Importances', ascending=False).head(20)

#     plt.figure(figsize=(10, 6))
#     sns.barplot(data=fi, x='Importances', y='Feature Id', color='darkorange')
#     plt.title("Top 20 Features by CatBoost Importance")
#     plt.tight_layout()
#     plt.show()

# else:
#     print("Invalid mode. Choose 'fast' or 'catboost'.")

# print("✅ Feature importance preview complete.")


In [7]:
# ================================================================
# 2️⃣ CatBoost Base Model (GridSearch + TimeSeriesSplit)
# ================================================================

print("⏳ Training CatBoost model with TimeSeries CV...")

tscv = TimeSeriesSplit(n_splits=5)

# check here random_state = 42 for reproducibility!
cbc = CatBoostRegressor(loss_function='RMSE', verbose=0, random_state=42)

param_grid = {
    'depth': [4, 6],
    'learning_rate': [0.05, 0.1],
    'iterations': [300, 500],
    'l2_leaf_reg': [2, 5]
}

grid = GridSearchCV(
    estimator=cbc,
    param_grid=param_grid,
    cv=tscv,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)
grid.fit(X, y)
best_cbc = grid.best_estimator_
print(f"✅ Best Params: {grid.best_params_}")

⏳ Training CatBoost model with TimeSeries CV...
Fitting 5 folds for each of 16 candidates, totalling 80 fits


ValueError: 
All the 80 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\calli\miniconda3\envs\ml\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\calli\miniconda3\envs\ml\Lib\site-packages\catboost\core.py", line 5873, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\calli\miniconda3\envs\ml\Lib\site-packages\catboost\core.py", line 2395, in _fit
    train_params = self._prepare_train_params(
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\calli\miniconda3\envs\ml\Lib\site-packages\catboost\core.py", line 2275, in _prepare_train_params
    train_pool = _build_train_pool(X, y, cat_features, text_features, embedding_features, pairs, graph,
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\calli\miniconda3\envs\ml\Lib\site-packages\catboost\core.py", line 1513, in _build_train_pool
    train_pool = Pool(X, y, cat_features=cat_features, text_features=text_features, embedding_features=embedding_features, pairs=pairs, graph=graph, weight=sample_weight, group_id=group_id,
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\calli\miniconda3\envs\ml\Lib\site-packages\catboost\core.py", line 855, in __init__
    self._init(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, graph, weight,
  File "c:\Users\calli\miniconda3\envs\ml\Lib\site-packages\catboost\core.py", line 1491, in _init
    self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, graph, weight,
  File "_catboost.pyx", line 4329, in _catboost._PoolBase._init_pool
  File "_catboost.pyx", line 4352, in _catboost._PoolBase._init_pool
  File "_catboost.pyx", line 2310, in _catboost._init_features_layout
_catboost.CatBoostError: catboost/libs/data/features_layout.cpp:124: All feature names should be different, but 'forward_returns' used more than once.


In [None]:
# ================================================================
# 3️⃣ Neural Network Model (Feedforward Regressor)
# ================================================================

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

def build_nn(input_dim):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])
    model.compile(optimizer=keras.optimizers.Adam(1e-3), loss='mse', metrics=['mae'])
    return model

nn_model = build_nn(X_scaled.shape[1])
es = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# last 20% time-based validation
date_cut = train["date_id"].quantile(0.8)
train_idx = train["date_id"] <= date_cut
val_idx = train["date_id"] > date_cut

X_train, y_train = X_scaled[train_idx], y[train_idx]
X_val, y_val = X_scaled[val_idx], y[val_idx]

nn_model.fit(X_train, y_train, validation_data=(X_val, y_val),
             epochs=100, batch_size=256, verbose=0, callbacks=[es])
print("✅ Neural Network trained successfully.")

✅ Neural Network trained successfully.


In [None]:
# ================================================================
# 4️⃣ Ensemble Prediction (0.X × CatBoost + 0.XX × NN)
# ================================================================
ensemble_cat_pct = 0.8
ensemble_nn_pct = 0.2

val_cat = best_cbc.predict(X.loc[val_idx])
val_nn = nn_model.predict(X_scaled[val_idx]).ravel()

val_ensemble = ensemble_cat_pct * val_cat + ensemble_nn_pct * val_nn
val_df = train.loc[val_idx].copy()
val_df["pred"] = val_ensemble

[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [None]:
# ===== Corrected evaluation: use mapped weights and official formula =====
def compute_strategy_stats(weights, forward_returns, risk_free_rate):
    """
    Compute strategy daily returns and Sharpe (annualized).
    weights: array-like positions in [0,2]
    forward_returns, risk_free_rate: arrays aligned
    """
    # Ensure numpy arrays
    w = np.asarray(weights)
    fr = np.asarray(forward_returns)
    rf = np.asarray(risk_free_rate)

    # Strategy return per day: rf*(1 - w) + w * forward_returns
    # Strategy excess over rf:
    strat_ret = rf * (1.0 - w) + w * fr
    strat_excess = strat_ret - rf   # == w * (fr - rf)
    # annualized sharpe
    mean = np.nanmean(strat_excess)
    std = np.nanstd(strat_excess)
    sharpe = (mean / (std + 1e-12)) * np.sqrt(252) if std > 0 else 0.0
    # annualized vol of strategy returns
    vol_ann = std * np.sqrt(252)
    return {
        'sharpe': sharpe,
        'vol_ann': vol_ann,
        'mean_daily_excess': mean,
        'std_daily_excess': std,
        'strat_ret_series': strat_ret,
        'strat_excess_series': strat_excess
    }

def sharpe_penalty_official(weights, forward_returns, risk_free_rate):
    """
    Compute adjusted Sharpe like the official metric:
    - compute strategy sharpe
    - compute market vol and strategy vol, form vol_penalty = 1 + max(0, strategy_vol/market_vol - 1.2)
    - compute return_gap penalty like (max(0, (market_mean_excess - strat_mean_excess) * 100 * 252))**2 / 100 etc.
    Returns adjusted_sharpe (float) and components.
    """
    # strategy stats
    stats = compute_strategy_stats(weights, forward_returns, risk_free_rate)
    strat_excess = stats['strat_excess_series']
    strat_sharpe = stats['sharpe']
    strat_vol = stats['vol_ann']
    # market stats
    fr = np.asarray(forward_returns)
    rf = np.asarray(risk_free_rate)
    market_excess = fr - rf
    market_mean_excess = ( (1 + market_excess).prod() ) ** (1.0 / len(market_excess)) - 1 if len(market_excess)>0 else 0.0
    # fallback simpler mean if product fails
    # but safer to use mean:
    market_mean_excess = np.nanmean(market_excess)
    market_std = np.nanstd(fr)
    market_vol = market_std * np.sqrt(252) if market_std>0 else 1e-9

    # volatility penalty
    excess_vol = max(0.0, (strat_vol / (market_vol + 1e-12)) - 1.2)
    vol_penalty = 1.0 + excess_vol

    # return gap penalty (use squared scaled gap similar to demo code)
    strat_mean_excess = np.nanmean(strat_excess)
    return_gap = max(0.0, (market_mean_excess - strat_mean_excess) * 100 * 252)  # percent annualized gap
    return_penalty = 1.0 + (return_gap**2) / 100.0

    adjusted_sharpe = strat_sharpe / (vol_penalty * return_penalty + 1e-12)
    return {
        'adjusted_sharpe': adjusted_sharpe,
        'strat_sharpe': strat_sharpe,
        'vol_penalty': vol_penalty,
        'return_penalty': return_penalty,
        'strat_vol': strat_vol,
        'market_vol': market_vol,
        'return_gap': return_gap
    }

# ===== Use it on validation properly mapping raw preds to weights =====

# val_ensemble is your raw ensemble prediction (unmapped)
# First map to weights using your mapping function (or revised mapping)
def robust_signal_to_weight(sig, lower=0.0, upper=2.0):
    """
    Map raw signals to weights robustly using percentile clipping and stable scaling.
    If distribution is degenerate, fallback to standard scaling.
    """
    sig = np.asarray(sig)
    lo = np.nanpercentile(sig, 5)
    hi = np.nanpercentile(sig, 95)
    if np.isclose(hi, lo):
        # fallback: z-score and sigmoid mapping
        sig_z = (sig - np.nanmean(sig)) / (np.nanstd(sig) + 1e-12)
        # map z to [0,2] via logistic
        w = 2.0 / (1.0 + np.exp(-sig_z))
    else:
        w = (sig - lo) / (hi - lo + 1e-12) * (upper - lower) + lower
    return np.clip(w, lower, upper)

# compute mapped weights
val_weights = robust_signal_to_weight(val_ensemble)   # or pass val_cat/val_nn separately

# compute official adjusted sharpe and components
res = sharpe_penalty_official(val_weights, val_df['forward_returns'].to_numpy(), val_df['risk_free_rate'].to_numpy())

print("Mapped weights stats:", np.nanmin(val_weights), np.nanpercentile(val_weights,5), np.nanmedian(val_weights), np.nanpercentile(val_weights,95), np.nanmax(val_weights))
print("Strategy raw Sharpe:", res['strat_sharpe'])
print("Adjusted Sharpe:", res['adjusted_sharpe'])
print("Vol penalty:", res['vol_penalty'], "Return penalty:", res['return_penalty'], "Return gap:", res['return_gap'])


Mapped weights stats: 0.0 0.00011407995932442048 0.6734031891719277 1.999445759525307 2.0
Strategy raw Sharpe: 2.5628806391535512
Adjusted Sharpe: 2.562880639150988
Vol penalty: 1.0 Return penalty: 1.0 Return gap: 0.0


In [None]:
# # ================================================================
# # 6️⃣ Competition-Compliant Inference Function
# # ================================================================
# _cat_model = best_cbc
# _nn_model = nn_model
# _scaler = scaler
# _feat_cols = features

# """
#     Check if is really necessary exchange from pl to pd and back to pl?
#     pl.DataFrame (we convert to pandas inside)
# """
# def predict(pl_df):
#     """Competition inference function."""
#     pdf = pl_df.to_pandas().fillna(0.0)
#     for f in _feat_cols:
#         if f not in pdf.columns:
#             pdf[f] = 0.0
#     Xp = pdf[_feat_cols].values
#     Xp_scaled = _scaler.transform(Xp)
#     pred_cat = _cat_model.predict(pdf[_feat_cols])
#     pred_nn = _nn_model.predict(Xp_scaled, verbose=0).ravel()
#     preds = ensemble_cat_pct * pred_cat + ensemble_nn_pct * pred_nn
#     lo, hi = np.percentile(preds, [5, 95])
#     weights = np.clip((preds - lo) / (hi - lo + 1e-9) * 2.0, 0, 2)
#     return pd.DataFrame({"prediction": weights.astype("float32")})

In [None]:
# ================================================================
# 6️⃣ Competition-Compliant Inference Function
# ================================================================
_cat_model = best_cbc
_nn_model = nn_model
_scaler = scaler
_feat_cols = features
_history_returns = list(train.loc[val_idx, 'forward_returns'].iloc[-VOL_WINDOW:].tolist())

def predict(pl_df: pl.DataFrame) -> float:
    """Competition inference function - returns single float allocation."""
    global _history_returns
    
    # Convert Polars to Pandas and handle missing values
    pdf = pl_df.to_pandas().fillna(0.0)
    
    # Ensure all required features are present
    for f in _feat_cols:
        if f not in pdf.columns:
            pdf[f] = 0.0
    
    # Get features in correct format
    X_features = pdf[_feat_cols].values
    X_scaled = _scaler.transform(X_features)
    
    # Make predictions from both models
    pred_cat = _cat_model.predict(pdf[_feat_cols])[0]  # Get first prediction
    pred_nn = _nn_model.predict(X_scaled, verbose=0).ravel()[0]  # Get first prediction
    
    # Ensemble prediction
    pred = ensemble_cat_pct * pred_cat + ensemble_nn_pct * pred_nn
    
    # Estimate rolling volatility for scaling
    vol_est = np.std(_history_returns) if len(_history_returns) > 1 else 1e-3
    
    # Scale prediction to allocation with volatility adjustment
    allocation = float(np.clip((best_k * pred) / (vol_est + 1e-9), 0, 2))
    
    # Update history for rolling volatility estimation
    if 'lagged_forward_returns' in pl_df.columns:
        try:
            _history_returns.append(float(pl_df['lagged_forward_returns'][0]))
        except:
            _history_returns.append(0.0)
    else:
        _history_returns.append(0.0)
    
    # Keep only last VOL_WINDOW entries
    _history_returns = _history_returns[-VOL_WINDOW:]
    
    return allocation

In [None]:
"""
NEXT STEPS, IMPORTANT FOR IMPROVEMENT:

Stronger feature scaling

PCA optional

Rolling retrain or time-based CV for robustness out of sample

Optimization of the mix (CatBoost vs NN) to dynamically find the optimal weight based on your adjusted Sharpe
Eventually to be extended to more models in the ensemble

"""

'\nNEXT STEPS, IMPORTANT FOR IMPROVEMENT:\n\nStronger feature scaling\n\nPCA optional\n\nRolling retrain or time-based CV for robustness out of sample\n\nOptimization of the mix (CatBoost vs NN) to dynamically find the optimal weight based on your adjusted Sharpe\nEventually to be extended to more models in the ensemble\n\n'

In [None]:
# # ================================================================
# # 7️⃣ Kaggle Evaluation Server / Local Submission
# # ================================================================

# if KAGGLE_ENV:
#     # Kaggle competition environment
#     server = kdeval.DefaultInferenceServer(predict)
    
#     if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
#         server.serve()
#     else:
#         server.run_local_gateway((str(DATA_DIR),))
        
# else:
#     # Local environment - generate submission file
#     print("🔧 Local mode - generating submission file...")
    
#     # Generate predictions for test set
#     test_pred_cat = best_cbc.predict(X_test)
#     test_pred_nn = nn_model.predict(scaler.transform(X_test), verbose=0).ravel()
#     preds = ensemble_cat_pct * test_pred_cat + ensemble_nn_pct * test_pred_nn
    
#     # Apply same scaling logic as validation
#     test_exposures = np.clip(best_k * preds, 0, 2)
    
#     # Apply smoothing like in the working example
#     alpha = 0.8
#     smoothed_allocation = []
#     prev = 0.0
#     for x in test_exposures:
#         s = alpha * x + (1 - alpha) * prev
#         smoothed_allocation.append(s)
#         prev = s
#     smoothed_allocation = np.array(smoothed_allocation)
    
#     # Create submission
#     submission = pd.DataFrame({
#         'date_id': test['date_id'],
#         'prediction': smoothed_allocation.astype('float32')
#     })
    
#     submission.to_csv('submission_ensemble.csv', index=False)
#     print("📁 Saved submission_ensemble.csv")
#     print(f"📊 Prediction range: [{smoothed_allocation.min():.4f}, {smoothed_allocation.max():.4f}]")