In [41]:
# ================================================================
# 🧠 HULL TACTICAL MARKET PREDICTION — ENSEMBLE + SHARPE PENALTY
# ================================================================
import os
from pathlib import Path
import numpy as np
import pandas as pd
# from typing import Tuple, Dict

import polars as pl

import warnings
warnings.filterwarnings('ignore')

from catboost import CatBoostRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

from tensorflow import keras
from tensorflow.keras import layers

# import kaggle_evaluation.default_inference_server as kdeval

In [42]:
# ================================================================
# 1️⃣ Data Loading
# ================================================================

# DATA_DIR = Path('/kaggle/input/hull-tactical-market-prediction')

## Configuration and Data Loading (local version only)
DATA_DIR = Path("01_data")

train = pd.read_csv(f"{DATA_DIR}/train.csv")
test = pd.read_csv(f"{DATA_DIR}/test.csv")

TARGET = "market_forward_excess_returns"
drop_cols = ["date_id", "forward_returns", "risk_free_rate"]
features = [c for c in train.columns if c not in drop_cols + [TARGET]]

train = train.fillna(0.0)
test = test.fillna(0.0)

X = train[features]
y = train[TARGET]
X_test = test[features]

In [43]:
# features
# TARGET
# drop_cols

In [44]:
# ================================================================
# 2️⃣ CatBoost Base Model (GridSearch + TimeSeriesSplit)
# ================================================================

print("⏳ Training CatBoost model with TimeSeries CV...")

tscv = TimeSeriesSplit(n_splits=5)
cbc = CatBoostRegressor(loss_function='RMSE', verbose=0, random_state=42)

param_grid = {
    'depth': [4, 6],
    'learning_rate': [0.05, 0.1],
    'iterations': [300, 500],
    'l2_leaf_reg': [2, 5]
}

grid = GridSearchCV(
    estimator=cbc,
    param_grid=param_grid,
    cv=tscv,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)
grid.fit(X, y)
best_cbc = grid.best_estimator_
print(f"✅ Best Params: {grid.best_params_}")

⏳ Training CatBoost model with TimeSeries CV...
Fitting 5 folds for each of 16 candidates, totalling 80 fits
✅ Best Params: {'depth': 6, 'iterations': 300, 'l2_leaf_reg': 5, 'learning_rate': 0.05}


In [45]:
# ================================================================
# 3️⃣ Neural Network Model (Feedforward Regressor)
# ================================================================

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

def build_nn(input_dim):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])
    model.compile(optimizer=keras.optimizers.Adam(1e-3), loss='mse', metrics=['mae'])
    return model

nn_model = build_nn(X_scaled.shape[1])
es = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# last 20% time-based validation
date_cut = train["date_id"].quantile(0.8)
train_idx = train["date_id"] <= date_cut
val_idx = train["date_id"] > date_cut

X_train, y_train = X_scaled[train_idx], y[train_idx]
X_val, y_val = X_scaled[val_idx], y[val_idx]

nn_model.fit(X_train, y_train, validation_data=(X_val, y_val),
             epochs=100, batch_size=256, verbose=0, callbacks=[es])
print("✅ Neural Network trained successfully.")

✅ Neural Network trained successfully.


In [46]:
# ================================================================
# 4️⃣ Ensemble Prediction (0.X × CatBoost + 0.XX × NN)
# ================================================================
ensemble_cat_pct = 0
ensemble_nn_pct = 1

val_cat = best_cbc.predict(X.loc[val_idx])
val_nn = nn_model.predict(X_scaled[val_idx]).ravel()

val_ensemble = ensemble_cat_pct * val_cat + ensemble_nn_pct * val_nn
val_df = train.loc[val_idx].copy()
val_df["pred"] = val_ensemble

[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


In [47]:
# ================================================================
# 5️⃣ Sharpe Penalty Formula (Official Metric Proxy)
# ================================================================
def sharpe_ratio(returns):
    mean = np.mean(returns)
    vol = np.std(returns)
    return (mean / (vol + 1e-9)) * np.sqrt(252)

def sharpe_penalty_score(pred, fwd_ret, risk_free_rate, penalty_weight=0.1):
    """
    Penalize models with excessive volatility relative to returns.
    """
    strat_ret = pred * fwd_ret - risk_free_rate
    raw_sharpe = sharpe_ratio(strat_ret)
    vol = np.std(strat_ret)
    penalty = penalty_weight * vol
    adj_sharpe = raw_sharpe - penalty
    return adj_sharpe, raw_sharpe, penalty

adj_sharpe, raw_sharpe, penalty = sharpe_penalty_score(
    val_df["pred"], val_df["forward_returns"], val_df["risk_free_rate"]
)

print(f"📊 Validation Sharpe: {raw_sharpe:.3f} | Penalty: {penalty:.4f} | Adjusted: {adj_sharpe:.3f}")

📊 Validation Sharpe: -1.913 | Penalty: 0.0001 | Adjusted: -1.913


In [48]:
# ================================================================
# 6️⃣ Competition-Compliant Inference Function
# ================================================================
_cat_model = best_cbc
_nn_model = nn_model
_scaler = scaler
_feat_cols = features

def predict(pl_df):
    """Competition inference function."""
    pdf = pl_df.to_pandas().fillna(0.0)
    for f in _feat_cols:
        if f not in pdf.columns:
            pdf[f] = 0.0
    Xp = pdf[_feat_cols].values
    Xp_scaled = _scaler.transform(Xp)
    pred_cat = _cat_model.predict(pdf[_feat_cols])
    pred_nn = _nn_model.predict(Xp_scaled, verbose=0).ravel()
    preds = 0.4 * pred_cat + 0.6 * pred_nn
    lo, hi = np.percentile(preds, [5, 95])
    weights = np.clip((preds - lo) / (hi - lo + 1e-9) * 2.0, 0, 2)
    return pd.DataFrame({"prediction": weights.astype("float32")})

In [51]:
# ===== Corrected evaluation: use mapped weights and official formula =====
import numpy as np

def compute_strategy_stats(weights, forward_returns, risk_free_rate):
    """
    Compute strategy daily returns and Sharpe (annualized).
    weights: array-like positions in [0,2]
    forward_returns, risk_free_rate: arrays aligned
    """
    # Ensure numpy arrays
    w = np.asarray(weights)
    fr = np.asarray(forward_returns)
    rf = np.asarray(risk_free_rate)

    # Strategy return per day: rf*(1 - w) + w * forward_returns
    # Strategy excess over rf:
    strat_ret = rf * (1.0 - w) + w * fr
    strat_excess = strat_ret - rf   # == w * (fr - rf)
    # annualized sharpe
    mean = np.nanmean(strat_excess)
    std = np.nanstd(strat_excess)
    sharpe = (mean / (std + 1e-12)) * np.sqrt(252) if std > 0 else 0.0
    # annualized vol of strategy returns
    vol_ann = std * np.sqrt(252)
    return {
        'sharpe': sharpe,
        'vol_ann': vol_ann,
        'mean_daily_excess': mean,
        'std_daily_excess': std,
        'strat_ret_series': strat_ret,
        'strat_excess_series': strat_excess
    }

def sharpe_penalty_official(weights, forward_returns, risk_free_rate):
    """
    Compute adjusted Sharpe like the official metric:
    - compute strategy sharpe
    - compute market vol and strategy vol, form vol_penalty = 1 + max(0, strategy_vol/market_vol - 1.2)
    - compute return_gap penalty like (max(0, (market_mean_excess - strat_mean_excess) * 100 * 252))**2 / 100 etc.
    Returns adjusted_sharpe (float) and components.
    """
    # strategy stats
    stats = compute_strategy_stats(weights, forward_returns, risk_free_rate)
    strat_excess = stats['strat_excess_series']
    strat_sharpe = stats['sharpe']
    strat_vol = stats['vol_ann']
    # market stats
    fr = np.asarray(forward_returns)
    rf = np.asarray(risk_free_rate)
    market_excess = fr - rf
    market_mean_excess = ( (1 + market_excess).prod() ) ** (1.0 / len(market_excess)) - 1 if len(market_excess)>0 else 0.0
    # fallback simpler mean if product fails
    # but safer to use mean:
    market_mean_excess = np.nanmean(market_excess)
    market_std = np.nanstd(fr)
    market_vol = market_std * np.sqrt(252) if market_std>0 else 1e-9

    # volatility penalty
    excess_vol = max(0.0, (strat_vol / (market_vol + 1e-12)) - 1.2)
    vol_penalty = 1.0 + excess_vol

    # return gap penalty (use squared scaled gap similar to demo code)
    strat_mean_excess = np.nanmean(strat_excess)
    return_gap = max(0.0, (market_mean_excess - strat_mean_excess) * 100 * 252)  # percent annualized gap
    return_penalty = 1.0 + (return_gap**2) / 100.0

    adjusted_sharpe = strat_sharpe / (vol_penalty * return_penalty + 1e-12)
    return {
        'adjusted_sharpe': adjusted_sharpe,
        'strat_sharpe': strat_sharpe,
        'vol_penalty': vol_penalty,
        'return_penalty': return_penalty,
        'strat_vol': strat_vol,
        'market_vol': market_vol,
        'return_gap': return_gap
    }

# ===== Use it on validation properly mapping raw preds to weights =====

# val_ensemble is your raw ensemble prediction (unmapped)
# First map to weights using your mapping function (or revised mapping)
def robust_signal_to_weight(sig, lower=0.0, upper=2.0):
    """
    Map raw signals to weights robustly using percentile clipping and stable scaling.
    If distribution is degenerate, fallback to standard scaling.
    """
    sig = np.asarray(sig)
    lo = np.nanpercentile(sig, 5)
    hi = np.nanpercentile(sig, 95)
    if np.isclose(hi, lo):
        # fallback: z-score and sigmoid mapping
        sig_z = (sig - np.nanmean(sig)) / (np.nanstd(sig) + 1e-12)
        # map z to [0,2] via logistic
        w = 2.0 / (1.0 + np.exp(-sig_z))
    else:
        w = (sig - lo) / (hi - lo + 1e-12) * (upper - lower) + lower
    return np.clip(w, lower, upper)

# compute mapped weights
val_weights = robust_signal_to_weight(val_ensemble)   # or pass val_cat/val_nn separately

# compute official adjusted sharpe and components
res = sharpe_penalty_official(val_weights, val_df['forward_returns'].to_numpy(), val_df['risk_free_rate'].to_numpy())

print("Mapped weights stats:", np.nanmin(val_weights), np.nanpercentile(val_weights,5), np.nanmedian(val_weights), np.nanpercentile(val_weights,95), np.nanmax(val_weights))
print("Strategy raw Sharpe:", res['strat_sharpe'])
print("Adjusted Sharpe:", res['adjusted_sharpe'])
print("Vol penalty:", res['vol_penalty'], "Return penalty:", res['return_penalty'], "Return gap:", res['return_gap'])


Mapped weights stats: 0.0 0.0002454032158233926 0.7244656132242924 1.9997560089382562 2.0
Strategy raw Sharpe: 0.5348811070399059
Adjusted Sharpe: 0.48137230953453
Vol penalty: 1.0 Return penalty: 1.1111588607093656 Return gap: 3.334049500372867


In [49]:
# ================================================================
# 7️⃣ Kaggle Evaluation Server
# ================================================================

server = kdeval.DefaultInferenceServer(predict)

if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    server.serve()
else:
    server.run_local_gateway((DATA_DIR,))

NameError: name 'kdeval' is not defined