In [None]:
# ================================================================
# 🧠 HULL TACTICAL MARKET PREDICTION — ENSEMBLE + SHARPE PENALTY
# ================================================================
import os
from pathlib import Path
import numpy as np
import pandas as pd
# from typing import Tuple, Dict

import polars as pl

import warnings
warnings.filterwarnings('ignore')

from catboost import CatBoostRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

from tensorflow import keras
from tensorflow.keras import layers

# import kaggle_evaluation.default_inference_server as kdeval

In [None]:
# ================================================================
# 1️⃣ Data Loading
# ================================================================

# DATA_DIR = Path('/kaggle/input/hull-tactical-market-prediction')

## Configuration and Data Loading (local version only)
DATA_DIR = Path("01_data")

train = pd.read_csv(f"{DATA_DIR}/train.csv")
test = pd.read_csv(f"{DATA_DIR}/test.csv")

TARGET = "market_forward_excess_returns"
drop_cols = ["date_id", "forward_returns", "risk_free_rate"]
features = [c for c in train.columns if c not in drop_cols + [TARGET]]

train = train.fillna(0.0)
test = test.fillna(0.0)

X = train[features]
y = train[TARGET]
X_test = test[features]

In [None]:
# ================================================================
# 2️⃣ CatBoost Base Model (GridSearch + TimeSeriesSplit)
# ================================================================

print("⏳ Training CatBoost model with TimeSeries CV...")

tscv = TimeSeriesSplit(n_splits=5)
cbc = CatBoostRegressor(loss_function='RMSE', verbose=0, random_state=42)

param_grid = {
    'depth': [4, 6],
    'learning_rate': [0.05, 0.1],
    'iterations': [300, 500],
    'l2_leaf_reg': [2, 5]
}

grid = GridSearchCV(
    estimator=cbc,
    param_grid=param_grid,
    cv=tscv,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)
grid.fit(X, y)
best_cbc = grid.best_estimator_
print(f"✅ Best Params: {grid.best_params_}")

In [None]:
# ================================================================
# 3️⃣ Neural Network Model (Feedforward Regressor)
# ================================================================

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

def build_nn(input_dim):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ])
    model.compile(optimizer=keras.optimizers.Adam(1e-3), loss='mse', metrics=['mae'])
    return model

nn_model = build_nn(X_scaled.shape[1])
es = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# last 20% time-based validation
date_cut = train["date_id"].quantile(0.8)
train_idx = train["date_id"] <= date_cut
val_idx = train["date_id"] > date_cut

X_train, y_train = X_scaled[train_idx], y[train_idx]
X_val, y_val = X_scaled[val_idx], y[val_idx]

nn_model.fit(X_train, y_train, validation_data=(X_val, y_val),
             epochs=100, batch_size=256, verbose=0, callbacks=[es])
print("✅ Neural Network trained successfully.")

In [None]:
# ================================================================
# 4️⃣ Ensemble Prediction (0.4 × CatBoost + 0.6 × NN)
# ================================================================

val_cat = best_cbc.predict(X.loc[val_idx])
val_nn = nn_model.predict(X_scaled[val_idx]).ravel()

val_ensemble = 0.4 * val_cat + 0.6 * val_nn
val_df = train.loc[val_idx].copy()
val_df["pred"] = val_ensemble

In [None]:
# ================================================================
# 5️⃣ Sharpe Penalty Formula (Official Metric Proxy)
# ================================================================

def sharpe_ratio(returns):
    mean = np.mean(returns)
    vol = np.std(returns)
    return (mean / (vol + 1e-9)) * np.sqrt(252)

def sharpe_penalty_score(pred, fwd_ret, risk_free_rate, penalty_weight=0.1):
    """
    Penalize models with excessive volatility relative to returns.
    """
    strat_ret = pred * fwd_ret - risk_free_rate
    raw_sharpe = sharpe_ratio(strat_ret)
    vol = np.std(strat_ret)
    penalty = penalty_weight * vol
    adj_sharpe = raw_sharpe - penalty
    return adj_sharpe, raw_sharpe, penalty

adj_sharpe, raw_sharpe, penalty = sharpe_penalty_score(
    val_df["pred"], val_df["forward_returns"], val_df["risk_free_rate"]
)

print(f"📊 Validation Sharpe: {raw_sharpe:.3f} | Penalty: {penalty:.4f} | Adjusted: {adj_sharpe:.3f}")

In [None]:
# ================================================================
# 6️⃣ Competition-Compliant Inference Function
# ================================================================

_cat_model = best_cbc
_nn_model = nn_model
_scaler = scaler
_feat_cols = features

def predict(pl_df):
    """Competition inference function."""
    pdf = pl_df.to_pandas().fillna(0.0)
    for f in _feat_cols:
        if f not in pdf.columns:
            pdf[f] = 0.0
    Xp = pdf[_feat_cols].values
    Xp_scaled = _scaler.transform(Xp)
    pred_cat = _cat_model.predict(pdf[_feat_cols])
    pred_nn = _nn_model.predict(Xp_scaled, verbose=0).ravel()
    preds = 0.4 * pred_cat + 0.6 * pred_nn
    lo, hi = np.percentile(preds, [5, 95])
    weights = np.clip((preds - lo) / (hi - lo + 1e-9) * 2.0, 0, 2)
    return pd.DataFrame({"prediction": weights.astype("float32")})

In [None]:
# ================================================================
# 7️⃣ Kaggle Evaluation Server
# ================================================================

server = kdeval.DefaultInferenceServer(predict)

if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    server.serve()
else:
    server.run_local_gateway((DATA_DIR,))

In [None]:
# ## Imports
# import os
# from pathlib import Path
# import numpy as np
# import pandas as pd
# from typing import Tuple, Dict

# import polars as pl

# import warnings
# warnings.filterwarnings('ignore')

In [41]:
## Configuration and Data Loading (kaggle_evaluation only)
# import kaggle_evaluation.default_inference_server as kdeval
# DATA_DIR = Path('/kaggle/input/hull-tactical-market-prediction')

## Configuration and Data Loading (local version only)
DATA_DIR = Path("01_data")

# Read CSV files from data_path
TRAIN_PATH = DATA_DIR / 'train.csv'
TEST_PATH  = DATA_DIR / 'test.csv'

VALIDATION_SIZE = 2700          # days, approx. 30% of data
# RANDOM_SEED = 42

import random
RANDOM_SEED = random.randint(1, 10000)

VOL_MULTIPLIER_LIMIT = 1.2
VOL_WINDOW = 20

def time_split_train_val(df: pd.DataFrame, val_size: int = 2700):
    df = df.sort_values('date_id').reset_index(drop=True)
    train_df = df.iloc[:-val_size].copy()
    val_df   = df.iloc[-val_size:].copy()
    return train_df, val_df

train_raw = pd.read_csv(TRAIN_PATH)
test_raw  = pd.read_csv(TEST_PATH)
train_raw.shape, test_raw.shape

((8990, 98), (10, 99))

In [42]:
## Feature Preparation
excluded = {'date_id', 'forward_returns', 'risk_free_rate', 'market_forward_excess_returns'}
feature_cols = [c for c in train_raw.columns if c not in excluded]
feature_cols = [c for c in feature_cols if c in test_raw.columns]

In [43]:
def prepare_df(df: pd.DataFrame, median_map: Dict[str, float], feature_cols: list) -> pd.DataFrame:
    df = df.copy()
    for c in feature_cols:
        if c not in df.columns:
            df[c] = 0.0
            df[f'{c}_was_na'] = 1
            continue
        if df[c].dtype.kind in 'fiu':
            med = median_map.get(c, 0.0)
            was_na = df[c].isna().astype(int)
            df[c] = df[c].fillna(med)
            df[f'{c}_was_na'] = was_na
        else:
            df[c] = pd.to_numeric(df[c], errors='coerce')
            med = median_map.get(c, 0.0)
            was_na = df[c].isna().astype(int)
            df[c] = df[c].fillna(med)
            df[f'{c}_was_na'] = was_na
    return df

In [44]:
## Train / Validation Split and Median Imputation
train_df, val_df = time_split_train_val(train_raw, val_size=VALIDATION_SIZE)

median_map = {c: float(train_df[c].median(skipna=True)) if train_df[c].dtype.kind in 'fiu' else 0.0 
              for c in feature_cols}

train_p = prepare_df(train_df, median_map, feature_cols)
val_p   = prepare_df(val_df, median_map, feature_cols)
test_p  = prepare_df(test_raw, median_map, feature_cols)

final_features = [f for c in feature_cols for f in (c, f"{c}_was_na")]
print("Number of features:", len(final_features))

Number of features: 188


In [45]:
# final_features

In [46]:
# === Bayesian Ridge Training ===
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_squared_error

# Prepara i dati (usando gli stessi DataFrame)
X_train = train_p[final_features]
y_train = train_p['forward_returns']
X_val   = val_p[final_features]
y_val   = val_p['forward_returns']

# Check for NaN values and handle them
print(f"NaN values in X_train: {X_train.isnull().sum().sum()}")
print(f"NaN values in y_train: {y_train.isnull().sum()}")

# Fill any remaining NaN values
X_train = X_train.fillna(0)
y_train = y_train.fillna(0)
X_val = X_val.fillna(0)
y_val = y_val.fillna(0)

# Crea e allena il modello
model = BayesianRidge()
model.fit(X_train, y_train)

# Valutazione sul validation set
val_pred = model.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, val_pred))
print(f"Validation RMSE: {rmse_val:.6f}")

NaN values in X_train: 6290
NaN values in y_train: 0
Validation RMSE: 0.010272


In [47]:
# === Volatility Scaling Calibration (BayesianRidge version) ===
def strategy_stats(returns, exposures):
    """Compute Sharpe and volatility for a given exposure series."""
    strat = exposures * returns
    mean = np.nanmean(strat)
    std  = np.nanstd(strat)
    sharpe = (mean / (std + 1e-9)) * np.sqrt(252)
    vol = std * np.sqrt(252)
    return {'sharpe': sharpe, 'vol': vol}

# Ensure validation data has no NaN values
val_features_clean = val_p[final_features].fillna(0)

# Predict mean and std (Bayesian posterior uncertainty)
val_pred_mean, val_pred_std = model.predict(val_features_clean, return_std=True)

# Define market volatility for scaling reference
market_vol = np.nanstd(train_p['forward_returns']) * np.sqrt(252)
print(f"Market annualized volatility: {market_vol:.4f}")

# We'll use a dynamic confidence weight: confidence = 1 / (1 + std)
confidence = 1 / (1 + val_pred_std)
val_conf_adj = val_pred_mean * confidence  # lower exposure when uncertainty is high

# Grid search for best scaling factor k (Sharpe ratio under vol constraint)
best_k, best_sharpe = 0.1, -1e9
for k in np.linspace(0.01, 5.0, 100):
    exposures = np.clip((k * val_conf_adj), 0, 2)
    stats = strategy_stats(val_p['forward_returns'], exposures)
    if stats['vol'] <= VOL_MULTIPLIER_LIMIT * market_vol and stats['sharpe'] > best_sharpe:
        best_k = k
        best_sharpe = stats['sharpe']

print(f"Chosen scaling factor k={best_k:.3f} | Validation Sharpe={best_sharpe:.2f}")

Market annualized volatility: 0.1698
Chosen scaling factor k=5.000 | Validation Sharpe=0.69


In [None]:
## Test Predictions + Smoothing
test_features_clean = test_p[final_features].fillna(0)
test_pred = model.predict(test_features_clean)

alpha = 0.8
smoothed_allocation = []
prev = 0.0
for x in np.clip(best_k * test_pred, 0, 2):
    s = alpha * x + (1 - alpha) * prev
    smoothed_allocation.append(s)
    prev = s
smoothed_allocation = np.array(smoothed_allocation)

# replace in final submission
submission_df = pd.DataFrame({
    'date_id': test_p['date_id'],
    'prediction': smoothed_allocation  
})
# submission_df.to_csv("submission_br.csv", index=False)
print("Saved submission_br.csv")

Saved submission_br.csv


In [49]:
"""
Kaggle Evaluation Metric:

strategy_returns = risk_free_rate * (1 - position) + position * forward_returns

if position = 0 → invest in risk-free asset,

if position = 1 → invest like the market,

if position = 2 → you are leveraged ×2 on the market.


def score():

strategy_returns = rf * (1 - pos) + pos * fwd_returns

In the code, the calibration seeks the best Sharpe of the portfolio exposed to pos by calculating:

strat = exposures * returns
""";

In [50]:
# # === Kaggle Inference Server Wrapper (BayesianRidge version) ===

# _model = model                     # BayesianRidge fitted model
# _best_k = best_k                   # scaling factor from validation calibration
# _history_returns = list(train_p['forward_returns'].iloc[-VOL_WINDOW:].tolist())

# def predict(pl_df: pl.DataFrame) -> float:
#     """Predict allocation for one timestep (Kaggle competition API)."""
#     global _history_returns
#     # Convert Polars → Pandas
#     pdf = pl_df.to_pandas()
    
#     # Apply same preprocessing
#     pdf_p = prepare_df(pdf, median_map, feature_cols)
#     for f in final_features:
#         if f not in pdf_p.columns:
#             pdf_p[f] = 0.0
    
#     # Convert to NumPy and predict
#     x = pdf_p[final_features].to_numpy()

#     """
#     added standard deviation based confidence adjustment
#     to reduce allocation when uncertainty is high
#     """

#     pred, std = model.predict(x, return_std=True)
    
#     # Compute rolling volatility estimate
#     vol_est = np.std(_history_returns) or 1e-3
#     confidence = 1 / (1 + std)
#     alloc = float(np.clip(_best_k * pred * confidence / (vol_est + 1e-9), 0, 2))
    
#     # Update history (for volatility tracking)
#     if 'lagged_forward_returns' in pl_df.columns:
#         try:
#             _history_returns.append(float(pl_df['lagged_forward_returns'][0]))
#         except:
#             _history_returns.append(0.0)
#     else:
#         _history_returns.append(0.0)
    
#     # Keep only the last VOL_WINDOW entries
#     _history_returns = _history_returns[-VOL_WINDOW:]
#     return alloc

# # Instantiate the Kaggle inference server
# server = kdeval.DefaultInferenceServer(predict)

# if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
#     server.serve()
# else:
#     server.run_local_gateway((str(DATA_DIR),))
