In [None]:
## Imports
import os
from pathlib import Path
import numpy as np
import pandas as pd
from typing import Tuple, Dict

import lightgbm as lgb

import polars as pl

import warnings
warnings.filterwarnings('ignore')

In [81]:
## Configuration and Data Loading (kaggle_evaluation only)
# import kaggle_evaluation.default_inference_server as kdeval
# DATA_DIR = Path('/kaggle/input/hull-tactical-market-prediction')

## Configuration and Data Loading (local version only)
DATA_DIR = Path("01_data")

# Read CSV files from data_path
TRAIN_PATH = DATA_DIR / 'train.csv'
TEST_PATH  = DATA_DIR / 'test.csv'

VALIDATION_SIZE = 2700          # days, approx. 30% of data
# RANDOM_SEED = 42

import random
RANDOM_SEED = random.randint(1, 10000)

VOL_MULTIPLIER_LIMIT = 1.2
VOL_WINDOW = 20

def time_split_train_val(df: pd.DataFrame, val_size: int = 2700):
    df = df.sort_values('date_id').reset_index(drop=True)
    train_df = df.iloc[:-val_size].copy()
    val_df   = df.iloc[-val_size:].copy()
    return train_df, val_df

train_raw = pd.read_csv(TRAIN_PATH)
test_raw  = pd.read_csv(TEST_PATH)
train_raw.shape, test_raw.shape

((8990, 98), (10, 99))

In [None]:
## Feature Preparation
excluded = {'date_id', 'forward_returns', 'risk_free_rate', 'market_forward_excess_returns'}
feature_cols = [c for c in train_raw.columns if c not in excluded]
feature_cols = [c for c in feature_cols if c in test_raw.columns]

'\nThe third line performs a crucial validation step by ensuring feature consistency between training and test datasets. \nIt filters the feature list to include only columns that exist in both the training data and the test data \n(test_raw.columns). This step is essential because machine learning models require identical feature structures \nduring training and prediction phases. If a feature exists in training data but not in test data, \nthe model would fail during inference. \nThis defensive programming approach prevents runtime errors and ensures that the model can successfully \nmake predictions on the test set.\n\nThis two-step filtering process - first removing inappropriate columns, \nthen ensuring train-test consistency - represents a best practice in machine learning pipelines. \nIt creates a robust feature set that avoids data leakage while maintaining compatibility across different data splits, \nwhich is particularly important in time-series financial prediction tasks wh

In [None]:
def prepare_df(df: pd.DataFrame, median_map: Dict[str, float], feature_cols: list) -> pd.DataFrame:
    df = df.copy()
    for c in feature_cols:
        if c not in df.columns:
            df[c] = 0.0
            df[f'{c}_was_na'] = 1
            continue
        if df[c].dtype.kind in 'fiu':
            med = median_map.get(c, 0.0)
            was_na = df[c].isna().astype(int)
            df[c] = df[c].fillna(med)
            df[f'{c}_was_na'] = was_na
        else:
            df[c] = pd.to_numeric(df[c], errors='coerce')
            med = median_map.get(c, 0.0)
            was_na = df[c].isna().astype(int)
            df[c] = df[c].fillna(med)
            df[f'{c}_was_na'] = was_na
    return df

"\nThis function implements a robust data preprocessing pipeline that handles missing values and data type inconsistencies \nwhile preserving information about missingness patterns - a crucial technique in machine learning feature engineering.\n\nThe function begins by creating a copy of the input DataFrame to avoid modifying the original data, \nfollowing defensive programming principles. It then iterates through each feature column to apply consistent \npreprocessing steps. The first conditional check handles the case where an expected feature column \nis completely missing from the DataFrame. Rather than failing, it gracefully creates the missing column filled \nwith zeros and immediately creates a corresponding indicator variable set to 1, \nsignaling that this entire feature was absent. \nThis approach ensures model compatibility across datasets with different column structures.\n\nFor columns that exist in the DataFrame, the function employs different strategy_returnsegies based 

In [84]:
## Train / Validation Split and Median Imputation
train_df, val_df = time_split_train_val(train_raw, val_size=VALIDATION_SIZE)

median_map = {c: float(train_df[c].median(skipna=True)) if train_df[c].dtype.kind in 'fiu' else 0.0 
              for c in feature_cols}

train_p = prepare_df(train_df, median_map, feature_cols)
val_p   = prepare_df(val_df, median_map, feature_cols)
test_p  = prepare_df(test_raw, median_map, feature_cols)

final_features = [f for c in feature_cols for f in (c, f"{c}_was_na")]
print("Number of features:", len(final_features))

Number of features: 188


In [85]:
# final_features

In [86]:
## LightGBM Training
train_data = lgb.Dataset(train_p[final_features], label=train_p['forward_returns'])
val_data   = lgb.Dataset(val_p[final_features], label=val_p['forward_returns'])

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 63,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'seed': RANDOM_SEED,
    'n_jobs': -1,
    'verbose': -1
}

model = lgb.train(
    params,
    train_data,
    valid_sets=[val_data],
    num_boost_round=2000,
    callbacks=[lgb.early_stopping(100), lgb.log_evaluation(100)]
)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.0108706
Early stopping, best iteration is:
[2]	valid_0's rmse: 0.0101998


In [87]:
## Volatility Scaling Calibration
def strategy_stats(returns, exposures):
    strategy_returns = exposures * returns
    mean = np.nanmean(strategy_returns)
    std  = np.nanstd(strategy_returns)
    sharpe = (mean / (std + 1e-9)) * np.sqrt(252)
    vol = std * np.sqrt(252)
    return {'sharpe': sharpe, 'vol': vol}

val_pred = model.predict(val_p[final_features], num_iteration=model.best_iteration)
market_vol = np.nanstd(train_p['forward_returns']) * np.sqrt(252)

best_k, best_sharpe = 0.1, -1e9
for k in np.linspace(0.01, 5.0, 100):
    exposures = np.clip((k * val_pred), 0, 2)
    stats = strategy_stats(val_p['forward_returns'], exposures)
    if stats['vol'] <= VOL_MULTIPLIER_LIMIT * market_vol and stats['sharpe'] > best_sharpe:
        best_k = k
        best_sharpe = stats['sharpe']

print(f"Chosen scaling factor k={best_k:.3f} with Sharpe={best_sharpe:.2f}")

Chosen scaling factor k=5.000 with Sharpe=0.57


In [None]:
## Test Predictions + Smoothing
test_pred = model.predict(test_p[final_features], num_iteration=model.best_iteration)

alpha = 0.8
smoothed_allocation = []
prev = 0.0
for x in np.clip(best_k * test_pred, 0, 2):
    s = alpha * x + (1 - alpha) * prev
    smoothed_allocation.append(s)
    prev = s
smoothed_allocation = np.array(smoothed_allocation)

# replace in final submission
submission_df = pd.DataFrame({
    'date_id': test_p['date_id'],
    'prediction': smoothed_allocation  
})
submission_df.to_csv("submission_lgb_fixed.csv", index=False)
print("Saved submission_lgb_fixed.csv")

Saved submission_lgb_fixed.csv


In [None]:
"""
Kaggle Evaluation Metric:

strategy_returns = risk_free_rate * (1 - position) + position * forward_returns

if position = 0 → invest in risk-free asset,

if position = 1 → invest like the market,

if position = 2 → you are leveraged ×2 on the market.


def score():

strategy_returns = rf * (1 - pos) + pos * fwd_returns

In the code, the calibration seeks the best Sharpe of the portfolio exposed to pos by calculating:

strat = exposures * returns
""";

'\nKaggle Evaluation Metric:\n\nstrategy_returns = risk_free_rate * (1 - position) + position * forward_returns\n\nif position = 0 → invest in risk-free asset,\n\nif position = 1 → invest like the market,\n\nif position = 2 → you are leveraged ×2 on the market.\n\n\ndef score():\n\nstrategy_returns = rf * (1 - pos) + pos * fwd_returns\n\nIn the code, the calibration seeks the best Sharpe of the portfolio exposed to pos by calculating:\n\nstrat = exposures * returns\n'

In [None]:
# ## Kaggle Inference Server Wrapper

# _model = model
# _best_k = best_k
# _history_returns = list(train_p['forward_returns'].iloc[-VOL_WINDOW:].tolist())

# def predict(pl_df: pl.DataFrame) -> float:
#     global _history_returns
#     pdf = pl_df.to_pandas()
#     pdf_p = prepare_df(pdf, median_map, feature_cols)
#     for f in final_features:
#         if f not in pdf_p.columns:
#             pdf_p[f] = 0.0
#     x = pdf_p[final_features].to_numpy()
#     pred = _model.predict(x, num_iteration=_model.best_iteration)[0]
#     vol_est = np.std(_history_returns) or 1e-3
#     allocation = float(np.clip((_best_k * pred) / (vol_est + 1e-9), 0, 2))
#     if 'lagged_forward_returns' in pl_df.columns:
#         try:
#             _history_returns.append(float(pl_df['lagged_forward_returns'][0]))
#         except:
#             _history_returns.append(0.0)
#     _history_returns = _history_returns[-VOL_WINDOW:]
#     return allocation

# inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

# if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
#     inference_server.serve()
# else:
#     inference_server.run_local_gateway((str(DATA_DIR),))