# Simple Baseline (LightGBM Version) — Hull Tactical Market Prediction

This notebook provides a **safe, fast and competition-compliant
baseline** for the Hull Tactical Market Prediction challenge on Kaggle.

### Key Features

-   LightGBM regression model (fast and stable)
-   Chronological split (no data leakage)
-   Median imputation + missing indicators
-   Volatility-based signal scaling
-   Kaggle-compatible inference server wrapper

In [2]:
## Imports
# import os
from pathlib import Path
# import math
import numpy as np
import pandas as pd
from typing import Tuple, Dict

# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import mean_squared_error
import lightgbm as lgb

import polars as pl

In [None]:
## Configuration and Data Loading (kaggle_evaluation only)
# import kaggle_evaluation.default_inference_server as kdeval
# DATA_DIR = Path('/kaggle/input/hull-tactical-market-prediction')

## Configuration and Data Loading (local version only)
DATA_DIR = Path("01_data")

# Read CSV files from data_path
TRAIN_PATH = DATA_DIR / 'train.csv'
TEST_PATH  = DATA_DIR / 'test.csv'

VALIDATION_SIZE = 2700          # days, approx. 30% of data
RANDOM_SEED = 42
VOL_MULTIPLIER_LIMIT = 1.2
VOL_WINDOW = 20

def time_split_train_val(df: pd.DataFrame, val_size: int = 2700):
    df = df.sort_values('date_id').reset_index(drop=True)
    train_df = df.iloc[:-val_size].copy()
    val_df   = df.iloc[-val_size:].copy()
    return train_df, val_df

train_raw = pd.read_csv(TRAIN_PATH)
test_raw  = pd.read_csv(TEST_PATH)
train_raw.shape, test_raw.shape

((8990, 98), (10, 99))

In [None]:
## Feature Preparation
excluded = {'date_id', 'forward_returns', 'risk_free_rate', 'market_forward_excess_returns'}
feature_cols = [c for c in train_raw.columns if c not in excluded]
feature_cols = [c for c in feature_cols if c in test_raw.columns]

"""
The third line performs a crucial validation step by ensuring feature consistency between training and test datasets. 
It filters the feature list to include only columns that exist in both the training data and the test data 
(test_raw.columns). This step is essential because machine learning models require identical feature structures 
during training and prediction phases. If a feature exists in training data but not in test data, 
the model would fail during inference. 
This defensive programming approach prevents runtime errors and ensures that the model can successfully 
make predictions on the test set.

This two-step filtering process - first removing inappropriate columns, 
then ensuring train-test consistency - represents a best practice in machine learning pipelines. 
It creates a robust feature set that avoids data leakage while maintaining compatibility across different data splits, 
which is particularly important in time-series financial prediction tasks where the test set represents 
future market conditions.
"""

def prepare_df(df: pd.DataFrame, median_map: Dict[str, float], feature_cols: list) -> pd.DataFrame:
    df = df.copy()
    for c in feature_cols:
        if c not in df.columns:
            df[c] = 0.0
            df[f'{c}_was_na'] = 1
            continue
        if df[c].dtype.kind in 'fiu':
            med = median_map.get(c, 0.0)
            was_na = df[c].isna().astype(int)
            df[c] = df[c].fillna(med)
            df[f'{c}_was_na'] = was_na
        else:
            df[c] = pd.to_numeric(df[c], errors='coerce')
            med = median_map.get(c, 0.0)
            was_na = df[c].isna().astype(int)
            df[c] = df[c].fillna(med)
            df[f'{c}_was_na'] = was_na
    return df

In [None]:
## Train / Validation Split and Median Imputation

train_df, val_df = time_split_train_val(train_raw, val_size=VALIDATION_SIZE)

median_map = {c: float(train_df[c].median(skipna=True)) if train_df[c].dtype.kind in 'fiu' else 0.0 
              for c in feature_cols}

train_p = prepare_df(train_df, median_map, feature_cols)
val_p   = prepare_df(val_df, median_map, feature_cols)
test_p  = prepare_df(test_raw, median_map, feature_cols)

final_features = [f for c in feature_cols for f in (c, f"{c}_was_na")]
print("Number of features:", len(final_features))

In [None]:
## LightGBM Training

train_data = lgb.Dataset(train_p[final_features], label=train_p['forward_returns'])
val_data   = lgb.Dataset(val_p[final_features], label=val_p['forward_returns'])

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 63,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'seed': RANDOM_SEED,
    'n_jobs': -1,
    'verbose': -1
}

model = lgb.train(
    params,
    train_data,
    valid_sets=[val_data],
    num_boost_round=2000,
    early_stopping_rounds=100,
    verbose_eval=100
)


In [None]:
## Volatility Scaling Calibration

def strategy_stats(returns, exposures):
    strat = exposures * returns
    mean = np.nanmean(strat)
    std  = np.nanstd(strat)
    sharpe = (mean / (std + 1e-9)) * np.sqrt(252)
    vol = std * np.sqrt(252)
    return {'sharpe': sharpe, 'vol': vol}

val_pred = model.predict(val_p[final_features], num_iteration=model.best_iteration)
market_vol = np.nanstd(train_p['forward_returns']) * np.sqrt(252)

best_k, best_sharpe = 0.1, -1e9
for k in np.linspace(0.01, 5.0, 100):
    exposures = np.clip((k * val_pred), 0, 2)
    stats = strategy_stats(val_p['forward_returns'], exposures)
    if stats['vol'] <= VOL_MULTIPLIER_LIMIT * market_vol and stats['sharpe'] > best_sharpe:
        best_k = k
        best_sharpe = stats['sharpe']

print(f"Chosen scaling factor k={best_k:.3f} with Sharpe={best_sharpe:.2f}")

In [None]:
## Test Predictions + Smoothing

test_pred = model.predict(test_p[final_features], num_iteration=model.best_iteration)

alpha = 0.8
smoothed_alloc = []
prev = 0.0
for x in np.clip(best_k * test_pred, 0, 2):
    s = alpha * x + (1 - alpha) * prev
    smoothed_alloc.append(s)
    prev = s
smoothed_alloc = np.array(smoothed_alloc)

submission_df = pd.DataFrame({
    'date_id': test_p['date_id'],
    'weight': smoothed_alloc
})
submission_df.to_csv("submission_lgb_fixed.csv", index=False)
print("Saved submission_lgb_fixed.csv")

In [None]:
## Kaggle Inference Server Wrapper

_model = model
_best_k = best_k
_history_returns = list(train_p['forward_returns'].iloc[-VOL_WINDOW:].tolist())

def predict(pl_df: pl.DataFrame) -> float:
    global _history_returns
    pdf = pl_df.to_pandas()
    pdf_p = prepare_df(pdf, median_map, feature_cols)
    for f in final_features:
        if f not in pdf_p.columns:
            pdf_p[f] = 0.0
    x = pdf_p[final_features].to_numpy()
    pred = _model.predict(x, num_iteration=_model.best_iteration)[0]
    vol_est = np.std(_history_returns) or 1e-3
    alloc = float(np.clip((_best_k * pred) / (vol_est + 1e-9), 0, 2))
    if 'lagged_forward_returns' in pl_df.columns:
        try:
            _history_returns.append(float(pl_df['lagged_forward_returns'][0]))
        except:
            _history_returns.append(0.0)
    _history_returns = _history_returns[-VOL_WINDOW:]
    return alloc

server = kdeval.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    server.serve()
else:
    server.run_local_gateway((str(DATA_DIR),))

### Notebook Summary

| Feature        | Description                          |
|----------------|--------------------------------------|
| Model          | LightGBM (fast, robust)              |
| Validation     | Time-based (last 2700 days)          |
| Imputation     | Median + missing flags               |
| Signal control | Volatility scaling (Sharpe-based)    |
| Inference      | Kaggle-compatible `predict` function |
| Runtime        | \< 5 minutes on Kaggle GPU notebook  |