In [2]:
import os
from pathlib import Path
import datetime
from typing import List

from tqdm import tqdm
from dataclasses import dataclass, asdict

import polars as pl 
import numpy as np
from sklearn.linear_model import ElasticNet, ElasticNetCV, LinearRegression
from sklearn.preprocessing import StandardScaler

In [3]:
import os
for dirname, _, filenames in os.walk('/Users/sudip/hull_tactical_market_prediction_using_hyperopt/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/Users/sudip/hull_tactical_market_prediction_using_hyperopt/input/hull-tactical-market-prediction/test.csv
/Users/sudip/hull_tactical_market_prediction_using_hyperopt/input/hull-tactical-market-prediction/train.csv
/Users/sudip/hull_tactical_market_prediction_using_hyperopt/input/hull-tactical-market-prediction/kaggle_evaluation/__init__.py
/Users/sudip/hull_tactical_market_prediction_using_hyperopt/input/hull-tactical-market-prediction/kaggle_evaluation/default_gateway.py
/Users/sudip/hull_tactical_market_prediction_using_hyperopt/input/hull-tactical-market-prediction/kaggle_evaluation/default_inference_server.py
/Users/sudip/hull_tactical_market_prediction_using_hyperopt/input/hull-tactical-market-prediction/kaggle_evaluation/core/kaggle_evaluation.proto
/Users/sudip/hull_tactical_market_prediction_using_hyperopt/input/hull-tactical-market-prediction/kaggle_evaluation/core/__init__.py
/Users/sudip/hull_tactical_market_prediction_using_hyperopt/input/hull-tactical-market-prediction/ka

## Configurations

In [4]:
# ============ PATHS ============
DATA_PATH: Path = Path('/Users/sudip/hull_tactical_market_prediction_using_hyperopt/input/hull-tactical-market-prediction/')

# ============ RETURNS TO SIGNAL CONFIGS ============
MIN_SIGNAL: float = 0.0                         # Minimum value for the daily signal 
MAX_SIGNAL: float = 2.0                         # Maximum value for the daily signal 
SIGNAL_MULTIPLIER: float = 400.0                # Multiplier of the OLS market forward excess returns predictions to signal 

# ============ MODEL CONFIGS ============
CV: int = 10                                    # Number of cross validation folds in the model fitting
L1_RATIO: float = 0.5                           # ElasticNet mixing parameter
ALPHAS: np.ndarray = np.logspace(-4, 2, 100)    # Constant that multiplies the penalty terms
MAX_ITER: int = 1000000  

# # Print results
# print("DATA_PATH:", DATA_PATH)
# print("MIN_SIGNAL:", MIN_SIGNAL)
# print("MAX_SIGNAL:", MAX_SIGNAL)
# print("SIGNAL_MULTIPLIER:", SIGNAL_MULTIPLIER)
# print("CV:", CV)
# print("L1_RATIO:", L1_RATIO)
# print("ALPHAS:", ALPHAS)
# print("MAX_ITER:", MAX_ITER)

## Dataclasses Helpers

In [5]:
# Dataset Output
@dataclass
class DatasetOutput:
    X_train : pl.DataFrame 
    X_test: pl.DataFrame
    y_train: pl.Series
    y_test: pl.Series
    scaler: StandardScaler

# ElasticNet Parameters
@dataclass 
class ElasticNetParameters:
    l1_ratio : float 
    cv: int
    alphas: np.ndarray 
    max_iter: int 
    
    def __post_init__(self): 
        if self.l1_ratio < 0 or self.l1_ratio > 1: 
            raise ValueError("Wrong initializing value for ElasticNet l1_ratio")

# Ret to Signal Parameters
@dataclass(frozen=True)
class RetToSignalParameters:
    signal_multiplier: float 
    min_signal : float = MIN_SIGNAL
    max_signal : float = MAX_SIGNAL

# Print results
elastic_params = ElasticNetParameters(
    l1_ratio=0.5,
    cv=10,
    alphas=np.logspace(-4, 2, 100),
    max_iter=1_000_000
)

signal_params = RetToSignalParameters(signal_multiplier=400.0)

# print(elastic_params)
# print(signal_params)

## Set the Parameters

In [6]:
# Ret to Signal Parameters
ret_signal_params = RetToSignalParameters(
    signal_multiplier= SIGNAL_MULTIPLIER
)

enet_params = ElasticNetParameters(
    l1_ratio = L1_RATIO, 
    cv = CV, 
    alphas = ALPHAS, 
    max_iter = MAX_ITER
)

## Dataset Loading/Creating Helper Functions

In [7]:
def load_trainset() -> pl.DataFrame:
    """
    Loads and preprocesses the training dataset.

    Returns:
        pl.DataFrame: The preprocessed training DataFrame.
    """
    return (
        pl.read_csv(DATA_PATH / "train.csv")
        .rename({'market_forward_excess_returns':'target'})
        .with_columns(
            pl.exclude('date_id').cast(pl.Float64, strict=False)
        )
        .head(-10)
    )

def load_testset() -> pl.DataFrame:
    """
    Loads and preprocesses the testing dataset.

    Returns:
        pl.DataFrame: The preprocessed testing DataFrame.
    """
    return (
        pl.read_csv(DATA_PATH / "test.csv")
        .rename({'lagged_forward_returns':'target'})
        .with_columns(
            pl.exclude('date_id').cast(pl.Float64, strict=False)
        )
    )

def create_example_dataset(df: pl.DataFrame) -> pl.DataFrame:
    """
    Creates new features and cleans a DataFrame.

    Args:
        df (pl.DataFrame): The input Polars DataFrame.

    Returns:
        pl.DataFrame: The DataFrame with new features, selected columns, and no null values.
    """
    vars_to_keep: List[str] = [
        # D Columns
        "D1", "D2", "D3", "D4", "D5", "D6", "D7", "D8", "D9",
        # E Columns
        "E1", "E2", "E3", "E4", "E5", "E6", "E7", "E8", "E9", "E10", "E11", "E12",
        "E13", "E14", "E15", "E16", "E17", "E18", "E19", "E20",
        # I Columns
        "I1", "I2", "I3", "I4", "I5", "I6", "I7", "I8", "I9",
        # M Columns
        "M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8", "M9", "M10", "M11", "M12",
        "M13", "M14", "M15", "M16", "M17", "M18",
        # P Columns
        "P1", "P2", "P3", "P4", "P5", "P6", "P7", "P8", "P9", "P10", "P11", "P12", "P13",
        # S Columns
        "S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8", "S9", "S10", "S11", "S12",
        # V Columns
        "V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12", "V13",
        # Derived features
        "U1", "U2"
    ]
    
    # Only keep columns that actually exist in the dataframe
    available_cols = df.columns
    vars_to_keep = [col for col in vars_to_keep if col in available_cols]

    return (
        df.with_columns(
            (pl.col("I2") - pl.col("I1")).alias("U1"),
            (pl.col("M11") / ((pl.col("I2") + pl.col("I9") + pl.col("I7")) / 3)).alias("U2")
        )
        .select(["date_id", "target"] + vars_to_keep)
        .with_columns([
            pl.col(col).fill_null(pl.col(col).ewm_mean(com=0.5))
            for col in vars_to_keep
        ])
        .drop_nulls()
    )
    
def join_train_test_dataframes(train: pl.DataFrame, test: pl.DataFrame) -> pl.DataFrame:
    """
    Joins two dataframes by common columns and concatenates them vertically.

    Args:
        train (pl.DataFrame): The training DataFrame.
        test (pl.DataFrame): The testing DataFrame.

    Returns:
        pl.DataFrame: A single DataFrame with vertically stacked data from common columns.
    """
    common_columns: list[str] = [col for col in train.columns if col in test.columns]
    
    return pl.concat([train.select(common_columns), test.select(common_columns)], how="vertical")

def split_dataset(train: pl.DataFrame, test: pl.DataFrame, features: list[str]) -> DatasetOutput: 
    """
    Splits the data into features (X) and target (y), and scales the features.

    Args:
        train (pl.DataFrame): The processed training DataFrame.
        test (pl.DataFrame): The processed testing DataFrame.
        features (list[str]): List of features to used in model. 

    Returns:
        DatasetOutput: A dataclass containing the scaled feature sets, target series, and the fitted scaler.
    """
    X_train = train.select(features)
    y_train = train.get_column('target')
    X_test = test.select(features)
    y_test = test.get_column('target')
    
    scaler = StandardScaler() 
    
    X_train_scaled_np = scaler.fit_transform(X_train)
    X_train = pl.from_numpy(X_train_scaled_np, schema=features)
    
    X_test_scaled_np = scaler.transform(X_test)
    X_test = pl.from_numpy(X_test_scaled_np, schema=features)
    
    
    return DatasetOutput(
        X_train = X_train,
        y_train = y_train, 
        X_test = X_test, 
        y_test = y_test,
        scaler = scaler
    )

## Converting Return Prediction to Signal

In [8]:
def convert_ret_to_signal(
    ret_arr: np.ndarray,
    params: RetToSignalParameters
) -> np.ndarray:
    """
    Converts raw model predictions (expected returns) into a trading signal.

    Args:
        ret_arr (np.ndarray): The array of predicted returns.
        params (RetToSignalParameters): Parameters for scaling and clipping the signal.

    Returns:
        np.ndarray: The resulting trading signal, clipped between min and max values.
    """
    return np.clip(
        ret_arr * params.signal_multiplier + 1, params.min_signal, params.max_signal
    )

In [9]:
def calculate_adjusted_sharpe(
    position: np.ndarray,
    forward_returns: np.ndarray,
    risk_free_rate: np.ndarray,
    min_investment: float = 0.0,
    max_investment: float = 2.0,
    trading_days_per_yr: int = 252
) -> float:
    """
    Calculates a custom evaluation metric (volatility-adjusted Sharpe ratio).

    This metric penalizes strategies that take on significantly more volatility
    than the underlying market.

    Args:
        position: The predicted position/signal (0 to 2).
        forward_returns: Forward returns of the market.
        risk_free_rate: Risk-free rate.
        min_investment: Minimum allowed position (default 0.0).
        max_investment: Maximum allowed position (default 2.0).
        trading_days_per_yr: Trading days per year (default 252).

    Returns:
        float: The calculated adjusted Sharpe ratio.
    """
    # Validate position range
    if position.max() > max_investment:
        raise ValueError(f'Position of {position.max()} exceeds maximum of {max_investment}')
    if position.min() < min_investment:
        raise ValueError(f'Position of {position.min()} below minimum of {min_investment}')
    
    # Calculate strategy returns
    strategy_returns = risk_free_rate * (1 - position) + position * forward_returns
    
    # Calculate strategy's Sharpe ratio
    strategy_excess_returns = strategy_returns - risk_free_rate
    strategy_excess_cumulative = (1 + strategy_excess_returns).prod()
    strategy_mean_excess_return = (strategy_excess_cumulative) ** (1 / len(forward_returns)) - 1
    strategy_std = strategy_returns.std()
    
    if strategy_std == 0:
        raise ValueError('Division by zero, strategy std is zero')
    
    sharpe = strategy_mean_excess_return / strategy_std * np.sqrt(trading_days_per_yr)
    strategy_volatility = float(strategy_std * np.sqrt(trading_days_per_yr) * 100)
    
    # Calculate market return and volatility
    market_excess_returns = forward_returns - risk_free_rate
    market_excess_cumulative = (1 + market_excess_returns).prod()
    market_mean_excess_return = (market_excess_cumulative) ** (1 / len(forward_returns)) - 1
    market_std = forward_returns.std()
    
    market_volatility = float(market_std * np.sqrt(trading_days_per_yr) * 100)
    
    if market_volatility == 0:
        raise ValueError('Division by zero, market std is zero')
    
    # Calculate the volatility penalty
    excess_vol = max(0, strategy_volatility / market_volatility - 1.2) if market_volatility > 0 else 0
    vol_penalty = 1 + excess_vol
    
    # Calculate the return penalty
    return_gap = max(
        0,
        (market_mean_excess_return - strategy_mean_excess_return) * 100 * trading_days_per_yr,
    )
    return_penalty = 1 + (return_gap**2) / 100
    
    # Adjust the Sharpe ratio by the volatility and return penalty
    adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
    return min(float(adjusted_sharpe), 1_000_000)

In [None]:
import pandas as pd

# Load raw data
train_raw = load_trainset()
test_raw = load_testset()

# Feature engineering
train = create_example_dataset(train_raw)
test = create_example_dataset(test_raw)

# Extract feature names
features = [col for col in train.columns if col not in ("date_id", "target")]

# Split and scale
dataset = split_dataset(train, test, features)

# Show full table previews
preview_dataset(dataset, n=30)


# # Print outputs
# print("==== DATASET OUTPUT ====")
# print("X_train shape:", dataset.X_train.shape)
# print("X_test shape:", dataset.X_test.shape)
# print("y_train length:", len(dataset.y_train))
# print("y_test length:", len(dataset.y_test))
# print("\nScaler mean (first 5 features):", dataset.scaler.mean_[:100])
# print("\nDataset dataclass summary:")
# print(dataset)

def preview_dataset(dataset: DatasetOutput, n: int = 12) -> None:
    print("=== TRAIN FEATURES (X_train) ===")
    display(dataset.X_train.to_pandas().head(n))
    print("\n")
    print("=== TRAIN TARGET (y_train length) ===")
    display(pd.DataFrame(dataset.y_train.to_numpy(), columns=["target"]).head(n))

def preview_dataset(dataset: DatasetOutput, n: int = 12) -> None:
    print("=== TEST FEATURES (X_test) ===")
    display(dataset.X_test.to_pandas().head(n))
    print("\n")
    print("=== TEST TARGET (y_test length) ===")
    display(pd.DataFrame(dataset.y_test.to_numpy(), columns=["target"]).head(n))

def preview_dataset(dataset: DatasetOutput, n: int = 12) -> None:
    print("\n")
    print("\n=== SCALER SUMMARY ===")
    print("Mean (first 5):", dataset.scaler.mean_[:5])
    print("Scale (first 5):", dataset.scaler.scale_[:5])

NameError: name 'preview_dataset' is not defined

In [25]:
# Load and prepare
train_raw = load_trainset()
test_raw = load_testset()

train = create_example_dataset(train_raw)
test = create_example_dataset(test_raw)

# Define features
features = [col for col in train.columns if col not in ("date_id", "target")]

# Split
dataset = split_dataset(train, test, features)

# Show full table previews
preview_dataset(dataset, n=10)


import polars as pl
import pandas as pd

# Assuming your earlier dataclasses and functions (DatasetOutput, load_trainset, etc.) are already defined





=== TRAIN FEATURES (X_train) ===


Unnamed: 0,S2,E2,E3,P9,S1,S5,I2,P8,P10,P12,P13,U1,U2
0,-0.285955,1.112088,1.674268,1.627662,0.16674,-0.129887,-0.821713,0.276656,0.635604,-0.138087,0.309654,-0.803976,-0.016005
1,-0.395515,1.123292,1.686506,1.629398,0.159627,-0.041275,-0.811496,0.305054,0.646199,-0.51678,0.308522,-0.794458,-0.014148
2,0.04564,1.144137,1.710228,1.631135,0.159589,-0.0331,-0.851134,0.312833,0.661098,-0.700966,0.30739,-0.842952,-0.014439
3,0.817345,1.141328,1.705303,1.632871,0.14779,-0.411799,-0.865743,0.306904,0.651978,-0.515,0.306258,-0.853212,-0.01507
4,0.597401,1.131427,1.691863,1.634607,0.128952,-0.139996,-0.888585,0.303112,0.643282,-0.514195,0.305126,-0.872823,-0.01442
5,0.487707,1.146221,1.708103,1.636344,0.114871,-0.064001,-0.911353,0.303532,0.647676,0.791638,0.303993,-0.900752,-0.014758
6,0.818884,1.194114,1.76371,1.63808,0.100811,0.031098,-0.934034,0.320816,0.682545,0.412286,0.302861,-0.930325,-0.013438
7,0.818492,1.195059,1.763027,1.639816,0.079709,-0.288987,-0.931795,0.322269,0.685368,-0.142893,0.301729,-0.925471,-0.013006
8,1.038009,1.149421,1.70692,1.641553,0.079824,-0.160765,-0.93784,0.305796,0.65409,-0.697855,0.300597,-0.934209,-0.013037
9,0.486466,1.133204,1.686006,1.643289,0.065811,0.104361,-0.877735,0.306325,0.652019,-0.141291,0.299464,-0.881453,-0.013886



=== TEST FEATURES (X_test) ===


Unnamed: 0,S2,E2,E3,P9,S1,S5,I2,P8,P10,P12,P13,U1,U2
0,0.071202,0.577252,0.979946,-0.884859,0.082628,-0.124857,1.193316,0.343976,0.791547,0.813018,1.002565,1.462864,-0.01466
1,0.259606,0.53172,0.931516,-0.874441,0.089617,0.052103,1.165067,0.324842,0.762751,-0.37985,0.717249,1.436112,-0.014608
2,0.32179,0.483057,0.879877,-0.877914,0.056102,0.227982,1.204837,0.302967,0.736316,0.345253,-0.990119,1.484485,-0.014716
3,-0.058333,0.514029,0.912258,-0.874441,0.016603,0.090467,1.244544,0.324163,0.750651,0.345485,0.544021,1.54094,-0.014149
4,-0.186197,0.566596,0.967275,-0.858814,0.036628,-0.021156,1.182242,0.345811,0.772634,0.598144,-0.599508,1.486083,-0.014469
5,-0.440632,0.542675,0.941613,-0.858814,0.050615,0.01777,1.160807,0.34501,0.760766,0.599486,0.422875,1.455728,-0.014867
6,-0.124694,0.553926,0.953023,-0.865759,0.085354,0.063544,1.098616,0.354122,0.765722,0.844027,0.813486,1.399496,-0.01467
7,-0.315846,0.569067,0.96852,-0.855341,0.137411,-0.057029,1.050089,0.35579,0.766012,-0.629474,1.052382,1.35065,-0.014263
8,-0.316883,0.586179,0.98603,-0.839714,0.16618,-0.03225,1.062804,0.355952,0.769013,1.335549,1.444125,1.363049,-0.01408
9,-0.000524,0.61443,1.015162,-0.837977,0.165597,-0.123211,1.034709,0.360814,0.789748,0.352787,1.316186,1.311524,-0.013962



=== TRAIN TARGET (y_train) ===


Unnamed: 0,target
0,0.003079
1,0.004344
2,-0.001013
3,-0.001524
4,0.000767
5,0.010164
6,0.002256
7,-0.010042
8,-0.002539
9,0.003291



=== TEST TARGET (y_test) ===


Unnamed: 0,target
0,0.003541
1,-0.005964
2,-0.00741
3,0.00542
4,0.008357
5,-0.002896
6,0.002457
7,0.002312
8,0.002891
9,0.00831



=== SCALER SUMMARY ===
Mean (first 5): [0.01165369 0.42743324 0.24506963 0.36680174 0.179122  ]
Scale (first 5): [1.04017828 1.44067354 1.4975419  0.38089654 1.39900821]


In [10]:
def main():
    """Main function to run the Hull Tactical Market Prediction model."""
    print("Starting Hull Tactical Market Prediction...")
    
    # Looking at the Data
    train: pl.DataFrame = load_trainset()
    test: pl.DataFrame = load_testset() 
    print("Training data shape:", train.shape)
    print("Test data shape:", test.shape)
    print(train.tail(3)) 
    print(test.head(3))

    # Store financial columns from training data before joining
    train_financial = train.select(['date_id', 'forward_returns', 'risk_free_rate'])
    
    # Generating the Train and Test
    df: pl.DataFrame = join_train_test_dataframes(train, test)
    df = create_example_dataset(df=df) 
    train_date_ids = train.get_column('date_id')
    test_date_ids = test.get_column('date_id')
    train: pl.DataFrame = df.filter(pl.col('date_id').is_in(train_date_ids.to_list()))
    test: pl.DataFrame = df.filter(pl.col('date_id').is_in(test_date_ids.to_list()))
    
    # Join financial columns back to train
    train = train.join(train_financial, on='date_id', how='left')

    # Exclude financial columns from features to avoid data leakage
    excluded_cols = ['date_id', 'target', 'forward_returns', 'risk_free_rate']
    FEATURES: list[str] = [col for col in test.columns if col not in excluded_cols]
    print(f"Features used: {FEATURES}")

    dataset: DatasetOutput = split_dataset(train=train, test=test, features=FEATURES) 

    X_train: pl.DataFrame = dataset.X_train
    X_test: pl.DataFrame = dataset.X_test
    y_train: pl.Series = dataset.y_train
    y_test: pl.Series = dataset.y_test
    scaler: StandardScaler = dataset.scaler 

    # Fitting the Model
    print("Fitting ElasticNet model with cross-validation...")
    model_cv: ElasticNetCV = ElasticNetCV(
        **asdict(enet_params)
    )
    model_cv.fit(X_train, y_train) 
            
    # Fit the final model using the best alpha found by cross-validation
    model: ElasticNet = ElasticNet(alpha=model_cv.alpha_, l1_ratio=enet_params.l1_ratio) 
    model.fit(X_train, y_train)
    
    print(f"Best alpha found: {model_cv.alpha_}")
    print(f"Model coefficients: {model.coef_}")
    
    # Make predictions
    predictions = model.predict(X_test)
    signals = convert_ret_to_signal(predictions, ret_signal_params)
    
    print(f"Predictions shape: {predictions.shape}")
    print(f"Signals shape: {signals.shape}")
    print(f"Signal range: [{signals.min():.4f}, {signals.max():.4f}]")
    
    # Calculate the adjusted Sharpe ratio score on training data (validation)
    if 'forward_returns' in train.columns and 'risk_free_rate' in train.columns:
        # Use the last portion of training data as validation
        val_size = min(1000, len(train))
        val_train = train.tail(val_size)
        val_X = val_train.select(FEATURES)
        val_X_scaled_np = scaler.transform(val_X)
        val_X_scaled = pl.from_numpy(val_X_scaled_np, schema=FEATURES)
        
        val_predictions = model.predict(val_X_scaled)
        val_signals = convert_ret_to_signal(val_predictions, ret_signal_params)
        
        forward_returns = val_train.get_column('forward_returns').to_numpy()
        risk_free_rate = val_train.get_column('risk_free_rate').to_numpy()
        
        try:
            adjusted_sharpe = calculate_adjusted_sharpe(
                position=val_signals,
                forward_returns=forward_returns,
                risk_free_rate=risk_free_rate
            )
            print(f"\nValidation Adjusted Sharpe Ratio: {adjusted_sharpe:.4f}")
        except Exception as e:
            print(f"\nError calculating validation score: {e}")
    
    return model, predictions, signals, test

## Main function

In [11]:
if __name__ == "__main__":
    model, predictions, signals, test_data = main()

Starting Hull Tactical Market Prediction...
Training data shape: (8980, 98)
Test data shape: (10, 99)
shape: (3, 98)
┌─────────┬─────┬─────┬─────┬───┬───────────┬─────────────────┬────────────────┬──────────┐
│ date_id ┆ D1  ┆ D2  ┆ D3  ┆ … ┆ V9        ┆ forward_returns ┆ risk_free_rate ┆ target   │
│ ---     ┆ --- ┆ --- ┆ --- ┆   ┆ ---       ┆ ---             ┆ ---            ┆ ---      │
│ i64     ┆ f64 ┆ f64 ┆ f64 ┆   ┆ f64       ┆ f64             ┆ f64            ┆ f64      │
╞═════════╪═════╪═════╪═════╪═══╪═══════════╪═════════════════╪════════════════╪══════════╡
│ 8977    ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ … ┆ -0.708599 ┆ 0.004187        ┆ 0.000162       ┆ 0.003713 │
│ 8978    ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ … ┆ -0.725858 ┆ 0.002279        ┆ 0.000162       ┆ 0.001805 │
│ 8979    ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ … ┆ -0.720092 ┆ 0.003541        ┆ 0.000161       ┆ 0.003068 │
└─────────┴─────┴─────┴─────┴───┴───────────┴─────────────────┴────────────────┴──────────┘
shape: (3, 99)
┌─────────┬─────┬─────┬─────┬───┬───────