In [37]:
import polars as pl

base_dir = 'hull-tactical-market-prediction'
tr = base_dir + '/train.csv'
te = base_dir + '/test.csv'

In [38]:
train = pl.read_csv(tr)
test = pl.read_csv(te)

In [None]:
m_cols = [f'M{i}' for i in range(1, 19)]
p_cols = [f'P{i}' for i in range(1, 14)]
v_cols = [f'V{i}' for i in range(1, 14)] + ['forward_returns']
cols_to_cast = (
    [f'E{i}' for i in range(1, 21)] +
    [f'I{i}' for i in range(1, 10)] +
    m_cols +
    [f'S{i}' for i in range(1, 13)] +
    [f'V{i}' for i in range(1, 14)] +
    p_cols
)

train = train.with_columns(
    pl.col(cols_to_cast).cast(pl.Float64, strict=False)
)

windows = [5, 20, 100]
rolling_mean_exprs = [
    pl.col(col)
    .rolling_mean(window_size=window, min_samples=1)
    .alias(f'{col}_rolling_mean_{window}')
    for col in m_cols
    for window in windows
]
train = train.with_columns(rolling_mean_exprs)

ratio_exprs = [
    (pl.col(f'M{i}_rolling_mean_20') / pl.col(f'M{i}_rolling_mean_100'))
    .alias(f'M{i}_moving_average_ratio_20/100')
    for i in range(1, 19)
]
train = train.with_columns(ratio_exprs)

lag_exprs = [
    pl.col(col).shift(day)
    .alias(f'{col}_lag_{day}')
    for col in ['market_forward_excess_returns', 'forward_returns']
    for day in [1, 2, 5]
]

std_window = 20
rolling_std_exprs = [
    pl.col(col)
    .rolling_std(window_size=std_window, min_samples=2)
    .alias(f'{col}_rolling_std_{std_window}')
    for col in v_cols
]

z_window = 100
zscore_exprs = [
    ((pl.col(col) - pl.col(col).rolling_mean(window_size=z_window, min_samples=1))
     / pl.col(col).rolling_std(window_size=z_window, min_samples=2))
    .alias(f'{col}_zscore_{z_window}')
    for col in p_cols
]

independent_features = lag_exprs + rolling_std_exprs + zscore_exprs
train = train.with_columns(independent_features)

allocation_feature = (
    pl.when((pl.col('forward_returns_lag_1') > 0) & (pl.col('forward_returns_lag_2') > 0))
    .then(1.5)
    .when(pl.col('forward_returns_lag_1') < 0)
    .then(0.5)
    .otherwise(1.0)
    .alias('momentum_allocation_feature')
)

train = train.with_columns(allocation_feature)

train = train.fill_null(strategy="forward").fill_null(strategy="backward")

train

ColumnNotFoundError: unable to find column "forward_returns_lag_1"; valid columns: ["date_id", "D1", "D2", "D3", "D4", "D5", "D6", "D7", "D8", "D9", "E1", "E10", "E11", "E12", "E13", "E14", "E15", "E16", "E17", "E18", "E19", "E2", "E20", "E3", "E4", "E5", "E6", "E7", "E8", "E9", "I1", "I2", "I3", "I4", "I5", "I6", "I7", "I8", "I9", "M1", "M10", "M11", "M12", "M13", "M14", "M15", "M16", "M17", "M18", "M2", "M3", "M4", "M5", "M6", "M7", "M8", "M9", "P1", "P10", "P11", "P12", "P13", "P2", "P3", "P4", "P5", "P6", "P7", "P8", "P9", "S1", "S10", "S11", "S12", "S2", "S3", "S4", "S5", "S6", "S7", "S8", "S9", "V1", "V10", "V11", "V12", "V13", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "forward_returns", "risk_free_rate", "market_forward_excess_returns", "M1_rolling_mean_5", "M1_rolling_mean_20", "M1_rolling_mean_100", "M2_rolling_mean_5", "M2_rolling_mean_20", "M2_rolling_mean_100", "M3_rolling_mean_5", "M3_rolling_mean_20", "M3_rolling_mean_100", "M4_rolling_mean_5", "M4_rolling_mean_20", "M4_rolling_mean_100", "M5_rolling_mean_5", "M5_rolling_mean_20", "M5_rolling_mean_100", "M6_rolling_mean_5", "M6_rolling_mean_20", "M6_rolling_mean_100", "M7_rolling_mean_5", "M7_rolling_mean_20", "M7_rolling_mean_100", "M8_rolling_mean_5", "M8_rolling_mean_20", "M8_rolling_mean_100", "M9_rolling_mean_5", "M9_rolling_mean_20", "M9_rolling_mean_100", "M10_rolling_mean_5", "M10_rolling_mean_20", "M10_rolling_mean_100", "M11_rolling_mean_5", "M11_rolling_mean_20", "M11_rolling_mean_100", "M12_rolling_mean_5", "M12_rolling_mean_20", "M12_rolling_mean_100", "M13_rolling_mean_5", "M13_rolling_mean_20", "M13_rolling_mean_100", "M14_rolling_mean_5", "M14_rolling_mean_20", "M14_rolling_mean_100", "M15_rolling_mean_5", "M15_rolling_mean_20", "M15_rolling_mean_100", "M16_rolling_mean_5", "M16_rolling_mean_20", "M16_rolling_mean_100", "M17_rolling_mean_5", "M17_rolling_mean_20", "M17_rolling_mean_100", "M18_rolling_mean_5", "M18_rolling_mean_20", "M18_rolling_mean_100", "M1_moving_average_ratio_20/100", "M2_moving_average_ratio_20/100", "M3_moving_average_ratio_20/100", "M4_moving_average_ratio_20/100", "M5_moving_average_ratio_20/100", "M6_moving_average_ratio_20/100", "M7_moving_average_ratio_20/100", "M8_moving_average_ratio_20/100", "M9_moving_average_ratio_20/100", "M10_moving_average_ratio_20/100", "M11_moving_average_ratio_20/100", "M12_moving_average_ratio_20/100", "M13_moving_average_ratio_20/100", "M14_moving_average_ratio_20/100", "M15_moving_average_ratio_20/100", "M16_moving_average_ratio_20/100", "M17_moving_average_ratio_20/100", "M18_moving_average_ratio_20/100"]

date_id - An identifier for a single trading day.

M* - Market Dynamics/Technical features.

E* - Macro Economic features.

I* - Interest Rate features.

P* - Price/Valuation features.

V* - Volatility features.

S* - Sentiment features.

MOM* - Momentum features.

D* - Dummy/Binary features.

forward_returns - The returns from buying the S&P 500 and selling it a day later. Train set only.

risk_free_rate - The federal funds rate. Train set only. 

market_forward_excess_returns - Forward returns relative to expectations. 
Computed by subtracting the rolling five-year mean forward returns and winsorizing the result using a median absolute deviation (MAD) with a criterion of 4. Train set only.

In [28]:
#ridge model
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

features = train.select(
    pl.exclude([
        'date_id', 
        'market_forward_excess_returns', 
        'forward_returns',                 
        'risk_free_rate'                   
    ])
)
target = train.select(['market_forward_excess_returns'])

split = int(train.height * 0.8)

X_train = features.head(split).to_numpy()
y_train = target.head(split).to_numpy().ravel()
X_test = features.tail(-split).to_numpy()
y_test = target.tail(-split).to_numpy().ravel()

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

for a in [0.001, 0.01, 1, 10, 100, 1000, 10000]:
    ridge = Ridge(alpha=a)
    ridge.fit(X_train, y_train)

    train_score = ridge.score(X_train, y_train)
    test_score = ridge.score(X_test, y_test)

    print(f'Train score at alpha {a}: {train_score}', flush = True)
    print(f'Test score at alpha {a}: {test_score}', flush = True)


Train score at alpha 0.001: 0.05395748906251929
Test score at alpha 0.001: -113.62522612308982
Train score at alpha 0.01: 0.053945960763281176
Test score at alpha 0.01: -94.38131150033021
Train score at alpha 1: 0.05300282581735061
Test score at alpha 1: -2.103509156204347
Train score at alpha 10: 0.05098960708762945
Test score at alpha 10: -11.436165348256411
Train score at alpha 100: 0.04582201880138426
Test score at alpha 100: -14.258967584923722
Train score at alpha 1000: 0.035692610936638225
Test score at alpha 1000: -8.472851243772427
Train score at alpha 10000: 0.019494855532804056
Test score at alpha 10000: -0.6152066704804546


In [29]:
import lightgbm as lgb
import numpy as np
from sklearn.metrics import mean_squared_error

params = {
    'objective': 'regression',
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'n_jobs': 1
}

lgbm = lgb.LGBMRegressor(**params)

lgbm.fit(
    X_train, y_train,
    eval_set = [(X_test, y_test)],
    eval_metric = 'rmse',
    callbacks = [lgb.early_stopping(100, verbose = True)]
)

y_pred = lgbm.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f'RMSE: {rmse}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019750 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 45638
[LightGBM] [Info] Number of data points in the train set: 7192, number of used features: 196
[LightGBM] [Info] Start training from score 0.000014
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.0111117	valid_0's l2: 0.00012347
RMSE: 0.011111692452367193




In [None]:
#raw predictions - y_pred

scaling_factors = range(0, 1000, 5)
solution_pl = train.tail(-split).select(['forward_returns', 'risk_free_rate'])

best_score = -np.inf
best_factor = 0
results = []

for factor in scaling_factors:
    allocations = np.clip((1 + (y_pred * factor)), 0, 2)
    #allocations = (2) / (1 + np.exp(-y_pred * factor))
    submission_pl = pl.DataFrame({
        'prediction': allocations
    })
    
    curr_score = score(
        solution = solution_pl.to_pandas(),
        submission = submission_pl.to_pandas(),
        row_id_column_name = ''
    )
    
    results.append({
        'factor': factor,
        'score': curr_score
    })
    
    if curr_score > best_score:
        best_score = curr_score
        best_factor = factor

print(f'Best scaling factor: {best_factor}')
print(f'Best score: {best_score}')

Best scaling factor: 730
Best score: 0.6477349220761722


In [30]:
#evaluation code

import numpy as np
import pandas as pd
import pandas.api.types

MIN_INVESTMENT = 0
MAX_INVESTMENT = 2


class ParticipantVisibleError(Exception):
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    Calculates a custom evaluation metric (volatility-adjusted Sharpe ratio).

    This metric penalizes strategies that take on significantly more volatility
    than the underlying market.

    Returns:
        float: The calculated adjusted Sharpe ratio.
    """

    if not pandas.api.types.is_numeric_dtype(submission['prediction']):
        raise ParticipantVisibleError('Predictions must be numeric')

    solution = solution
    solution['position'] = submission['prediction']

    if solution['position'].max() > MAX_INVESTMENT:
        raise ParticipantVisibleError(f'Position of {solution["position"].max()} exceeds maximum of {MAX_INVESTMENT}')
    if solution['position'].min() < MIN_INVESTMENT:
        raise ParticipantVisibleError(f'Position of {solution["position"].min()} below minimum of {MIN_INVESTMENT}')

    solution['strategy_returns'] = solution['risk_free_rate'] * (1 - solution['position']) + solution['position'] * solution['forward_returns']

    # Calculate strategy's Sharpe ratio
    strategy_excess_returns = solution['strategy_returns'] - solution['risk_free_rate']
    strategy_excess_cumulative = (1 + strategy_excess_returns).prod()
    strategy_mean_excess_return = (strategy_excess_cumulative) ** (1 / len(solution)) - 1
    strategy_std = solution['strategy_returns'].std()

    trading_days_per_yr = 252
    if strategy_std == 0:
        raise ParticipantVisibleError('Division by zero, strategy std is zero')
    sharpe = strategy_mean_excess_return / strategy_std * np.sqrt(trading_days_per_yr)
    strategy_volatility = float(strategy_std * np.sqrt(trading_days_per_yr) * 100)

    # Calculate market return and volatility
    market_excess_returns = solution['forward_returns'] - solution['risk_free_rate']
    market_excess_cumulative = (1 + market_excess_returns).prod()
    market_mean_excess_return = (market_excess_cumulative) ** (1 / len(solution)) - 1
    market_std = solution['forward_returns'].std()

    market_volatility = float(market_std * np.sqrt(trading_days_per_yr) * 100)

    if market_volatility == 0:
        raise ParticipantVisibleError('Division by zero, market std is zero')

    # Calculate the volatility penalty
    excess_vol = max(0, strategy_volatility / market_volatility - 1.2) if market_volatility > 0 else 0
    vol_penalty = 1 + excess_vol

    # Calculate the return penalty
    return_gap = max(
        0,
        (market_mean_excess_return - strategy_mean_excess_return) * 100 * trading_days_per_yr,
    )
    return_penalty = 1 + (return_gap**2) / 100

    # Adjust the Sharpe ratio by the volatility and return penalty
    adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
    return min(float(adjusted_sharpe), 1_000_000)

In [None]:
days = [1, 2, 3, 4, 5, 10, 20]
col = 'forward_returns'
lagged_exprs = [
    pl.col(col).shift(day)
    .alias(f'{col}_lag_{day}')
    for day in days
]

alloc = train.with_columns(lagged_exprs).select([f'{col}_lag_{day}' for day in days])
alloc = alloc.fill_null(strategy = 'forward').fill_null(strategy = 'backward')
target = train.select('market_forward_excess_returns')


forward_returns_lag_1,forward_returns_lag_2,forward_returns_lag_3,forward_returns_lag_4,forward_returns_lag_5,forward_returns_lag_10,forward_returns_lag_20
f64,f64,f64,f64,f64,f64,f64
-0.002421,-0.002421,-0.002421,-0.002421,-0.002421,-0.002421,-0.002421
-0.002421,-0.002421,-0.002421,-0.002421,-0.002421,-0.002421,-0.002421
-0.008495,-0.002421,-0.002421,-0.002421,-0.002421,-0.002421,-0.002421
-0.009624,-0.008495,-0.002421,-0.002421,-0.002421,-0.002421,-0.002421
0.004662,-0.009624,-0.008495,-0.002421,-0.002421,-0.002421,-0.002421
…,…,…,…,…,…,…
-0.002896,0.008357,0.00542,-0.00741,-0.005964,0.015341,0.007798
0.002457,-0.002896,0.008357,0.00542,-0.00741,-0.004386,-0.001977
0.002312,0.002457,-0.002896,0.008357,0.00542,0.004187,0.010646
0.002891,0.002312,0.002457,-0.002896,0.008357,0.002279,0.003423


In [None]:
from sklearn.linear_model import LogisticRegression

y_mini_train = np.sign(target.to_numpy().ravel())

mini_model = LogisticRegression()
mini_model.fit(alloc.to_numpy(), y_mini_train)

momentum_feature_values = mini_model.predict_proba(alloc.to_numpy())[:, 1]

print(momentum_feature_values)

array([0.51951711, 0.51951711, 0.52162108, ..., 0.51476374, 0.51416013,
       0.51317477], shape=(8990,))

In [None]:
from sklearn.metrics import roc_auc_score

split = int(alloc.height * 0.8)

X_mini_train = alloc.head(split)
y_train_full = target.head(split)
X_mini_valid = alloc.tail(-split)
y_valid_full = target.tail(-split)

y_train_binary = np.where(y_train_full.to_numpy().ravel() > 0, 1, 0)
y_valid_binary = np.where(y_valid_full.to_numpy().ravel() > 0, 1, 0)

mini_model = LogisticRegression()
mini_model.fit(X_mini_train.to_numpy(), y_train_binary)

valid_probabilities = mini_model.predict_proba(X_mini_valid.to_numpy())[:, 1]

auc_score = roc_auc_score(y_valid_binary, valid_probabilities)
print(f"ROC AUC Score of the mini-model on the validation set: {auc_score:.4f}")


ROC AUC Score of the mini-model on the validation set: 0.5191
