In [1]:
import pandas as pd
import lightgbm as lgb
import pickle
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit
import numpy as np

# --- Configuration ---
# IMPORTANT: Make sure these paths are correct for your local setup.
DATA_PATH = '/Users/rushilpatel/Downloads/hull-tactical-market-prediction/'
OUTPUT_PATH = '/Users/rushilpatel/Downloads/hull-tactical-market-prediction/'

TRAIN_FILE_PATH = os.path.join(DATA_PATH, 'train.csv')
FEATURES_OUTPUT_PATH = os.path.join(OUTPUT_PATH, 'selected_features.txt')
DAYS_PER_YEAR = 252

# --- STRATEGY HYPERPARAMETER ---
# A moderate scaling factor provides a good learning signal for the metric.
ALLOCATION_SCALING_FACTOR = 50.0 

# --- Custom Metric Implementation ---
def calculate_final_score(y_true, y_pred, df_for_metric):
    """Calculates the volatility- and return-penalized Sharpe ratio."""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    market_returns = df_for_metric['forward_returns'].values
    risk_free_rate = df_for_metric['risk_free_rate'].values
    
    positions = np.clip(1.0 + y_pred * ALLOCATION_SCALING_FACTOR, 0, 2)
    strategy_returns = risk_free_rate * (1 - positions) + positions * market_returns
    strategy_excess_returns = strategy_returns - risk_free_rate
    
    clipped_ser = np.clip(strategy_excess_returns, -0.5, 0.5)
    if len(clipped_ser) < 2: return 0.0
    
    geo_mean_strategy = np.expm1(np.mean(np.log1p(clipped_ser)))
    vol_strategy = np.std(strategy_returns, ddof=1) * np.sqrt(DAYS_PER_YEAR)
    if vol_strategy < 1e-6: return 0.0

    market_excess_returns = market_returns - risk_free_rate
    clipped_mer = np.clip(market_excess_returns, -0.5, 0.5)
    geo_mean_market = np.expm1(np.mean(np.log1p(clipped_mer)))
    vol_market = np.std(market_returns, ddof=1) * np.sqrt(DAYS_PER_YEAR)
    
    sharpe_ratio = (geo_mean_strategy / vol_strategy) * np.sqrt(DAYS_PER_YEAR)
    vol_penalty = 1 + max(0, (vol_strategy / vol_market) - 1.2)
    annualized_return_gap = max(0, (geo_mean_market - geo_mean_strategy) * DAYS_PER_YEAR)
    return_penalty = 1 + (annualized_return_gap ** 2) / 100
    
    return min(sharpe_ratio / (vol_penalty * return_penalty), 1_000_000)

def sharpe_eval_metric(y_true, y_pred):
    score = calculate_final_score(y_true, y_pred, val_metric_df)
    return 'custom_sharpe', score, True

# --- Feature Engineering ---
def engineer_features(df):
    df = df.copy()
    print("Engineering features...")
    
    df['P1_div_V1'] = df['P1'] / (df['V1'] + 1e-6)
    df['I1_mul_E1'] = df['I1'] * df['E1']
    lags = [1, 5, 21]
    features_to_lag = ['M1', 'V1', 'P1', 'S1']
    for feature in features_to_lag:
        for lag in lags:
            df[f'{feature}_lag_{lag}'] = df[feature].shift(lag)
            
    return df

val_metric_df = None

def train_with_cross_validation():
    global val_metric_df
    print("--- Starting Time Series Cross-Validation Workflow ---")

    df_train = pd.read_csv(TRAIN_FILE_PATH, parse_dates=['date_id']).sort_values('date_id')

    base_features = [
        'M1', 'M5', 'M6', 'E1', 'E5', 'E7', 'I1', 'I5', 'I7',
        'P1', 'P5', 'P6', 'V1', 'V5', 'V8', 'V9',
        'S1', 'S5', 'S8', 'S10', 'D1', 'D5', 'D8'
    ]
    target = 'market_forward_excess_returns'
    
    df_train_engineered = engineer_features(df_train)
    
    engineered_feature_names = ['P1_div_V1', 'I1_mul_E1'] + [f'{f}_lag_{l}' for f in ['M1', 'V1', 'P1', 'S1'] for l in [1, 5, 21]]
    final_features = base_features + engineered_feature_names

    df_train_engineered.dropna(subset=[target], inplace=True)
    
    X = df_train_engineered[final_features].ffill().bfill()
    y = df_train_engineered[target]

    # --- Time Series Cross-Validation Setup ---
    n_splits = 5
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    print(f"\n--- Starting training for {n_splits} folds ---")

    for fold, (train_index, val_index) in enumerate(tscv.split(X)):
        print(f"\n--- Fold {fold+1}/{n_splits} ---")
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        val_metric_df = df_train_engineered.iloc[val_index]

        print(f"Training data shape: {X_train.shape}, Validation data shape: {X_val.shape}")
        
        lgbm = lgb.LGBMRegressor(
            objective='huber',
            random_state=42 + fold,
            n_estimators=3000,
            learning_rate=0.01,
            num_leaves=24,
            min_child_samples=100,
            subsample=0.7,
            colsample_bytree=0.7,
            reg_alpha=0.2,
            reg_lambda=0.2
        )

        lgbm.fit(X_train, y_train,
                 eval_set=[(X_val, y_val)],
                 eval_metric=sharpe_eval_metric,
                 callbacks=[lgb.early_stopping(200, verbose=False), lgb.log_evaluation(500)])
        
        if lgbm.best_score_:
            #print(f"Fold {fold+1} best score: {lgbm.best_score_['valid_0']['custom_sharpe']:.5f} at iteration {lgbm.best_iteration_}")
        
            model_output_path = os.path.join(OUTPUT_PATH, f'lgbm_model_fold_{fold}.pkl')
            print(f"Saving model for fold {fold+1} to {model_output_path}")
            with open(model_output_path, 'wb') as f:
                pickle.dump(lgbm, f)
        else:
            print(f"Fold {fold+1} did not produce a valid model.")

    print(f"\nSaving final feature list to: {FEATURES_OUTPUT_PATH}")
    with open(FEATURES_OUTPUT_PATH, 'w') as f:
        for feature in X.columns:
            f.write(f"{feature}\n")
    print("Feature list saved successfully.")

if __name__ == '__main__':
    if not os.path.exists(OUTPUT_PATH):
        os.makedirs(OUTPUT_PATH)
    train_with_cross_validation()



--- Starting Time Series Cross-Validation Workflow ---
Engineering features...

--- Starting training for 5 folds ---

--- Fold 1/5 ---
Training data shape: (1500, 37), Validation data shape: (1498, 37)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000252 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5063
[LightGBM] [Info] Number of data points in the train set: 1500, number of used features: 22
[LightGBM] [Info] Start training from score 0.000231


  df_train = pd.read_csv(TRAIN_FILE_PATH, parse_dates=['date_id']).sort_values('date_id')


Saving model for fold 1 to /Users/rushilpatel/Downloads/hull-tactical-market-prediction/lgbm_model_fold_0.pkl

--- Fold 2/5 ---
Training data shape: (2998, 37), Validation data shape: (1498, 37)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000275 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5570
[LightGBM] [Info] Number of data points in the train set: 2998, number of used features: 24
[LightGBM] [Info] Start training from score -0.000073
Saving model for fold 2 to /Users/rushilpatel/Downloads/hull-tactical-market-prediction/lgbm_model_fold_1.pkl

--- Fold 3/5 ---
Training data shape: (4496, 37), Validation data shape: (1498, 37)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000477 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7117
[LightGBM] [Info] Number of data points in the train set: 4496, number o