In [17]:
import pandas as pd
import lightgbm as lgb
import pickle
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# --- Configuration ---
# IMPORTANT: Make sure these paths are correct for your local setup.
DATA_PATH = '/Users/rushilpatel/Downloads/hull-tactical-market-prediction/'
OUTPUT_PATH = '/Users/rushilpatel/Downloads/hull-tactical-market-prediction/'

TRAIN_FILE_PATH = os.path.join(DATA_PATH, 'train.csv')
MODEL_OUTPUT_PATH = os.path.join(OUTPUT_PATH, 'lgbm_model.pkl')
FEATURES_OUTPUT_PATH = os.path.join(OUTPUT_PATH, 'selected_features.txt')
DAYS_PER_YEAR = 252

# --- SYNCHRONIZED WITH YOUR WORKING SUBMISSION SCRIPT ---
ALLOCATION_SCALING_FACTOR = 7.0

# --- Custom Metric Implementation ---
def calculate_final_score(y_true, y_pred, df_for_metric):
    """Calculates the volatility- and return-penalized Sharpe ratio."""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    market_returns = df_for_metric['forward_returns'].values
    risk_free_rate = df_for_metric['risk_free_rate'].values
    
    positions = np.clip(1.0 + y_pred * ALLOCATION_SCALING_FACTOR, 0, 2)
    strategy_returns = risk_free_rate * (1 - positions) + positions * market_returns
    strategy_excess_returns = strategy_returns - risk_free_rate
    
    clipped_ser = np.clip(strategy_excess_returns, -0.5, 0.5)
    if len(clipped_ser) < 2: return 0.0
    
    geo_mean_strategy = np.expm1(np.mean(np.log1p(clipped_ser)))
    vol_strategy = np.std(strategy_returns, ddof=1) * np.sqrt(DAYS_PER_YEAR)
    if vol_strategy < 1e-6: return 0.0

    market_excess_returns = market_returns - risk_free_rate
    clipped_mer = np.clip(market_excess_returns, -0.5, 0.5)
    geo_mean_market = np.expm1(np.mean(np.log1p(clipped_mer)))
    vol_market = np.std(market_returns, ddof=1) * np.sqrt(DAYS_PER_YEAR)
    
    sharpe_ratio = (geo_mean_strategy / vol_strategy) * np.sqrt(DAYS_PER_YEAR)
    vol_penalty = 1 + max(0, (vol_strategy / vol_market) - 1.2)
    annualized_return_gap = max(0, (geo_mean_market - geo_mean_strategy) * DAYS_PER_YEAR)
    return_penalty = 1 + (annualized_return_gap ** 2) / 100
    
    return min(sharpe_ratio / (vol_penalty * return_penalty), 1_000_000)

def sharpe_eval_metric(y_true, y_pred):
    score = calculate_final_score(y_true, y_pred, val_metric_df)
    return 'custom_sharpe', score, True

# --- SYNCHRONIZED WITH YOUR WORKING SUBMISSION SCRIPT ---
def engineer_features(df):
    df = df.copy()
    print("Engineering features to match submission script...")
    
    df['P1_div_V1'] = df['P1'] / (df['V1'] + 1e-6)
    df['I1_mul_E1'] = df['I1'] * df['E1']
    lags = [1, 3, 5, 10]
    features_to_lag = ['M1', 'V1', 'P1', 'S1']
    for feature in features_to_lag:
        for lag in lags:
            df[f'{feature}_lag_{lag}'] = df[feature].shift(lag)
            
    return df

val_metric_df = None

def train_and_evaluate_model():
    global val_metric_df
    print("--- Starting Single Model Training Workflow ---")

    df_train = pd.read_csv(TRAIN_FILE_PATH, parse_dates=['date_id'])
    df_train = df_train.sort_values('date_id')


    base_features = [
        'M1', 'M5', 'M6', 'E1', 'E5', 'E7', 'I1', 'I5', 'I7',
        'P1', 'P5', 'P6', 'V1', 'V5', 'V8', 'V9',
        'S1', 'S5', 'S8', 'S10', 'D1', 'D5', 'D8'
    ]
    target = 'market_forward_excess_returns'
    
    df_train_engineered = engineer_features(df_train)
    
    engineered_feature_names = ['P1_div_V1', 'I1_mul_E1'] + [f'{f}_lag_{l}' for f in ['M1', 'V1', 'P1', 'S1'] for l in [1, 3, 5, 10]]
    final_features = base_features + engineered_feature_names

    df_train_engineered.dropna(subset=[target], inplace=True)
    
    X = df_train_engineered[final_features].ffill().bfill()
    y = df_train_engineered[target]

    # --- NEW: More Robust Train/Validation Split ---
    validation_size = 252 * 3  # Use last 3 years for validation
    gap_size = 21 # 1-month gap to prevent data leakage
    
    train_end_index = len(X) - validation_size - gap_size
    val_start_index = len(X) - validation_size

    X_train, y_train = X.iloc[:train_end_index], y.iloc[:train_end_index]
    X_val, y_val = X.iloc[val_start_index:], y.iloc[val_start_index:]
    
    val_metric_df = df_train_engineered.iloc[val_start_index:]

    print(f"Training data shape: {X_train.shape}, Validation data shape: {X_val.shape}")
    
    # --- Final Model Configuration ---
    print("\n--- 5. Training Final Single Model ---")
    lgbm = lgb.LGBMRegressor(
        objective='huber',
        random_state=42,
        n_estimators=4000,
        learning_rate=0.005, # Slow learning
        num_leaves=16,
        min_child_samples=200, # Strong constraint
        subsample=0.6,
        colsample_bytree=0.6,
        reg_alpha=0.1,
        reg_lambda=0.1
    )

    lgbm.fit(X_train, y_train,
             eval_set=[(X_val, y_val)],
             eval_metric=sharpe_eval_metric,
             callbacks=[lgb.early_stopping(300, verbose=True), lgb.log_evaluation(300)])

    # --- Retrain and Save Final Model ---
    print("\n--- 6. Training Final Model on All Data ---")
    final_model = lgb.LGBMRegressor(**lgbm.get_params())
    best_iter = lgbm.best_iteration_ if lgbm.best_iteration_ is not None and lgbm.best_iteration_ > 0 else 1
    final_model.set_params(n_estimators=best_iter)
    final_model.fit(X, y)
    print("Final model training complete.")

    print(f"Saving final model to: {MODEL_OUTPUT_PATH}")
    with open(MODEL_OUTPUT_PATH, 'wb') as f:
        pickle.dump(final_model, f)
    print("Model saved successfully.")
    
    print(f"Saving feature list to: {FEATURES_OUTPUT_PATH}")
    with open(FEATURES_OUTPUT_PATH, 'w') as f:
        for feature in X.columns:
            f.write(f"{feature}\n")
    print("Feature list saved successfully.")

if __name__ == '__main__':
    if not os.path.exists(OUTPUT_PATH):
        os.makedirs(OUTPUT_PATH)
    train_and_evaluate_model()



--- Starting Single Model Training Workflow ---
Engineering features to match submission script...
Training data shape: (8213, 41), Validation data shape: (756, 41)

--- 5. Training Final Single Model ---
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000844 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9660
[LightGBM] [Info] Number of data points in the train set: 8213, number of used features: 41
[LightGBM] [Info] Start training from score 0.000022
Training until validation scores don't improve for 300 rounds


  df_train = pd.read_csv(TRAIN_FILE_PATH, parse_dates=['date_id'])


[300]	valid_0's huber: 3.30481e-05	valid_0's custom_sharpe: 0.0764093
Early stopping, best iteration is:
[2]	valid_0's huber: 3.29196e-05	valid_0's custom_sharpe: 0.0765416

--- 6. Training Final Model on All Data ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000670 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9671
[LightGBM] [Info] Number of data points in the train set: 8990, number of used features: 41
[LightGBM] [Info] Start training from score 0.000051
Final model training complete.
Saving final model to: /Users/rushilpatel/Downloads/hull-tactical-market-prediction/lgbm_model.pkl
Model saved successfully.
Saving feature list to: /Users/rushilpatel/Downloads/hull-tactical-market-prediction/selected_features.txt
Feature list saved successfully.
