### 1. Setup

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import scipy.stats as stats

from utils import data_prep as dp, feature_gen as fg, feature_select as fs, backtest as bt

### 2. Load & Clean Data

In [None]:
exchange = "binance"

start_date = "20230101"
end_date = "20241231"

start_dt = dt.datetime.strptime(start_date, "%Y%m%d")
end_dt = dt.datetime.strptime(end_date, "%Y%m%d")

In [None]:
DATA_PATH = "/data/time_series/market_data/"

level1_data = dp.get_files(start_dt, end_dt, "level1", exchange, "futures", "BTCUSDT",DATA_PATH)
book_data = dp.get_files(start_dt, end_dt, "book", exchange, "futures", "BTCUSDT",DATA_PATH)
trade_data = dp.get_files(start_dt, end_dt, "trade", exchange, "futures", "BTCUSDT",DATA_PATH)

level1_data.shape, book_data.shape, trade_data.shape

### 2.1 Carryover Analysis & Handling

In [None]:
# Analyze carryover data quality
carryover_stats = dp.analyze_carryover(
    level1_data, book_data, trade_data,
    names=["level1_data", "book_data", "trade_data"]
)

for name, status in carryover_stats.items():
    if "status" in status:
        print(f"\n{name}: {status['status']}")
        continue
    
    print(f"\n{name}:")
    print(f"  Total rows: {status['total_rows']:,}")
    print(f"  Missing original data (carryover != 0): {status['missing_original']:,} ({status['missing_original_pct']:.2f}%)")
    print(f"  Completely missing (carryover == -1): {status['completely_missing']:,} ({status['completely_missing_pct']:.2f}%)")

In [None]:
level1_data, book_data, trade_data = dp.clean_carryover(
    level1_data, book_data, trade_data,
    carryover_weight=0.5,
    remove_completely_missing=True
)

In [None]:
# Convert timestamps to datetime
level1_data['ts_end'] = pd.to_datetime(level1_data['ts_end'], unit='ms')
book_data[['ts_end', 'ts_book']] = book_data[['ts_end', 'ts_book']].apply(pd.to_datetime, unit='ms')
trade_data['ts_end'] = pd.to_datetime(trade_data['ts_end'], unit='ms')

# Set index to ts_end
level1_data.set_index('ts_end', inplace=True)
book_data.set_index('ts_end', inplace=True)
trade_data.set_index('ts_end', inplace=True)

# Align time series data
(level1_data, book_data, trade_data), start, end = dp.align_ts(level1_data, book_data, trade_data)

# Create a common time index
time_idx = pd.date_range(start=start, end=end, freq='1min')

time_idx[[0, -1]]

In [None]:
# Compute log return
level1_data['log_return'] = np.log(level1_data['close_mid'] / level1_data['close_mid'].shift(1))

### 3. Build Feature

In [None]:
taus = [1, 5, 15, 30, 60, 120]
features_df = fg.build_features(level1_data, book_data, trade_data, taus)
features_df.tail()

### 4. Build Target

In [None]:
target_df = fg.target_rv(level1_data, horizons=[60])
target_df.head()

### 5. Combine Features & Target

In [None]:
df = pd.concat([features_df, target_df], axis=1)
df = df.dropna(axis=0)

# Add data quality columns from level1_data
df["is_carryover"] = level1_data["is_carryover"]
df["sample_weight"] = level1_data["sample_weight"]

df.head()

### 6. Walk-Forward CV

In [None]:
folds = [
    ("2023-01-01", "2023-09-30", "2023-10-01", "2023-12-31"), # fold 1
    ("2023-04-01", "2023-12-31", "2024-01-01", "2024-03-31"), # fold 2
    ("2023-07-01", "2024-03-31", "2024-04-01", "2024-06-30"), # fold 3
]

target = target_df.columns[0]

In [None]:
num_leaves_list = [31, 63, 127]
learning_rates  = [0.05, 0.10]

param_grid = [
    {"num_leaves": nl, "learning_rate": lr, "n_estimators": 300}
    for nl in num_leaves_list
    for lr in learning_rates
]

In [None]:
def train_lgb(train_df, val_df, feats, target, params, use_sample_weight=True):
    model = lgb.LGBMRegressor(
        num_leaves=params["num_leaves"],
        learning_rate=params["learning_rate"],
        n_estimators=params["n_estimators"],
        subsample=0.9,
        colsample_bytree=0.9,
        verbose=-1,
        random_state=42
    )
    if use_sample_weight and "sample_weight" in train_df.columns:
        sample_weights = train_df["sample_weight"].values
        model.fit(train_df[feats], train_df[target], sample_weight=sample_weights)
    else:
        model.fit(train_df[feats], train_df[target])
    pred = model.predict(val_df[feats])

    rmse = mean_squared_error(val_df[target], pred) ** 0.5
    mae  = mean_absolute_error(val_df[target], pred)
    ic   = stats.spearmanr(pred, val_df[target]).correlation

    return rmse, mae, ic

In [None]:
tr_start, tr_end, va_start, va_end = folds[0]
test_params = param_grid[0]  # Use first hyperparameter set

train_df = df[(df.index >= tr_start) & (df.index <= tr_end)]
val_df   = df[(df.index >= va_start) & (df.index <= va_end)]

print(f"\nFold 1: Train {tr_start}→{tr_end}, Val {va_start}→{va_end}")
print(f"Train set: {len(train_df)} rows")
print(f"  - Original data (weight=1.0): {(~train_df['is_carryover']).sum()}")
print(f"  - Carryover data (weight=0.5): {train_df['is_carryover'].sum()}")

# Feature selection
feats_ic, ic_series = fs.select_ic(train_df, target)
feats, _, _ = fs.prune_corr(train_df, feats_ic, ic_series)

# Train WITHOUT sample weighting (original method)
print("METHOD 1: WITHOUT Sample Weighting (Original)")
rmse_no_weight, mae_no_weight, ic_no_weight = train_lgb(
    train_df, val_df, feats, target, test_params, use_sample_weight=False
)
print(f"Results: RMSE={rmse_no_weight:.6f}, MAE={mae_no_weight:.6f}, IC={ic_no_weight:.6f}")

print("METHOD 2: WITH Sample Weighting (Improved)")
rmse_with_weight, mae_with_weight, ic_with_weight = train_lgb(
    train_df, val_df, feats, target, test_params, use_sample_weight=True
)
print(f"Results: RMSE={rmse_with_weight:.6f}, MAE={mae_with_weight:.6f}, IC={ic_with_weight:.6f}")

# Show differences
print("\n" + "=" * 80)
print("IMPACT OF SAMPLE WEIGHTING")
print("=" * 80)
rmse_diff = rmse_with_weight - rmse_no_weight
mae_diff = mae_with_weight - mae_no_weight
ic_diff = ic_with_weight - ic_no_weight

print(f"RMSE change: {rmse_diff:+.6f} ({100*rmse_diff/rmse_no_weight:+.2f}%)")
print(f"MAE change:  {mae_diff:+.6f} ({100*mae_diff/mae_no_weight:+.2f}%)")
print(f"IC change:   {ic_diff:+.6f} ({100*ic_diff/ic_no_weight:+.2f}%)")

In [None]:
tuning_results = []

for p_i, params in enumerate(param_grid, 1):
    print(f"\nHyperparameter set {p_i}/{len(param_grid)}: {params}")
    fold_scores = []

    for f_i, (tr_start, tr_end, va_start, va_end) in enumerate(folds, 1):
        print(f"\nFold {f_i}")
        print(f"  Training window:   {tr_start} to {tr_end}")
        print(f"  Validation window: {va_start} to {va_end}")

        train_df = df[(df.index >= tr_start) & (df.index <= tr_end)]
        val_df   = df[(df.index >= va_start) & (df.index <= va_end)]

        print("  Selecting features (IC ranking)...")
        feats_ic, ic_series = fs.select_ic(train_df, target)

        print("  Pruning correlated features...")
        feats, _, _ = fs.prune_corr(train_df, feats_ic, ic_series)

        print(f"  Features used: {len(feats)}")

        print("  Training model...")
        rmse, mae, ic = train_lgb(train_df, val_df, feats, target, params, use_sample_weight=True)

        print(f"  Fold {f_i} RMSE={rmse:.4f}, MAE={mae:.4f}, IC={ic:.4f}")

        fold_scores.append({"rmse": rmse, "mae": mae, "ic": ic})

    avg_rmse = np.mean([s["rmse"] for s in fold_scores])
    avg_mae  = np.mean([s["mae"]  for s in fold_scores])
    avg_ic   = np.mean([s["ic"]   for s in fold_scores])

    tuning_results.append({
        "params": params,
        "avg_rmse": avg_rmse,
        "avg_mae": avg_mae,
        "avg_ic": avg_ic
    })

    print(f"\nFinished hyperparameter set {p_i}")
    print(f"  Average RMSE={avg_rmse:.4f}, MAE={avg_mae:.4f}, IC={avg_ic:.4f}")

In [None]:
tuning_results_sorted = sorted(tuning_results, key=lambda x: -x["avg_ic"])
best_params = tuning_results_sorted[0]["params"]

print("\nBest hyperparameters based on IC:")
print(best_params)

### 8. Train Final Model

In [None]:
pretest_df = df[: "2024-06-30"]
test_df    = df["2024-07-01":]

# Feature selection on ALL pre-test data
feats_ic, ic_series = fs.select_ic(pretest_df, target)
feats, _, _ = fs.prune_corr(pretest_df, feats_ic, ic_series)

# Final model training (with sample weighting for carryover data quality)
final_model = lgb.LGBMRegressor(
    num_leaves=best_params["num_leaves"],
    learning_rate=best_params["learning_rate"],
    n_estimators=best_params["n_estimators"],
    subsample=0.9,
    colsample_bytree=0.9,
    verbose=-1,
    random_state=42
)

# Use sample weights: carryover data gets 0.5 weight, original data gets 1.0
sample_weights = pretest_df["sample_weight"].values
final_model.fit(pretest_df[feats], pretest_df[target], sample_weight=sample_weights)
oos_pred = final_model.predict(test_df[feats])

rmse = mean_squared_error(test_df[target], oos_pred)**0.5
mae  = mean_absolute_error(test_df[target], oos_pred)
ic   = stats.spearmanr(oos_pred, test_df[target]).correlation

print("\nFinal OOS Performance")
print(f"RMSE: {rmse:.4f}")
print(f"MAE : {mae:.4f}")
print(f"IC  : {ic:.4f}")

In [None]:
# Compute Spearman correlation (feature vs target)
spearman_corr = pretest_df[feats + [target]].corr(method="spearman")[target].drop(target)

plt.figure(figsize=(6, max(6, len(feats)*0.3)))
sns.heatmap(
    spearman_corr.to_frame(),
    annot=True,
    cmap="coolwarm",
    center=0,
    fmt=".2f",
    cbar=False
)
plt.title("Spearman Correlation (Selected Features vs Target)")
plt.ylabel("Features")
plt.tight_layout()
plt.show()

In [None]:
pearson_corr = pretest_df[feats].corr(method="pearson")

plt.figure(figsize=(12, 10))
sns.heatmap(
    pearson_corr,
    cmap="coolwarm",
    center=0,
    annot=True,
    fmt=".2f",
    square=True,
    linewidths=0.5
)
plt.title("Pearson Correlation Heatmap (Selected Features)")
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.dates as mdates
plt.figure(figsize=(14,5))
plt.plot(test_df.index, oos_pred, label='Predicted Volatility', color='blue', alpha=0.5)
plt.plot(test_df.index, test_df[target], label='Actual Realized Volatility', color='orange', alpha=0.5)
plt.title('Predicted vs Actual Realized Volatility (Test Period)')
plt.xlabel('Time')
plt.ylabel('Volatility')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.hist(test_df[target], bins=150, alpha=0.6, label='Actual Volatility', color='orange',density=True)
plt.hist(oos_pred, bins=150, alpha=0.6, label='Predicted Volatility', color='blue',density=True)
plt.title('Distribution of Actual vs Predicted Volatility (Test Period)')
plt.xlabel('Volatility')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.hist(test_df[target], bins=150, alpha=0.6, label='Actual Volatility',
         color='orange', density=True)
plt.hist(oos_pred, bins=150, alpha=0.6, label='Predicted Volatility',
         color='blue', density=True)

plt.xscale('log')
plt.title('Log-Scaled Distribution of Actual vs Predicted Volatility (Test Period)')
plt.xlabel('Volatility (log scale)')
plt.ylabel('Density')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
from scipy.stats import spearmanr

# Scatter plot: Predicted vs Actual Volatility
plt.figure(figsize=(8,6))
plt.scatter(oos_pred, test_df[target], alpha=0.3, color='blue')
plt.xlabel('Predicted Volatility')
plt.ylabel('Actual Realized Volatility')
plt.title('Predicted vs Actual Volatility (Test Period)')

# Calculate Spearman correlation
spearman_corr, _ = spearmanr(oos_pred, test_df[target])
plt.annotate(f'Spearman ρ = {spearman_corr:.4f}', xy=(0.05, 0.95), xycoords='axes fraction', fontsize=14, color='darkred', ha='left', va='top', bbox=dict(boxstyle='round,pad=0.3', fc='white', ec='darkred', lw=2))

plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Rolling 30-day Spearman IC

pred_series = pd.Series(oos_pred, index=test_df.index)
actual_series = test_df[target]

# Calculate rolling Spearman correlation (30 days)
rolling_window = 30 * 24 * 60  # 30 days in minute data)
rolling_ic = []
rolling_dates = []

for i in range(len(test_df) - rolling_window):
    window_pred = pred_series.iloc[i:i+rolling_window]
    window_actual = actual_series.iloc[i:i+rolling_window]
    corr, _ = spearmanr(window_pred, window_actual)
    rolling_ic.append(corr)
    rolling_dates.append(test_df.index[i+rolling_window])

rolling_ic = np.array(rolling_ic)
rolling_dates = pd.DatetimeIndex(rolling_dates)

# Plot rolling Spearman IC
plt.figure(figsize=(14, 6))
plt.plot(rolling_dates, rolling_ic, linewidth=2, color='blue', alpha=0.7)
plt.axhline(y=0, color='black', linestyle='--', alpha=0.5)
plt.axhline(y=np.nanmean(rolling_ic), color='red', linestyle='--', linewidth=2, label=f'Mean IC: {np.nanmean(rolling_ic):.4f}')
plt.fill_between(rolling_dates, rolling_ic, 0, where=(rolling_ic > 0), alpha=0.2, color='green', label='Positive IC')
plt.fill_between(rolling_dates, rolling_ic, 0, where=(rolling_ic <= 0), alpha=0.2, color='red', label='Negative IC')
plt.xlabel('Date')
plt.ylabel('Spearman Correlation')
plt.title('Rolling 30-Day Spearman IC (Predicted vs Actual Volatility)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Summary statistics
print(f'Rolling 30-Day Spearman IC Statistics:')
print(f'  Mean: {np.nanmean(rolling_ic):.4f}')
print(f'  Median: {np.nanmedian(rolling_ic):.4f}')
print(f'  Std: {np.nanstd(rolling_ic):.4f}')
print(f'  Min: {np.nanmin(rolling_ic):.4f}')
print(f'  Max: {np.nanmax(rolling_ic):.4f}')
print(f'  Positive IC periods: {(rolling_ic > 0).sum()}/{len(rolling_ic)} ({100*(rolling_ic > 0).sum()/len(rolling_ic):.1f}%)')

In [None]:
lgb.plot_importance(final_model)
plt.show()

### Backtest

In [None]:
pretest_rv = df.loc["2023-01-01":"2024-06-30", "target_rv_fwd60m"]

sigma_target = np.median(pretest_rv)
print(f"Median of pre-test realized vol: {sigma_target:.4f}\n")

oos_pred = pd.Series(oos_pred, index=test_df.index)

actual_returns = df["log_ret_1m"].loc[test_df.index].shift(-1)   # your actual next-period returns

strat_ret, base_ret, metrics, w = bt.vol_managed_backtest(
    returns=actual_returns,
    pred_vol=oos_pred,
    sigma_target=sigma_target
)

table = bt.metrics_to_table(metrics)
display(table.round(4))
bt.plot_backtest(strat_ret, base_ret, w)

In [None]:
def rolling_sharpe(returns, window=30 * 24 * 60):
    roll_mean = returns.rolling(window).mean()
    roll_std  = returns.rolling(window).std()
    return roll_mean / roll_std

# Rolling Sharpe
roll_sharpe_strat = rolling_sharpe(strat_ret)
roll_sharpe_base  = rolling_sharpe(base_ret)

# Cumulative returns
cum_strat = (1 + strat_ret).cumprod()
cum_base  = (1 + base_ret).cumprod()

# ---- Create the 3-panel plot ----
fig, axs = plt.subplots(3, 1, figsize=(14, 12), sharex=True,
                        gridspec_kw={'height_ratios': [2.2, 1.0, 1.3]})

# Panel 1: Cumulative returns
axs[0].plot(cum_strat, label="Strategy", color="blue")
axs[0].plot(cum_base, label="Buy & Hold", color="gray", alpha=0.7)
axs[0].set_title("Cumulative Returns")
axs[0].set_ylabel("Growth of $1")
axs[0].legend()
axs[0].grid(alpha=0.3)

# Panel 2: Position weights
axs[1].plot(w, color="purple")
axs[1].set_title("Position Weight (Vol Targeting)")
axs[1].set_ylabel("Weight")
axs[1].grid(alpha=0.3)

# Panel 3: Rolling Sharpe
axs[2].plot(roll_sharpe_strat, label="Strategy Rolling Sharpe", color="blue")
axs[2].plot(roll_sharpe_base,  label="Buy & Hold Rolling Sharpe", color="gray", alpha=0.7)
axs[2].axhline(0, color="black", lw=1)
axs[2].set_title("Rolling 30-day Sharpe Ratio")
axs[2].set_ylabel("Sharpe")
axs[2].set_xlabel("Time")
axs[2].grid(alpha=0.3)
axs[2].legend()

plt.tight_layout()
plt.show()