In [None]:
# @title 1. Environment Setup & Branch Checkout
import os
from google.colab import drive

# 1. Mount Drive to persist models/data
drive.mount('/content/drive')

# 2. Clone & Switch Branch
repo_path = "/content/VolSense"
if not os.path.exists(repo_path):
    !git clone https://github.com/rahulmkarthik/VolSense.git $repo_path
    %cd $repo_path
    !git fetch origin feature/volnetx
    !git checkout feature/volnetx
    print("‚úÖ Checked out branch: feature/volnetx")
else:
    %cd $repo_path
    !git pull origin feature/volnetx
    print("‚úÖ Repo updated.")

# 3. Install Dependencies (Editable mode to ensure local changes apply)
!pip install -r requirements.txt --quiet
!pip install -e . --quiet

# 4. Verify Structure
print("\nüìÇ Current Working Directory:", os.getcwd())
if not os.path.exists("models"):
    os.makedirs("models")
    print("   created 'models/' directory for artifacts.")

In [None]:
# @title 2. Load Engineered Dataset
import pandas as pd

# ‚ö†Ô∏è UPDATE THIS PATH to where you uploaded your file
DATASET_PATH = "/content/drive/MyDrive/VolNetX_training_data/unzipped/master_lstm_dataset_v2.csv"

if not os.path.exists(DATASET_PATH):
    print(f"‚ö†Ô∏è File not found at {DATASET_PATH}. Please upload it.")
else:
    print(f"üöÄ Loading dataset from {DATASET_PATH}...")
    df = pd.read_csv(DATASET_PATH, parse_dates=["date"])

    # Quick sanity check
    print(f"   Rows: {len(df):,}")
    print(f"   Tickers: {df['ticker'].nunique()}")

    # Verify new features exist
    expected_cols = ["macro_Oil", "vol_entropy", "skew_scaled_return"]
    missing = [c for c in expected_cols if c not in df.columns]
    if missing:
        print(f"‚ùå WARNING: Missing features: {missing}")
    else:
        print("‚úÖ All key engineered features found.")

üåç Fetching market data: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:02<00:00,  1.50ticker/s]


In [None]:
# @title 3. VolNetX Configuration
from volsense_core.forecaster_core import VolSenseForecaster
import torch

# --- 1. Feature Set (18 Extra + 'return' + 'realized_vol') ---
EXTRA_FEATURES = [
    # --- Core Trends ---
    "vol_20d", "vol_60d", "vol_3d", "vol_10d",
    # --- Volatility Dynamics ---
    "vol_vol", "vol_entropy", "vol_chg", "vol_ratio",
    # --- Price & Momentum ---
    "abs_return", "macd_diff", "rsi_14",
    # --- Macro & Stress ---
    "macro_VIX", "macro_Oil",
    "market_stress", "vol_stress",
    # --- Distribution ---
    "skew_scaled_return"
]

# --- 2. Hyperparameters ---
MODEL_VERSION = "v701_volnetx_v8"  # Unique tag for this run
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

TRAIN_CONFIG = {
    "window": 65,                # 3 months context for Transformer
    "horizons": [1, 5, 10],      # Multi-horizon target
    "loss_horizon_weights": [0.5, 0.3, 0.2], # Prioritize 1-day accuracy
    "hidden_dim": 128,           # d_model size
    "num_layers": 3,             # Depth (LSTM + Transformer blocks)
    "epochs": 50,                # Max epochs (early stopping will handle overtraining)
    "batch_size": 128,           # Larger batch for stable gradients
    "lr": 6e-4,                  # Slightly conservative learning rate
    "cosine_schedule" : True,
    "dropout": 0.2,              # Regularization
    "grad_clip" : 1.0,
    "weight_decay" : 2e-5,
    "val_start": "2023-06-01",   # Validation cutoff
    "use_transformer": True,     # Enable VolNetX hybrid mode
    "use_feature_attention": True, # Enable dynamic feature selection
    "global_ckpt_path": f"models/{MODEL_VERSION}", # Save path relative to repo root
    "patience" : 13
}

print(f"‚öôÔ∏è configured VolNetX ({MODEL_VERSION}) on {DEVICE}")
print(f"   Features: {len(EXTRA_FEATURES)} explicit + base features")

Unnamed: 0,date,realized_vol_log,realized_vol,ticker,return,vol_vol,return_sharpe_20d,macd_diff,vol_3d,market_stress_1d_lag,...,vol_60d,vol_entropy,skew_5d,vol_kurt_20d,vol_ratio,ewma_vol_10d,vol_stress,rsi_14,vol_skew_20d,abs_return
0,2005-01-25,-0.706510,0.493362,AAPL,0.018231,0.000000,0.000000,0.000000,0.493362,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.999998,0.493362,0.017709,0.000000,0.000000,0.018231
1,2005-01-26,-0.705208,0.494005,AAPL,0.002776,0.000455,0.000000,-0.000986,0.493684,0.017709,...,0.000000,0.000000,0.000000,0.000000,0.999998,0.493479,0.032465,0.000000,0.000000,0.002776
2,2005-01-27,-0.704852,0.494181,AAPL,0.005398,0.000431,0.000000,-0.001383,0.493849,0.032465,...,0.000000,0.000000,0.628032,0.000000,0.999998,0.493607,0.012322,0.000000,0.000000,0.005398
3,2005-01-28,-0.704045,0.494579,AAPL,0.018447,0.000507,0.000000,-0.000721,0.494255,0.012322,...,0.000000,0.000000,-0.049140,0.000000,1.000450,0.493783,0.009239,0.000000,0.000000,0.018447
4,2005-01-31,-0.837282,0.432884,AAPL,0.039470,0.027349,0.000000,0.001080,0.473882,0.009235,...,0.000000,0.000000,0.660010,0.000000,0.983558,0.482711,0.014591,0.000000,0.000000,0.039470
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20903,2025-10-27,-2.020124,0.132638,MSFT,0.015107,0.006880,0.185402,0.001274,0.142235,0.008716,...,0.167869,-1315.194138,0.848502,1.632760,0.950492,0.147616,0.010147,60.651309,-0.860415,0.015107
20904,2025-10-28,-1.909825,0.148105,MSFT,0.019849,0.006466,0.237702,0.001929,0.142801,0.010675,...,0.166809,-1320.893085,0.269403,1.735806,0.959574,0.147705,0.010649,58.571901,-0.889229,0.019849
20905,2025-10-29,-1.905825,0.148699,MSFT,-0.000959,0.005909,0.214903,0.000902,0.143147,0.011097,...,0.165532,-1339.605424,0.290157,1.902887,0.966849,0.147886,0.011354,51.533138,-0.936720,0.000959
20906,2025-10-30,-1.632705,0.195399,MSFT,-0.029157,0.016145,0.086663,-0.001599,0.164068,0.011743,...,0.164996,-600.297462,-0.895050,7.665364,1.078512,0.156524,0.029837,47.234082,2.032174,0.029157


In [None]:
# @title 4. Train VolNetX
import time
import shutil

# Initialize Forecaster with "volnetx" method
forecaster = VolSenseForecaster(
    method="volnetx",
    device=DEVICE,
    extra_features=EXTRA_FEATURES,
    **TRAIN_CONFIG
)

print("üöÄ Starting Training Run...")
start_time = time.time()

# Run Fit (Handles Dataset Build -> Train -> Save internally)
forecaster.fit(df)

# üõ°Ô∏è Auto-Backup to Drive
drive_save_dir = "/content/drive/MyDrive/Colab_files/VolSense/models"
os.makedirs(drive_save_dir, exist_ok=True)

print(f"üíæ Backing up artifacts to Drive: {drive_save_dir}...")
# Copy all files starting with the model version
for f in os.listdir("models"):
    if f.startswith(MODEL_VERSION):
        shutil.copy2(os.path.join("models", f), drive_save_dir)
        print(f"   ‚úÖ Copied {f}")

end_time = time.time()
print(f"\n‚úÖ Training Complete in {(end_time - start_time)/60:.1f} minutes.")

In [None]:
# @title 6. Evaluate VolNetX Performance
from volsense_core.evaluation.evaluation import ModelEvaluator
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Run Predictions on Test Set (Forecasts aligned with Realized Vol)
#    The 'predict' method automatically handles rolling windows and inverse log-transform
print("üîÆ Generating rolling forecasts on validation set...")
preds_df = forecaster.predict(df, mode="eval")

# Filter for validation period only to be fair
val_start_date = pd.to_datetime(TRAIN_CONFIG["val_start"])
preds_df = preds_df[preds_df['asof_date'] >= val_start_date].copy()

print(f"‚úÖ Generated {len(preds_df):,} predictions from {val_start_date.date()}")

# 2. Initialize Evaluator
evaluator = ModelEvaluator(preds_df, model_name=MODEL_VERSION)

# 3. Compute & Show Metrics
metrics_df = evaluator.compute_metrics()
summary_df = evaluator.summarize()

# 4. Visualization: True vs Pred (Linear Scale)
#    We look for tight clustering around the diagonal (red line)
def plot_performance(df, horizon=1):
    subset = df[df['horizon'] == horizon]

    fig, ax = plt.subplots(1, 2, figsize=(14, 5))

    # Scatter Plot
    sns.scatterplot(data=subset, x='realized_vol', y='forecast_vol', alpha=0.1, ax=ax[0])
    max_val = max(subset['realized_vol'].max(), subset['forecast_vol'].max())
    ax[0].plot([0, max_val], [0, max_val], 'r--', label='Perfect Prediction')
    ax[0].set_title(f"Horizon {horizon}d: Forecast vs Realized")
    ax[0].set_xlabel("True Volatility")
    ax[0].set_ylabel("Predicted Volatility")
    ax[0].legend()

    # Residual Distribution
    residuals = subset['forecast_vol'] - subset['realized_vol']
    sns.histplot(residuals, bins=50, kde=True, ax=ax[1])
    ax[1].set_title(f"Horizon {horizon}d: Residuals (Error)")
    ax[1].set_xlabel("Error (Pred - True)")
    ax[1].axvline(0, color='r', linestyle='--')

    plt.tight_layout()
    plt.show()

print("\nüìä Visualizing 1-Day Horizon Performance:")
plot_performance(preds_df, horizon=1)

if 5 in TRAIN_CONFIG["horizons"]:
    print("\nüìä Visualizing 5-Day Horizon Performance:")
    plot_performance(preds_df, horizon=5)

if 10 in TRAIN_CONFIG["horizons"]:
    print("\nüìä Visualizing 10-Day Horizon Performance:")
    plot_performance(preds_df, horizon=10)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1024x81 and 32x32)

In [None]:
metrics_df.to_csv("/content/drive/MyDrive/Colab_files/VolSense/models/{MODEL_VERSION}_metrics.csv")

In [None]:
# @title 7. Compare against v507 (Global LSTM)
from volsense_core.utils.scalers import TorchStandardScaler
from volsense_inference.model_loader import load_model
import types
import numpy as np


# 1. Load v507 Artifacts
MODEL_V507 = "v507"  # Ensure this matches the folder name in 'models/'
print(f"ü§ñ Loading Baseline Model: {MODEL_V507}...")

try:
    # Load raw artifacts
    model_507, meta_507, _, t2i_507, feats_507 = load_model(MODEL_V507, "models", device=DEVICE)

    # 2. JIT Scaler Reconstruction (Critical for v507)
    #    v507 expects specific features. We must create scalers for IT, not VolNetX.
    print(f"   üõ†Ô∏è Reconstructing scalers for {len(t2i_507)} tickers...")
    scalers_507 = {}

    # Use the full 'df' to approximate scaling stats for inference
    # (Strictly, this should be training-only data, but for a quick check this works)
    df_grouped = df.groupby("ticker")

    for t in t2i_507:
        sc = TorchStandardScaler()
        if t in df_grouped.groups:
            t_df = df_grouped.get_group(t)
            # Only use features v507 was trained on
            valid_feats = [f for f in feats_507 if f in t_df.columns]

            if valid_feats:
                data_vals = t_df[valid_feats].values.astype(float)
                data_vals = np.nan_to_num(data_vals)
                sc.fit(torch.tensor(data_vals, dtype=torch.float32))
                sc.feature_names_in_ = valid_feats # CRITICAL for predict_next_day
            else:
                # Feature mismatch fallback
                dim = len(feats_507)
                sc.mean_ = torch.zeros(dim)
                sc.scale_ = torch.ones(dim)
                sc.feature_names_in_ = feats_507
        else:
            # Ticker unknown to current dataset
            dim = len(feats_507)
            sc.mean_ = torch.zeros(dim)
            sc.scale_ = torch.ones(dim)
            sc.feature_names_in_ = feats_507

        scalers_507[t] = sc

    # 3. Hydrate Forecaster Wrapper
    vf_507 = VolSenseForecaster(method="global_lstm", device=DEVICE)
    vf_507.model = model_507
    vf_507.global_ticker_to_id = t2i_507
    vf_507.global_scalers = scalers_507
    vf_507.global_window = meta_507.get("window", 40)

    # Mock config
    vf_507.cfg = types.SimpleNamespace()
    vf_507.cfg.horizons = meta_507.get("horizons", [1, 5, 10])

    # 4. Predict
    print(f"üîÆ Generating v507 forecasts on validation set...")
    preds_507 = vf_507.predict(df, mode="eval")

    # Filter to same validation window as VolNetX for fair comparison
    preds_507 = preds_507[preds_507['asof_date'] >= val_start_date].copy()

    # 5. Evaluate
    eval_507 = ModelEvaluator(preds_507, model_name=f"Baseline_{MODEL_V507}")
    summary_507 = eval_507.summarize()

    # 6. Head-to-Head Comparison Table
    print("\nüèÜ Head-to-Head: VolNetX vs v507 (1-Day Horizon)")

    # Extract 1d rows
    res_volnet = summary_df[summary_df['horizon'] == 1].copy()
    res_volnet['Model'] = "VolNetX"

    res_507 = summary_507[summary_507['horizon'] == 1].copy()
    res_507['Model'] = "v507 (LSTM)"

    comp_table = pd.concat([res_volnet, res_507], ignore_index=True)
    cols = ['Model', 'RMSE', 'MAE', 'R2', 'Corr', 'DW']
    display(comp_table[cols].style.background_gradient(cmap="Greens", subset=['R2', 'Corr']).background_gradient(cmap="Reds_r", subset=['RMSE', 'MAE']))

    # 7. Visual Comparison (Scatter)
    print("\nüìä Visual Comparison (1-Day):")
    fig, ax = plt.subplots(1, 2, figsize=(14, 5), sharex=True, sharey=True)

    # VolNetX
    vx_data = preds_df[preds_df['horizon'] == 1]
    sns.scatterplot(data=vx_data, x='realized_vol', y='forecast_vol', alpha=0.15, ax=ax[0], color='blue')
    ax[0].plot([0, 0.04], [0, 0.04], 'k--', lw=1)
    ax[0].set_title(f"VolNetX (R2: {res_volnet['R2'].iloc[0]:.3f})")

    # v507
    v5_data = preds_507[preds_507['horizon'] == 1]
    sns.scatterplot(data=v5_data, x='realized_vol', y='forecast_vol', alpha=0.15, ax=ax[1], color='red')
    ax[1].plot([0, 0.04], [0, 0.04], 'k--', lw=1)
    ax[1].set_title(f"v507 (R2: {res_507['R2'].iloc[0]:.3f})")

    plt.tight_layout()
    plt.show()

except Exception as e:
    print(f"‚ùå Comparison failed: {e}")
    print("Tip: Ensure 'v507' folder exists in models/ and contains .pth + .meta.json")

In [None]:
# @title 8. Multi-Horizon Analysis (5d & 10d)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Define horizons to analyze
analyze_horizons = [5, 10]

# Ensure we have the summary DFs (computed in previous cells)
# summary_df = evaluator.summarize()  <-- VolNetX
# summary_507 = eval_507.summarize()  <-- v507 Baseline

for h in analyze_horizons:
    print(f"\n{'='*60}")
    print(f"üìÖ  {h}-DAY HORIZON COMPARISON")
    print(f"{'='*60}")

    # 1. Filter & Merge Metrics
    try:
        # Extract rows for specific horizon
        row_volnet = summary_df[summary_df['horizon'] == h].copy()
        row_volnet['Model'] = "VolNetX"

        row_507 = summary_507[summary_507['horizon'] == h].copy()
        row_507['Model'] = "v507 (LSTM)"

        # Combine
        comp_table = pd.concat([row_volnet, row_507], ignore_index=True)

        # Display Styled Table
        cols = ['Model', 'RMSE', 'MAE', 'R2', 'Corr', 'DW']
        display(comp_table[cols].style.background_gradient(cmap="Greens", subset=['R2', 'Corr'])
                                      .background_gradient(cmap="Reds_r", subset=['RMSE', 'MAE']))

        # 2. Visual Comparison
        fig, ax = plt.subplots(1, 2, figsize=(14, 5), sharex=True, sharey=True)

        # VolNetX Plot
        vx_data = preds_df[preds_df['horizon'] == h]
        sns.scatterplot(data=vx_data, x='realized_vol', y='forecast_vol',
                        alpha=0.2, ax=ax[0], color='blue')

        # Reference Line (Diagonal)
        max_val = max(vx_data['realized_vol'].max(), 0.05) # Cap for visibility
        ax[0].plot([0, max_val], [0, max_val], 'k--', lw=1.5)
        r2_val = row_volnet['R2'].iloc[0] if not row_volnet.empty else 0.0
        ax[0].set_title(f"VolNetX ({h}d) | R¬≤: {r2_val:.3f}")
        ax[0].set_xlabel("Realized Vol")
        ax[0].set_ylabel("Forecast Vol")
        ax[0].grid(True, alpha=0.3)

        # v507 Plot
        v5_data = preds_507[preds_507['horizon'] == h]
        sns.scatterplot(data=v5_data, x='realized_vol', y='forecast_vol',
                        alpha=0.2, ax=ax[1], color='red')

        ax[1].plot([0, max_val], [0, max_val], 'k--', lw=1.5)
        r2_507 = row_507['R2'].iloc[0] if not row_507.empty else 0.0
        ax[1].set_title(f"v507 ({h}d) | R¬≤: {r2_507:.3f}")
        ax[1].set_xlabel("Realized Vol")
        ax[1].grid(True, alpha=0.3)

        plt.tight_layout()
        plt.show()

    except IndexError:
        print(f"‚ö†Ô∏è Horizon {h} not found in results. (Did you configure horizons=[1,5,10]?)")
    except Exception as e:
        print(f"‚ùå Error generating {h}d comparison: {e}")