In [2]:
#!/usr/bin/env python3
"""
===============================================================================
ADD USD RISK VOLUME MEASURES TO FUTURES VOLATILITY DATA
===============================================================================

Runs AFTER: 4-futures_prices_volatility.ipynb

Reads per-instrument files from:
- 04-futures_price_volatility/norgate_continuous/volatility_measures/parquet/

Each file contains:
- date, close, volume, ewma32_std, etc.

Computes time-series columns:
1) rolling_avg_volume_60d  = 60-day rolling mean of volume
2) rolling_avg_volume_20d  = 20-day rolling mean of volume (more responsive)
3) annualized_vol          = ewma32_std × sqrt(trading_days_per_year)
4) usd_risk_volume         = FX × rolling_avg_volume_60d × annualized_vol × close × multiplier

Formula (from your image):
    USD Risk Volume = FX rate × Avg daily volume × σₐ × Price × Multiplier

Where:
- FX rate: Currency conversion (default 1.0 for USD)
- Avg daily volume: Rolling average volume in contracts/day
- σₐ: Annualized volatility (from ewma32_std)
- Price: Close price
- Multiplier: Contract multiplier (point value)

Outputs to NEW directory structure:
- 04a-risk_volume/norgate_continuous/parquet/<SYMBOL>.parquet
- 04a-risk_volume/norgate_continuous/csv/<SYMBOL>.csv

The backtest (step 5) should then read from this new directory instead of step 4.
===============================================================================
"""

import os
from pathlib import Path
import pandas as pd
import numpy as np

# ============================================================
# Configuration
# ============================================================

CONFIG = {
    # Input: volatility measures from step 4
    "input_parquet_dir": Path("./04-futures_price_volatility/norgate_continuous/volatility_measures/parquet"),
    "input_csv_dir": Path("./04-futures_price_volatility/norgate_continuous/volatility_measures/csv"),
    "prefer_parquet_input": True,
    
    # Contract multipliers
    "contracts_file": Path("./01-futures_universe/futures_contracts_full.parquet"),
    
    # Output: new directory structure
    "output_root": Path("./04a-risk_volume/norgate_continuous"),
    
    # Column names (matching step 4 output)
    "date_col": "date",
    "close_col": "close",
    "volume_col": "volume",
    "vol_col": "ewma32_std",  # Daily EWMA volatility from step 4
    
    # Parameters
    "rolling_volume_window_long": 60,   # 60-day rolling average
    "rolling_volume_window_short": 20,  # 20-day rolling average (more responsive)
    "trading_days_per_year": 256,
    "fx_rate": 1.0,  # Currency conversion (1.0 for USD)
    
    # New column names
    "new_columns": [
        "rolling_avg_volume_60d",
        "rolling_avg_volume_20d", 
        "annualized_vol",
        "usd_risk_volume",
    ],
}

# ============================================================
# Helper Functions
# ============================================================

def ensure_output_dirs() -> None:
    """Create output directories."""
    output_parquet = CONFIG["output_root"] / "parquet"
    output_csv = CONFIG["output_root"] / "csv"
    output_parquet.mkdir(parents=True, exist_ok=True)
    output_csv.mkdir(parents=True, exist_ok=True)
    print(f"✓ Output directories created:")
    print(f"  - {output_parquet}")
    print(f"  - {output_csv}")

def load_contract_multipliers(contracts_path: Path) -> dict:
    """
    Load contract multipliers from futures_contracts_full.parquet.
    
    Returns:
    --------
    dict: {symbol: multiplier}
    """
    if not contracts_path.exists():
        raise FileNotFoundError(f"Contracts file not found: {contracts_path}")
    
    df = pd.read_parquet(contracts_path)
    df.columns = [str(c).strip().lower() for c in df.columns]
    
    # Find symbol column
    sym_col_candidates = ["symbol", "ticker", "root", "contract", "instrument"]
    sym_col = next((c for c in sym_col_candidates if c in df.columns), None)
    if sym_col is None:
        raise ValueError(f"Could not find symbol column. Columns: {list(df.columns)}")
    
    # Find multiplier column
    mult_col_candidates = ["point_value", "pointvalue", "multiplier", "contract_multiplier"]
    mult_col = next((c for c in mult_col_candidates if c in df.columns), None)
    if mult_col is None:
        raise ValueError(f"Could not find multiplier column. Columns: {list(df.columns)}")
    
    # Build dictionary
    multipliers = {}
    for _, row in df.iterrows():
        sym = str(row[sym_col]).strip().upper()
        mult = pd.to_numeric(row[mult_col], errors='coerce')
        if pd.notna(mult) and mult > 0:
            multipliers[sym] = float(mult)
    
    return multipliers

def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Normalize column names to lowercase."""
    df.columns = [str(c).strip().lower() for c in df.columns]
    return df

def ensure_date_column(df: pd.DataFrame) -> pd.DataFrame:
    """Ensure date column exists and is properly formatted."""
    date_col = CONFIG["date_col"]
    
    # If date is index, bring it back as a column
    if isinstance(df.index, pd.DatetimeIndex) and (date_col not in df.columns):
        df = df.reset_index()
        if "index" in df.columns:
            df = df.rename(columns={"index": date_col})
    
    if date_col not in df.columns:
        raise ValueError(f"Missing '{date_col}' column. Found: {list(df.columns)}")
    
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    df = df.dropna(subset=[date_col]).sort_values(date_col)
    return df

def add_risk_volume_measures(df: pd.DataFrame, multiplier: float) -> pd.DataFrame:
    """
    Add USD risk volume measures to the dataframe.
    
    Parameters:
    -----------
    df : DataFrame with date, close, volume, ewma32_std columns
    multiplier : Contract multiplier (point value)
    
    Returns:
    --------
    DataFrame with new columns added
    """
    cfg = CONFIG
    
    # Check required columns exist
    required = [cfg["close_col"], cfg["volume_col"], cfg["vol_col"]]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")
    
    # Convert to numeric
    close = pd.to_numeric(df[cfg["close_col"]], errors='coerce')
    volume = pd.to_numeric(df[cfg["volume_col"]], errors='coerce')
    vol_daily = pd.to_numeric(df[cfg["vol_col"]], errors='coerce')
    
    # 1) Rolling average volume (60-day)
    rolling_vol_60 = volume.rolling(
        window=cfg["rolling_volume_window_long"],
        min_periods=max(1, cfg["rolling_volume_window_long"] // 2)  # Allow half window as minimum
    ).mean()
    
    # 2) Rolling average volume (20-day, more responsive)
    rolling_vol_20 = volume.rolling(
        window=cfg["rolling_volume_window_short"],
        min_periods=max(1, cfg["rolling_volume_window_short"] // 2)
    ).mean()
    
    # 3) Annualized volatility (for convenience)
    annualized_vol = vol_daily * np.sqrt(cfg["trading_days_per_year"])
    
    # 4) USD Risk Volume = FX × Avg_Volume × σ_annualized × Price × Multiplier
    usd_risk = (
        cfg["fx_rate"] * 
        rolling_vol_60 * 
        annualized_vol * 
        close * 
        multiplier
    )
    
    # Add new columns
    df["rolling_avg_volume_60d"] = rolling_vol_60
    df["rolling_avg_volume_20d"] = rolling_vol_20
    df["annualized_vol"] = annualized_vol
    df["usd_risk_volume"] = usd_risk
    
    return df

def reorder_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Reorder columns to put key columns first.
    """
    # Preferred order for key columns
    priority = [
        "date", "open", "high", "low", "close", "volume",
        "delivery month", "open interest",
        "daily_prices_change_pts", "daily_prices_change_percent",
        "daily_std_full", "annualized_std_full", "ewma32_std",
        "rolling_avg_volume_60d", "rolling_avg_volume_20d",
        "annualized_vol", "usd_risk_volume",
    ]
    
    # Get columns that exist and are in priority list
    present_priority = [c for c in priority if c in df.columns]
    
    # Get remaining columns not in priority list
    remaining = [c for c in df.columns if c not in present_priority]
    
    return df[present_priority + remaining]

def read_instrument(symbol: str) -> pd.DataFrame:
    """Read instrument data from input directory."""
    cfg = CONFIG
    
    pq_path = cfg["input_parquet_dir"] / f"{symbol}.parquet"
    csv_path = cfg["input_csv_dir"] / f"{symbol}.csv"
    
    # Try parquet first if preferred
    if cfg["prefer_parquet_input"] and pq_path.exists():
        df = pd.read_parquet(pq_path)
    elif csv_path.exists():
        df = pd.read_csv(csv_path)
    elif pq_path.exists():
        df = pd.read_parquet(pq_path)
    else:
        raise FileNotFoundError(
            f"No input file found for {symbol} in:\n"
            f"  - {cfg['input_parquet_dir']}\n"
            f"  - {cfg['input_csv_dir']}"
        )
    
    df = normalize_columns(df)
    df = ensure_date_column(df)
    return df

def write_instrument(symbol: str, df: pd.DataFrame) -> None:
    """Write instrument data to output directories."""
    output_parquet = CONFIG["output_root"] / "parquet" / f"{symbol}.parquet"
    output_csv = CONFIG["output_root"] / "csv" / f"{symbol}.csv"
    
    df.to_parquet(output_parquet, index=False, compression="snappy")
    df.to_csv(output_csv, index=False)

def get_instrument_list() -> set:
    """Get list of all available instruments from input directories."""
    cfg = CONFIG
    symbols = set()
    
    if cfg["input_parquet_dir"].exists():
        symbols |= {p.stem for p in cfg["input_parquet_dir"].glob("*.parquet")}
    
    if cfg["input_csv_dir"].exists():
        symbols |= {p.stem for p in cfg["input_csv_dir"].glob("*.csv")}
    
    return symbols

# ============================================================
# Main
# ============================================================

def main():
    cfg = CONFIG
    
    print("=" * 80)
    print("USD RISK VOLUME CALCULATION - STEP 4a")
    print("=" * 80)
    print()
    
    # Check input directory exists
    if not cfg["input_parquet_dir"].exists() and not cfg["input_csv_dir"].exists():
        raise FileNotFoundError(
            f"Input directories not found:\n"
            f"  - {cfg['input_parquet_dir']}\n"
            f"  - {cfg['input_csv_dir']}\n"
            f"Please run 4-futures_prices_volatility.ipynb first."
        )
    
    # Create output directories
    ensure_output_dirs()
    print()
    
    # Load contract multipliers
    print(f"Loading contract multipliers from: {cfg['contracts_file']}")
    multipliers = load_contract_multipliers(cfg["contracts_file"])
    print(f"✓ Loaded {len(multipliers)} contract multipliers")
    print()
    
    # Get list of instruments
    symbols = get_instrument_list()
    if not symbols:
        raise RuntimeError("No instrument files found in input directories")
    
    print(f"Found {len(symbols)} instruments to process")
    print(f"Parameters:")
    print(f"  - Rolling volume windows: {cfg['rolling_volume_window_short']}d, {cfg['rolling_volume_window_long']}d")
    print(f"  - Trading days per year: {cfg['trading_days_per_year']}")
    print(f"  - FX rate: {cfg['fx_rate']}")
    print()
    
    # Process each instrument
    print("Processing instruments...")
    print("-" * 80)
    
    processed = 0
    skipped = 0
    
    for symbol in sorted(symbols):
        try:
            # Load data
            df = read_instrument(symbol)
            
            # Get multiplier
            multiplier = multipliers.get(symbol)
            if multiplier is None:
                print(f"  ⚠️  {symbol}: No multiplier found, skipping")
                skipped += 1
                continue
            
            # Add risk volume measures
            df = add_risk_volume_measures(df, multiplier)
            
            # Reorder columns
            df = reorder_columns(df)
            
            # Write output
            write_instrument(symbol, df)
            
            # Get summary stats for reporting
            recent_usd_risk = df['usd_risk_volume'].iloc[-1]
            recent_vol_60 = df['rolling_avg_volume_60d'].iloc[-1]
            recent_price = df[cfg['close_col']].iloc[-1]
            
            if pd.notna(recent_usd_risk):
                print(f"  ✓ {symbol:8s} USD risk: ${recent_usd_risk:>15,.0f}   "
                      f"(vol60d={recent_vol_60:>8,.0f}, price=${recent_price:>8.2f})")
            else:
                print(f"  ✓ {symbol:8s} Processed (USD risk N/A in recent data)")
            
            processed += 1
            
        except Exception as e:
            print(f"  ❌ {symbol}: Error - {e}")
            skipped += 1
    
    print("-" * 80)
    print()
    
    # Summary
    print("=" * 80)
    print("SUMMARY")
    print("=" * 80)
    print(f"Successfully processed: {processed} instruments")
    print(f"Skipped: {skipped} instruments")
    print()
    print(f"Output written to:")
    print(f"  - {cfg['output_root'] / 'parquet'}")
    print(f"  - {cfg['output_root'] / 'csv'}")
    print()
    print("New columns added to each file:")
    for col in cfg["new_columns"]:
        print(f"  - {col}")
    print()
    print("=" * 80)
    print("NEXT STEPS")
    print("=" * 80)
    print("Update your backtest (step 5) CONFIG to use the new directory:")
    print()
    print('  "signal_vol_prices_dir": Path("./04a-risk_volume/norgate_continuous/parquet"),')
    print()
    print("This will give you access to time-series USD risk volume for filtering.")
    print("=" * 80)

if __name__ == "__main__":
    main()

USD RISK VOLUME CALCULATION - STEP 4a

✓ Output directories created:
  - 04a-risk_volume\norgate_continuous\parquet
  - 04a-risk_volume\norgate_continuous\csv

Loading contract multipliers from: 01-futures_universe\futures_contracts_full.parquet
✓ Loaded 105 contract multipliers

Found 105 instruments to process
Parameters:
  - Rolling volume windows: 20d, 60d
  - Trading days per year: 256
  - FX rate: 1.0

Processing instruments...
--------------------------------------------------------------------------------
  ✓ 6A       USD risk: $    366,061,351   (vol60d=  91,052, price=$    0.67)
  ✓ 6B       USD risk: $    378,021,810   (vol60d=  91,504, price=$    1.34)
  ✓ 6C       USD risk: $    227,891,981   (vol60d=  71,794, price=$    0.72)
  ✓ 6E       USD risk: $  1,452,077,944   (vol60d= 189,078, price=$    1.17)
  ✓ 6J       USD risk: $    824,876,137   (vol60d= 154,799, price=$    0.63)
  ✓ 6M       USD risk: $     90,211,480   (vol60d=  53,541, price=$    0.06)
  ✓ 6N       USD ri