In [1]:
#!/usr/bin/env python3
import os
from pathlib import Path
import pandas as pd
import numpy as np

"""
===============================================================================
ADD DAILY CLOSE-TO-CLOSE CHANGE (POINTS + PERCENT) TO CONTINUOUS FUTURES FILES
===============================================================================

Input options:
1) Per-symbol files created by your notebook:
   ./02-futures_prices/norgate_continuous/parquet/<SYMBOL>.parquet
   ./02-futures_prices/norgate_continuous/csv/<SYMBOL>.csv

2) Consolidated file (if you enabled SAVE_ALL_* in the notebook):
   ./02-futures_prices/norgate_continuous/norgate_continuous_all.parquet (or .csv)

Outputs (per symbol):
- CSV:    OUTPUT_DIR/csv/<SYMBOL>.csv
- Parquet OUTPUT_DIR/parquet/<SYMBOL>.parquet

Appends:
- daily_prices_change_pts
- daily_prices_change_percent
"""

# ============================================================
# Configuration
# ============================================================

# Set ONE of these modes:
INPUT_MODE = "per_symbol_dir"   # "per_symbol_dir" or "consolidated_file"

# If INPUT_MODE == "per_symbol_dir":
INPUT_PARQUET_DIR = Path("./02-futures_prices/norgate_continuous/parquet")
INPUT_CSV_DIR     = Path("./02-futures_prices/norgate_continuous/csv")
PREFER_PARQUET_INPUT = True     # If True, read parquet when available, else CSV

# If INPUT_MODE == "consolidated_file":
CONSOLIDATED_PATH = Path("./02-futures_prices/norgate_continuous/norgate_continuous_all.parquet")
# (can also be .csv)

# Output root (won't touch originals)
OUTPUT_ROOT = Path("./03-futures_price_changes/norgate_continuous/daily_changes")
OUTPUT_CSV_DIR = OUTPUT_ROOT / "csv"
OUTPUT_PARQUET_DIR = OUTPUT_ROOT / "parquet"

# Column expectations (we’ll be robust if some are missing)
DATE_COL = "date"
CLOSE_COL = "close"

# If your files use different names, add aliases here
COL_ALIASES = {
    "delivery month": ["delivery month", "delivery_month", "deliverymonth", "contract_month", "delivery"],
    "open interest":  ["open interest", "open_interest", "openinterest", "oi"],
}

# ============================================================
# Implementation
# ============================================================

def ensure_dirs():
    OUTPUT_CSV_DIR.mkdir(parents=True, exist_ok=True)
    OUTPUT_PARQUET_DIR.mkdir(parents=True, exist_ok=True)

def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    # Your notebook lowercases columns; this keeps it consistent
    df.columns = [str(c).strip().lower() for c in df.columns]
    return df

def ensure_date_column(df: pd.DataFrame) -> pd.DataFrame:
    # Per-symbol parquet/csv from the notebook uses date as index
    if isinstance(df.index, pd.DatetimeIndex) and (DATE_COL not in df.columns):
        df = df.reset_index()
    if DATE_COL in df.columns:
        df[DATE_COL] = pd.to_datetime(df[DATE_COL], errors="coerce")
        df = df.dropna(subset=[DATE_COL]).sort_values(DATE_COL)
    return df

def pick_or_create_optional_cols(df: pd.DataFrame) -> pd.DataFrame:
    # Ensure the optional columns exist (even if NaN)
    for canonical, aliases in COL_ALIASES.items():
        found = None
        for a in aliases:
            if a in df.columns:
                found = a
                break
        if found is None:
            df[canonical] = np.nan
        elif found != canonical:
            df = df.rename(columns={found: canonical})
    return df

def add_daily_changes(df: pd.DataFrame) -> pd.DataFrame:
    if CLOSE_COL not in df.columns:
        raise ValueError(f"Missing '{CLOSE_COL}' column. Available columns: {list(df.columns)}")

    df[CLOSE_COL] = pd.to_numeric(df[CLOSE_COL], errors="coerce")
    df["daily_prices_change_pts"] = df[CLOSE_COL].diff()
    df["daily_prices_change_percent"] = df[CLOSE_COL].pct_change()
    return df

def reorder_columns(df: pd.DataFrame) -> pd.DataFrame:
    desired = [
        "date", "open", "high", "low", "close", "volume",
        "delivery month", "open interest",
        "daily_prices_change_pts", "daily_prices_change_percent",
    ]
    # Keep desired order first, then append any remaining columns
    cols_present = [c for c in desired if c in df.columns]
    remaining = [c for c in df.columns if c not in cols_present]
    return df[cols_present + remaining]

def read_symbol_file(sym: str) -> pd.DataFrame:
    pq = INPUT_PARQUET_DIR / f"{sym}.parquet"
    csv = INPUT_CSV_DIR / f"{sym}.csv"

    if PREFER_PARQUET_INPUT and pq.exists():
        df = pd.read_parquet(pq)
    elif csv.exists():
        df = pd.read_csv(csv)
    elif pq.exists():
        df = pd.read_parquet(pq)
    else:
        raise FileNotFoundError(f"No input found for {sym} in {INPUT_PARQUET_DIR} or {INPUT_CSV_DIR}")

    df = normalize_columns(df)
    df = ensure_date_column(df)
    return df

def write_symbol_outputs(sym: str, df: pd.DataFrame):
    out_csv = OUTPUT_CSV_DIR / f"{sym}.csv"
    out_pq  = OUTPUT_PARQUET_DIR / f"{sym}.parquet"

    df.to_csv(out_csv, index=False)
    df.to_parquet(out_pq, index=False, compression="snappy")

def run_per_symbol_dir():
    # discover symbols from parquet/csv folders
    syms = set()
    if INPUT_PARQUET_DIR.exists():
        syms |= {p.stem for p in INPUT_PARQUET_DIR.glob("*.parquet")}
    if INPUT_CSV_DIR.exists():
        syms |= {p.stem for p in INPUT_CSV_DIR.glob("*.csv")}

    if not syms:
        raise RuntimeError(f"No per-symbol files found in {INPUT_PARQUET_DIR} or {INPUT_CSV_DIR}")

    print(f"Found {len(syms)} symbols.")

    for sym in sorted(syms):
        df = read_symbol_file(sym)
        df = pick_or_create_optional_cols(df)
        df = add_daily_changes(df)
        df = reorder_columns(df)
        write_symbol_outputs(sym, df)
    print(f"Done. Wrote per-symbol outputs to: {OUTPUT_ROOT}")

def run_consolidated_file():
    if not CONSOLIDATED_PATH.exists():
        raise FileNotFoundError(f"Consolidated file not found: {CONSOLIDATED_PATH}")

    if CONSOLIDATED_PATH.suffix.lower() == ".parquet":
        big = pd.read_parquet(CONSOLIDATED_PATH)
    else:
        big = pd.read_csv(CONSOLIDATED_PATH)

    big = normalize_columns(big)
    big = ensure_date_column(big)

    if "symbol" not in big.columns:
        raise ValueError("Consolidated file must contain a 'symbol' column (written by your notebook).")

    # Process each symbol independently so the diff/pct_change doesn’t bleed across symbols
    for sym, df in big.groupby("symbol", sort=True):
        sym = str(sym).strip().upper()
        df = df.copy()
        df = pick_or_create_optional_cols(df)
        df = df.sort_values("date")
        df = add_daily_changes(df)
        df = reorder_columns(df)
        write_symbol_outputs(sym, df)

    print(f"Done. Wrote per-symbol outputs to: {OUTPUT_ROOT}")

def main():
    ensure_dirs()
    if INPUT_MODE == "per_symbol_dir":
        run_per_symbol_dir()
    elif INPUT_MODE == "consolidated_file":
        run_consolidated_file()
    else:
        raise ValueError("INPUT_MODE must be 'per_symbol_dir' or 'consolidated_file'")

if __name__ == "__main__":
    main()


Found 105 symbols.
Done. Wrote per-symbol outputs to: 03-futures_price_changes\norgate_continuous\daily_changes
