In [1]:
#!/usr/bin/env python3
import os
import re
import pandas as pd
from datetime import datetime

"""
Purpose:
    Trim each universe_with_jump90.parquet (per regression lookback) to only S&P 500
    constituents on each date, and save the output file with the lookback in the filename.

Inputs (per lookback):
    ./11a-multiple_universe_with_jump90/lookback_XXD/universe_with_jump90.parquet

Membership:
    ./1-sp500_membership_daily_matrix/sp500_membership_join_exit_date.parquet
    columns: first_join_date, last_exit_date
    ticker may be index or column.

Outputs (one file per lookback, filename includes lookback):
    ./12a-multiple-tradable_sp500_universe/12a-multiple-tradable_sp500_universe_XXD.parquet

Validation:
    ./system_verification/12a-multiple-tradable_sp500_universe/validation_XXD_<timestamp>.csv
    ./system_verification/12a-multiple-tradable_sp500_universe/validation_summary_<timestamp>.csv
"""

# ============================================================
# CONFIG
# ============================================================

INPUT_ROOT      = "./11a-multiple_universe_with_jump90"
MEMBERSHIP_FILE = "./1-sp500_membership_daily_matrix/sp500_membership_join_exit_date.parquet"

OUTPUT_DIR      = "./12a-multiple-tradable_sp500_universe"
VER_DIR         = "./system_verification/12a-multiple-tradable_sp500_universe"
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(VER_DIR, exist_ok=True)

INPUT_FILENAME = "universe_with_jump90.parquet"
DATE_CUTOFF = pd.Timestamp("1998-01-01")

print("=== BUILDING TRADABLE SP500 UNIVERSE (post-1998) for EACH LOOKBACK ===")

# ============================================================
# HELPERS
# ============================================================

def discover_lookback_dirs(root: str):
    """Find subdirectories like lookback_60D, lookback_90D, etc., sorted by numeric window."""
    pat = re.compile(r"^lookback_(\d+)D$")
    found = []
    for name in os.listdir(root):
        full = os.path.join(root, name)
        if os.path.isdir(full):
            m = pat.match(name)
            if m:
                found.append((int(m.group(1)), name))
    found.sort(key=lambda x: x[0])
    return found  # list of (window_int, dir_name)

def safe_read_parquet(path: str):
    try:
        return pd.read_parquet(path)
    except Exception:
        return None

# ============================================================
# 1) LOAD MEMBERSHIP TABLE (ONCE)
# ============================================================

m = safe_read_parquet(MEMBERSHIP_FILE)
if m is None or m.empty:
    raise FileNotFoundError(f"Could not read membership file: {MEMBERSHIP_FILE}")

# normalize to a 'ticker' column (membership often uses ticker as index)
if "ticker" not in m.columns:
    if m.index.name is None:
        m = m.reset_index().rename(columns={"index": "ticker"})
    else:
        m = m.reset_index().rename(columns={m.index.name: "ticker"})

need_cols = {"ticker", "first_join_date", "last_exit_date"}
if not need_cols.issubset(m.columns):
    raise ValueError(f"Membership file missing columns: {need_cols - set(m.columns)}")

m["first_join_date"] = pd.to_datetime(m["first_join_date"])
m["last_exit_date"]  = pd.to_datetime(m["last_exit_date"])
m = m[["ticker", "first_join_date", "last_exit_date"]].copy()

print(f"Loaded membership table: {len(m):,} tickers")

# ============================================================
# 2) DISCOVER INPUT UNIVERSES (PER LOOKBACK)
# ============================================================

lookbacks = discover_lookback_dirs(INPUT_ROOT)

jobs = []
if lookbacks:
    for w, lb_dir in lookbacks:
        f = os.path.join(INPUT_ROOT, lb_dir, INPUT_FILENAME)
        if os.path.exists(f):
            jobs.append((w, lb_dir, f))
        else:
            print(f"⚠ Missing input universe for {lb_dir}: {f}")
else:
    # Optional fallback if there is a root-level file (older pipeline)
    root_file = os.path.join(INPUT_ROOT, INPUT_FILENAME)
    if os.path.exists(root_file):
        jobs.append((-1, "root", root_file))

if not jobs:
    raise FileNotFoundError(
        f"No input universes found. Expected {INPUT_ROOT}/lookback_XXD/{INPUT_FILENAME} "
        f"(or {INPUT_ROOT}/{INPUT_FILENAME})."
    )

print(f"Found {len(jobs)} universes to trim.")

# ============================================================
# 3) PROCESS EACH UNIVERSE
# ============================================================

ts = datetime.now().strftime("%Y%m%d-%H%M%S")
summary_rows = []

for w, lb_dir, input_path in jobs:
    lookback_label = f"{w}D" if w > 0 else "root"

    print("\n----------------------------------------------------")
    print(f"Processing lookback: {lb_dir}")
    print(f"Input: {input_path}")
    print("----------------------------------------------------")

    df = safe_read_parquet(input_path)
    if df is None or df.empty:
        print(f"❌ Could not read or empty: {input_path}")
        continue

    if "date" not in df.columns or "ticker" not in df.columns:
        print(f"❌ Missing required columns ('date','ticker') in: {input_path}")
        continue

    df = df.copy()
    df["date"] = pd.to_datetime(df["date"])

    rows_in = int(len(df))

    # 1) date cutoff
    df = df[df["date"] >= DATE_CUTOFF].copy()
    rows_after_cutoff = int(len(df))

    # 2) merge membership join/exit dates
    df = df.merge(m, on="ticker", how="left")

    # 3) apply membership window (vectorized)
    # in_sp500 = join exists AND date >= join AND (exit is NaT OR date <= exit)
    join = df["first_join_date"]
    exit_ = df["last_exit_date"]
    d = df["date"]

    df["in_sp500"] = join.notna() & (d >= join) & (exit_.isna() | (d <= exit_))

    df2 = df[df["in_sp500"]].copy()
    rows_out = int(len(df2))

    # Sort: date ↑ then slope_adj ↓ (if exists)
    if "slope_adj" in df2.columns:
        df2 = df2.sort_values(["date", "slope_adj"], ascending=[True, False])
    else:
        df2 = df2.sort_values(["date", "ticker"], ascending=[True, True])

    # ============================================================
    # SAVE OUTPUT (filename includes lookback)
    # ============================================================

    out_name = f"12-tradable_sp500_universe_{lookback_label}.parquet"
    out_path = os.path.join(OUTPUT_DIR, out_name)
    df2.to_parquet(out_path, index=False)

    # Validation log per lookback (filename includes lookback)
    missing_membership_rows = int(df["first_join_date"].isna().sum())

    val = pd.DataFrame([{
        "timestamp": datetime.now().isoformat(timespec="seconds"),
        "lookback_dir": lb_dir,
        "lookback_label": lookback_label,
        "input_file": input_path,
        "output_file": out_path,
        "rows_input": rows_in,
        "rows_after_1998_cutoff": rows_after_cutoff,
        "rows_output_in_sp500": rows_out,
        "unique_tickers_input": int(df["ticker"].nunique()),
        "unique_tickers_output": int(df2["ticker"].nunique()) if not df2.empty else 0,
        "rows_missing_membership_info": missing_membership_rows,
    }])

    val_path = os.path.join(VER_DIR, f"validation_{lookback_label}_{ts}.csv")
    val.to_csv(val_path, index=False)

    print(f"✔ Saved tradable SP500 universe → {out_path}")
    print(f"✔ Saved validation log          → {val_path}")

    summary_rows.append(val.iloc[0].to_dict())

# ============================================================
# 4) SAVE SUMMARY LOG
# ============================================================

summary_df = pd.DataFrame(summary_rows)
summary_path = os.path.join(VER_DIR, f"validation_summary_{ts}.csv")
summary_df.to_csv(summary_path, index=False)

print("\n=== COMPLETED ===")
print(f"Saved summary validation log → {summary_path}")
print(f"Processed datasets: {len(summary_df):,}")


=== BUILDING TRADABLE SP500 UNIVERSE (post-1998) for EACH LOOKBACK ===
Loaded membership table: 1,192 tickers
Found 5 universes to trim.

----------------------------------------------------
Processing lookback: lookback_60D
Input: ./11a-multiple_universe_with_jump90\lookback_60D\universe_with_jump90.parquet
----------------------------------------------------
✔ Saved tradable SP500 universe → ./12a-multiple-tradable_sp500_universe\12-tradable_sp500_universe_60D.parquet
✔ Saved validation log          → ./system_verification/12a-multiple-tradable_sp500_universe\validation_60D_20251231-100020.csv

----------------------------------------------------
Processing lookback: lookback_90D
Input: ./11a-multiple_universe_with_jump90\lookback_90D\universe_with_jump90.parquet
----------------------------------------------------
✔ Saved tradable SP500 universe → ./12a-multiple-tradable_sp500_universe\12-tradable_sp500_universe_90D.parquet
✔ Saved validation log          → ./system_verification/12a