In [1]:
#!/usr/bin/env python3
import os
import re
import pandas as pd
import numpy as np
from datetime import datetime

"""
Purpose:
    For each MA100-enriched universe (per regression lookback), merge Jump90 signals
    and save a new parquet per lookback.

Inputs (per lookback):
    ./10a-multiple_100MA_enriched_universe/lookback_XXD/enriched_universe_ma100.parquet

Jump90 per ticker:
    ./6-90Day_jump_filter_adjusted_all_prices/{TICKER}.parquet

Outputs (per lookback):
    ./11a-multiple_universe_with_jump90/lookback_XXD/universe_with_jump90.parquet

Validation (per lookback + summary):
    ./system_verification/11a-multiple_universe_with_jump90/lookback_XXD/validation_<timestamp>.csv
    ./system_verification/11a-multiple_universe_with_jump90/validation_summary_<timestamp>.csv
"""

# ============================================================
# CONFIG
# ============================================================

MA_ENRICHED_ROOT = "./10a-multiple_100MA_enriched_universe"
JUMP90_DIR       = "./6-90Day_jump_filter_adjusted_all_prices"

OUTPUT_ROOT      = "./11a-multiple_universe_with_jump90"
VER_ROOT         = "./system_verification/11a-multiple_universe_with_jump90"
os.makedirs(OUTPUT_ROOT, exist_ok=True)
os.makedirs(VER_ROOT, exist_ok=True)

MA_FILENAME  = "enriched_universe_ma100.parquet"   # from the prior script
OUT_FILENAME = "universe_with_jump90.parquet"

print("=== Building Enriched Universes (Add Jump90 Signals — No Filtering) per Lookback ===")

# ============================================================
# HELPERS
# ============================================================

def discover_lookback_dirs(root: str):
    """
    Finds subdirectories like lookback_60D, lookback_90D, etc., sorted by numeric window.
    """
    pat = re.compile(r"^lookback_(\d+)D$")
    found = []
    for name in os.listdir(root):
        full = os.path.join(root, name)
        if os.path.isdir(full):
            m = pat.match(name)
            if m:
                found.append((int(m.group(1)), name))
    found.sort(key=lambda x: x[0])
    return [name for _, name in found]

def safe_read_parquet(path: str):
    try:
        return pd.read_parquet(path)
    except Exception:
        return None

# ============================================================
# 1) LOAD JUMP90 FILES ONCE
# ============================================================

print("\nLoading Jump90 files...")

jump_map = {}
bad_jump_files = []

for fname in os.listdir(JUMP90_DIR):
    if not fname.endswith(".parquet"):
        continue

    ticker = fname.replace(".parquet", "")
    fpath = os.path.join(JUMP90_DIR, fname)

    tmp = safe_read_parquet(fpath)
    if tmp is None:
        bad_jump_files.append(ticker)
        continue

    need = {"date", "pct_change", "abs_pct", "abs_rollmax_90", "no_big_jump_90"}
    if not need.issubset(tmp.columns):
        bad_jump_files.append(ticker)
        continue

    tmp = tmp[["date", "pct_change", "abs_pct", "abs_rollmax_90", "no_big_jump_90"]].copy()
    tmp["date"] = pd.to_datetime(tmp["date"])
    jump_map[ticker] = tmp

print(f"Loaded Jump90: {len(jump_map)} tickers")
print(f"Unreadable/invalid Jump90 files: {len(bad_jump_files)} tickers")

# ============================================================
# 2) DISCOVER MA-ENRICHED UNIVERSE FILES (PER LOOKBACK)
# ============================================================

lookback_dirs = discover_lookback_dirs(MA_ENRICHED_ROOT)

jobs = []
if lookback_dirs:
    for lb in lookback_dirs:
        f = os.path.join(MA_ENRICHED_ROOT, lb, MA_FILENAME)
        if os.path.exists(f):
            jobs.append((lb, f))
        else:
            print(f"⚠ Missing MA-enriched file for {lb}: {f}")
else:
    # Optional fallback: if someone created a root-level MA file
    root_file = os.path.join(MA_ENRICHED_ROOT, MA_FILENAME)
    if os.path.exists(root_file):
        jobs.append(("root", root_file))

if not jobs:
    raise FileNotFoundError(
        f"No MA-enriched universe files found. Expected "
        f"{MA_ENRICHED_ROOT}/lookback_XXD/{MA_FILENAME} (or {MA_ENRICHED_ROOT}/{MA_FILENAME})."
    )

print(f"\nFound {len(jobs)} MA-enriched universes to process.")

# ============================================================
# 3) PROCESS EACH MA-ENRICHED UNIVERSE AND MERGE JUMP90
# ============================================================

ts = datetime.now().strftime("%Y%m%d-%H%M%S")
summary_rows = []

for lookback, ma_path in jobs:
    print("\n----------------------------------------------------")
    print(f"Processing lookback: {lookback}")
    print(f"Input MA file: {ma_path}")
    print("----------------------------------------------------")

    df = safe_read_parquet(ma_path)
    if df is None or df.empty:
        print(f"❌ Could not read or empty input: {ma_path}")
        continue

    if "date" not in df.columns or "ticker" not in df.columns:
        print(f"❌ Input missing required columns 'date'/'ticker': {ma_path}")
        continue

    df = df.copy()
    df["date"] = pd.to_datetime(df["date"])

    # merge Jump90 per ticker (same pattern as your script)
    rows = []
    missing_jump_tickers = []

    for ticker, sub in df.groupby("ticker", sort=False):
        if ticker in jump_map:
            merged = sub.merge(jump_map[ticker], on="date", how="left")
        else:
            merged = sub.copy()
            merged["pct_change"]     = np.nan
            merged["abs_pct"]        = np.nan
            merged["abs_rollmax_90"] = np.nan
            merged["no_big_jump_90"] = np.nan
            missing_jump_tickers.append(ticker)

        rows.append(merged)

    df2 = pd.concat(rows, ignore_index=True)

    # clean column order (like your script)
    desired_cols = [
        "date",
        "open_adj", "high_adj", "low_adj", "close_adj", "volume",
        "ma100", "above_ma100",
        "pct_change", "abs_pct", "abs_rollmax_90", "no_big_jump_90",
        "slope_annual", "r2", "slope_adj",
        "ticker"
    ]
    existing_cols = [c for c in desired_cols if c in df2.columns]
    df2 = df2[existing_cols].copy()

    # sort
    df2 = df2.sort_values(["date", "ticker"]).reset_index(drop=True)

    # output dirs per lookback
    out_dir = OUTPUT_ROOT if lookback == "root" else os.path.join(OUTPUT_ROOT, lookback)
    ver_dir = VER_ROOT if lookback == "root" else os.path.join(VER_ROOT, lookback)
    os.makedirs(out_dir, exist_ok=True)
    os.makedirs(ver_dir, exist_ok=True)

    out_path = os.path.join(out_dir, OUT_FILENAME)
    df2.to_parquet(out_path, index=False)

    # validation per lookback
    val = pd.DataFrame([{
        "timestamp": datetime.now().isoformat(timespec="seconds"),
        "lookback": lookback,
        "input_ma_file": ma_path,
        "rows_input": int(df.shape[0]),
        "rows_after_merge": int(df2.shape[0]),
        "unique_tickers": int(df["ticker"].nunique()),
        "missing_jump_tickers": int(len(set(missing_jump_tickers))),
        "null_pct_change": int(df2["pct_change"].isna().sum()) if "pct_change" in df2.columns else -1,
        "null_no_big_jump_90": int(df2["no_big_jump_90"].isna().sum()) if "no_big_jump_90" in df2.columns else -1,
        "output_file": out_path,
    }])

    val_path = os.path.join(ver_dir, f"validation_{ts}.csv")
    val.to_csv(val_path, index=False)

    print(f"✔ Saved → {out_path}")
    print(f"✔ Validation log → {val_path}")

    summary_rows.append(val.iloc[0].to_dict())

# ============================================================
# 4) SAVE OVERALL SUMMARY LOG
# ============================================================

summary_df = pd.DataFrame(summary_rows)
summary_path = os.path.join(VER_ROOT, f"validation_summary_{ts}.csv")
summary_df.to_csv(summary_path, index=False)

print("\n=== COMPLETED ===")
print(f"Saved summary validation log → {summary_path}")
print(f"Processed datasets: {len(summary_df):,}")


=== Building Enriched Universes (Add Jump90 Signals — No Filtering) per Lookback ===

Loading Jump90 files...
Loaded Jump90: 1167 tickers
Unreadable/invalid Jump90 files: 0 tickers

Found 5 MA-enriched universes to process.

----------------------------------------------------
Processing lookback: lookback_60D
Input MA file: ./10a-multiple_100MA_enriched_universe\lookback_60D\enriched_universe_ma100.parquet
----------------------------------------------------
✔ Saved → ./11a-multiple_universe_with_jump90\lookback_60D\universe_with_jump90.parquet
✔ Validation log → ./system_verification/11a-multiple_universe_with_jump90\lookback_60D\validation_20251231-091148.csv

----------------------------------------------------
Processing lookback: lookback_90D
Input MA file: ./10a-multiple_100MA_enriched_universe\lookback_90D\enriched_universe_ma100.parquet
----------------------------------------------------
✔ Saved → ./11a-multiple_universe_with_jump90\lookback_90D\universe_with_jump90.parquet
✔