In [None]:
#!/usr/bin/env python3
import os
import re
import pandas as pd
import numpy as np
from datetime import datetime

"""
Purpose:
    For each ranking dataset (one per regression lookback), add MA100 and above_ma100.

Inputs:
    Ranking datasets (per lookback):
        ./9a-multiple_regression_ranking_dataset/lookback_XXD/ranking_dataset.parquet

    MA100 per ticker:
        ./5-100D_MA_adjusted_all_prices/{TICKER}.parquet  with columns ['date','ma100']

Outputs (per lookback):
    ./10a-multiple_100MA_enriched_universe/lookback_XXD/enriched_universe_ma100.parquet
    ./system_verification/10a-multiple_100MA_enriched_universe/lookback_XXD/validation_<timestamp>.csv

Also writes:
    ./system_verification/10a-multiple_100MA_enriched_universe/validation_summary_<timestamp>.csv
"""

# ============================================================
# CONFIG
# ============================================================

RANKING_ROOT = "./9a-multiple_regression_ranking_dataset"
MA100_DIR    = "./5-100D_MA_adjusted_all_prices"

OUTPUT_ROOT  = "./10a-multiple_100MA_enriched_universe"
VER_ROOT     = "./system_verification/10a-multiple_100MA_enriched_universe"

os.makedirs(OUTPUT_ROOT, exist_ok=True)
os.makedirs(VER_ROOT, exist_ok=True)

RANKING_FILENAME = "ranking_dataset.parquet"
OUT_FILENAME     = "enriched_universe_ma100.parquet"

print("=== Building Enriched Universes (Add MA100 + Above/Below MA100) per Lookback ===")

# ============================================================
# HELPERS
# ============================================================

def discover_lookback_dirs(root: str):
    """
    Finds subdirectories like lookback_60D, lookback_90D, etc., sorted by numeric window.
    """
    pat = re.compile(r"^lookback_(\d+)D$")
    found = []
    for name in os.listdir(root):
        full = os.path.join(root, name)
        if os.path.isdir(full):
            m = pat.match(name)
            if m:
                found.append((int(m.group(1)), name))
    found.sort(key=lambda x: x[0])
    return [name for _, name in found]

def safe_read_parquet(path: str):
    try:
        return pd.read_parquet(path)
    except Exception:
        return None

# ============================================================
# 1) LOAD MA100 FILES ONCE
# ============================================================

print("\nLoading MA100 files...")

ma100_map = {}
missing_ma100_files = []
bad_ma100_files = []

for fname in os.listdir(MA100_DIR):
    if not fname.endswith(".parquet"):
        continue

    ticker = fname.replace(".parquet", "")
    fpath = os.path.join(MA100_DIR, fname)

    tmp = safe_read_parquet(fpath)
    if tmp is None:
        bad_ma100_files.append(ticker)
        continue

    if not {"date", "ma100"}.issubset(tmp.columns):
        bad_ma100_files.append(ticker)
        continue

    tmp = tmp[["date", "ma100"]].copy()
    tmp["date"] = pd.to_datetime(tmp["date"])
    ma100_map[ticker] = tmp

print(f"Loaded MA100 for {len(ma100_map)} tickers.")
print(f"Unreadable/invalid MA100 files: {len(bad_ma100_files)} tickers.")

# ============================================================
# 2) DISCOVER RANKING DATASETS (PER LOOKBACK)
# ============================================================

lookback_dirs = discover_lookback_dirs(RANKING_ROOT)

# If no lookback subdirs found, fall back to a single ranking file at root (optional)
ranking_jobs = []
if lookback_dirs:
    for lb in lookback_dirs:
        rfile = os.path.join(RANKING_ROOT, lb, RANKING_FILENAME)
        if os.path.exists(rfile):
            ranking_jobs.append((lb, rfile))
        else:
            print(f"⚠ Missing ranking file for {lb}: {rfile}")
else:
    root_file = os.path.join(RANKING_ROOT, RANKING_FILENAME)
    if os.path.exists(root_file):
        ranking_jobs.append(("root", root_file))

if not ranking_jobs:
    raise FileNotFoundError(
        f"No ranking datasets found. Expected {RANKING_ROOT}/lookback_XXD/{RANKING_FILENAME} "
        f"(or {RANKING_ROOT}/{RANKING_FILENAME})."
    )

print(f"\nFound {len(ranking_jobs)} ranking datasets to process.")

# ============================================================
# 3) PROCESS EACH RANKING DATASET
# ============================================================

ts = datetime.now().strftime("%Y%m%d-%H%M%S")
summary_rows = []

for lookback, ranking_path in ranking_jobs:
    print("\n----------------------------------------------------")
    print(f"Processing lookback: {lookback}")
    print(f"Ranking file: {ranking_path}")
    print("----------------------------------------------------")

    df = safe_read_parquet(ranking_path)
    if df is None or df.empty:
        print(f"❌ Could not read or empty ranking dataset: {ranking_path}")
        continue

    # basic expectations
    if "date" not in df.columns or "ticker" not in df.columns:
        print(f"❌ ranking dataset missing required columns 'date'/'ticker': {ranking_path}")
        continue

    df = df.copy()
    df["date"] = pd.to_datetime(df["date"])

    # merge MA100 per ticker (same approach as your original)
    rows = []
    missing_tickers = []

    for ticker, sub in df.groupby("ticker", sort=False):
        if ticker in ma100_map:
            merged = sub.merge(ma100_map[ticker], on="date", how="left")
        else:
            merged = sub.copy()
            merged["ma100"] = np.nan
            missing_tickers.append(ticker)
        rows.append(merged)

    df2 = pd.concat(rows, ignore_index=True)

    # add boolean
    # if ma100 is NaN, above_ma100 becomes False (because comparison yields False)
    df2["above_ma100"] = df2["close_adj"] > df2["ma100"]

    # sort (optional but consistent)
    df2 = df2.sort_values(["date", "ticker"]).reset_index(drop=True)

    # output dirs
    out_dir = OUTPUT_ROOT if lookback == "root" else os.path.join(OUTPUT_ROOT, lookback)
    ver_dir = VER_ROOT if lookback == "root" else os.path.join(VER_ROOT, lookback)
    os.makedirs(out_dir, exist_ok=True)
    os.makedirs(ver_dir, exist_ok=True)

    out_path = os.path.join(out_dir, OUT_FILENAME)
    df2.to_parquet(out_path, index=False)

    # validation log per lookback
    val = pd.DataFrame([{
        "timestamp": datetime.now().isoformat(timespec="seconds"),
        "lookback": lookback,
        "ranking_file": ranking_path,
        "rows_in_ranking": int(df.shape[0]),
        "rows_after_merge": int(df2.shape[0]),
        "unique_tickers": int(df["ticker"].nunique()),
        "missing_ma100_tickers": int(len(set(missing_tickers))),
        "null_ma100_values": int(df2["ma100"].isna().sum()),
        "true_above_ma100": int(df2["above_ma100"].sum()),
        "false_above_ma100": int((~df2["above_ma100"]).sum()),
        "output_file": out_path,
    }])

    val_path = os.path.join(ver_dir, f"validation_{ts}.csv")
    val.to_csv(val_path, index=False)

    print(f"✔ Saved enriched universe → {out_path}")
    print(f"✔ Saved validation log   → {val_path}")

    summary_rows.append(val.iloc[0].to_dict())

# ============================================================
# 4) SAVE OVERALL SUMMARY LOG
# ============================================================

summary_df = pd.DataFrame(summary_rows)
summary_path = os.path.join(VER_ROOT, f"validation_summary_{ts}.csv")
summary_df.to_csv(summary_path, index=False)

print("\n=== COMPLETED ===")
print(f"Saved summary validation log → {summary_path}")
print(f"Processed datasets: {len(summary_df):,}")


=== Building Enriched Universes (Add MA100 + Above/Below MA100) per Lookback ===

Loading MA100 files...
Loaded MA100 for 1167 tickers.
Unreadable/invalid MA100 files: 0 tickers.

Found 5 ranking datasets to process.

----------------------------------------------------
Processing lookback: lookback_60D
Ranking file: ./9a-multiple_regression_ranking_dataset\lookback_60D\ranking_dataset.parquet
----------------------------------------------------
✔ Saved enriched universe → ./10a-multiple_100MA_enriched_universe\lookback_60D\enriched_universe_ma100.parquet
✔ Saved validation log   → ./system_verification/10a-multiple_100MA_enriched_universe\lookback_60D\validation_20251231-090606.csv

----------------------------------------------------
Processing lookback: lookback_90D
Ranking file: ./9a-multiple_regression_ranking_dataset\lookback_90D\ranking_dataset.parquet
----------------------------------------------------
✔ Saved enriched universe → ./10a-multiple_100MA_enriched_universe\lookback