In [1]:
#!/usr/bin/env python3
import os
import re
import pandas as pd
import numpy as np
from datetime import datetime

"""
Purpose:
    Build ranking datasets (OHLC + volume + slope_adj = slope_annual * r2)
    for multiple regression lookback periods.

Assumptions:
    - REGRESSION_ROOT contains subdirectories like: lookback_60D, lookback_90D, ...
    - Each subdirectory contains per-ticker parquet files with columns:
        date, slope_annual, r2   (others allowed)

Outputs (per lookback):
    OUTPUT_ROOT/lookback_XXD/ranking_dataset.parquet
    VER_ROOT/lookback_XXD/ranking_validation-<timestamp>.csv
"""

# ============================================================
# CONFIG
# ============================================================

ADJ_PRICE_DIR    = "./3-adjusted_All_Prices_OHLC"
REGRESSION_ROOT  = "./7a-multiple_regressions_adjusted_all_prices"

OUTPUT_ROOT      = "./9a-multiple_regression_ranking_dataset"
VER_ROOT         = "./system_verification/9a-multiple_regression_ranking_dataset"

os.makedirs(OUTPUT_ROOT, exist_ok=True)
os.makedirs(VER_ROOT, exist_ok=True)

REQUIRED_PX = {"date", "open_adj", "high_adj", "low_adj", "close_adj"}  # volume optional

print("=== BUILDING RANKING DATASETS (OHLC + slope × r2) FOR EACH LOOKBACK ===")

# ============================================================
# HELPERS
# ============================================================

def discover_lookback_dirs(root: str):
    """
    Find subdirectories like 'lookback_90D' and sort by numeric window.
    """
    pat = re.compile(r"^lookback_(\d+)D$")
    items = []
    for name in os.listdir(root):
        full = os.path.join(root, name)
        if os.path.isdir(full):
            m = pat.match(name)
            if m:
                items.append((int(m.group(1)), name))
    items.sort(key=lambda x: x[0])
    return [name for _, name in items]

def log_issue(validation_rows, ticker, lookback_dir, issue, detail=""):
    validation_rows.append({
        "timestamp": datetime.now().isoformat(timespec="seconds"),
        "lookback_dir": lookback_dir,
        "ticker": ticker,
        "issue": issue,
        "detail": detail,
    })
    print(f"⚠ {lookback_dir} | {ticker} | {issue} | {detail}")

# ============================================================
# DISCOVER TICKERS
# ============================================================

tickers = sorted([
    f.replace(".parquet", "")
    for f in os.listdir(ADJ_PRICE_DIR)
    if f.endswith(".parquet")
])
print(f"Found {len(tickers)} tickers with adjusted OHLC data.")

# ============================================================
# DISCOVER LOOKBACK DIRECTORIES
# ============================================================

lookback_dirs = discover_lookback_dirs(REGRESSION_ROOT)
if not lookback_dirs:
    raise ValueError(
        f"No lookback_*D subdirectories found under {REGRESSION_ROOT}. "
        "Expected e.g. lookback_60D, lookback_90D, etc."
    )

print(f"Found {len(lookback_dirs)} regression lookback folders:")
for d in lookback_dirs:
    print(f"  - {d}")

# ============================================================
# OPTIONAL: CACHE PRICE DATA ONCE (faster; reasonable for SP500)
# ============================================================

print("\nLoading adjusted OHLC into memory (one-time cache)...")
px_cache = {}
for ticker in tickers:
    f_px = os.path.join(ADJ_PRICE_DIR, f"{ticker}.parquet")
    try:
        px = pd.read_parquet(f_px)

        if not REQUIRED_PX.issubset(px.columns):
            # keep in cache as None; log per lookback later (so you see it everywhere)
            px_cache[ticker] = None
            continue

        px["date"] = pd.to_datetime(px["date"])
        px = px.sort_values("date").reset_index(drop=True)

        # Keep required + volume if present
        keep_cols = ["date", "open_adj", "high_adj", "low_adj", "close_adj"]
        if "volume" in px.columns:
            keep_cols.append("volume")
        else:
            # add volume as NaN to keep schema consistent downstream
            px["volume"] = np.nan
            keep_cols.append("volume")

        px_cache[ticker] = px[keep_cols].copy()
    except Exception:
        px_cache[ticker] = None

print("Price cache ready.\n")

# ============================================================
# MAIN: BUILD ONE RANKING DATASET PER LOOKBACK
# ============================================================

for lb_dir in lookback_dirs:
    print("\n====================================================")
    print(f"Building ranking dataset for: {lb_dir}")
    print("====================================================")

    regression_dir = os.path.join(REGRESSION_ROOT, lb_dir)
    out_dir = os.path.join(OUTPUT_ROOT, lb_dir)
    ver_dir = os.path.join(VER_ROOT, lb_dir)
    os.makedirs(out_dir, exist_ok=True)
    os.makedirs(ver_dir, exist_ok=True)

    validation_rows = []
    rows = []

    for ticker in tickers:
        base_px = px_cache.get(ticker)

        if base_px is None:
            f_px = os.path.join(ADJ_PRICE_DIR, f"{ticker}.parquet")
            log_issue(validation_rows, ticker, lb_dir, "cannot_read_or_missing_required_px_columns", f_px)
            continue

        df = base_px.copy()

        # --------------------------------------------------------
        # Load regression file for this lookback
        # --------------------------------------------------------
        f_reg = os.path.join(regression_dir, f"{ticker}.parquet")

        if not os.path.exists(f_reg):
            log_issue(validation_rows, ticker, lb_dir, "missing_regression_file", f_reg)
            df["slope_annual"] = np.nan
            df["r2"] = np.nan
            df["slope_adj"] = np.nan
        else:
            try:
                rg = pd.read_parquet(f_reg)

                # Expect at least date + slope_annual + r2
                if not {"date", "slope_annual", "r2"}.issubset(rg.columns):
                    log_issue(validation_rows, ticker, lb_dir, "regression_missing_columns", str(set(["date","slope_annual","r2"]) - set(rg.columns)))
                    df["slope_annual"] = np.nan
                    df["r2"] = np.nan
                    df["slope_adj"] = np.nan
                else:
                    rg = rg[["date", "slope_annual", "r2"]].copy()
                    rg["date"] = pd.to_datetime(rg["date"])

                    df = df.merge(rg, on="date", how="left")
                    df["slope_adj"] = df["slope_annual"] * df["r2"]

            except Exception as e:
                log_issue(validation_rows, ticker, lb_dir, "cannot_read_regression_file", str(e))
                df["slope_annual"] = np.nan
                df["r2"] = np.nan
                df["slope_adj"] = np.nan

        # Add ticker
        df["ticker"] = ticker
        rows.append(df)

    if not rows:
        print(f"❌ No rows produced for {lb_dir}. Skipping save.")
        continue

    # ============================================================
    # CONCAT + SORT FINAL DATA
    # ============================================================

    ranking_df = pd.concat(rows, ignore_index=True)
    ranking_df = ranking_df.sort_values(["date", "slope_adj"], ascending=[True, False])

    # ============================================================
    # SAVE OUTPUTS
    # ============================================================

    ranking_path = os.path.join(out_dir, "ranking_dataset.parquet")
    ranking_df.to_parquet(ranking_path, index=False)

    val_df = pd.DataFrame(validation_rows)
    val_path = os.path.join(ver_dir, f"ranking_validation-{datetime.now().strftime('%Y%m%d-%H%M%S')}.csv")
    val_df.to_csv(val_path, index=False)

    print(f"\n=== COMPLETED: {lb_dir} ===")
    print(f"Saved ranking dataset → {ranking_path}")
    print(f"Saved validation log  → {val_path}")
    print(f"Rows: {len(ranking_df):,}")

print("\n✅ All lookbacks processed.")


=== BUILDING RANKING DATASETS (OHLC + slope × r2) FOR EACH LOOKBACK ===
Found 1167 tickers with adjusted OHLC data.
Found 5 regression lookback folders:
  - lookback_60D
  - lookback_90D
  - lookback_120D
  - lookback_180D
  - lookback_252D

Loading adjusted OHLC into memory (one-time cache)...
Price cache ready.


Building ranking dataset for: lookback_60D
⚠ lookback_60D | BBI1 | missing_regression_file | ./7a-multiple_regressions_adjusted_all_prices\lookback_60D\BBI1.parquet
⚠ lookback_60D | ITT1 | missing_regression_file | ./7a-multiple_regressions_adjusted_all_prices\lookback_60D\ITT1.parquet
⚠ lookback_60D | Q | missing_regression_file | ./7a-multiple_regressions_adjusted_all_prices\lookback_60D\Q.parquet
⚠ lookback_60D | SOLS | missing_regression_file | ./7a-multiple_regressions_adjusted_all_prices\lookback_60D\SOLS.parquet

=== COMPLETED: lookback_60D ===
Saved ranking dataset → ./9a-multiple_regression_ranking_dataset\lookback_60D\ranking_dataset.parquet
Saved validation log  →