In [1]:
#!/usr/bin/env python3
import os
import pandas as pd
import numpy as np
from datetime import datetime
from numba import njit, prange

"""
Purpose:
    Compute rolling log-price regressions (slope + R²) for multiple lookback windows
    and save outputs bucketed into subdirectories per lookback.

Outputs:
    OUTPUT_ROOT/lookback_XXD/{TICKER}.parquet
    VER_ROOT/lookback_XXD/regression_validation-<timestamp>.csv
"""

# ============================================================
# CONFIG
# ============================================================

INPUT_DIR   = "./3-adjusted_All_Prices_OHLC"

# Keep your existing output directories as the ROOT, and bucket by subfolders
OUTPUT_ROOT = "./7a-multiple_regressions_adjusted_all_prices"
VER_ROOT    = "./system_verification/7a-multiple_regressions_adjusted_all_prices"

os.makedirs(OUTPUT_ROOT, exist_ok=True)
os.makedirs(VER_ROOT, exist_ok=True)

# 4–5 lookback windows (trading days)
WINDOWS = [60, 90, 120, 180, 252]

TRADING_DAYS_PER_YEAR = 252  # for annualization
REQUIRED = {"date", "close_adj"}

print(f"\nLoading adjusted OHLC files from: {INPUT_DIR}")
files = sorted([f for f in os.listdir(INPUT_DIR) if f.endswith(".parquet")])
print(f"Found {len(files)} tickers.\n")

# ============================================================
# VALIDATION STORAGE
# ============================================================

validation_rows = []

def add_validation(ticker, window, issue, detail=""):
    row = {"ticker": ticker, "window": int(window), "issue": issue, "detail": detail}
    validation_rows.append(row)
    print(f"⚠ VALIDATION: {ticker} | W={window} | {issue} | {detail}")

# ============================================================
# NUMBA REGRESSION ENGINE (window-parameterized)
# ============================================================

@njit
def fast_regression_window(y, x, x_mean, var_x, W):
    """
    Computes slope + R² for one window of log prices.
    Regression: log(price) ~ beta0 + beta1 * time_index
    beta1 is in units of log-price per day (≈ avg daily log-return).
    """
    # reject invalid windows
    for v in y:
        if np.isnan(v) or np.isinf(v):
            return np.nan, np.nan

    y_mean = np.mean(y)

    # covariance
    cov_xy = 0.0
    for i in range(W):
        cov_xy += (x[i] - x_mean) * (y[i] - y_mean)
    cov_xy /= W

    beta1 = cov_xy / var_x
    beta0 = y_mean - beta1 * x_mean

    # compute R²
    ss_res = 0.0
    ss_tot = 0.0
    for i in range(W):
        y_hat = beta0 + beta1 * x[i]
        ss_res += (y[i] - y_hat) ** 2
        ss_tot += (y[i] - y_mean) ** 2

    r2 = 1.0 - ss_res / ss_tot if ss_tot > 0 else np.nan
    return beta1, r2

@njit(parallel=True)
def run_rolling_regressions(log_prices, W, x, x_mean, var_x):
    """
    log_prices: 1D array of log-adjusted prices for this ticker.

    Alignment:
    - slope at index t uses window ending at t (inclusive):
      log_prices[t-W+1 : t+1]
    - first valid index is W-1.
    """
    n = len(log_prices)
    slopes_daily = np.full(n, np.nan)
    r2_out = np.full(n, np.nan)

    for t in prange(W - 1, n):
        window = log_prices[t - W + 1 : t + 1]
        slope, r2 = fast_regression_window(window, x, x_mean, var_x, W)
        slopes_daily[t] = slope
        r2_out[t] = r2

    return slopes_daily, r2_out

# ============================================================
# MAIN LOGIC
# ============================================================

for file in files:
    ticker = file.replace(".parquet", "")
    in_path = os.path.join(INPUT_DIR, file)

    print(f"Processing {ticker}...")

    try:
        # -----------------------------
        # Load & validate (once)
        # -----------------------------
        df = pd.read_parquet(in_path)

        if not REQUIRED.issubset(df.columns):
            add_validation(ticker, -1, "missing_columns", str(REQUIRED - set(df.columns)))
            continue

        df["date"] = pd.to_datetime(df["date"])
        df = df.sort_values("date").reset_index(drop=True)

        if (df["close_adj"] <= 0).any():
            add_validation(
                ticker, -1, "nonpositive_close_adj",
                f"{(df['close_adj'] <= 0).sum()} rows"
            )

        # Prepare log prices once
        px = df["close_adj"].astype(float).values
        px = np.where(px > 0, px, np.nan)  # guard
        log_px = np.log(px)

        n = len(df)

        # -----------------------------
        # Compute each lookback window
        # -----------------------------
        for W in WINDOWS:
            out_dir = os.path.join(OUTPUT_ROOT, f"lookback_{W}D")
            ver_dir = os.path.join(VER_ROOT,  f"lookback_{W}D")
            os.makedirs(out_dir, exist_ok=True)
            os.makedirs(ver_dir, exist_ok=True)

            out_path = os.path.join(out_dir, f"{ticker}.parquet")

            if n < W:
                last_date = df["date"].max().date()
                add_validation(
                    ticker, W, "insufficient_history_for_regression",
                    f"{n} rows, last={last_date}"
                )
                continue

            # Precompute x-axis for this window (0..W-1)
            x = np.arange(W, dtype=np.float64)
            x_mean = x.mean()
            var_x = np.mean((x - x_mean) ** 2)

            slopes_daily, r2 = run_rolling_regressions(log_px, W, x, x_mean, var_x)

            # annualize slope: slope_daily is avg daily log-return
            slopes_annual = np.expm1(slopes_daily * TRADING_DAYS_PER_YEAR)

            df_out = df.copy()
            df_out["window"]       = int(W)
            df_out["slope_daily"]  = slopes_daily
            df_out["slope_annual"] = slopes_annual
            df_out["r2"]           = r2

            df_out.to_parquet(out_path, index=False)

        print(f"✔ {ticker}: done ({len(WINDOWS)} lookbacks)\n")

    except Exception as e:
        # window=-1 indicates "not window-specific"
        add_validation(ticker, -1, "processing_error", str(e))
        print(f"❌ ERROR for {ticker}: {e}\n")

# ============================================================
# SAVE VALIDATION REPORT (single combined file)
# ============================================================

validation_df = pd.DataFrame(validation_rows)

val_file = os.path.join(
    VER_ROOT,
    f"regression_multi_validation-{datetime.now().strftime('%Y%m%d-%H%M%S')}.csv"
)
validation_df.to_csv(val_file, index=False)

print("\n====================================================")
print(" MULTI-LOOKBACK REGRESSION VALIDATION SUMMARY")
print("====================================================")
if validation_df.empty:
    print("No validation issues detected.")
else:
    print(validation_df)
print("\nValidation log saved to:")
print(val_file)
print("====================================================\n")



Loading adjusted OHLC files from: ./3-adjusted_All_Prices_OHLC
Found 1167 tickers.

Processing A...
✔ A: done (5 lookbacks)

Processing AAL...
✔ AAL: done (5 lookbacks)

Processing AAMRQ...
✔ AAMRQ: done (5 lookbacks)

Processing AAP...
✔ AAP: done (5 lookbacks)

Processing AAPL...
✔ AAPL: done (5 lookbacks)

Processing ABBV...
✔ ABBV: done (5 lookbacks)

Processing ABI1...
✔ ABI1: done (5 lookbacks)

Processing ABKFQ...
✔ ABKFQ: done (5 lookbacks)

Processing ABMD...
✔ ABMD: done (5 lookbacks)

Processing ABNB...
✔ ABNB: done (5 lookbacks)

Processing ABS...
✔ ABS: done (5 lookbacks)

Processing ABT...
✔ ABT: done (5 lookbacks)

Processing ACAS...
✔ ACAS: done (5 lookbacks)

Processing ACGL...
✔ ACGL: done (5 lookbacks)

Processing ACKHQ...
✔ ACKHQ: done (5 lookbacks)

Processing ACN...
✔ ACN: done (5 lookbacks)

Processing ACS...
✔ ACS: done (5 lookbacks)

Processing ACV1...
✔ ACV1: done (5 lookbacks)

Processing ADBE...
✔ ADBE: done (5 lookbacks)

Processing ADCT1...
✔ ADCT1: done 