In [None]:
"""
Runs 90-day rolling log-price regressions on adjusted OHLC parquet files, using Numba to compute
daily and annualized slopes plus R² for each window. Validates required columns and positive closes,
skips tickers with insufficient history, saves per-ticker regression outputs to
./7-90Day_exp_regression_adjusted_all_prices, and logs validation issues to a timestamped CSV in
./system_verification/7-90Day_exp_regression_adjusted_all_prices.
"""



import os
import pandas as pd
import numpy as np
from datetime import datetime
from numba import njit, prange

# ============================================================
# CONFIG
# ============================================================

INPUT_DIR  = "./3-adjusted_All_Prices_OHLC"
OUTPUT_DIR = "./7-90Day_exp_regression_adjusted_all_prices"
VER_DIR    = "./system_verification/7-90Day_exp_regression_adjusted_all_prices"

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(VER_DIR, exist_ok=True)

WINDOW = 90                    # 90 trading days
TRADING_DAYS_PER_YEAR = 252    # For annualization

print(f"\nLoading adjusted OHLC files from: {INPUT_DIR}")
files = sorted([f for f in os.listdir(INPUT_DIR) if f.endswith(".parquet")])
print(f"Found {len(files)} tickers.\n")

REQUIRED = {"date", "close_adj"}

# ============================================================
# VALIDATION STORAGE
# ============================================================

validation_rows = []

def add_validation(ticker, issue, detail=""):
    row = {"ticker": ticker, "issue": issue, "detail": detail}
    validation_rows.append(row)
    print(f"⚠ VALIDATION: {ticker} | {issue} | {detail}")

# ============================================================
# Numba Regression Engine
# ============================================================

# x axis (0..WINDOW-1)
x = np.arange(WINDOW, dtype=np.float64)
x_mean = x.mean()
var_x = np.mean((x - x_mean) ** 2)

@njit
def fast_regression_window(y):
    """
    Computes slope + R² for one window of log prices.
    y must be length = WINDOW.

    Regression: log(price) ~ beta0 + beta1 * time_index
    beta1 is in units of log-price per day (≈ avg daily log-return).
    """
    # reject invalid windows
    for v in y:
        if np.isnan(v) or np.isinf(v):
            return np.nan, np.nan

    y_mean = np.mean(y)

    # covariance
    cov_xy = 0.0
    for i in range(WINDOW):
        cov_xy += (x[i] - x_mean) * (y[i] - y_mean)
    cov_xy /= WINDOW

    beta1 = cov_xy / var_x
    beta0 = y_mean - beta1 * x_mean

    # compute R²
    ss_res = 0.0
    ss_tot = 0.0
    for i in range(WINDOW):
        y_hat = beta0 + beta1 * x[i]
        ss_res += (y[i] - y_hat) ** 2
        ss_tot += (y[i] - y_mean) ** 2

    # If ss_tot == 0 (flat series), R² is undefined; return NaN
    r2 = 1.0 - ss_res / ss_tot if ss_tot > 0 else np.nan

    return beta1, r2

@njit(parallel=True)
def run_rolling_regressions(log_prices):
    """
    log_prices: 1D array of log-adjusted prices for this ticker.

    IMPORTANT ALIGNMENT:
    - slope at index t uses window ending at t (inclusive):
      log_prices[t-WINDOW+1 : t+1]
    - first valid index is WINDOW-1.
    """
    n = len(log_prices)
    slopes_daily = np.full(n, np.nan)
    r2_out = np.full(n, np.nan)

    for t in prange(WINDOW - 1, n):
        window = log_prices[t - WINDOW + 1 : t + 1]  # includes t
        slope, r2 = fast_regression_window(window)
        slopes_daily[t] = slope
        r2_out[t] = r2

    return slopes_daily, r2_out

# ============================================================
# MAIN LOGIC
# ============================================================

for file in files:
    ticker = file.replace(".parquet", "")
    in_path = os.path.join(INPUT_DIR, file)
    out_path = os.path.join(OUTPUT_DIR, f"{ticker}.parquet")

    print(f"Processing {ticker}...")

    try:
        # -----------------------------
        # Load & validate
        # -----------------------------
        df = pd.read_parquet(in_path)

        if not REQUIRED.issubset(df.columns):
            add_validation(ticker, "missing_columns", str(REQUIRED - set(df.columns)))
            continue

        df["date"] = pd.to_datetime(df["date"])
        df = df.sort_values("date").reset_index(drop=True)

        if (df["close_adj"] <= 0).any():
            add_validation(
                ticker,
                "nonpositive_close_adj",
                f"{(df['close_adj'] <= 0).sum()} rows"
            )

        if len(df) < WINDOW:
            last_date = df["date"].max().date()
            add_validation(
                ticker,
                "insufficient_history_for_regression",
                f"{len(df)} rows, last={last_date}"
            )
            continue

        # -----------------------------
        # Prepare data: log prices
        # -----------------------------
        px = df["close_adj"].astype(float).values
        # Guard against <=0 -> log invalid
        px = np.where(px > 0, px, np.nan)
        log_px = np.log(px)

        # -----------------------------
        # Run Numba rolling regressions
        # -----------------------------
        slopes_daily, r2 = run_rolling_regressions(log_px)

        # annualize slope: slope_daily is avg daily log-return
        slopes_annual = np.expm1(slopes_daily * TRADING_DAYS_PER_YEAR)

        # -----------------------------
        # Write output file
        # -----------------------------
        df_out = df.copy()
        df_out["slope_daily"]  = slopes_daily
        df_out["slope_annual"] = slopes_annual
        df_out["r2"]           = r2

        df_out.to_parquet(out_path, index=False)
        print(f"✔ {ticker}: saved regression → {out_path}\n")

    except Exception as e:
        add_validation(ticker, "processing_error", str(e))
        print(f"❌ ERROR for {ticker}: {e}\n")

# ============================================================
# SAVE VALIDATION REPORT
# ============================================================

validation_df = pd.DataFrame(validation_rows)

val_file = os.path.join(
    VER_DIR,
    f"regression90_validation-{datetime.now().strftime('%Y%m%d-%H%M%S')}.csv"
)

validation_df.to_csv(val_file, index=False)

print("\n====================================================")
print(" 90-DAY REGRESSION VALIDATION SUMMARY")
print("====================================================")

if validation_df.empty:
    print("No validation issues detected.")
else:
    print(validation_df)

print("\nValidation log saved to:")
print(val_file)
print("====================================================\n")



Loading adjusted OHLC files from: ./3-adjusted_All_Prices_OHLC
Found 1167 tickers.

Processing A...
✔ A: saved regression → ./7-90Day_exp_regression_adjusted_all_prices\A.parquet

Processing AAL...
✔ AAL: saved regression → ./7-90Day_exp_regression_adjusted_all_prices\AAL.parquet

Processing AAMRQ...
✔ AAMRQ: saved regression → ./7-90Day_exp_regression_adjusted_all_prices\AAMRQ.parquet

Processing AAP...
✔ AAP: saved regression → ./7-90Day_exp_regression_adjusted_all_prices\AAP.parquet

Processing AAPL...
✔ AAPL: saved regression → ./7-90Day_exp_regression_adjusted_all_prices\AAPL.parquet

Processing ABBV...
✔ ABBV: saved regression → ./7-90Day_exp_regression_adjusted_all_prices\ABBV.parquet

Processing ABI1...
✔ ABI1: saved regression → ./7-90Day_exp_regression_adjusted_all_prices\ABI1.parquet

Processing ABKFQ...
✔ ABKFQ: saved regression → ./7-90Day_exp_regression_adjusted_all_prices\ABKFQ.parquet

Processing ABMD...
✔ ABMD: saved regression → ./7-90Day_exp_regression_adjusted_all_

In [2]:
df_abbv = pd.read_parquet("7-90Day_exp_regression_adjusted_all_prices/ABBV.parquet")

# Basic overview
print(f"Rows: {len(df_abbv)}, Columns: {len(df_abbv.columns)}")
print(f"\nColumn names: {list(df_abbv.columns)}")
print(f"\nData types:\n{df_abbv.dtypes}")
print(f"\nFirst few rows:\n{df_abbv.head(91)}")
print(f"\nLast few rows:\n{df_abbv.tail()}")


Rows: 3269, Columns: 18

Column names: ['ticker', 'date', 'open', 'high', 'low', 'close', 'volume', 'closeadj', 'closeunadj', 'lastupdated', 'adj_factor', 'open_adj', 'high_adj', 'low_adj', 'close_adj', 'slope_daily', 'slope_annual', 'r2']

Data types:
ticker                  object
date            datetime64[ns]
open                   float64
high                   float64
low                    float64
close                  float64
volume                 float64
closeadj               float64
closeunadj             float64
lastupdated     datetime64[ms]
adj_factor             float64
open_adj               float64
high_adj               float64
low_adj                float64
close_adj              float64
slope_daily            float64
slope_annual           float64
r2                     float64
dtype: object

First few rows:
   ticker       date   open   high    low   close      volume  closeadj  \
0    ABBV 2013-01-02  34.92  35.40  34.10  35.120  13768000.0    20.865   
1    ABB

In [3]:
df_abbv.to_csv("ABBV_regression.csv", index=False)
print(f"✔ Exported df_abbv to ABBV_regression.csv")

✔ Exported df_abbv to ABBV_regression.csv
