In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

# ============================================================
# CONFIG
# ============================================================

INPUT_DIR  = "./3-adjusted_All_Prices_OHLC"
OUTPUT_DIR = "./5-100D_MA_adjusted_all_prices"
VER_DIR    = "./system_verification/5-100D_MA_adjusted_all_prices"

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(VER_DIR, exist_ok=True)

print(f"\nLoading adjusted OHLC files from: {INPUT_DIR}")
files = sorted([f for f in os.listdir(INPUT_DIR) if f.endswith('.parquet')])
print(f"Found {len(files)} tickers.\n")

REQUIRED = {"date", "close_adj"}

# ============================================================
# VALIDATION STORAGE
# ============================================================

validation_rows = []

def add_validation(ticker, issue, detail=""):
    row = {"ticker": ticker, "issue": issue, "detail": detail}
    validation_rows.append(row)
    print(f"⚠ VALIDATION: {ticker} | {issue} | {detail}")


# ============================================================
# MA100 CALCULATION
# ============================================================

def compute_ma100(df, ticker):

    # Ensure required columns exist
    if not REQUIRED.issubset(df.columns):
        missing = REQUIRED - set(df.columns)
        add_validation(ticker, "missing_required_columns", f"missing: {missing}")
        return None

    # Date handling
    df["date"] = pd.to_datetime(df["date"])
    if not df["date"].is_monotonic_increasing:
        add_validation(ticker, "non_monotonic_dates")
        df = df.sort_values("date").reset_index(drop=True)

    # ============================================================
    # VALIDATION: LARGE GAPS IN TRADING CALENDAR
    # ============================================================

    gaps = df["date"].diff().dt.days
    if (gaps > 10).any():
        idx = gaps.idxmax()
        add_validation(
            ticker,
            "large_date_gap",
            f"max gap = {int(gaps.max())} days (from {df.loc[idx-1,'date'].date()} to {df.loc[idx,'date'].date()})"
        )

    # ============================================================
    # VALIDATION: NON-POSITIVE ADJUSTED CLOSES
    # ============================================================

    bad_close = df[df["close_adj"] <= 0]
    if not bad_close.empty:
        dates_str = ", ".join(str(d.date()) for d in bad_close["date"])
        add_validation(
            ticker,
            "nonpositive_close_adj",
            f"{len(bad_close)} rows → {dates_str}"
        )

    # ============================================================
    # INSUFFICIENT HISTORY CHECK
    # ============================================================

    if len(df) < 100:
        last_dt = df["date"].max().date()
        add_validation(
            ticker,
            "insufficient_history_for_ma100",
            f"only {len(df)} rows, last date {last_dt}"
        )
        # still compute MA100 (will be all NaNs)
        df["ma100"] = np.nan
        df["valid_ma100"] = False
        return df

    # ============================================================
    # COMPUTE 100-DAY TRADING MA
    # ============================================================
    df["ma100"] = df["close_adj"].rolling(window=100, min_periods=100).mean()
    df["valid_ma100"] = df["ma100"].notna()

    # ============================================================
    # VALIDATION: MA100 NEGATIVE?
    # ============================================================

    bad_ma = df[df["ma100"] < 0]
    if not bad_ma.empty:
        dates_str = ", ".join(str(d.date()) for d in bad_ma["date"])
        add_validation(
            ticker,
            "negative_MA100_detected",
            f"dates: {dates_str}"
        )

    # ============================================================
    # VALIDATION: PRICE SPIKES (20× ABOVE MEDIAN)
    # ============================================================

    if len(df) > 50:
        median_close = df["close_adj"].median()
        max_close = df["close_adj"].max()

        if median_close > 0 and max_close > median_close * 20:
            spike_date = df.loc[df["close_adj"].idxmax(), "date"].date()
            add_validation(
                ticker,
                "price_spike_detected",
                f"max close {max_close:.2f} on {spike_date}, median {median_close:.2f}"
            )

    return df


# ============================================================
# MAIN LOOP
# ============================================================

for file in files:
    ticker = file.replace(".parquet", "")
    in_path = os.path.join(INPUT_DIR, file)
    out_path = os.path.join(OUTPUT_DIR, f"{ticker}.parquet")

    print(f"Processing {ticker}...")

    try:
        df = pd.read_parquet(in_path)
        df2 = compute_ma100(df, ticker)

        if df2 is not None:
            df2.to_parquet(out_path, index=False)
            print(f"✔ {ticker}: saved MA100 → {out_path}\n")
        else:
            print(f"❌ {ticker}: MA100 not computed.\n")

    except Exception as e:
        add_validation(ticker, "processing_error", str(e))
        print(f"❌ ERROR for {ticker}: {e}\n")


# ============================================================
# SAVE VALIDATION REPORT
# ============================================================

validation_df = pd.DataFrame(validation_rows)

val_file = os.path.join(
    VER_DIR,
    f"ma100_validation-{datetime.now().strftime('%Y%m%d-%H%M%S')}.csv"
)

validation_df.to_csv(val_file, index=False)

print("\n====================================================")
print(" MA100 VALIDATION SUMMARY")
print("====================================================")

if validation_df.empty:
    print("No validation issues detected.")
else:
    print(validation_df)

print("\nValidation log saved to:")
print(val_file)
print("====================================================\n")



Loading adjusted OHLC files from: ./3-adjusted_All_Prices_OHLC
Found 1167 tickers.

Processing A...
✔ A: saved MA100 → ./5-100D_MA_adjusted_all_prices\A.parquet

Processing AAL...
✔ AAL: saved MA100 → ./5-100D_MA_adjusted_all_prices\AAL.parquet

Processing AAMRQ...
✔ AAMRQ: saved MA100 → ./5-100D_MA_adjusted_all_prices\AAMRQ.parquet

Processing AAP...
✔ AAP: saved MA100 → ./5-100D_MA_adjusted_all_prices\AAP.parquet

Processing AAPL...
⚠ VALIDATION: AAPL | price_spike_detected | max close 286.19 on 2025-12-02, median 12.19
✔ AAPL: saved MA100 → ./5-100D_MA_adjusted_all_prices\AAPL.parquet

Processing ABBV...
✔ ABBV: saved MA100 → ./5-100D_MA_adjusted_all_prices\ABBV.parquet

Processing ABI1...
✔ ABI1: saved MA100 → ./5-100D_MA_adjusted_all_prices\ABI1.parquet

Processing ABKFQ...
✔ ABKFQ: saved MA100 → ./5-100D_MA_adjusted_all_prices\ABKFQ.parquet

Processing ABMD...
⚠ VALIDATION: ABMD | price_spike_detected | max close 449.75 on 2018-09-28, median 18.06
✔ ABMD: saved MA100 → ./5-100D_