In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

# ============================================================
# CONFIG
# ============================================================

INPUT_DIR  = "./3-adjusted_All_Prices_OHLC"
OUTPUT_DIR = "./6-90Day_jump_filter_adjusted_all_prices"
VER_DIR    = "./system_verification/6-90Day_jump_filter_adjusted_all_prices"

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(VER_DIR, exist_ok=True)

print(f"\nLoading adjusted OHLC files from: {INPUT_DIR}")
files = sorted([f for f in os.listdir(INPUT_DIR) if f.endswith(".parquet")])
print(f"Found {len(files)} tickers.\n")

REQUIRED = {"date", "close_adj"}


# ============================================================
# VALIDATION STORAGE
# ============================================================

validation_rows = []

def add_validation(ticker, issue, detail=""):
    row = {"ticker": ticker, "issue": issue, "detail": detail}
    validation_rows.append(row)
    print(f"⚠ VALIDATION: {ticker} | {issue} | {detail}")


# ============================================================
# FUNCTION: 90-DAY JUMP FILTER
# ============================================================

def compute_jump90(df, ticker):

    # -------------------------------
    # Required columns?
    # -------------------------------
    if not REQUIRED.issubset(df.columns):
        missing = REQUIRED - set(df.columns)
        add_validation(ticker, "missing_required_columns", str(missing))
        return None

    # -------------------------------
    # Date handling
    # -------------------------------
    df["date"] = pd.to_datetime(df["date"])

    if not df["date"].is_monotonic_increasing:
        add_validation(ticker, "non_monotonic_dates")
        df = df.sort_values("date").reset_index(drop=True)

    # -------------------------------
    # Check for invalid close prices
    # -------------------------------
    if (df["close_adj"] <= 0).any():
        n = int((df["close_adj"] <= 0).sum())
        add_validation(ticker, "nonpositive_close_adj", f"{n} rows")

    # -------------------------------
    # Daily percent change
    # -------------------------------
    df["pct_change"] = df["close_adj"].pct_change()
    df["abs_pct"] = df["pct_change"].abs()

    # -------------------------------
    # Detect daily spike (>50%)
    # -------------------------------
    if (df["abs_pct"] > 0.50).any():
        idx = df[df["abs_pct"] > 0.50].index[0]
        spike_date = df.loc[idx, "date"].date()
        spike_val = df.loc[idx, "abs_pct"]
        add_validation(
            ticker,
            "huge_daily_spike",
            f"{spike_val:.2f} on {spike_date}"
        )

    # -------------------------------
    # 90-Day Rolling Max
    # -------------------------------
    df["abs_rollmax_90"] = (
        df["abs_pct"]
        .rolling(window=90, min_periods=90)
        .max()
    )

    # -------------------------------
    # TRADING RULE:
    # must have full 90-day window or stock is NOT tradable
    # -------------------------------
    df["no_big_jump_90"] = (
        (df["abs_rollmax_90"].notna()) & 
        (df["abs_rollmax_90"] <= 0.15)
    )

    return df


# ============================================================
# MAIN LOOP
# ============================================================

for file in files:
    ticker = file.replace(".parquet", "")
    in_path = os.path.join(INPUT_DIR, file)
    out_path = os.path.join(OUTPUT_DIR, f"{ticker}.parquet")

    print(f"Processing {ticker}...")

    try:
        df = pd.read_parquet(in_path)
        df2 = compute_jump90(df, ticker)

        if df2 is not None:
            df2.to_parquet(out_path, index=False)
            print(f"✔ {ticker}: saved jump filter → {out_path}\n")
        else:
            print(f"❌ {ticker}: jump filter not computed.\n")

    except Exception as e:
        add_validation(ticker, "processing_error", str(e))
        print(f"❌ ERROR for {ticker}: {e}\n")


# ============================================================
# SAVE VALIDATION REPORT
# ============================================================

validation_df = pd.DataFrame(validation_rows)

val_file = os.path.join(
    VER_DIR,
    f"jump90_validation-{datetime.now().strftime('%Y%m%d-%H%M%S')}.csv"
)

validation_df.to_csv(val_file, index=False)

print("\n====================================================")
print(" 90-DAY JUMP FILTER VALIDATION SUMMARY")
print("====================================================")

if validation_df.empty:
    print("No validation issues detected.")
else:
    print(validation_df)

print("\nValidation log saved to:")
print(val_file)
print("====================================================\n")



Loading adjusted OHLC files from: ./3-adjusted_All_Prices_OHLC
Found 1167 tickers.

Processing A...
✔ A: saved jump filter → ./6-90Day_jump_filter_adjusted_all_prices\A.parquet

Processing AAL...
✔ AAL: saved jump filter → ./6-90Day_jump_filter_adjusted_all_prices\AAL.parquet

Processing AAMRQ...
⚠ VALIDATION: AAMRQ | huge_daily_spike | 0.84 on 2011-11-29
✔ AAMRQ: saved jump filter → ./6-90Day_jump_filter_adjusted_all_prices\AAMRQ.parquet

Processing AAP...
⚠ VALIDATION: AAP | huge_daily_spike | 0.57 on 2025-05-22
✔ AAP: saved jump filter → ./6-90Day_jump_filter_adjusted_all_prices\AAP.parquet

Processing AAPL...
⚠ VALIDATION: AAPL | huge_daily_spike | 0.52 on 2000-09-29
✔ AAPL: saved jump filter → ./6-90Day_jump_filter_adjusted_all_prices\AAPL.parquet

Processing ABBV...
✔ ABBV: saved jump filter → ./6-90Day_jump_filter_adjusted_all_prices\ABBV.parquet

Processing ABI1...
✔ ABI1: saved jump filter → ./6-90Day_jump_filter_adjusted_all_prices\ABI1.parquet

Processing ABKFQ...
⚠ VALIDAT

In [2]:
df_abbv = pd.read_parquet("6-90Day_jump_filter_adjusted_all_prices/ABBV.parquet")

# Basic overview
print(f"Rows: {len(df_abbv)}, Columns: {len(df_abbv.columns)}")
print(f"\nColumn names: {list(df_abbv.columns)}")
print(f"\nData types:\n{df_abbv.dtypes}")
print(f"\nFirst few rows:\n{df_abbv.head()}")
print(f"\nLast few rows:\n{df_abbv.tail()}")
print(f"\nBasic statistics:\n{df_abbv[['close_adj', 'pct_change', 'abs_rollmax_90']].describe()}")
print(f"\nDate range: {df_abbv['date'].min()} to {df_abbv['date'].max()}")
print(f"\nno_big_jump_90 value counts:\n{df_abbv['no_big_jump_90'].value_counts()}")

Rows: 3269, Columns: 19

Column names: ['ticker', 'date', 'open', 'high', 'low', 'close', 'volume', 'closeadj', 'closeunadj', 'lastupdated', 'adj_factor', 'open_adj', 'high_adj', 'low_adj', 'close_adj', 'pct_change', 'abs_pct', 'abs_rollmax_90', 'no_big_jump_90']

Data types:
ticker                    object
date              datetime64[ns]
open                     float64
high                     float64
low                      float64
close                    float64
volume                   float64
closeadj                 float64
closeunadj               float64
lastupdated       datetime64[ms]
adj_factor               float64
open_adj                 float64
high_adj                 float64
low_adj                  float64
close_adj                float64
pct_change               float64
abs_pct                  float64
abs_rollmax_90           float64
no_big_jump_90              bool
dtype: object

First few rows:
  ticker       date   open   high    low   close      volume  clo