In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

# ============================================================
# CONFIG
# ============================================================

INPUT_DIR  = "./3-adjusted_All_Prices_OHLC"
OUTPUT_DIR = "./4-ATR20_adjusted_All_Prices"
VER_DIR    = "./system_verification/4-ATR20_adjusted_All_Prices"

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(VER_DIR, exist_ok=True)

print(f"\nLoading adjusted price files from: {INPUT_DIR}")
files = sorted([f for f in os.listdir(INPUT_DIR) if f.endswith(".parquet")])
print(f"Found {len(files)} tickers.\n")

REQUIRED = {"open_adj", "high_adj", "low_adj", "close_adj"}


# ============================================================
# VALIDATION STORAGE
# ============================================================

validation_rows = []

def add_validation(ticker, issue, detail=""):
    row = {"ticker": ticker, "issue": issue, "detail": detail}
    validation_rows.append(row)
    print(f"⚠ VALIDATION: {ticker} | {issue} | {detail}")


# ============================================================
# ATR20 COMPUTATION — SIMPLE + CORRECT
# ============================================================

def compute_atr20(df, ticker):

    # --- Date validation ---
    if "date" not in df.columns:
        add_validation(ticker, "missing_date_column")
        return None

    df["date"] = pd.to_datetime(df["date"])
    if not df["date"].is_monotonic_increasing:
        add_validation(ticker, "non_monotonic_dates")
        df = df.sort_values("date")

    df = df.reset_index(drop=True)

    # --- Required columns ---
    if not REQUIRED.issubset(df.columns):
        missing = REQUIRED - set(df.columns)
        add_validation(ticker, "missing_adjusted_columns", str(missing))
        return None

    if len(df) < 20:
        add_validation(ticker, "insufficient_history", f"{len(df)} rows (<20)")
        return None

    # ============================================================
    # TRUE RANGE (using adjusted prices)
    # ============================================================

    df["prev_close"] = df["close_adj"].shift(1)

    tr1 = df["high_adj"] - df["low_adj"]
    tr2 = (df["high_adj"] - df["prev_close"]).abs()
    tr3 = (df["low_adj"]  - df["prev_close"]).abs()

    df["tr"] = np.maximum(tr1, np.maximum(tr2, tr3))

    # TR must be non-negative
    if (df["tr"] < 0).any():
        add_validation(ticker, "negative_TR_detected")

    # ============================================================
    # ATR20 = SIMPLE 20-DAY ROLLING MEAN
    # ============================================================

    df["atr20"] = df["tr"].rolling(window=20, min_periods=20).mean()

    # Optional: detect absurd ATR spikes
    if len(df) > 30:
        median_atr = df["atr20"].median()
        if median_atr > 0:
            max_atr = df["atr20"].max()
            if max_atr > median_atr * 20:
                add_validation(
                    ticker,
                    "ATR_spike_detected",
                    f"max ATR = {max_atr:.4f}, median = {median_atr:.4f}"
                )

    return df


# ============================================================
# MAIN LOOP
# ============================================================

for file in files:
    ticker = file.replace(".parquet", "")
    in_path = os.path.join(INPUT_DIR, file)
    out_path = os.path.join(OUTPUT_DIR, f"{ticker}.parquet")

    print(f"Processing {ticker}...")

    try:
        df = pd.read_parquet(in_path)
        df2 = compute_atr20(df, ticker)

        if df2 is not None:
            df2.to_parquet(out_path, index=False)
            print(f"✔ {ticker}: saved ATR20 → {out_path}\n")
        else:
            print(f"❌ {ticker}: ATR20 not computed.\n")

    except Exception as e:
        add_validation(ticker, "processing_error", str(e))
        print(f"❌ ERROR for {ticker}: {e}\n")


# ============================================================
# SAVE VALIDATION REPORT
# ============================================================

validation_df = pd.DataFrame(validation_rows)

val_file = os.path.join(
    VER_DIR,
    f"atr20_validation-{datetime.now().strftime('%Y%m%d-%H%M%S')}.csv"
)

validation_df.to_csv(val_file, index=False)

print("\n====================================================")
print(" ATR20 VALIDATION SUMMARY")
print("====================================================")

if validation_df.empty:
    print("No validation issues detected.")
else:
    print(validation_df)

print("\nValidation log saved to:")
print(val_file)
print("====================================================\n")



Loading adjusted price files from: ./3-adjusted_All_Prices_OHLC
Found 1167 tickers.

Processing A...
✔ A: saved ATR20 → ./4-ATR20_adjusted_All_Prices\A.parquet

Processing AAL...
✔ AAL: saved ATR20 → ./4-ATR20_adjusted_All_Prices\AAL.parquet

Processing AAMRQ...
✔ AAMRQ: saved ATR20 → ./4-ATR20_adjusted_All_Prices\AAMRQ.parquet

Processing AAP...
✔ AAP: saved ATR20 → ./4-ATR20_adjusted_All_Prices\AAP.parquet

Processing AAPL...
⚠ VALIDATION: AAPL | ATR_spike_detected | max ATR = 11.3977, median = 0.2763
✔ AAPL: saved ATR20 → ./4-ATR20_adjusted_All_Prices\AAPL.parquet

Processing ABBV...
✔ ABBV: saved ATR20 → ./4-ATR20_adjusted_All_Prices\ABBV.parquet

Processing ABI1...
⚠ VALIDATION: ABI1 | ATR_spike_detected | max ATR = 15.1383, median = 0.7150
✔ ABI1: saved ATR20 → ./4-ATR20_adjusted_All_Prices\ABI1.parquet

Processing ABKFQ...
✔ ABKFQ: saved ATR20 → ./4-ATR20_adjusted_All_Prices\ABKFQ.parquet

Processing ABMD...
⚠ VALIDATION: ABMD | ATR_spike_detected | max ATR = 26.4475, median = 

In [2]:
import pandas as pd
df_ranking = pd.read_parquet("./4-ATR20_adjusted_All_Prices/AAPL.parquet")

print("Ranking Dataset Info:")
print("="*60)
print(f"Shape: {df_ranking.shape}")
print(f"\nColumns: {df_ranking.columns.tolist()}")
print(f"\nData types:\n{df_ranking.dtypes}")
print(f"\nFirst few rows:")
print(df_ranking.head(22))
print(f"\nLast few rows:")
print(df_ranking.tail(10))
print(f"\nSummary statistics:")
print(df_ranking.describe())
print(f"\nNull values:")
print(df_ranking.isnull().sum())

Ranking Dataset Info:


Shape: (7043, 18)

Columns: ['ticker', 'date', 'open', 'high', 'low', 'close', 'volume', 'closeadj', 'closeunadj', 'lastupdated', 'adj_factor', 'open_adj', 'high_adj', 'low_adj', 'close_adj', 'prev_close', 'tr', 'atr20']

Data types:
ticker                 object
date           datetime64[ns]
open                  float64
high                  float64
low                   float64
close                 float64
volume                float64
closeadj              float64
closeunadj            float64
lastupdated    datetime64[ms]
adj_factor            float64
open_adj              float64
high_adj              float64
low_adj               float64
close_adj             float64
prev_close            float64
tr                    float64
atr20                 float64
dtype: object

First few rows:
   ticker       date   open   high    low  close        volume  closeadj  \
0    AAPL 1997-12-31  0.117  0.122  0.116  0.117  4.063580e+08     0.098   
1    AAPL 1998-01-02  0.122  0.145  0.120 

In [3]:
# Use existing df_ranking to export as CSV (no re-imports)
csv_path = "./4-ATR20_adjusted_All_Prices/AAPL.csv"
df_ranking.to_csv(csv_path, index=False)
print(f"✔ Exported to CSV → {csv_path}")

✔ Exported to CSV → ./4-ATR20_adjusted_All_Prices/AAPL.csv
