In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

# ============================================================
# CONFIGURATION
# ============================================================

INPUT_DIR  = "./2-all_prices/sharadar_sep_full"
OUTPUT_DIR = "./3-adjusted_All_Prices_OHLC"
VER_DIR    = "./system_verification/3-adjusted_All_Prices_OHLC"

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(VER_DIR, exist_ok=True)

REQUIRED = {"open", "high", "low", "close", "closeadj"}

print(f"\nScanning directory: {INPUT_DIR}")
files = sorted([f for f in os.listdir(INPUT_DIR) if f.endswith(".csv")])
print(f"Tickers found: {len(files)}\n")

# ============================================================
# VALIDATION STORAGE
# ============================================================

validation = []

def v(t, issue, detail=""):
    print(f"⚠ VALIDATION: {t} | {issue} | {detail}")
    validation.append({"ticker": t, "issue": issue, "detail": detail})


# ============================================================
# OPTIMIZED PRICE ADJUSTMENT
# ============================================================

def compute_adjusted(df, ticker):

    # vectorized & fast
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df.sort_values("date", inplace=True)
    df.reset_index(drop=True, inplace=True)

    # ------------ VALIDATION (vectorized) -----------------

    # Missing columns
    cols = set(df.columns)
    if not REQUIRED.issubset(cols):
        missing = REQUIRED - cols
        v(ticker, "missing_columns", str(missing))
        raise ValueError(f"Missing columns: {missing}")

    # Nonpositive
    if (df["close"] <= 0).any():
        v(ticker, "nonpositive_close", (df["close"] <= 0).sum())

    if (df["closeadj"] <= 0).any():
        v(ticker, "invalid_closeadj", (df["closeadj"] <= 0).sum())

    # Missing dates or NaTs
    if df["date"].isna().any():
        v(ticker, "invalid_dates", df["date"].isna().sum())
        df.dropna(subset=["date"], inplace=True)

    # Large gaps
    gaps = df["date"].diff().dt.days.values
    max_gap = np.nanmax(gaps)
    if max_gap > 7:
        v(ticker, "large_date_gap", int(max_gap))

    # ------------ ADJUSTMENT (fully vectorized) -------------

    # adj_factor
    df["adj_factor"] = df["closeadj"].values / df["close"].values

    # detect split-like jumps > 25%
    jumps = np.abs(np.diff(df["adj_factor"].values) / df["adj_factor"].values[:-1])
    if (jumps > 0.25).any():
        v(ticker, "adj_factor_jump", f"max jump {jumps.max():.3f}")

    # adjusted prices (vectorized)
    adj_factor = df["adj_factor"].values
    df["open_adj"]  = df["open"].values  * adj_factor
    df["high_adj"]  = df["high"].values  * adj_factor
    df["low_adj"]   = df["low"].values   * adj_factor
    df["close_adj"] = df["closeadj"].values

    return df


# ============================================================
# MAIN LOOP
# ============================================================

for file in files:
    ticker = file.replace(".csv", "")
    in_path = os.path.join(INPUT_DIR, file)
    out_path = os.path.join(OUTPUT_DIR, f"{ticker}.parquet")

    print(f"Processing {ticker} …")

    try:
        df = pd.read_csv(in_path, engine="pyarrow")
        df2 = compute_adjusted(df, ticker)

        df2.to_parquet(out_path, index=False)
        print(f"✔ Saved → {out_path}\n")

    except Exception as e:
        v(ticker, "processing_error", str(e))
        print(f"❌ ERROR for {ticker}: {e}\n")


# ============================================================
# SAVE VALIDATION
# ============================================================

val_df = pd.DataFrame(validation)
val_path = os.path.join(
    VER_DIR,
    f"adjusted_price_validation-{datetime.now().strftime('%Y%m%d-%H%M%S')}.csv"
)
val_df.to_csv(val_path, index=False)

print("\n====================================================")
print(" VALIDATION SUMMARY")
print("====================================================")

if val_df.empty:
    print("No issues detected.")
else:
    print(val_df)

print("\nSaved validation →", val_path)
print("====================================================\n")



Scanning directory: ./2-all_prices/sharadar_sep_full
Tickers found: 1167

Processing A …
⚠ VALIDATION: A | adj_factor_jump | max jump 0.384
✔ Saved → ./3-adjusted_All_Prices_OHLC\A.parquet

Processing AAL …
✔ Saved → ./3-adjusted_All_Prices_OHLC\AAL.parquet

Processing AAMRQ …
⚠ VALIDATION: AAMRQ | adj_factor_jump | max jump 0.982
✔ Saved → ./3-adjusted_All_Prices_OHLC\AAMRQ.parquet

Processing AAP …
✔ Saved → ./3-adjusted_All_Prices_OHLC\AAP.parquet

Processing AAPL …
✔ Saved → ./3-adjusted_All_Prices_OHLC\AAPL.parquet

Processing ABBV …
✔ Saved → ./3-adjusted_All_Prices_OHLC\ABBV.parquet

Processing ABI1 …
✔ Saved → ./3-adjusted_All_Prices_OHLC\ABI1.parquet

Processing ABKFQ …
✔ Saved → ./3-adjusted_All_Prices_OHLC\ABKFQ.parquet

Processing ABMD …
✔ Saved → ./3-adjusted_All_Prices_OHLC\ABMD.parquet

Processing ABNB …
✔ Saved → ./3-adjusted_All_Prices_OHLC\ABNB.parquet

Processing ABS …
✔ Saved → ./3-adjusted_All_Prices_OHLC\ABS.parquet

Processing ABT …
⚠ VALIDATION: ABT | adj_facto

In [2]:
import pandas as pd
aapl_df = pd.read_parquet(os.path.join(OUTPUT_DIR, "apa.parquet"))
print(aapl_df.head())
print(f"\nShape: {aapl_df.shape}")
print(f"Columns: {list(aapl_df.columns)}")

  ticker       date    open    high     low   close     volume  closeadj  \
0    APA 1997-12-31  17.125  17.625  17.095  17.530  1603000.0    11.927   
1    APA 1998-01-02  17.595  17.595  17.030  17.030   621000.0    11.587   
2    APA 1998-01-05  17.280  17.280  16.530  16.565  1675000.0    11.271   
3    APA 1998-01-06  16.315  16.500  16.125  16.250  2998000.0    11.056   
4    APA 1998-01-07  16.440  16.625  15.940  16.095  1714000.0    10.951   

   closeunadj lastupdated  adj_factor   open_adj   high_adj    low_adj  \
0       35.06  2025-10-23    0.680376  11.651448  11.991636  11.631036   
1       34.06  2025-10-23    0.680388  11.971419  11.971419  11.587000   
2       33.13  2025-10-23    0.680411  11.757494  11.757494  11.247186   
3       32.50  2025-10-23    0.680369  11.100224  11.226092  10.970954   
4       32.19  2025-10-23    0.680398  11.185737  11.311611  10.845538   

   close_adj  
0     11.927  
1     11.587  
2     11.271  
3     11.056  
4     10.951  

Shape: 

In [3]:
# Filter apa data for the specified date range and export to CSV
start_date = "2020-01-01"
end_date = "2020-06-03"

aapl_df = pd.read_parquet(os.path.join(OUTPUT_DIR, "apa.parquet"))
aapl_df["date"] = pd.to_datetime(aapl_df["date"])

# Filter by date range
filtered_df = aapl_df[(aapl_df["date"] >= start_date) & (aapl_df["date"] <= end_date)]

# Export to CSV
csv_output_path = os.path.join(OUTPUT_DIR, "apa_2020-01-01_to_2020-06-03.csv")
filtered_df.to_csv(csv_output_path, index=False)

print(f"✔ Exported {len(filtered_df)} rows to {csv_output_path}")
print(f"\nDate range: {filtered_df['date'].min()} to {filtered_df['date'].max()}")
print(f"\nFirst few rows:")
print(filtered_df.head())

✔ Exported 106 rows to ./3-adjusted_All_Prices_OHLC\apa_2020-01-01_to_2020-06-03.csv

Date range: 2020-01-02 00:00:00 to 2020-06-03 00:00:00

First few rows:
     ticker       date   open   high     low  close      volume  closeadj  \
5536    APA 2020-01-02  25.70  25.93  25.130  25.36   3395000.0    21.887   
5537    APA 2020-01-03  25.71  26.14  25.530  25.69   5941000.0    22.172   
5538    APA 2020-01-06  25.94  25.98  25.165  25.64   4470000.0    22.129   
5539    APA 2020-01-07  29.93  32.58  29.500  32.51  37027000.0    28.058   
5540    APA 2020-01-08  32.51  33.21  32.040  32.73  13506000.0    28.248   

      closeunadj lastupdated  adj_factor   open_adj   high_adj    low_adj  \
5536       25.36  2025-10-23    0.863052  22.180438  22.378940  21.688498   
5537       25.69  2025-10-23    0.863060  22.189261  22.560377  22.033910   
5538       25.64  2025-10-23    0.863066  22.387920  22.422442  21.719044   
5539       32.51  2025-10-23    0.863058  25.831312  28.118414  25.4601

In [4]:
from pathlib import Path
import pandas as pd

# --- settings ---
BASE_DIR = Path("./3-adjusted_All_Prices_OHLC")   # <-- must be Path, not str
TICKER = "NVDA"
start_date = pd.Timestamp("2003-05-01")
end_date   = pd.Timestamp("2003-09-03")

parquet_path = BASE_DIR / f"{TICKER}.parquet"
print("Reading parquet:", parquet_path)

if not parquet_path.exists():
    print("❌ File not found:", parquet_path)
    print("\nHere are similar files in the folder (first 50):")
    files = sorted(BASE_DIR.glob("*.parquet"))
    for f in files[:50]:
        print("  ", f.name)
    raise FileNotFoundError(parquet_path)

df = pd.read_parquet(parquet_path)

# --- find date column robustly ---
date_col = None
for c in ["date", "Date", "datetime", "timestamp"]:
    if c in df.columns:
        date_col = c
        break
if date_col is None:
    raise KeyError(f"No date-like column found. Columns are: {list(df.columns)}")

df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
df = df.dropna(subset=[date_col])

print("Columns:", list(df.columns))
print("File date range:", df[date_col].min(), "to", df[date_col].max())

# Filter by date range
filtered = df[df[date_col].between(start_date, end_date, inclusive="both")].copy()

csv_output_path = BASE_DIR / f"{TICKER}_{start_date.date()}_to_{end_date.date()}.csv"
filtered.to_csv(csv_output_path, index=False)

print(f"✔ Exported {len(filtered):,} rows to {csv_output_path}")

if len(filtered) > 0:
    print("Filtered date range:", filtered[date_col].min(), "to", filtered[date_col].max())
    print(filtered.head())
else:
    print("⚠️ 0 rows after filtering. (Either no data in that range, or date column isn’t what we think.)")


Reading parquet: 3-adjusted_All_Prices_OHLC\NVDA.parquet
Columns: ['ticker', 'date', 'open', 'high', 'low', 'close', 'volume', 'closeadj', 'closeunadj', 'lastupdated', 'adj_factor', 'open_adj', 'high_adj', 'low_adj', 'close_adj']
File date range: 1999-01-22 00:00:00 to 2025-12-30 00:00:00
✔ Exported 87 rows to 3-adjusted_All_Prices_OHLC\NVDA_2003-05-01_to_2003-09-03.csv
Filtered date range: 2003-05-01 00:00:00 to 2003-09-03 00:00:00
     ticker       date   open   high    low  close        volume  closeadj  \
1073   NVDA 2003-05-01  0.119  0.121  0.118  0.120  7.333680e+08     0.110   
1074   NVDA 2003-05-02  0.122  0.133  0.121  0.133  1.691088e+09     0.122   
1075   NVDA 2003-05-05  0.134  0.142  0.131  0.136  1.576164e+09     0.124   
1076   NVDA 2003-05-06  0.135  0.139  0.133  0.136  1.143864e+09     0.125   
1077   NVDA 2003-05-07  0.134  0.137  0.131  0.134  8.973720e+08     0.123   

      closeunadj lastupdated  adj_factor  open_adj  high_adj   low_adj  \
1073       14.42  20