In [None]:
"""
===============================================================================
DOWNLOAD FULL SHARADAR/SEP PRICE HISTORY FOR ALL S&P 500 TICKERS regardless of membership status
===============================================================================
Adds DIAGNOSTICS:
 - Less than 100 days of price data after Jan 1, 1998
 - No price data at all
 - Last price date older than 30 days
 - Very early termination (possible delisting)
 - Very late start (recent IPO)
 - Large gaps in trading history (>10 days)
 

Downloads full SHARADAR/SEP price history for every ticker that has ever appeared in the S&P 500,
saving per-ticker CSVs in ./2-all_prices/sharadar_sep_full. Loads the membership list from the
daily matrix, fetches data in chunks with API key configured via NASDAQ_DATA_LINK_API_KEY, and
prints diagnostics for missing data, short histories after 1998, stale prices (>30 days), very
late starts, very early terminations, and large gaps (>10 days) between trading days.

===============================================================================
"""

import os
import pandas as pd
import numpy as np
import nasdaqdatalink
import time
from datetime import datetime, timedelta

# ---------------------------------------------------------
# API KEY (replace if needed)
# ---------------------------------------------------------
nasdaqdatalink_key = os.getenv("NASDAQ_DATA_LINK_API_KEY")
nasdaqdatalink.ApiConfig.api_key = nasdaqdatalink_key


# ============================================================
# 1. CONFIG
# ============================================================
MEMBERSHIP_FILE = "./1-sp500_membership_daily_matrix/sp500_membership_full.parquet"

OUTPUT_DIR = "./2-all_prices/sharadar_sep_full"
os.makedirs(OUTPUT_DIR, exist_ok=True)

CHUNK_SIZE = 40   # safe for API limits

START_DATE = pd.Timestamp("1998-01-01")
TODAY = pd.Timestamp.today().normalize()

# ============================================================
# 2. LOAD MEMBERSHIP
# ============================================================
print("Loading membership matrix…")
membership = pd.read_parquet(MEMBERSHIP_FILE)
all_tickers = sorted(membership.index.tolist())

print(f"✔ Loaded {len(all_tickers)} unique historical S&P 500 tickers")
print(f"Membership covers: {membership.columns.min()} → {membership.columns.max()}")


# ============================================================
# 3. DOWNLOAD SHARADAR/SEP DATA
# ============================================================
print("\nDownloading SHARADAR/SEP…")
all_parts = []

for i in range(0, len(all_tickers), CHUNK_SIZE):
    chunk = all_tickers[i:i+CHUNK_SIZE]
    print(f"→ Chunk {i//CHUNK_SIZE + 1}: {len(chunk)} tickers…")

    df = nasdaqdatalink.get_table(
        "SHARADAR/SEP",
        ticker=chunk,
        paginate=True
    )

    all_parts.append(df)

    time.sleep(1.2)   # rate limit safety

sep = pd.concat(all_parts, ignore_index=True)
sep["date"] = pd.to_datetime(sep["date"])

print(f"\n✔ Downloaded {len(sep):,} rows")
print(f"✔ Unique tickers received: {sep['ticker'].nunique()}")


# ============================================================
# 4. DIAGNOSTIC PRINT UTIL
# ============================================================
def print_flag(ticker, message):
    print(f"⚠ {ticker}: {message}")


# ============================================================
# 5. SAVE FILES + RUN DIAGNOSTICS
# ============================================================
print("\nSaving ticker-level CSV files + running diagnostics…\n")

tickers = sep["ticker"].unique()

for tk in tickers:

    df_tk = sep[sep["ticker"] == tk].sort_values("date")

    # Save file
    out_file = os.path.join(OUTPUT_DIR, f"{tk}.csv")
    df_tk.to_csv(out_file, index=False)

    print(f"Saved: {out_file}")

    # ========================================================
    # DIAGNOSTICS START HERE
    # ========================================================

    if df_tk.empty:
        print_flag(tk, "NO PRICE DATA RECEIVED")
        continue

    df98 = df_tk[df_tk["date"] >= START_DATE]
    n_days = len(df98)

    # 1. <100 days of price data after 1998
    if n_days < 100:
        last_day = df98["date"].max() if n_days > 0 else None
        print_flag(
            tk,
            f"LESS THAN 100 DAYS of price data after 1998 — {n_days} days (last date = {last_day})"
        )

    # 2. Data starts very late (possible recent IPO)
    first_price = df_tk["date"].min()
    if first_price.year > 2015:
        print_flag(tk, f"Recent IPO — first price date is {first_price.date()}")

    # 3. Data ends very early (possible delisting)
    last_price = df_tk["date"].max()
    if last_price < TODAY - pd.Timedelta(days=30):
        print_flag(
            tk,
            f"Data ends at {last_price.date()} — older than 30 days (likely delisted or missing data)"
        )

    # 4. Suspiciously short full history (e.g., <5 years total)
    full_days = len(df_tk)
    if full_days < 250 * 5:
        print_flag(
            tk,
            f"Short full history (<5 years) — only {full_days} trading days total."
        )

    # 5. Detect large gaps (>10 days) between trading days
    dates = df_tk["date"].sort_values()
    gaps = dates.diff().dt.days
    if (gaps > 10).any():
        max_gap = gaps.max()
        print_flag(tk, f"LARGE GAP in data — max gap = {int(max_gap)} days")


print("\n===============================================================================")
print(" DONE — Full SHARADAR/SEP OHLCV + TOTAL-RETURN CLOSE (closeadj) saved per ticker")
print(" Diagnostics printed to terminal.")
print(" Location:", OUTPUT_DIR)
print("===============================================================================")


Loading membership matrix…
✔ Loaded 1187 unique historical S&P 500 tickers
Membership covers: 1957-03-04 00:00:00 → 2025-12-05 00:00:00

Downloading SHARADAR/SEP…
→ Chunk 1: 40 tickers…
→ Chunk 2: 40 tickers…
→ Chunk 3: 40 tickers…
→ Chunk 4: 40 tickers…
→ Chunk 5: 40 tickers…
→ Chunk 6: 40 tickers…
→ Chunk 7: 40 tickers…
→ Chunk 8: 40 tickers…
→ Chunk 9: 40 tickers…
→ Chunk 10: 40 tickers…
→ Chunk 11: 40 tickers…
→ Chunk 12: 40 tickers…
→ Chunk 13: 40 tickers…
→ Chunk 14: 40 tickers…
→ Chunk 15: 40 tickers…
→ Chunk 16: 40 tickers…
→ Chunk 17: 40 tickers…
→ Chunk 18: 40 tickers…
→ Chunk 19: 40 tickers…
→ Chunk 20: 40 tickers…
→ Chunk 21: 40 tickers…
→ Chunk 22: 40 tickers…
→ Chunk 23: 40 tickers…
→ Chunk 24: 40 tickers…
→ Chunk 25: 40 tickers…
→ Chunk 26: 40 tickers…
→ Chunk 27: 40 tickers…
→ Chunk 28: 40 tickers…
→ Chunk 29: 40 tickers…
→ Chunk 30: 27 tickers…

✔ Downloaded 5,336,685 rows
✔ Unique tickers received: 1162

Saving ticker-level CSV files + running diagnostics…

Saved: ./2

In [3]:
import pandas as pd
df = pd.read_parquet("./sp500_prices/sp500_price_matrix_raw.parquet")
df

date,1997-12-31,1998-01-02,1998-01-05,1998-01-06,1998-01-07,1998-01-08,1998-01-09,1998-01-12,1998-01-13,1998-01-14,...,2025-11-11,2025-11-12,2025-11-13,2025-11-14,2025-11-17,2025-11-18,2025-11-19,2025-11-20,2025-11-21,2025-11-24
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A,,,,,,,,,,,...,149.42,151.52,146.89,146.82,144.52,143.84,144.40,145.06,151.25,153.60
AAL,,,,,,,,,,,...,12.99,13.41,13.07,12.76,12.34,12.36,12.33,12.24,12.87,13.10
AAMRQ,64.250,63.812,64.312,64.719,66.375,65.438,62.500,64.781,64.812,64.625,...,,,,,,,,,,
AAP,,,,,,,,,,,...,51.06,51.02,50.22,50.03,48.93,49.76,50.04,48.37,51.70,50.93
AAPL,0.117,0.145,0.142,0.169,0.156,0.163,0.163,0.163,0.174,0.176,...,275.25,273.47,272.95,272.41,267.46,267.44,268.56,266.25,271.49,275.92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YUM,7.265,7.282,7.062,6.765,6.750,6.485,6.500,6.532,6.532,6.532,...,150.75,149.37,149.73,149.02,148.03,148.82,148.32,149.21,152.98,151.08
ZBH,,,,,,,,,,,...,88.50,90.03,90.57,89.91,88.64,89.08,89.45,89.07,92.18,93.52
ZBRA,13.222,13.222,13.084,13.111,13.027,13.000,11.836,11.804,12.280,11.613,...,255.17,255.69,241.57,240.56,232.71,230.90,230.93,228.40,240.95,242.45
ZION,45.380,45.500,46.380,45.630,44.880,41.500,39.560,40.500,40.750,42.250,...,52.67,52.45,50.91,51.02,48.92,49.34,50.43,50.44,52.12,51.89
