In [None]:

"""
Builds a daily S&P 500 membership matrix from SHARADAR/SP500 events via Nasdaq Data Link.
Loads the API key from NASDAQ_DATA_LINK_API_KEY, downloads constituent add/remove events,
simulates daily membership across all business days in the event range, and saves the full
membership matrix to Parquet at ./1-sp500_membership_daily_matrix/sp500_membership_full.parquet.
Also computes membership metadata (first/last/exit/current status), reports recent additions
and removals, and writes a timestamped diagnostics CSV to system_verification/1-SP500MEMBERSHIPBUILDER.
"""
import os
from datetime import datetime
import pandas as pd
import nasdaqdatalink
from IPython.display import display, HTML
from dotenv import load_dotenv



RUN_TIMESTAMP = None
# ============================================================
# PRETTY COLORS
# ============================================================
class C:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    BOLD = '\033[1m'
    ENDC = '\033[0m'


# ============================================================
# CONFIG
# ============================================================
load_dotenv()
nasdaqdatalink_key = os.getenv("NASDAQ_DATA_LINK_API_KEY")

nasdaqdatalink.ApiConfig.api_key = nasdaqdatalink_key

BASE_DIR = "./1-sp500_membership_daily_matrix"
os.makedirs(BASE_DIR, exist_ok=True)

FULL_PARQUET_PATH = os.path.join(BASE_DIR, "sp500_membership_full.parquet")

VERIFICATION_DIR = "system_verification/1-SP500MEMBERSHIPBUILDER"
os.makedirs(VERIFICATION_DIR, exist_ok=True)

RECENT_WINDOW = 101


# ============================================================
# 1. LOAD SHARADAR SP500 EVENTS
# ============================================================
print("Downloading SHARADAR/SP500 constituent events…")

sp500_events = nasdaqdatalink.get_table(
    'SHARADAR/SP500',
    paginate=True
)

sp500_events["date"] = pd.to_datetime(sp500_events["date"])
sp500_events = sp500_events.sort_values("date")

min_date = sp500_events["date"].min()
max_date = sp500_events["date"].max()

print(f"Events date range: {min_date.date()} → {max_date.date()}")
print(f"Total SP500 events: {len(sp500_events):,}")


# ============================================================
# 2. BUSINESS-DAY CALENDAR
# ============================================================
# IMPORTANT: Use ONLY up to max_date, not today
all_dates = pd.date_range(start=min_date, end=max_date, freq="B")

print(f"Business days: {len(all_dates)}")


# ============================================================
# 3. SIMULATE DAILY MEMBERSHIP
# ============================================================
sp500_events = sp500_events[["ticker", "date", "action"]]
events_by_date = sp500_events.groupby("date")

current_members = set()
members_by_date = {}

print("Simulating membership timeline…")

for current_date in all_dates:
    if current_date in events_by_date.groups:
        todays_events = events_by_date.get_group(current_date)
        for _, row in todays_events.iterrows():
            if row["action"] == "removed":
                current_members.discard(row["ticker"])
            else:
                current_members.add(row["ticker"])

    members_by_date[current_date] = current_members.copy()


# ============================================================
# 4. BUILD MEMBERSHIP MATRIX
# ============================================================
all_tickers = sorted(sp500_events["ticker"].unique())

membership_df = pd.DataFrame(False, index=all_tickers, columns=all_dates)

for date, members in members_by_date.items():
    membership_df.loc[list(members), date] = True

membership_df.to_parquet(FULL_PARQUET_PATH)
print(f"Saved full membership → {FULL_PARQUET_PATH}")


# ============================================================
# 5. DIAGNOSTICS
# ============================================================
print(C.HEADER + "\n========== RUNNING DIAGNOSTICS ==========\n" + C.ENDC)

latest_date = membership_df.columns[-1]
prev_date = membership_df.columns[-2]

print(C.OKBLUE + f"Membership ends at: {latest_date.date()}" + C.ENDC)


# ------------------------------------------------------------
# Additions/removals on last day
# ------------------------------------------------------------
was_member = membership_df[prev_date]
is_member  = membership_df[latest_date]

added_today = membership_df.index[(is_member) & (~was_member)]
removed_today = membership_df.index[(~is_member) & (was_member)]

print(C.OKGREEN + f"\nAdditions on last day: {len(added_today)}" + C.ENDC)
print(list(added_today))

print(C.FAIL + f"Removals on last day: {len(removed_today)}" + C.ENDC)
print(list(removed_today))


# ------------------------------------------------------------
# Membership metadata: first_in, last_in, exit_date
# ------------------------------------------------------------
meta = {}

for tk in membership_df.index:
    s = membership_df.loc[tk]
    days = s[s].index

    if len(days) == 0:
        meta[tk] = {
            "first_in": None,
            "last_in": None,
            "exit_date": None,
            "currently_in": False
        }
        continue

    first_in = days.min()
    last_in  = days.max()
    currently_in = s[latest_date]
    exit_date = None if currently_in else last_in

    meta[tk] = {
        "first_in": first_in,
        "last_in": last_in,
        "exit_date": exit_date,
        "currently_in": currently_in
    }


# ------------------------------------------------------------
# Recent additions/removals (< RECENT_WINDOW)
# ------------------------------------------------------------
print(C.OKCYAN + f"\nRecent additions (<{RECENT_WINDOW} days):" + C.ENDC)
recent_adds = [
    tk for tk, m in meta.items()
    if m["first_in"] is not None
    and 0 <= (latest_date - m["first_in"]).days <= RECENT_WINDOW
]
print(recent_adds)

print(C.OKCYAN + f"\nRecent removals (<{RECENT_WINDOW} days):" + C.ENDC)
recent_rems = [
    tk for tk, m in meta.items()
    if m["exit_date"] is not None
    and 0 <= (latest_date - m["exit_date"]).days <= RECENT_WINDOW
]
print(recent_rems)


# ------------------------------------------------------------
# Save diagnostics
# ------------------------------------------------------------
diag_df = pd.DataFrame([
    {"ticker": tk, **vals}
    for tk, vals in meta.items()
])

timestamp = datetime.now().strftime('%Y%m%d-%H%M%S')
diag_path = os.path.join(
    VERIFICATION_DIR,
    f"membership_diagnostics-{timestamp}.csv"
)
diag_df.to_csv(diag_path, index=False)

print(C.OKGREEN + f"\nSaved diagnostics → {diag_path}" + C.ENDC)
print(C.HEADER + "\n========== DONE ==========\n" + C.ENDC)


Downloading SHARADAR/SP500 constituent events…
Events date range: 1957-03-04 → 2025-12-30
Total SP500 events: 58,138
Business days: 17957
Simulating membership timeline…
Saved full membership → ./1-sp500_membership_daily_matrix\sp500_membership_full.parquet
[95m
[0m
[94mMembership ends at: 2025-12-30[0m
[92m
Additions on last day: 0[0m
[]
[91mRemovals on last day: 0[0m
[]
[96m
Recent additions (<101 days):[0m
['APP', 'ARES', 'CRH', 'CVNA', 'EME', 'FIX', 'HOOD', 'Q', 'SNDK', 'SOLS']
[96m
Recent removals (<101 days):[0m
['EMN', 'IPG', 'K', 'KMX', 'LKQ', 'MHK', 'SOLS']
[92m
Saved diagnostics → system_verification/1-SP500MEMBERSHIPBUILDER\membership_diagnostics-20251231-081705.csv[0m
[95m
[0m


In [35]:
import pandas as pd

df = pd.read_parquet("./1-sp500_membership_daily_matrix/sp500_membership_full.parquet")
df


Unnamed: 0,1957-03-04,1957-03-05,1957-03-06,1957-03-07,1957-03-08,1957-03-11,1957-03-12,1957-03-13,1957-03-14,1957-03-15,...,2025-12-10,2025-12-11,2025-12-12,2025-12-15,2025-12-16,2025-12-17,2025-12-18,2025-12-19,2025-12-22,2025-12-23
A,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,True,True,True
AAL,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
AAMRQ,True,True,True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
AAP,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
AAPL,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YUM,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,True,True,True
ZBH,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,True,True,True
ZBRA,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,True,True,True
ZION,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [36]:
df.iloc[:10, :10]

Unnamed: 0,1957-03-04,1957-03-05,1957-03-06,1957-03-07,1957-03-08,1957-03-11,1957-03-12,1957-03-13,1957-03-14,1957-03-15
A,False,False,False,False,False,False,False,False,False,False
AAL,False,False,False,False,False,False,False,False,False,False
AAMRQ,True,True,True,True,True,True,True,True,True,True
AAP,False,False,False,False,False,False,False,False,False,False
AAPL,False,False,False,False,False,False,False,False,False,False
ABBV,False,False,False,False,False,False,False,False,False,False
ABI1,False,False,False,False,False,False,False,False,False,False
ABKFQ,False,False,False,False,False,False,False,False,False,False
ABMD,False,False,False,False,False,False,False,False,False,False
ABNB,False,False,False,False,False,False,False,False,False,False


In [37]:
date = "2010-06-30"
date = pd.Timestamp(date)

members = df[df[date]].index.tolist()
len(members), members[:20]


(501,
 ['A',
  'AAPL',
  'ABT',
  'ADBE',
  'ADI',
  'ADM',
  'ADP',
  'ADSK',
  'AEE',
  'AEP',
  'AES',
  'AET',
  'AFL',
  'AGN',
  'AGN1',
  'AIG',
  'AIV',
  'AIZ',
  'AKAM',
  'AKS'])

In [38]:
ticker = "AAPL"
aapl_history = df.loc[ticker]
aapl_history[aapl_history].index[:20]   # first 20 days it was in


DatetimeIndex(['1982-11-30', '1982-12-01', '1982-12-02', '1982-12-03',
               '1982-12-06', '1982-12-07', '1982-12-08', '1982-12-09',
               '1982-12-10', '1982-12-13', '1982-12-14', '1982-12-15',
               '1982-12-16', '1982-12-17', '1982-12-20', '1982-12-21',
               '1982-12-22', '1982-12-23', '1982-12-24', '1982-12-27'],
              dtype='datetime64[ns]', freq=None)