In [None]:
"""
Consumes the daily S&P 500 membership matrix Parquet, computes first join and last exit dates
per ticker, flags today’s additions/removals and recent changes within RECENT_WINDOW, and saves
the join/exit table to Parquet and CSV alongside a diagnostics CSV of detected events.
"""
import pandas as pd
import os
from datetime import datetime

# ============================================================
# CONFIG
# ============================================================
INPUT_PATH = "./1-sp500_membership_daily_matrix/sp500_membership_full.parquet"

OUTPUT_PARQUET = "./1-sp500_membership_daily_matrix/sp500_membership_join_exit_date.parquet"
OUTPUT_CSV     = "./1-sp500_membership_daily_matrix/sp500_membership_join_exit_date.csv"

# Diagnostics output directory
DIAG_DIR = "./system_verification/1a-SP500_Join__Exit_Date"
os.makedirs(DIAG_DIR, exist_ok=True)

RECENT_WINDOW = 100   # days

# ============================================================
# LOAD MEMBERSHIP MATRIX
# ============================================================
print(f"\nLoading membership matrix → {INPUT_PATH}")
membership = pd.read_parquet(INPUT_PATH)

membership.columns = pd.to_datetime(membership.columns)
membership = membership.sort_index().sort_index(axis=1)

membership_bool = membership.astype(bool)

print("Shape:", membership.shape)
print("Rows = tickers, Columns = daily membership flags\n")

# ============================================================
# BASIC DATE VARIABLES
# ============================================================
last_date = membership_bool.columns[-1]
prev_date = membership_bool.columns[-2]

today = last_date  # last date present in matrix

print(f"Last date in matrix : {last_date.date()}")
print(f"Previous date       : {prev_date.date()}\n")

# ============================================================
# DIAGNOSTIC STORAGE
# ============================================================
diagnostics = []

def add_diag(ticker, event_type, **extra):
    row = {"ticker": ticker, "event_type": event_type}
    row.update(extra)
    diagnostics.append(row)

# ============================================================
# 1. NEW ADDITIONS TODAY
# ============================================================
was_member = membership_bool[prev_date]
is_member = membership_bool[last_date]

added_today_mask = is_member & (~was_member)
removed_today_mask = (~is_member) & was_member

added_today = membership_bool.index[added_today_mask]
removed_today = membership_bool.index[removed_today_mask]

print("========== NEW ADDITIONS TODAY ==========")
if len(added_today) == 0:
    print("No new additions today.")
else:
    for tk in added_today:
        print(" •", tk)
        add_diag(tk, "added_today", date=str(today.date()))

print("\n========== NEW REMOVALS TODAY ==========")
if len(removed_today) == 0:
    print("No new removals today.")
else:
    for tk in removed_today:
        print(" •", tk)
        add_diag(tk, "removed_today", date=str(today.date()))

# ============================================================
# 2. FIRST JOIN DATE & LAST EXIT DATE FOR EVERY TICKER
# ============================================================
print("\nComputing first join and last exit dates...")

first_join_dates = {}
last_exit_dates = {}

for ticker in membership_bool.index:
    series = membership_bool.loc[ticker]
    joined_days = series[series].index

    if len(joined_days) > 0:
        first_join = joined_days.min()

        # Exit date = last day it was TRUE *and is NOT currently in the index*
        if series[last_date] == True:
            last_exit = None  # still active
        else:
            last_exit = joined_days.max()
    else:
        first_join = None
        last_exit = None

    first_join_dates[ticker] = first_join
    last_exit_dates[ticker] = last_exit

# ============================================================
# 3. RECENT ADDITIONS & REMOVALS (LAST N DAYS)
# ============================================================
print(f"\n========== RECENT ADDITIONS (<{RECENT_WINDOW} days) ==========")

recent_additions = []
recent_removals = []

for ticker in membership_bool.index:
    first_in = first_join_dates[ticker]
    last_exit = last_exit_dates[ticker]

    # Skip if never in SP500
    if first_in is None:
        continue

    # Recent additions
    days_since_first = (today - first_in).days
    if 0 <= days_since_first <= RECENT_WINDOW:
        recent_additions.append(ticker)
        add_diag(
            ticker,
            "recent_addition",
            first_in_sp500=str(first_in.date()),
            days_since_added=days_since_first
        )

    # Recent removals
    if last_exit is not None:
        days_since_exit = (today - last_exit).days
        if 0 <= days_since_exit <= RECENT_WINDOW:
            recent_removals.append(ticker)
            add_diag(
                ticker,
                "recent_removal",
                last_exit_sp500=str(last_exit.date()),
                days_since_removed=days_since_exit
            )

# PRINT RESULTS
if recent_additions:
    print("Recent additions:", recent_additions)
else:
    print("No recent additions.")

print(f"\n========== RECENT REMOVALS (<{RECENT_WINDOW} days) ==========")
if recent_removals:
    print("Recent removals:", recent_removals)
else:
    print("No recent removals.")

# ============================================================
# BUILD OUTPUT DF: FIRST JOIN + LAST EXIT
# ============================================================
join_exit_df = pd.DataFrame({
    "ticker": list(first_join_dates.keys()),
    "first_join_date": list(first_join_dates.values()),
    "last_exit_date": list(last_exit_dates.values())
}).set_index("ticker")

# ============================================================
# SAVE JOIN / EXIT DATE FILES
# ============================================================
join_exit_df.to_parquet(OUTPUT_PARQUET)
join_exit_df.to_csv(OUTPUT_CSV)

print("\nSaved join/exit date files:")
print(" →", OUTPUT_PARQUET)
print(" →", OUTPUT_CSV)

# ============================================================
# SAVE DIAGNOSTICS TO CSV
# ============================================================
diag_df = pd.DataFrame(diagnostics)

diag_file = os.path.join(
    DIAG_DIR,
    f"sp500_membership_diagnostics-{datetime.now().strftime('%Y%m%d-%H%M%S')}.csv"
)

diag_df.to_csv(diag_file, index=False)

print("\nSaved diagnostics →", diag_file)
print("\n=== DONE ===")



Loading membership matrix → ./1-sp500_membership_daily_matrix/sp500_membership_full.parquet
Shape: (1192, 17957)
Rows = tickers, Columns = daily membership flags

Last date in matrix : 2025-12-30
Previous date       : 2025-12-29

No new additions today.

No new removals today.

Computing first join and last exit dates...

Recent additions: ['APP', 'ARES', 'CRH', 'CVNA', 'EME', 'FIX', 'HOOD', 'Q', 'SNDK', 'SOLS']

Recent removals: ['EMN', 'IPG', 'K', 'KMX', 'LKQ', 'MHK', 'SOLS']

Saved join/exit date files:
 → ./1-sp500_membership_daily_matrix/sp500_membership_join_exit_date.parquet
 → ./1-sp500_membership_daily_matrix/sp500_membership_join_exit_date.csv

Saved diagnostics → ./system_verification/1a-SP500_Join__Exit_Date\sp500_membership_diagnostics-20251231-081736.csv

=== DONE ===
