In [2]:
#!/usr/bin/env python

import os
import pandas as pd
from datetime import datetime

# ============================================================
# CONFIG
# ============================================================

INPUT_FILE      = "./11-universe_with_jump90/11-universe_with_jump90.parquet"
MEMBERSHIP_FILE = "./1-sp500_membership_daily_matrix/sp500_membership_join_exit_date.parquet"

OUTPUT_DIR      = "./12-tradable_sp500_universe"
VER_DIR         = "./system_verification/12-tradable_sp500_universe"

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(VER_DIR, exist_ok=True)

print("=== BUILDING TRADABLE SP500 UNIVERSE (post-1998) ===")

# ============================================================
# 1. LOAD INPUT UNIVERSE
# ============================================================

df = pd.read_parquet(INPUT_FILE)
df["date"] = pd.to_datetime(df["date"])

print(f"Loaded enriched universe: {df.shape[0]:,} rows")

# ============================================================
# 2. LOAD MEMBERSHIP (index = ticker)
# ============================================================

m = pd.read_parquet(MEMBERSHIP_FILE)

# Ensure correct types
m["first_join_date"] = pd.to_datetime(m["first_join_date"])
m["last_exit_date"]  = pd.to_datetime(m["last_exit_date"])

# Membership file already uses index as ticker → keep as-is
print("Loaded membership table with tickers in index.")
print("Columns:", m.columns.tolist())

# Convert to simple lookup dicts
JOIN = m["first_join_date"].to_dict()
EXIT = m["last_exit_date"].to_dict()

# ============================================================
# 3. DATE CUTOFF
# ============================================================

df = df[df["date"] >= pd.Timestamp("1998-01-01")]
print(f"Rows after 1998 cutoff: {df.shape[0]:,}")

# ============================================================
# 4. APPLY SP500 MEMBERSHIP WINDOW
# ============================================================

def in_sp500(ticker, date):
    join = JOIN.get(ticker)
    exit = EXIT.get(ticker)

    if join is None:       # never in the index
        return False

    if pd.isna(exit):      # still active member
        return date >= join

    return (date >= join) and (date <= exit)

print("Applying membership logic...")

df["in_sp500"] = [
    in_sp500(t, d) for t, d in zip(df["ticker"], df["date"])
]

df2 = df[df["in_sp500"] == True].copy()

print(f"Rows after membership filtering: {df2.shape[0]:,}")

# ============================================================
# 5. SORT (date ↑ then slope ↓)
# ============================================================

df2 = df2.sort_values(["date", "slope_adj"], ascending=[True, False])

# ============================================================
# 6. SAVE OUTPUT
# ============================================================

out_path = os.path.join(OUTPUT_DIR, "12-tradable_sp500_universe.parquet")
df2.to_parquet(out_path, index=False)

# Validation metadata
val = pd.DataFrame({
    "timestamp": [datetime.now().isoformat(timespec="seconds")],
    "rows_input": [df.shape[0]],
    "rows_output": [df2.shape[0]],
})

val_path = os.path.join(
    VER_DIR,
    f"12-tradable_sp500_universe_validation-{datetime.now().strftime('%Y%m%d-%H%M%S')}.csv"
)

val.to_csv(val_path, index=False)

print("\n=== COMPLETED ===")
print(f"Saved tradable SP500 universe → {out_path}")
print(f"Saved validation log          → {val_path}")


=== BUILDING TRADABLE SP500 UNIVERSE (post-1998) ===
Loaded enriched universe: 5,367,530 rows
Loaded membership table with tickers in index.
Columns: ['first_join_date', 'last_exit_date']
Rows after 1998 cutoff: 5,366,657
Applying membership logic...
Rows after membership filtering: 3,591,967

=== COMPLETED ===
Saved tradable SP500 universe → ./12-tradable_sp500_universe\12-tradable_sp500_universe.parquet
Saved validation log          → ./system_verification/12-tradable_sp500_universe\12-tradable_sp500_universe_validation-20260105-110744.csv


In [3]:
df_ranking = pd.read_parquet("./12-tradable_sp500_universe/12-tradable_sp500_universe.parquet")

print("Ranking Dataset Info:")
print("="*60)
print(f"Shape: {df_ranking.shape}")
print(f"\nColumns: {df_ranking.columns.tolist()}")
print(f"\nData types:\n{df_ranking.dtypes}")
print(f"\nFirst few rows:")
print(df_ranking.head(10))
print(f"\nLast few rows:")
print(df_ranking.tail(10))
print(f"\nSummary statistics:")
print(df_ranking.describe())
print(f"\nNull values:")
print(df_ranking.isnull().sum())

Ranking Dataset Info:
Shape: (3591967, 17)

Columns: ['date', 'open_adj', 'high_adj', 'low_adj', 'close_adj', 'volume', 'ma100', 'above_ma100', 'pct_change', 'abs_pct', 'abs_rollmax_90', 'no_big_jump_90', 'slope_annual', 'r2', 'slope_adj', 'ticker', 'in_sp500']

Data types:
date              datetime64[ns]
open_adj                 float64
high_adj                 float64
low_adj                  float64
close_adj                float64
volume                   float64
ma100                    float64
above_ma100                 bool
pct_change               float64
abs_pct                  float64
abs_rollmax_90           float64
no_big_jump_90              bool
slope_annual             float64
r2                       float64
slope_adj                float64
ticker                    object
in_sp500                    bool
dtype: object

First few rows:
        date   open_adj   high_adj    low_adj  close_adj       volume  ma100  \
0 1998-01-02  32.439683  32.439683  31.966822     32.

In [4]:
target_date = pd.Timestamp("2001-01-17")
subset = df_ranking[df_ranking["date"] == target_date]

out_path = os.path.join(VER_DIR, f"ranking_{target_date.date()}.csv")
subset.to_csv(out_path, index=False)

print(f"Saved {len(subset):,} rows for {target_date.date()} → {out_path}")

Saved 505 rows for 2001-01-17 → ./system_verification/12-tradable_sp500_universe\ranking_2001-01-17.csv


In [5]:
# Choose a ticker to export
ticker_to_export = "AAPL"  # Change this to any ticker you want

# Filter the dataframe for the specific ticker
stock_data = df_ranking[df_ranking["ticker"] == ticker_to_export]

# Create output path
stock_out_path = os.path.join(VER_DIR, f"{ticker_to_export}_full_history.csv")

# Export to CSV
stock_data.to_csv(stock_out_path, index=False)

print(f"Exported {len(stock_data):,} rows for {ticker_to_export} → {stock_out_path}")

Exported 7,044 rows for AAPL → ./system_verification/12-tradable_sp500_universe\AAPL_full_history.csv


In [6]:
import pandas as pd
m = pd.read_parquet("./1-sp500_membership_daily_matrix/sp500_membership_join_exit_date.parquet")
print(m.columns)
print(m.head())


Index(['first_join_date', 'last_exit_date'], dtype='object')
       first_join_date last_exit_date
ticker                               
A           2000-06-05            NaT
AAL         2015-03-17     2024-09-20
AAMRQ       1957-03-04     2003-03-13
AAP         2015-07-07     2023-08-24
AAPL        1982-11-30            NaT


In [7]:
import pandas as pd
trades = pd.read_parquet("./13-trading_output_regression_insp500_spyfilter_cap15/13-trades_regression_insp500_spyfilter_cap15.parquet")
print("Last exec dates in trades file:")
print(trades["exec_date"].drop_duplicates().sort_values().tail(10))

Last exec dates in trades file:
2924   2025-10-23
2926   2025-10-30
2928   2025-11-06
2931   2025-11-13
2932   2025-11-20
2937   2025-11-28
2943   2025-12-04
2946   2025-12-11
2951   2025-12-18
2953   2025-12-26
Name: exec_date, dtype: datetime64[ns]


In [8]:
rankings = pd.read_parquet("./13-trading_output_regression_insp500_spyfilter_cap15/13-weekly_rankings_pre_filter_cap15.parquet")
print("Last signal dates in rankings file:")
print(rankings["signal_date"].drop_duplicates().sort_values().tail(10))

Last signal dates in rankings file:
35845   2025-10-29
35871   2025-11-05
35897   2025-11-12
35923   2025-11-19
35949   2025-11-26
35975   2025-12-03
36001   2025-12-10
36027   2025-12-17
36053   2025-12-24
36079   2025-12-31
Name: signal_date, dtype: datetime64[ns]


In [9]:
import pandas as pd
df = pd.read_parquet("./12-tradable_sp500_universe/12-tradable_sp500_universe.parquet")
dates = sorted(df["date"].unique())

# Check what dates exist around Christmas
dec_dates = [d for d in dates if pd.Timestamp(d) >= pd.Timestamp("2025-12-20")]
print(dec_dates)

[Timestamp('2025-12-22 00:00:00'), Timestamp('2025-12-23 00:00:00'), Timestamp('2025-12-24 00:00:00'), Timestamp('2025-12-26 00:00:00'), Timestamp('2025-12-29 00:00:00'), Timestamp('2025-12-30 00:00:00'), Timestamp('2025-12-31 00:00:00'), Timestamp('2026-01-02 00:00:00')]


In [10]:
import pandas as pd
df = pd.read_parquet("./12-tradable_sp500_universe/12-tradable_sp500_universe.parquet")
df["date"] = pd.to_datetime(df["date"])
df_by_date = {d: sub for d, sub in df.groupby("date")}
dates = sorted(df_by_date.keys())
next_date_map = {d: dates[i + 1] if i + 1 < len(dates) else None for i, d in enumerate(dates)}

print("Dec 24 ->", next_date_map.get(pd.Timestamp("2025-12-24")))
print("Dec 31 ->", next_date_map.get(pd.Timestamp("2025-12-31")))

Dec 24 -> 2025-12-26 00:00:00
Dec 31 -> 2026-01-02 00:00:00


In [11]:
import pandas as pd
df = pd.read_parquet("./12-tradable_sp500_universe/12-tradable_sp500_universe.parquet")
df["date"] = pd.to_datetime(df["date"])
dates = sorted(df["date"].unique())
print("Last 5 dates:", dates[-5:])
print("Is Jan 2 the last date?", dates[-1] == pd.Timestamp("2026-01-02"))

Last 5 dates: [Timestamp('2025-12-26 00:00:00'), Timestamp('2025-12-29 00:00:00'), Timestamp('2025-12-30 00:00:00'), Timestamp('2025-12-31 00:00:00'), Timestamp('2026-01-02 00:00:00')]
Is Jan 2 the last date? True


In [12]:
import pandas as pd
rankings = pd.read_parquet("./13-trading_output_regression_insp500_spyfilter_cap15/13-weekly_rankings_pre_filter_cap15.parquet")
dec31 = rankings[rankings["signal_date"] == "2025-12-31"]
print(dec31[["signal_date", "exec_date", "ticker"]].head())

      signal_date  exec_date ticker
36079  2025-12-31 2026-01-02   SNDK
36080  2025-12-31 2026-01-02    WBD
36081  2025-12-31 2026-01-02    WDC
36082  2025-12-31 2026-01-02     MU
36083  2025-12-31 2026-01-02    ALB


In [13]:
import pandas as pd
rankings = pd.read_parquet("./13-trading_output_regression_insp500_spyfilter_cap15/13-weekly_rankings_pre_filter_cap15.parquet")
dec31 = rankings[rankings["signal_date"] == "2025-12-31"]
print("Number of ranked stocks:", len(dec31))
print("\nWeight changes:")
print(dec31[["ticker", "target_weight", "current_weight", "target_shares", "current_shares"]].head(20))

Number of ranked stocks: 26

Weight changes:
      ticker  target_weight  current_weight  target_shares  current_shares
36079   SNDK       0.010024        0.000000            880               0
36080    WBD       0.119884        0.119606          86689           86488
36081    WDC       0.019814        0.028387           2397            3434
36082     MU       0.013408        0.000000            979               0
36083    ALB       0.032618        0.000000           4806               0
36084    TER       0.030232        0.000000           3255               0
36085   AMAT       0.026057        0.000000           2113               0
36086    LLY       0.007581        0.000000            147               0
36087   LRCX       0.030655        0.000000           3732               0
36088    STX       0.013743        0.000000           1040               0
36089    CAH       0.064648        0.060713           6556            6157
36090   JBHT       0.047429        0.000000           5