In [1]:
#!/usr/bin/env python

import os
import pandas as pd
import numpy as np
from datetime import datetime

#contains price OHLC + volume + slope_adj (slope × r2) for all tickers and dates.  Organzed by date and ranked by slope_adj.
# # ============================================================
# CONFIG
# ============================================================

ADJ_PRICE_DIR      = "./3-adjusted_All_Prices_OHLC"
REGRESSION_DIR     = "./7-90Day_exp_regression_adjusted_all_prices"

OUTPUT_DIR         = "./9-ranking_dataset"
VER_DIR            = "./system_verification/9-ranking_dataset"

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(VER_DIR, exist_ok=True)

print("=== BUILDING RANKING DATASET (OHLC + slope × r2) ===")

# ============================================================
# VALIDATION LOGGING
# ============================================================

validation_rows = []

def log_issue(ticker, issue, detail=""):
    validation_rows.append({
        "timestamp": datetime.now().isoformat(timespec="seconds"),
        "ticker": ticker,
        "issue": issue,
        "detail": detail,
    })
    print(f"⚠ {ticker} | {issue} | {detail}")

# ============================================================
# DISCOVER TICKERS
# ============================================================

tickers = sorted([
    f.replace(".parquet", "")
    for f in os.listdir(ADJ_PRICE_DIR)
    if f.endswith(".parquet")
])

print(f"Found {len(tickers)} tickers with adjusted OHLC data.")

# ============================================================
# MAIN LOOP — MERGE OHLC + REGRESSION
# ============================================================

rows = []

for ticker in tickers:
    print(f"\nProcessing: {ticker}")

    # --------------------------------------------------------
    # Load adjusted OHLC
    # --------------------------------------------------------
    f_px = os.path.join(ADJ_PRICE_DIR, f"{ticker}.parquet")

    try:
        px = pd.read_parquet(f_px)
    except:
        log_issue(ticker, "cannot_read_adjusted_price_file", f_px)
        continue

    # Ensure date is datetime
    px["date"] = pd.to_datetime(px["date"])

    # Keep only adjusted OHLC + volume + date
    if not {"open_adj", "high_adj", "low_adj", "close_adj"}.issubset(px.columns):
        log_issue(ticker, "missing_adjusted_ohlc_columns", f_px)
        continue

    df = px[[
        "date",
        "open_adj", "high_adj", "low_adj", "close_adj",
        "volume"
    ]].copy()

    # --------------------------------------------------------
    # Load regression file (slope + r2)
    # --------------------------------------------------------
    f_reg = os.path.join(REGRESSION_DIR, f"{ticker}.parquet")

    if not os.path.exists(f_reg):
        log_issue(ticker, "missing_regression_file", f_reg)

        df["slope_annual"] = np.nan
        df["r2"] = np.nan
        df["slope_adj"] = np.nan

    else:
        rg = pd.read_parquet(f_reg)[["date", "slope_annual", "r2"]]
        rg["date"] = pd.to_datetime(rg["date"])

        df = df.merge(rg, on="date", how="left")

        df["slope_adj"] = df["slope_annual"] * df["r2"]

    # --------------------------------------------------------
    # Add ticker
    # --------------------------------------------------------
    df["ticker"] = ticker

    rows.append(df)

# ============================================================
# CONCAT FINAL DATA
# ============================================================

ranking_df = pd.concat(rows, ignore_index=True)

ranking_df = ranking_df.sort_values(
    ["date", "slope_adj"], ascending=[True, False]
)

# ============================================================
# SAVE OUTPUTS
# ============================================================

ranking_path = os.path.join(OUTPUT_DIR, "ranking_dataset.parquet")
ranking_df.to_parquet(ranking_path, index=False)

val_df = pd.DataFrame(validation_rows)
val_path = os.path.join(VER_DIR, f"ranking_validation-{datetime.now().strftime('%Y%m%d-%H%M%S')}.csv")
val_df.to_csv(val_path, index=False)

print("\n=== COMPLETED ===")
print(f"Saved ranking dataset → {ranking_path}")
print(f"Saved validation log → {val_path}")
print(f"Rows: {len(ranking_df):,}")


=== BUILDING RANKING DATASET (OHLC + slope × r2) ===
Found 1167 tickers with adjusted OHLC data.

Processing: A

Processing: AAL

Processing: AAMRQ

Processing: AAP

Processing: AAPL

Processing: ABBV

Processing: ABI1

Processing: ABKFQ

Processing: ABMD

Processing: ABNB

Processing: ABS

Processing: ABT

Processing: ACAS

Processing: ACGL

Processing: ACKHQ

Processing: ACN

Processing: ACS

Processing: ACV1

Processing: ADBE

Processing: ADCT1

Processing: ADI

Processing: ADM

Processing: ADP

Processing: ADSK

Processing: ADT1

Processing: AEE

Processing: AEP

Processing: AES

Processing: AET

Processing: AFL

Processing: AFS.A

Processing: AGC1

Processing: AGN

Processing: AGN1

Processing: AHM1

Processing: AIG

Processing: AIT

Processing: AIV

Processing: AIZ

Processing: AJG

Processing: AKAM

Processing: AKS

Processing: AL1

Processing: ALB

Processing: ALGN

Processing: ALK

Processing: ALL

Processing: ALLE

Processing: ALTR1

Processing: ALXN

Processing: AM1

Process

In [2]:


df_join_date = pd.read_parquet("1-sp500_membership_daily_matrix/sp500_membership_join_exit_date.parquet")

print("Join Exit File Info:")
print("="*60)
print(f"Shape: {df_join_date.shape}")
print(f"\nColumns: {df_join_date.columns.tolist()}")
print(f"\nData types:\n{df_join_date.dtypes}")
print(f"\nFirst few rows:")
print(df_join_date.head(93))
print(f"\nLast few rows:")
print(df_join_date.tail(10))
print(f"\nSummary statistics:")
print(df_join_date.describe())
print(f"\nNull values:")
print(df_join_date.isnull().sum())

Join Exit File Info:
Shape: (1192, 2)

Columns: ['first_join_date', 'last_exit_date']

Data types:
first_join_date    datetime64[ns]
last_exit_date     datetime64[ns]
dtype: object

First few rows:
       first_join_date last_exit_date
ticker                               
A           2000-06-05            NaT
AAL         2015-03-17     2024-09-20
AAMRQ       1957-03-04     2003-03-13
AAP         2015-07-07     2023-08-24
AAPL        1982-11-30            NaT
...                ...            ...
ARG         2009-09-09     2016-05-19
AS1         1957-03-04     1998-11-20
ASC1        1957-03-04     1999-06-23
ASH         1989-08-03     2008-11-13
ASN         2004-12-20     2007-10-04

[93 rows x 2 columns]

Last few rows:
       first_join_date last_exit_date
ticker                               
XTO         2004-12-28     2010-06-24
XYL         2011-11-01            NaT
XYZ         2025-07-23            NaT
YHOO        1999-12-08     2017-06-16
YNR         2000-01-05     2000-10-02
YUM

In [3]:
import pandas as pd

# Load and inspect the ABBV regression file

df_abbv_reg = pd.read_parquet("7-90Day_exp_regression_adjusted_all_prices/ABBV.parquet")

print("ABBV Regression File Info:")
print("="*60)
print(f"Shape: {df_abbv_reg.shape}")
print(f"\nColumns: {df_abbv_reg.columns.tolist()}")
print(f"\nData types:\n{df_abbv_reg.dtypes}")
print(f"\nFirst few rows:")
print(df_abbv_reg.head(93))
print(f"\nLast few rows:")
print(df_abbv_reg.tail(10))
print(f"\nSummary statistics:")
print(df_abbv_reg.describe())
print(f"\nNull values:")
print(df_abbv_reg.isnull().sum())

ABBV Regression File Info:
Shape: (3269, 18)

Columns: ['ticker', 'date', 'open', 'high', 'low', 'close', 'volume', 'closeadj', 'closeunadj', 'lastupdated', 'adj_factor', 'open_adj', 'high_adj', 'low_adj', 'close_adj', 'slope_daily', 'slope_annual', 'r2']

Data types:
ticker                  object
date            datetime64[ns]
open                   float64
high                   float64
low                    float64
close                  float64
volume                 float64
closeadj               float64
closeunadj             float64
lastupdated     datetime64[ms]
adj_factor             float64
open_adj               float64
high_adj               float64
low_adj                float64
close_adj              float64
slope_daily            float64
slope_annual           float64
r2                     float64
dtype: object

First few rows:
   ticker       date   open   high    low   close      volume  closeadj  \
0    ABBV 2013-01-02  34.92  35.40  34.10  35.120  13768000.0    20

In [4]:
# Load and inspect the final ranking dataset
df_ranking = pd.read_parquet("9-ranking_dataset/ranking_dataset.parquet")

print("Ranking Dataset Info:")
print("="*60)
print(f"Shape: {df_ranking.shape}")
print(f"\nColumns: {df_ranking.columns.tolist()}")
print(f"\nData types:\n{df_ranking.dtypes}")
print(f"\nFirst few rows:")
print(df_ranking.head(93))
print(f"\nLast few rows:")
print(df_ranking.tail(10))
print(f"\nSummary statistics:")
print(df_ranking.describe())
print(f"\nNull values:")
print(df_ranking.isnull().sum())

Ranking Dataset Info:
Shape: (5366253, 10)

Columns: ['date', 'open_adj', 'high_adj', 'low_adj', 'close_adj', 'volume', 'slope_annual', 'r2', 'slope_adj', 'ticker']

Data types:
date            datetime64[ns]
open_adj               float64
high_adj               float64
low_adj                float64
close_adj              float64
volume                 float64
slope_annual           float64
r2                     float64
slope_adj              float64
ticker                  object
dtype: object

First few rows:
         date   open_adj   high_adj    low_adj  close_adj       volume  \
0  1997-12-31  31.776559  32.770191  31.666043     32.423    3793386.6   
1  1997-12-31   0.098000   0.102188   0.097162      0.098  406358000.0   
2  1997-12-31  16.272963  16.637861  15.908998     16.580    2817385.2   
3  1997-12-31  30.000000  30.750000  29.917000     30.667     256200.0   
4  1997-12-31   8.000000   8.250000   7.750000      8.190     121400.0   
..        ...        ...        ...  

In [5]:
# GitHub Copilot

# Filter df_ranking for January 15, 2001 and export to CSV
target_date = pd.Timestamp("2001-01-17")
subset = df_ranking[df_ranking["date"] == target_date]

out_path = os.path.join(VER_DIR, f"ranking_{target_date.date()}.csv")
subset.to_csv(out_path, index=False)

print(f"Saved {len(subset):,} rows for {target_date.date()} → {out_path}")

Saved 835 rows for 2001-01-17 → ./system_verification/9-ranking_dataset\ranking_2001-01-17.csv
