In [1]:
#!/usr/bin/env python

import os
import pandas as pd
import numpy as np
from datetime import datetime

# ============================================================
# CONFIG
# ============================================================

MA_FILE     = "./10-100MA_enriched_universe/10-100MA_enriched_universe.parquet"
JUMP90_DIR  = "./6-90Day_jump_filter_adjusted_all_prices"

OUTPUT_DIR  = "./11-universe_with_jump90"
VER_DIR     = "./system_verification/11-universe_with_jump90"

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(VER_DIR, exist_ok=True)

print("=== Building Enriched Universe (Add Jump90 Signals — No Filtering) ===")


# ============================================================
# 1. LOAD MA-ENRICHED UNIVERSE
# ============================================================

df = pd.read_parquet(MA_FILE)
df["date"] = pd.to_datetime(df["date"])

print(f"Loaded MA-enriched universe: {df.shape[0]:,} rows")
print("Columns:", df.columns.tolist())


# ============================================================
# 2. LOAD 90-DAY JUMP FILES
# ============================================================

print("\nLoading Jump90 files...")

jump_map = {}
missing_jump_files = []

for fname in os.listdir(JUMP90_DIR):
    if not fname.endswith(".parquet"):
        continue

    ticker = fname.replace(".parquet", "")

    try:
        tmp = pd.read_parquet(os.path.join(JUMP90_DIR, fname))
        tmp["date"] = pd.to_datetime(tmp["date"])

        tmp = tmp[[
            "date",
            "pct_change",
            "abs_pct",
            "abs_rollmax_90",
            "no_big_jump_90"
        ]]

        jump_map[ticker] = tmp

    except Exception:
        missing_jump_files.append(ticker)


print(f"Loaded Jump90: {len(jump_map)} tickers")
print(f"Missing jump files: {len(missing_jump_files)}")


# ============================================================
# 3. MERGE JUMP SIGNALS — DO NOT FILTER ANYTHING
# ============================================================

print("\nMerging Jump90 signals into universe...")

rows = []

for ticker, sub in df.groupby("ticker", sort=False):

    if ticker in jump_map:
        merged = sub.merge(jump_map[ticker], on="date", how="left")
    else:
        merged = sub.copy()
        merged["pct_change"]     = np.nan
        merged["abs_pct"]        = np.nan
        merged["abs_rollmax_90"] = np.nan
        merged["no_big_jump_90"] = np.nan

    rows.append(merged)

df2 = pd.concat(rows, ignore_index=True)

print(f"After merge: {df2.shape[0]:,} rows")


# ============================================================
# 4. CLEAN COLUMN ORDER (for readability)
# ============================================================

desired_cols = [
    "date",
    "open_adj", "high_adj", "low_adj", "close_adj", "volume",
    "ma100", "above_ma100",
    "pct_change", "abs_pct", "abs_rollmax_90", "no_big_jump_90",
    "slope_annual", "r2", "slope_adj",
    "ticker"
]

existing_cols = [c for c in desired_cols if c in df2.columns]
df2 = df2[existing_cols]


# ============================================================
# 5. SORT + SAVE
# ============================================================

df2 = df2.sort_values(["date", "ticker"])

out_path = os.path.join(OUTPUT_DIR, "11-universe_with_jump90.parquet")
df2.to_parquet(out_path, index=False)

print("\nSaved →", out_path)


# ============================================================
# 6. VALIDATION LOG
# ============================================================

val = pd.DataFrame({
    "timestamp": [datetime.now().isoformat(timespec="seconds")],
    "rows_input": [df.shape[0]],
    "rows_after_merge": [df2.shape[0]],
    "missing_jump_files": [len(missing_jump_files)],
    "null_pct_change": [df2["pct_change"].isna().sum()],
    "null_no_big_jump_90": [df2["no_big_jump_90"].isna().sum()],
})

val_path = os.path.join(
    VER_DIR,
    f"11-jump90_enrichment_validation-{datetime.now().strftime('%Y%m%d-%H%M%S')}.csv"
)
val.to_csv(val_path, index=False)

print("Validation log →", val_path)
print("\n=== COMPLETED ===")


=== Building Enriched Universe (Add Jump90 Signals — No Filtering) ===
Loaded MA-enriched universe: 5,366,253 rows
Columns: ['date', 'open_adj', 'high_adj', 'low_adj', 'close_adj', 'volume', 'slope_annual', 'r2', 'slope_adj', 'ticker', 'ma100', 'above_ma100']

Loading Jump90 files...
Loaded Jump90: 1167 tickers
Missing jump files: 0

Merging Jump90 signals into universe...
After merge: 5,366,253 rows

Saved → ./11-universe_with_jump90\11-universe_with_jump90.parquet
Validation log → ./system_verification/11-universe_with_jump90\11-jump90_enrichment_validation-20251231-090824.csv

=== COMPLETED ===


In [2]:
# Load and inspect the final ranking dataset
df_ranking = pd.read_parquet("./11-universe_with_jump90/11-universe_with_jump90.parquet")
print("Ranking Dataset Info:")
print("="*60)
print(f"Shape: {df_ranking.shape}")
print(f"\nColumns: {df_ranking.columns.tolist()}")
print(f"\nData types:\n{df_ranking.dtypes}")
print(f"\nFirst few rows:")
print(df_ranking.head(93))
print(f"\nLast few rows:")
print(df_ranking.tail(10))
print(f"\nSummary statistics:")
print(df_ranking.describe())
print(f"\nNull values:")
print(df_ranking.isnull().sum())

Ranking Dataset Info:
Shape: (5366253, 16)

Columns: ['date', 'open_adj', 'high_adj', 'low_adj', 'close_adj', 'volume', 'ma100', 'above_ma100', 'pct_change', 'abs_pct', 'abs_rollmax_90', 'no_big_jump_90', 'slope_annual', 'r2', 'slope_adj', 'ticker']

Data types:
date              datetime64[ns]
open_adj                 float64
high_adj                 float64
low_adj                  float64
close_adj                float64
volume                   float64
ma100                    float64
above_ma100                 bool
pct_change               float64
abs_pct                  float64
abs_rollmax_90           float64
no_big_jump_90              bool
slope_annual             float64
r2                       float64
slope_adj                float64
ticker                    object
dtype: object

First few rows:
         date   open_adj   high_adj    low_adj  close_adj       volume  ma100  \
0  1997-12-31  31.776559  32.770191  31.666043     32.423    3793386.6    NaN   
1  1997-12-31   

In [3]:
target_date = pd.Timestamp("2001-01-17")
subset = df_ranking[df_ranking["date"] == target_date]

out_path = os.path.join(VER_DIR, f"ranking_{target_date.date()}.csv")
subset.to_csv(out_path, index=False)

print(f"Saved {len(subset):,} rows for {target_date.date()} → {out_path}")

Saved 835 rows for 2001-01-17 → ./system_verification/11-universe_with_jump90\ranking_2001-01-17.csv
