In [1]:
#!/usr/bin/env python

import os
import pandas as pd
import numpy as np
from datetime import datetime

# ============================================================
# CONFIG
# ============================================================

RANKING_FILE = "./9-ranking_dataset/ranking_dataset.parquet"
MA100_DIR    = "./5-100D_MA_adjusted_all_prices"

OUTPUT_DIR   = "./10-100MA_enriched_universe"
VER_DIR      = "./system_verification/10-100MA_enriched_universe"

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(VER_DIR, exist_ok=True)

print("=== Building Enriched Universe (Add MA100 + Above/Below MA100) ===")

# ============================================================
# 1. LOAD RANKING DATASET
# ============================================================

df = pd.read_parquet(RANKING_FILE)
df["date"] = pd.to_datetime(df["date"])

print(f"Loaded ranking dataset: {df.shape[0]:,} rows")
print("Columns:", df.columns.tolist())

# Expected columns:
# ['date','open_adj','high_adj','low_adj','close_adj','volume',
#  'slope_annual','r2','slope_adj','ticker']


# ============================================================
# 2. LOAD MA100 FILES
# ============================================================

print("\nLoading MA100 files...")

ma100_map = {}
missing_ma100 = []

for fname in os.listdir(MA100_DIR):
    if not fname.endswith(".parquet"):
        continue
    
    ticker = fname.replace(".parquet", "")
    
    try:
        tmp = pd.read_parquet(os.path.join(MA100_DIR, fname))
        tmp["date"] = pd.to_datetime(tmp["date"])
        
        # Only keep useful columns
        tmp = tmp[["date", "ma100"]]
        
        ma100_map[ticker] = tmp
        
    except Exception:
        missing_ma100.append(ticker)

print(f"Loaded MA100 for {len(ma100_map)} tickers.")
print(f"Missing MA100 for {len(missing_ma100)} tickers.")


# ============================================================
# 3. MERGE MA100 (NO FILTERING)
# ============================================================

print("\nMerging MA100 into ranking dataset (no filtering)...")

rows = []

for ticker, sub in df.groupby("ticker", sort=False):
    
    if ticker in ma100_map:
        merged = sub.merge(ma100_map[ticker], on="date", how="left")
    else:
        merged = sub.copy()
        merged["ma100"] = np.nan
    
    rows.append(merged)

df2 = pd.concat(rows, ignore_index=True)

print(f"After merge: {df2.shape[0]:,} rows")


# ============================================================
# 4. ADD above_ma100 BOOLEAN COLUMN
# ============================================================

print("\nAdding above_ma100 column...")

df2["above_ma100"] = df2["close_adj"] > df2["ma100"]


# ============================================================
# 5. SORT BY DATE (optional)
# ============================================================

df2 = df2.sort_values(["date", "ticker"])


# ============================================================
# 6. SAVE OUTPUT
# ============================================================

out_path = os.path.join(OUTPUT_DIR, "10-100MA_enriched_universe.parquet")
df2.to_parquet(out_path, index=False)

print("\nSaved enriched universe →", out_path)


# ============================================================
# 7. VALIDATION LOG
# ============================================================

val = pd.DataFrame({
    "timestamp": [datetime.now().isoformat(timespec="seconds")],
    "rows_in_ranking": [df.shape[0]],
    "rows_after_merge": [df2.shape[0]],
    "missing_ma100_files": [len(missing_ma100)],
    "null_ma100_values": [df2["ma100"].isna().sum()],
    "true_above_ma100": [df2["above_ma100"].sum()],
    "false_above_ma100": [(~df2["above_ma100"]).sum()],
})

val_path = os.path.join(
    VER_DIR,
    f"10-100MA_enriched_universe_validation-{datetime.now().strftime('%Y%m%d-%H%M%S')}.csv"
)

val.to_csv(val_path, index=False)

print("Saved validation log →", val_path)
print("\n=== COMPLETED ===")


=== Building Enriched Universe (Add MA100 + Above/Below MA100) ===
Loaded ranking dataset: 5,366,253 rows
Columns: ['date', 'open_adj', 'high_adj', 'low_adj', 'close_adj', 'volume', 'slope_annual', 'r2', 'slope_adj', 'ticker']

Loading MA100 files...
Loaded MA100 for 1167 tickers.
Missing MA100 for 0 tickers.

Merging MA100 into ranking dataset (no filtering)...
After merge: 5,366,253 rows

Adding above_ma100 column...

Saved enriched universe → ./10-100MA_enriched_universe\10-100MA_enriched_universe.parquet
Saved validation log → ./system_verification/10-100MA_enriched_universe\10-100MA_enriched_universe_validation-20251231-090121.csv

=== COMPLETED ===


In [2]:
# Load and inspect the final ranking dataset
df_ranking = pd.read_parquet("./10-100MA_enriched_universe/10-100MA_enriched_universe.parquet")

print("="*60)
print(f"Shape: {df_ranking.shape}")
print(f"\nColumns: {df_ranking.columns.tolist()}")
print(f"\nData types:\n{df_ranking.dtypes}")
print(f"\nFirst few rows:")
print(df_ranking.head(93))
print(f"\nLast few rows:")
print(df_ranking.tail(10))
print(f"\nSummary statistics:")
print(df_ranking.describe())
print(f"\nNull values:")
print(df_ranking.isnull().sum())

Shape: (5366253, 12)

Columns: ['date', 'open_adj', 'high_adj', 'low_adj', 'close_adj', 'volume', 'slope_annual', 'r2', 'slope_adj', 'ticker', 'ma100', 'above_ma100']

Data types:
date            datetime64[ns]
open_adj               float64
high_adj               float64
low_adj                float64
close_adj              float64
volume                 float64
slope_annual           float64
r2                     float64
slope_adj              float64
ticker                  object
ma100                  float64
above_ma100               bool
dtype: object

First few rows:
         date   open_adj   high_adj    low_adj  close_adj       volume  \
0  1997-12-31  31.776559  32.770191  31.666043     32.423    3793386.6   
1  1997-12-31   0.098000   0.102188   0.097162      0.098  406358000.0   
2  1997-12-31  16.272963  16.637861  15.908998     16.580    2817385.2   
3  1997-12-31  30.000000  30.750000  29.917000     30.667     256200.0   
4  1997-12-31   8.000000   8.250000   7.750000 

In [3]:
# Load and inspect the jump file source dataset
df_ranking = pd.read_parquet("./6-90Day_jump_filter_adjusted_all_prices/AAPL.parquet")

print("Ranking Dataset Info:")
print("="*60)
print(f"Shape: {df_ranking.shape}")
print(f"\nColumns: {df_ranking.columns.tolist()}")
print(f"\nData types:\n{df_ranking.dtypes}")
print(f"\nFirst few rows:")
print(df_ranking.head(93))
print(f"\nLast few rows:")
print(df_ranking.tail(10))
print(f"\nSummary statistics:")
print(df_ranking.describe())
print(f"\nNull values:")
print(df_ranking.isnull().sum())

Ranking Dataset Info:
Shape: (7043, 19)

Columns: ['ticker', 'date', 'open', 'high', 'low', 'close', 'volume', 'closeadj', 'closeunadj', 'lastupdated', 'adj_factor', 'open_adj', 'high_adj', 'low_adj', 'close_adj', 'pct_change', 'abs_pct', 'abs_rollmax_90', 'no_big_jump_90']

Data types:
ticker                    object
date              datetime64[ns]
open                     float64
high                     float64
low                      float64
close                    float64
volume                   float64
closeadj                 float64
closeunadj               float64
lastupdated       datetime64[ms]
adj_factor               float64
open_adj                 float64
high_adj                 float64
low_adj                  float64
close_adj                float64
pct_change               float64
abs_pct                  float64
abs_rollmax_90           float64
no_big_jump_90              bool
dtype: object

First few rows:
   ticker       date   open   high    low  close      