In [1]:
# Notebook: 02_build_aapl_ood_base.ipynb

import pandas as pd, numpy as np, json
from pathlib import Path

# Folders
ROOT   = Path("..")
TRAIND = ROOT / "train"            # where your train stats live (macro_zscore_stats.json)
OODD   = ROOT / "test"    # where we'll save AAPL OOD outputs
ROOT.mkdir(parents=True, exist_ok=True); OODD.mkdir(parents=True, exist_ok=True)

# Input: your AAPL base CSV (no macros yet), produced in Step 1 earlier
BASE_AAPL_CSV = OODD / "base_AAPL_2024.csv"

# Helper: business-day index
def bdays(start, end):
    return pd.bdate_range(start=start, end=end)



In [2]:
base = pd.read_csv(BASE_AAPL_CSV, parse_dates=["Date"]).sort_values("Date")

# Parse JSON embedding -> numpy array
def parse_emb(s): return np.array(json.loads(s), dtype=np.float32)
base["text_embed"] = base["prev_day_embedding_json"].apply(parse_emb)
base.drop(columns=["prev_day_embedding_json"], inplace=True)

# Drop any rows with zero-norm embedding (usually the first)
def vnorm(a): return float(np.linalg.norm(a))
zero_mask = base["text_embed"].apply(vnorm).eq(0.0)
if zero_mask.any():
    print("Dropping zero-embedding rows:", int(zero_mask.sum()))
    base = base.loc[~zero_mask].copy()

print("AAPL base rows:", len(base))
print("Date range:", base["Date"].min().date(), "→", base["Date"].max().date())


AAPL base rows: 229
Date range: 2024-01-09 → 2024-12-11


In [3]:
OOD_START = base["Date"].min().normalize()
OOD_END   = base["Date"].max().normalize()

# Buffer for CPI YoY and publication lags
# After (more cushion: 18 months is safe for quarterly + lag)
macro_fetch_start = (OOD_START - pd.DateOffset(months=24)).normalize()
macro_fetch_end   = OOD_END

print("Fetch macro:", macro_fetch_start.date(), "→", macro_fetch_end.date())


Fetch macro: 2022-01-09 → 2024-12-11


In [4]:
from fredapi import Fred
fred = Fred(api_key="ae61356225c8e8de915564116aa0c9f3")  # <-- same as your notebook

# CPI (monthly) → YoY %
cpi = fred.get_series('CPIAUCSL', observation_start=macro_fetch_start, observation_end=macro_fetch_end)
cpi = cpi.to_frame("cpi").reset_index().rename(columns={"index":"Date"})
cpi["cpi_yoy"] = cpi["cpi"].pct_change(12) * 100

# Unemployment (monthly, %)
unrate = fred.get_series('UNRATE', observation_start=macro_fetch_start, observation_end=macro_fetch_end)
unrate = unrate.to_frame("unrate").reset_index().rename(columns={"index":"Date"})

# 10Y-2Y spread (daily, pp)
t10y2y = fred.get_series('T10Y2Y', observation_start=macro_fetch_start, observation_end=macro_fetch_end)
t10y2y = t10y2y.to_frame("t10y2y").reset_index().rename(columns={"index":"Date"})

# Real GDP (quarterly) → QoQ %
gdp = fred.get_series('GDPC1', observation_start=macro_fetch_start, observation_end=macro_fetch_end)
gdp = gdp.to_frame("gdp").reset_index().rename(columns={"index":"Date"})
gdp["gdp_qoq"] = gdp["gdp"].pct_change() * 100

print("Fetched:", len(cpi), len(unrate), len(t10y2y), len(gdp))


Fetched: 36 36 763 12


In [5]:
# Merge sparse macro series
macro = (cpi[["Date","cpi_yoy"]]
         .merge(unrate[["Date","unrate"]], on="Date", how="outer")
         .merge(t10y2y[["Date","t10y2y"]], on="Date", how="outer")
         .merge(gdp[["Date","gdp_qoq"]], on="Date", how="outer")
        ).sort_values("Date")

# Business-day reindex on wide window + ffill
wide_idx = bdays(macro_fetch_start, macro_fetch_end)
macro_w = (macro.set_index("Date").reindex(wide_idx).rename_axis("Date").ffill())

# Publication lags (strict causality)
macro_w["cpi_yoy_lagged"] = macro_w["cpi_yoy"].shift(10)  # CPI +10d
macro_w["unrate_lagged"]  = macro_w["unrate"].shift(5)    # UNRATE +5d
macro_w["t10y2y_lagged"]  = macro_w["t10y2y"]             # 0d
macro_w["gdp_qoq_lagged"] = macro_w["gdp_qoq"].shift(30)  # GDP +30d

macro_w = macro_w.ffill().reset_index()

# Trim to OOD
macro_ood = macro_w[(macro_w["Date"] >= OOD_START) & (macro_w["Date"] <= OOD_END)].copy()
print("Macro OOD rows:", len(macro_ood))


Macro OOD rows: 242


In [6]:
# Load TRAIN z-score stats (μ, σ)
stats = json.load(open(TRAIND / "macro_zscore_stats.json"))
mu, sd = stats["mu"], stats["sd"]
lag_cols = ["cpi_yoy_lagged","unrate_lagged","t10y2y_lagged","gdp_qoq_lagged"]

# Apply train μ/σ to OOD
for c in lag_cols:
    macro_ood[c + "_z"] = (macro_ood[c] - mu[c]) / (sd[c] if sd[c] != 0 else 1.0)

# Forward-fill z-scores to cover quarterly gaps
for zc in [c+"_z" for c in lag_cols]:
    macro_ood[zc] = macro_ood[zc].ffill()

macro_daily = macro_ood[["Date"] + [c+"_z" for c in lag_cols]]
print("NaNs in z-cols:\n", macro_daily.isna().sum())


NaNs in z-cols:
 Date                0
cpi_yoy_lagged_z    0
unrate_lagged_z     0
t10y2y_lagged_z     0
gdp_qoq_lagged_z    0
dtype: int64


In [7]:
ood = base.merge(macro_daily, on="Date", how="left").sort_values("Date")
ood["text_embed"] = ood["text_embed"].apply(lambda a: a.tolist())

out_path = OODD / "x_test_ood_base.parquet"
ood.to_parquet(out_path, index=False)

print("Saved:", out_path)
print("Rows:", len(ood))
print("Columns:", list(ood.columns))
print("\nSanity:\n  Dates sorted:", ood["Date"].is_monotonic_increasing,
      "\n  Movement unique:", sorted(ood["Movement"].unique()))


Saved: ../test/x_test_ood_base.parquet
Rows: 229
Columns: ['Date', 'Movement', 'Open', 'Close_lag1', 'High_lag1', 'Volume_lag1', 'Daily_Return_lag1', 'Volatility_lag1', 'sentiment_volatility_lag1', 'aggregate_sentiment_score_lag1', 'text_embed', 'cpi_yoy_lagged_z', 'unrate_lagged_z', 't10y2y_lagged_z', 'gdp_qoq_lagged_z']

Sanity:
  Dates sorted: True 
  Movement unique: [0, 1]
