In [4]:
# 1) Imports & folder paths
import pandas as pd, numpy as np, json
from pathlib import Path

# Work inside /macro_retrieval/XOM
ROOT   = Path(".")
TRAIND = ROOT / "train"           # contains macro_zscore_stats.json
OODD   = ROOT / "test"            # save OOD outputs here

ROOT.mkdir(parents=True, exist_ok=True)
TRAIND.mkdir(parents=True, exist_ok=True)
OODD.mkdir(parents=True, exist_ok=True)

# Base CSV (no macros yet)
BASE_XOM_CSV = ROOT / "base_XOM_2024.csv"
print("Paths set.\n", "ROOT:", ROOT.resolve(), "\nTRAIN:", TRAIND.resolve(), "\nTEST:", OODD.resolve())


Paths set.
 ROOT: /Users/sarthakkhanna/Desktop/thesis/paper_new/macro_augmented_ood/macro_retrieval/XOM 
TRAIN: /Users/sarthakkhanna/Desktop/thesis/paper_new/macro_augmented_ood/macro_retrieval/XOM/train 
TEST: /Users/sarthakkhanna/Desktop/thesis/paper_new/macro_augmented_ood/macro_retrieval/XOM/test


In [5]:
# 2) Helper: business-day index
def bdays(start, end):
    return pd.bdate_range(start=start, end=end)


In [6]:
# 3) Load base XOM CSV
base = pd.read_csv(BASE_XOM_CSV, parse_dates=["Date"]).sort_values("Date")
print("Loaded base_XOM_2024.csv — rows:", len(base))


Loaded base_XOM_2024.csv — rows: 242


In [7]:
# 4) Parse JSON embedding → numpy array and drop zero-norm rows
def parse_emb(s):
    return np.array(json.loads(s), dtype=np.float32)

base["text_embed"] = base["prev_day_embedding_json"].apply(parse_emb)
base.drop(columns=["prev_day_embedding_json"], inplace=True)

def vnorm(a):
    return float(np.linalg.norm(a))

zero_mask = base["text_embed"].apply(vnorm).eq(0.0)
if zero_mask.any():
    print("Dropping zero-embedding rows:", int(zero_mask.sum()))
    base = base.loc[~zero_mask].copy()

print("XOM base rows:", len(base))
print("Date range:", base["Date"].min().date(), "→", base["Date"].max().date())


XOM base rows: 242
Date range: 2024-01-11 → 2024-12-30


In [8]:
# 5) Define OOD window & macro fetch window (with buffer)
OOD_START = base["Date"].min().normalize()
OOD_END   = base["Date"].max().normalize()

macro_fetch_start = (OOD_START - pd.DateOffset(months=24)).normalize()
macro_fetch_end   = OOD_END

print("OOD:", OOD_START.date(), "→", OOD_END.date())
print("Fetch macro:", macro_fetch_start.date(), "→", macro_fetch_end.date())


OOD: 2024-01-11 → 2024-12-30
Fetch macro: 2022-01-11 → 2024-12-30


In [11]:
# 6) Fetch macro time series from FRED
from fredapi import Fred
fred = Fred(api_key="ae61356225c8e8de915564116aa0c9f3")  # your key

# CPI (monthly) → YoY %
cpi = fred.get_series('CPIAUCSL', observation_start=macro_fetch_start, observation_end=macro_fetch_end)
cpi = cpi.to_frame("cpi").reset_index().rename(columns={"index":"Date"})
cpi["cpi_yoy"] = cpi["cpi"].pct_change(12) * 100

# Unemployment (monthly, %)
unrate = fred.get_series('UNRATE', observation_start=macro_fetch_start, observation_end=macro_fetch_end)
unrate = unrate.to_frame("unrate").reset_index().rename(columns={"index":"Date"})

# 10Y-2Y spread (daily, pp)
t10y2y = fred.get_series('T10Y2Y', observation_start=macro_fetch_start, observation_end=macro_fetch_end)
t10y2y = t10y2y.to_frame("t10y2y").reset_index().rename(columns={"index":"Date"})

# Real GDP (quarterly) → QoQ %
gdp = fred.get_series('GDPC1', observation_start=macro_fetch_start, observation_end=macro_fetch_end)
gdp = gdp.to_frame("gdp").reset_index().rename(columns={"index":"Date"})
gdp["gdp_qoq"] = gdp["gdp"].pct_change() * 100

print("Fetched counts — CPI:", len(cpi), "UNRATE:", len(unrate), "T10Y2Y:", len(t10y2y), "GDP:", len(gdp))


Fetched counts — CPI: 36 UNRATE: 36 T10Y2Y: 775 GDP: 12


In [12]:
# 7) Merge macro series, reindex to business days, apply publication lags
macro = (
    cpi[["Date","cpi_yoy"]]
      .merge(unrate[["Date","unrate"]], on="Date", how="outer")
      .merge(t10y2y[["Date","t10y2y"]], on="Date", how="outer")
      .merge(gdp[["Date","gdp_qoq"]], on="Date", how="outer")
).sort_values("Date")

wide_idx = bdays(macro_fetch_start, macro_fetch_end)
macro_w = macro.set_index("Date").reindex(wide_idx).rename_axis("Date").ffill()

# Publication lags (causal)
macro_w["cpi_yoy_lagged"] = macro_w["cpi_yoy"].shift(10)   # CPI +10d
macro_w["unrate_lagged"]  = macro_w["unrate"].shift(5)     # UNRATE +5d
macro_w["t10y2y_lagged"]  = macro_w["t10y2y"]              # 0d
macro_w["gdp_qoq_lagged"] = macro_w["gdp_qoq"].shift(30)   # GDP +30d

macro_w = macro_w.ffill().reset_index()

# Trim to OOD window
macro_ood = macro_w[(macro_w["Date"] >= OOD_START) & (macro_w["Date"] <= OOD_END)].copy()
print("Macro OOD rows:", len(macro_ood))


Macro OOD rows: 253


In [14]:
# 8) Z-score using train μ, σ
stats = json.load(open(TRAIND / "macro_zscore_stats.json"))
mu, sd = stats["mu"], stats["sd"]
lag_cols = ["cpi_yoy_lagged","unrate_lagged","t10y2y_lagged","gdp_qoq_lagged"]

for c in lag_cols:
    macro_ood[c + "_z"] = (macro_ood[c] - mu[c]) / (sd[c] if sd[c] != 0 else 1.0)

# Cover quarterly gaps
for zc in [c+"_z" for c in lag_cols]:
    macro_ood[zc] = macro_ood[zc].ffill()

macro_daily = macro_ood[["Date"] + [c+"_z" for c in lag_cols]]
print("NaNs in z-cols:\n", macro_daily.isna().sum())


NaNs in z-cols:
 Date                0
cpi_yoy_lagged_z    0
unrate_lagged_z     0
t10y2y_lagged_z     0
gdp_qoq_lagged_z    0
dtype: int64


In [17]:
# 9) Merge macro z-scores with base XOM and save
ood = base.merge(macro_daily, on="Date", how="left").sort_values("Date")
ood["text_embed"] = ood["text_embed"].apply(lambda a: a.tolist())

out_path = OODD / "xom_test_ood_base.parquet"
ood.to_parquet(out_path, index=False)

print("Saved:", out_path)
print("Rows:", len(ood))
print("Columns:", list(ood.columns))


Saved: test/xom_test_ood_base.parquet
Rows: 242
Columns: ['Date', 'Movement', 'Open', 'Close_lag1', 'High_lag1', 'Volume_lag1', 'Daily_Return_lag1', 'Volatility_lag1', 'sentiment_volatility_lag1', 'aggregate_sentiment_score_lag1', 'text_embed', 'cpi_yoy_lagged_z', 'unrate_lagged_z', 't10y2y_lagged_z', 'gdp_qoq_lagged_z']


In [18]:
# 10) Simple sanity checks
print(
    "Dates sorted:", ood["Date"].is_monotonic_increasing,
    "\nMovement unique:", sorted(ood["Movement"].unique())
)


Dates sorted: True 
Movement unique: [np.int64(0), np.int64(1)]
