# Relevant Imports

# Calculate the Realized Return

In [93]:
from pathlib import Path
import numpy as np, pandas as pd

ROOT   = Path("./")  # /ood_validation/macro_retrieval
TRAIND = ROOT / "train"

inp = TRAIND / "sp500_features.parquet"
out = TRAIND / "sp500_features_with_ret.parquet"

df = pd.read_parquet(inp).sort_values("Date").reset_index(drop=True)

# Preferred: compute same-day log return from Close/Open
if "Close" in df.columns:
    df["Daily_Return"] = np.log(df["Close"] / df["Open"]).replace([np.inf, -np.inf], np.nan)
else:
    # Fallback: shift lagged returns forward
    if "Daily_Return_lag1" not in df.columns:
        raise KeyError("Neither Close nor Daily_Return_lag1 found in train data.")
    df["Daily_Return"] = df["Daily_Return_lag1"].shift(-1)

# Drop trailing NaNs (from shift edge)
nan_tail = int(df["Daily_Return"].isna().sum())
if nan_tail:
    df = df.iloc[:-nan_tail].copy()

df.to_parquet(out, index=False)
print("TRAIN with realized return saved:", out, "| rows:", len(df))
print(df[["Date","Daily_Return"]].head())


TRAIN with realized return saved: train/sp500_features_with_ret.parquet | rows: 2797
        Date  Daily_Return
0 2007-08-07      0.476339
1 2007-08-10     -1.208174
2 2007-08-13     -0.102920
3 2007-08-14     -1.393884
4 2007-08-15     -1.083829


In [94]:
from pathlib import Path
import numpy as np, pandas as pd

ROOT  = Path("./")  # /ood_validation/macro_retrieval
TESTD = ROOT / "test"

# Use whichever file you’ll evaluate (base or with retrieval)
inp = TESTD / "x_test_ood.parquet"
if not inp.exists():
    inp = TESTD / "x_test_ood_base.parquet"

out = inp.with_name(inp.stem + "_with_ret.parquet")

df = pd.read_parquet(inp).sort_values("Date").reset_index(drop=True)

if "Close" in df.columns:
    df["Daily_Return"] = np.log(df["Close"] / df["Open"]).replace([np.inf, -np.inf], np.nan)
else:
    if "Daily_Return_lag1" not in df.columns:
        raise KeyError("Neither Close nor Daily_Return_lag1 found in OOD data.")
    df["Daily_Return"] = df["Daily_Return_lag1"].shift(-1)

nan_tail = int(df["Daily_Return"].isna().sum())
if nan_tail:
    df = df.iloc[:-nan_tail].copy()

df.to_parquet(out, index=False)
print("OOD with realized return saved:", out, "| rows:", len(df))
print(df[["Date","Daily_Return"]].head())


OOD with realized return saved: test/x_test_ood_base_with_ret.parquet | rows: 227
        Date  Daily_Return
0 2024-01-09     -0.196457
1 2024-01-10      0.844844
2 2024-01-11      0.313409
3 2024-01-12     -2.372397
4 2024-01-16     -1.184751


# Experiments

# Numeircal Only

In [95]:
from pathlib import Path
import pandas as pd

ROOT   = Path("./")  # notebook at /ood_validation/macro_retrieval
TRAIND = ROOT / "train"
TESTD  = ROOT / "test"

# Prefer files with realized-return; fallback to originals
def pick_train_path():
    p1 = TRAIND / "sp500_features_with_ret.parquet"
    p0 = TRAIND / "sp500_features.parquet"
    return p1 if p1.exists() else p0

def pick_ood_path():
    # prefer OOD with retrieval; else base; both prefer *_with_ret if available
    cands = [
        TESTD / "x_test_ood_with_ret.parquet",
        TESTD / "x_test_ood.parquet",
        TESTD / "x_test_ood_base_with_ret.parquet",
        TESTD / "x_test_ood_base.parquet",
    ]
    for p in cands:
        if p.exists(): return p
    raise FileNotFoundError("No OOD parquet found in /test.")

TRAIN_PARQUET = pick_train_path()
OOD_PARQUET   = pick_ood_path()

print("Using TRAIN:", TRAIN_PARQUET)
print("Using OOD  :", OOD_PARQUET)

train = pd.read_parquet(TRAIN_PARQUET).sort_values("Date").reset_index(drop=True)
ood   = pd.read_parquet(OOD_PARQUET).sort_values("Date").reset_index(drop=True)


Using TRAIN: train/sp500_features_with_ret.parquet
Using OOD  : test/x_test_ood_base_with_ret.parquet


In [96]:
import numpy as np

def trading_metrics(y_true, y_pred, realized_ret):
    if realized_ret is None or len(realized_ret) == 0:
        return {"win_rate": None, "profit_factor": None, "sharpe_252": None}
    pos = np.where(np.asarray(y_pred) > 0, 1.0, -1.0)
    strat_ret = pos * np.asarray(realized_ret)
    win_rate = (strat_ret > 0).mean()
    gross_profit = strat_ret[strat_ret > 0].sum()
    gross_loss   = -strat_ret[strat_ret < 0].sum()
    profit_factor = (gross_profit / gross_loss) if gross_loss > 0 else np.inf
    mu, sd = strat_ret.mean(), strat_ret.std(ddof=1)
    sharpe_252 = (mu / sd) * np.sqrt(252) if sd > 0 else np.nan
    return {"win_rate": float(win_rate), "profit_factor": float(profit_factor), "sharpe_252": float(sharpe_252)}

def realized_return_column(df):
    # Prefer explicitly-added realized return
    if "Daily_Return" in df.columns and pd.api.types.is_numeric_dtype(df["Daily_Return"]):
        return "Daily_Return"
    # Fallbacks (rare)
    for c in ["ret", "RET", "strategy_return", "realized_return"]:
        if c in df.columns and pd.api.types.is_numeric_dtype(df[c]):
            return c
    return None


In [97]:
def pick_numeric_only_columns(df):
    drop_exact = {
        "Date","Movement","text_embed","z_retr",
        "cpi_yoy_lagged_z","unrate_lagged_z","t10y2y_lagged_z","gdp_qoq_lagged_z",
        "Daily_Return"  # never use realized return as a feature
    }
    keep = [c for c in df.columns if c not in drop_exact and pd.api.types.is_numeric_dtype(df[c])]
    return keep

X_cols = pick_numeric_only_columns(train)
print("Numeric-only features:", len(X_cols))


Numeric-only features: 8


In [98]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, matthews_corrcoef, roc_auc_score
)
import numpy as np
import pandas as pd

# Features/labels
X_cols = pick_numeric_only_columns(train)
y_col  = "Movement"

print("Numeric-only features:", len(X_cols))
print("Train rows:", len(train), "Train date range:",
      train["Date"].min().date(), "→", train["Date"].max().date())

# CV setup
tscv = TimeSeriesSplit(n_splits=5)
cv_rows = []

for fold, (tr_idx, va_idx) in enumerate(tscv.split(train), 1):
    tr_df = train.iloc[tr_idx].copy()
    va_df = train.iloc[va_idx].copy()

    X_tr, y_tr = tr_df[X_cols].to_numpy(), tr_df[y_col].to_numpy()
    X_va, y_va = va_df[X_cols].to_numpy(), va_df[y_col].to_numpy()

    # Pipeline: scale numerics → LR
    pipe = Pipeline([
        ("scaler", StandardScaler(with_mean=True, with_std=True)),
        ("clf", LogisticRegression(max_iter=500, solver="liblinear", n_jobs=1, random_state=42))
    ])
    pipe.fit(X_tr, y_tr)

    proba_va = pipe.predict_proba(X_va)[:, 1]
    yhat_va  = (proba_va >= 0.5).astype(int)

    # Classification metrics
    acc  = accuracy_score(y_va, yhat_va)
    prec = precision_score(y_va, yhat_va, zero_division=0)
    rec  = recall_score(y_va, yhat_va, zero_division=0)
    f1   = f1_score(y_va, yhat_va, zero_division=0)
    mcc  = matthews_corrcoef(y_va, yhat_va)
    try:
        auroc = roc_auc_score(y_va, proba_va)
    except ValueError:
        auroc = np.nan

    # Trading metrics (if realized return exists)
    rr_col = realized_return_column(va_df)
    tm = trading_metrics(y_va, yhat_va, va_df[rr_col].to_numpy() if rr_col else None)

    cv_rows.append({
        "fold": fold, "n_val": len(va_df),
        "val_start": va_df["Date"].min().date(),
        "val_end": va_df["Date"].max().date(),
        "Accuracy": acc, "Precision": prec, "Recall": rec, "F1": f1, "MCC": mcc, "AUROC": auroc,
        "WinRate": tm["win_rate"], "ProfitFactor": tm["profit_factor"], "Sharpe_252": tm["sharpe_252"],
        "ret_col": rr_col
    })

cv_df = pd.DataFrame(cv_rows)
print("\n=== CV results (per fold) ===")
display(cv_df)

print("\n=== CV means ===")
display(cv_df[["Accuracy","Precision","Recall","F1","MCC","AUROC","WinRate","ProfitFactor","Sharpe_252"]]
        .mean(numeric_only=True))


Numeric-only features: 8
Train rows: 2797 Train date range: 2007-08-07 → 2023-07-12

=== CV results (per fold) ===


Unnamed: 0,fold,n_val,val_start,val_end,Accuracy,Precision,Recall,F1,MCC,AUROC,WinRate,ProfitFactor,Sharpe_252,ret_col
0,1,466,2012-09-07,2015-12-03,0.680258,0.905109,0.476923,0.624685,0.451097,0.845836,0.620172,2.561426,5.287349,Daily_Return
1,2,466,2015-12-04,2017-10-17,0.579399,0.574932,0.840637,0.682848,0.140222,0.647234,0.512876,1.144653,0.728899,Daily_Return
2,3,466,2017-10-18,2019-10-03,0.639485,0.619048,1.0,0.764706,0.283174,0.872649,0.508584,1.103695,0.538566,Daily_Return
3,4,466,2019-10-04,2021-08-26,0.684549,0.659722,1.0,0.794979,0.352031,0.866996,0.600858,2.541406,4.179627,Daily_Return
4,5,466,2021-08-27,2023-07-12,0.51073,0.502183,1.0,0.668605,0.130473,0.882867,0.461373,0.938593,-0.383703,Daily_Return



=== CV means ===


Accuracy        0.618884
Precision       0.652199
Recall          0.863512
F1              0.707165
MCC             0.271399
AUROC           0.823117
WinRate         0.540773
ProfitFactor    1.657955
Sharpe_252      2.070147
dtype: float64

In [99]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, matthews_corrcoef, roc_auc_score
)
import numpy as np

# Train full model on TRAIN
pipe_full = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", LogisticRegression(max_iter=1000, solver="liblinear", n_jobs=1, random_state=42))
])
pipe_full.fit(train[X_cols].to_numpy(), train["Movement"].to_numpy())

# Predict on OOD
print("OOD rows:", len(ood), "OOD date range:", ood["Date"].min().date(), "→", ood["Date"].max().date())
X_test = ood[X_cols].to_numpy()
y_test = ood["Movement"].to_numpy()

proba_test = pipe_full.predict_proba(X_test)[:, 1]
yhat_test  = (proba_test >= 0.5).astype(int)

# Classification metrics
acc  = accuracy_score(y_test, yhat_test)
prec = precision_score(y_test, yhat_test, zero_division=0)
rec  = recall_score(y_test, yhat_test, zero_division=0)
f1   = f1_score(y_test, yhat_test, zero_division=0)
mcc  = matthews_corrcoef(y_test, yhat_test)
try:
    auroc = roc_auc_score(y_test, proba_test)
except ValueError:
    auroc = np.nan

# Trading metrics on OOD
rr_col_test = realized_return_column(ood)
tm = trading_metrics(y_test, yhat_test, ood[rr_col_test].to_numpy() if rr_col_test else None)

print("\n=== OOD (AAPL 2024) — Numerical-only LR ===")
print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f} | MCC: {mcc:.4f} | AUROC: {auroc:.4f}")
if rr_col_test:
    print(f"WinRate: {tm['win_rate']:.4f} | ProfitFactor: {tm['profit_factor']:.4f} | Sharpe_252: {tm['sharpe_252']:.4f} | ReturnCol: {rr_col_test}")
else:
    print("No realized-return column found in OOD; trading metrics skipped.")


OOD rows: 227 OOD date range: 2024-01-09 → 2024-12-09

=== OOD (AAPL 2024) — Numerical-only LR ===
Accuracy: 0.4758 | Precision: 0.4507 | Recall: 0.9796 | F1: 0.6174 | MCC: 0.1495 | AUROC: 0.5715
WinRate: 0.4097 | ProfitFactor: 0.7584 | Sharpe_252: -1.5792 | ReturnCol: Daily_Return


# Text Only

In [100]:
def pick_text_only_columns(df):
    # text_embed is stored as list/array → expand into numpy
    if "text_embed" not in df.columns:
        raise KeyError("text_embed column not found in dataframe")
    return "text_embed"

X_col_text = pick_text_only_columns(train)
print("Using text-only features from column:", X_col_text)


Using text-only features from column: text_embed


In [101]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, matthews_corrcoef, roc_auc_score
)

tscv = TimeSeriesSplit(n_splits=5)
cv_rows = []

for fold, (tr_idx, va_idx) in enumerate(tscv.split(train), 1):
    tr_df, va_df = train.iloc[tr_idx], train.iloc[va_idx]

    # Expand embeddings to numpy
    X_tr = np.vstack(tr_df[X_col_text].to_numpy())
    X_va = np.vstack(va_df[X_col_text].to_numpy())
    y_tr, y_va = tr_df["Movement"].to_numpy(), va_df["Movement"].to_numpy()

    # Logistic Regression (no scaling needed for embeddings)
    clf = LogisticRegression(max_iter=500, solver="liblinear", n_jobs=1, random_state=42)
    clf.fit(X_tr, y_tr)

    proba_va = clf.predict_proba(X_va)[:, 1]
    yhat_va  = (proba_va >= 0.5).astype(int)

    # Classification metrics
    acc  = accuracy_score(y_va, yhat_va)
    prec = precision_score(y_va, yhat_va, zero_division=0)
    rec  = recall_score(y_va, yhat_va, zero_division=0)
    f1   = f1_score(y_va, yhat_va, zero_division=0)
    mcc  = matthews_corrcoef(y_va, yhat_va)
    try:
        auroc = roc_auc_score(y_va, proba_va)
    except ValueError:
        auroc = np.nan

    # Trading metrics
    rr_col = realized_return_column(va_df)
    tm = trading_metrics(y_va, yhat_va, va_df[rr_col].to_numpy() if rr_col else None)

    cv_rows.append({
        "fold": fold, "n_val": len(va_df),
        "val_start": va_df["Date"].min().date(), "val_end": va_df["Date"].max().date(),
        "Accuracy": acc, "Precision": prec, "Recall": rec, "F1": f1, "MCC": mcc, "AUROC": auroc,
        "WinRate": tm["win_rate"], "ProfitFactor": tm["profit_factor"], "Sharpe_252": tm["sharpe_252"],
        "ret_col": rr_col
    })

cv_df = pd.DataFrame(cv_rows)
print("\n=== Text-only CV results (per fold) ===")
display(cv_df)

print("\n=== Text-only CV means ===")
display(cv_df[["Accuracy","Precision","Recall","F1","MCC","AUROC","WinRate","ProfitFactor","Sharpe_252"]]
        .mean(numeric_only=True))



=== Text-only CV results (per fold) ===


Unnamed: 0,fold,n_val,val_start,val_end,Accuracy,Precision,Recall,F1,MCC,AUROC,WinRate,ProfitFactor,Sharpe_252,ret_col
0,1,466,2012-09-07,2015-12-03,0.521459,0.60452,0.411538,0.489703,0.073401,0.518913,0.512876,1.177658,0.934971,Daily_Return
1,2,466,2015-12-04,2017-10-17,0.540773,0.539957,0.996016,0.70028,0.033149,0.509182,0.444206,0.84844,-0.886536,Daily_Return
2,3,466,2017-10-18,2019-10-03,0.583691,0.588764,0.959707,0.729805,0.027355,0.470649,0.478541,0.864955,-0.791702,Daily_Return
3,4,466,2019-10-04,2021-08-26,0.603004,0.608225,0.985965,0.752343,-0.074153,0.523834,0.540773,1.02512,0.115186,Daily_Return
4,5,466,2021-08-27,2023-07-12,0.493562,0.493562,1.0,0.66092,0.0,0.491728,0.45279,0.842458,-1.037679,Daily_Return



=== Text-only CV means ===


Accuracy        0.548498
Precision       0.567006
Recall          0.870645
F1              0.666610
MCC             0.011951
AUROC           0.502861
WinRate         0.485837
ProfitFactor    0.951726
Sharpe_252     -0.333152
dtype: float64

In [102]:
# Full-train model
X_tr_full = np.vstack(train[X_col_text].to_numpy())
y_tr_full = train["Movement"].to_numpy()

clf_full = LogisticRegression(max_iter=1000, solver="liblinear", n_jobs=1, random_state=42)
clf_full.fit(X_tr_full, y_tr_full)

# OOD evaluation
X_test = np.vstack(ood[X_col_text].to_numpy())
y_test = ood["Movement"].to_numpy()

proba_test = clf_full.predict_proba(X_test)[:, 1]
yhat_test  = (proba_test >= 0.5).astype(int)

# Metrics
acc  = accuracy_score(y_test, yhat_test)
prec = precision_score(y_test, yhat_test, zero_division=0)
rec  = recall_score(y_test, yhat_test, zero_division=0)
f1   = f1_score(y_test, yhat_test, zero_division=0)
mcc  = matthews_corrcoef(y_test, yhat_test)
try:
    auroc = roc_auc_score(y_test, proba_test)
except ValueError:
    auroc = np.nan

rr_col_test = realized_return_column(ood)
tm = trading_metrics(y_test, yhat_test, ood[rr_col_test].to_numpy() if rr_col_test else None)

print("\n=== OOD (AAPL 2024) — Text-only LR ===")
print("Rows:", len(ood), "Date range:", ood["Date"].min().date(), "→", ood["Date"].max().date())
print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f} | MCC: {mcc:.4f} | AUROC: {auroc:.4f}")
if rr_col_test:
    print(f"WinRate: {tm['win_rate']:.4f} | ProfitFactor: {tm['profit_factor']:.4f} | Sharpe_252: {tm['sharpe_252']:.4f} | ReturnCol: {rr_col_test}")
else:
    print("No realized-return column found in OOD; trading metrics skipped.")



=== OOD (AAPL 2024) — Text-only LR ===
Rows: 227 Date range: 2024-01-09 → 2024-12-09
Accuracy: 0.4317 | Precision: 0.4317 | Recall: 1.0000 | F1: 0.6031 | MCC: 0.0000 | AUROC: 0.4205
WinRate: 0.4009 | ProfitFactor: 0.6686 | Sharpe_252: -2.2949 | ReturnCol: Daily_Return


# 3. Multimodal

In [103]:
# Numeric (incl. macro_z) + text_embed, exclude labels/leakage columns
def pick_numeric_incl_macro(df):
    drop = {
        "Date","Movement","z_retr","Daily_Return",  # label/leakage
        # keep macro_z this time (unlike the numeric-only baseline)
    }
    keep = []
    for c in df.columns:
        if c in drop: 
            continue
        if c == "text_embed":
            continue  # handled separately
        if pd.api.types.is_numeric_dtype(df[c]):
            keep.append(c)
    return keep

num_cols_mm = pick_numeric_incl_macro(train)
text_col    = "text_embed"

print("Multimodal numeric+macro feature count:", len(num_cols_mm))
print("First 10 numeric/macro cols:", num_cols_mm[:10])
print("Using text column:", text_col)


Multimodal numeric+macro feature count: 12
First 10 numeric/macro cols: ['Open', 'Close_lag1', 'High_lag1', 'Volume_lag1', 'Daily_Return_lag1', 'Volatility_lag1', 'sentiment_volatility_lag1', 'aggregate_sentiment_score_lag1', 'cpi_yoy_lagged_z', 'unrate_lagged_z']
Using text column: text_embed


In [104]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, matthews_corrcoef, roc_auc_score
)
import numpy as np
import pandas as pd

tscv = TimeSeriesSplit(n_splits=5)
cv_rows = []

for fold, (tr_idx, va_idx) in enumerate(tscv.split(train), 1):
    tr_df, va_df = train.iloc[tr_idx], train.iloc[va_idx]

    # Split by modality
    Xnum_tr = tr_df[num_cols_mm].to_numpy(dtype=float)
    Xnum_va = va_df[num_cols_mm].to_numpy(dtype=float)
    Xtxt_tr = np.vstack(tr_df[text_col].to_numpy()).astype("float32")
    Xtxt_va = np.vstack(va_df[text_col].to_numpy()).astype("float32")

    # Scale numerics (+macro_z) only; leave embeddings as-is
    scaler = StandardScaler(with_mean=True, with_std=True)
    Xnum_tr_s = scaler.fit_transform(Xnum_tr)
    Xnum_va_s = scaler.transform(Xnum_va)

    # Concatenate: [scaled numerics+macro | text embedding]
    X_tr = np.hstack([Xnum_tr_s, Xtxt_tr]).astype("float32")
    X_va = np.hstack([Xnum_va_s, Xtxt_va]).astype("float32")
    y_tr = tr_df["Movement"].to_numpy()
    y_va = va_df["Movement"].to_numpy()

    # LR classifier
    clf = LogisticRegression(max_iter=1000, solver="liblinear", n_jobs=1, random_state=42)
    clf.fit(X_tr, y_tr)

    proba_va = clf.predict_proba(X_va)[:, 1]
    yhat_va  = (proba_va >= 0.5).astype(int)

    # Classification metrics
    acc  = accuracy_score(y_va, yhat_va)
    prec = precision_score(y_va, yhat_va, zero_division=0)
    rec  = recall_score(y_va, yhat_va, zero_division=0)
    f1   = f1_score(y_va, yhat_va, zero_division=0)
    mcc  = matthews_corrcoef(y_va, yhat_va)
    try:
        auroc = roc_auc_score(y_va, proba_va)
    except ValueError:
        auroc = np.nan

    # Trading metrics
    rr_col = realized_return_column(va_df)
    tm = trading_metrics(y_va, yhat_va, va_df[rr_col].to_numpy() if rr_col else None)

    cv_rows.append({
        "fold": fold, "n_val": len(va_df),
        "val_start": va_df["Date"].min().date(), "val_end": va_df["Date"].max().date(),
        "Accuracy": acc, "Precision": prec, "Recall": rec, "F1": f1, "MCC": mcc, "AUROC": auroc,
        "WinRate": tm["win_rate"], "ProfitFactor": tm["profit_factor"], "Sharpe_252": tm["sharpe_252"],
        "ret_col": rr_col
    })

cv_df_mm = pd.DataFrame(cv_rows)
print("\n=== Multimodal (No-Ret) CV results (per fold) ===")
display(cv_df_mm)

print("\n=== Multimodal (No-Ret) CV means ===")
display(cv_df_mm[["Accuracy","Precision","Recall","F1","MCC","AUROC","WinRate","ProfitFactor","Sharpe_252"]]
        .mean(numeric_only=True))



=== Multimodal (No-Ret) CV results (per fold) ===


Unnamed: 0,fold,n_val,val_start,val_end,Accuracy,Precision,Recall,F1,MCC,AUROC,WinRate,ProfitFactor,Sharpe_252,ret_col
0,1,466,2012-09-07,2015-12-03,0.641631,0.614251,0.961538,0.749625,0.297802,0.752987,0.577253,1.802142,3.347154,Daily_Return
1,2,466,2015-12-04,2017-10-17,0.551502,0.547945,0.956175,0.696662,0.073933,0.613972,0.459227,0.904769,-0.540038,Daily_Return
2,3,466,2017-10-18,2019-10-03,0.667382,0.648241,0.945055,0.769001,0.306498,0.783294,0.515021,1.28291,1.358183,Daily_Return
3,4,466,2019-10-04,2021-08-26,0.665236,0.661654,0.926316,0.77193,0.250679,0.678143,0.564378,1.946086,3.036088,Daily_Return
4,5,466,2021-08-27,2023-07-12,0.624464,0.568922,0.986957,0.721781,0.367838,0.796794,0.536481,1.491961,2.418346,Daily_Return



=== Multimodal (No-Ret) CV means ===


Accuracy        0.630043
Precision       0.608203
Recall          0.955208
F1              0.741800
MCC             0.259350
AUROC           0.725038
WinRate         0.530472
ProfitFactor    1.485574
Sharpe_252      1.923947
dtype: float64

In [105]:
# Build full-train matrices
Xnum_full = train[num_cols_mm].to_numpy(dtype=float)
Xtxt_full = np.vstack(train[text_col].to_numpy()).astype("float32")
scaler_full = StandardScaler(with_mean=True, with_std=True)
Xnum_full_s = scaler_full.fit_transform(Xnum_full)
X_full = np.hstack([Xnum_full_s, Xtxt_full]).astype("float32")
y_full = train["Movement"].to_numpy()

clf_full = LogisticRegression(max_iter=2000, solver="liblinear", n_jobs=1, random_state=42)
clf_full.fit(X_full, y_full)

# OOD matrices
Xnum_test = ood[num_cols_mm].to_numpy(dtype=float)
Xtxt_test = np.vstack(ood[text_col].to_numpy()).astype("float32")
Xnum_test_s = scaler_full.transform(Xnum_test)
X_test = np.hstack([Xnum_test_s, Xtxt_test]).astype("float32")
y_test = ood["Movement"].to_numpy()

proba_test = clf_full.predict_proba(X_test)[:, 1]
yhat_test  = (proba_test >= 0.5).astype(int)

# Metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, matthews_corrcoef, roc_auc_score
)
acc  = accuracy_score(y_test, yhat_test)
prec = precision_score(y_test, yhat_test, zero_division=0)
rec  = recall_score(y_test, yhat_test, zero_division=0)
f1   = f1_score(y_test, yhat_test, zero_division=0)
mcc  = matthews_corrcoef(y_test, yhat_test)
try:
    auroc = roc_auc_score(y_test, proba_test)
except ValueError:
    auroc = np.nan

rr_col_test = realized_return_column(ood)
tm = trading_metrics(y_test, yhat_test, ood[rr_col_test].to_numpy() if rr_col_test else None)

print("\n=== OOD (AAPL 2024) — Multimodal (No-Ret) LR ===")
print("Rows:", len(ood), "Date range:", ood["Date"].min().date(), "→", ood["Date"].max().date())
print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f} | MCC: {mcc:.4f} | AUROC: {auroc:.4f}")
if rr_col_test:
    print(f"WinRate: {tm['win_rate']:.4f} | ProfitFactor: {tm['profit_factor']:.4f} | Sharpe_252: {tm['sharpe_252']:.4f} | ReturnCol: {rr_col_test}")
else:
    print("No realized-return column found in OOD; trading metrics skipped.")



=== OOD (AAPL 2024) — Multimodal (No-Ret) LR ===
Rows: 227 Date range: 2024-01-09 → 2024-12-09
Accuracy: 0.4714 | Precision: 0.4337 | Recall: 0.7347 | F1: 0.5455 | MCC: 0.0067 | AUROC: 0.4981
WinRate: 0.4405 | ProfitFactor: 0.9988 | Sharpe_252: -0.0071 | ReturnCol: Daily_Return


# Retrieval Code

# Text Retrieval

In [106]:
import os, time
os.environ.setdefault("OMP_NUM_THREADS","1"); os.environ.setdefault("MKL_NUM_THREADS","1")

import numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, matthews_corrcoef, roc_auc_score)

ROOT   = Path("./")            # notebook at /ood_validation/macro_retrieval
TRAIND = ROOT / "train"
TESTD  = ROOT / "test"

# prefer files with realized returns
TRAIN_PARQUET = (TRAIND/"sp500_features_with_ret.parquet") if (TRAIND/"sp500_features_with_ret.parquet").exists() else (TRAIND/"sp500_features.parquet")
OOD_PARQUET_CAND = [
    TESTD/"x_test_ood_alpha0_with_ret.parquet",
    TESTD/"x_test_ood_with_ret.parquet",
    TESTD/"x_test_ood_alpha0.parquet",
    TESTD/"x_test_ood.parquet",
    TESTD/"x_test_ood_base_with_ret.parquet",
    TESTD/"x_test_ood_base.parquet",
]
for _p in OOD_PARQUET_CAND:
    if _p.exists():
        OOD_PARQUET = _p; break
else:
    raise FileNotFoundError("No OOD parquet found in /test.")

print("TRAIN:", TRAIN_PARQUET)
print("OOD  :", OOD_PARQUET)

train = pd.read_parquet(TRAIN_PARQUET).sort_values("Date").reset_index(drop=True)
ood   = pd.read_parquet(OOD_PARQUET).sort_values("Date").reset_index(drop=True)

def realized_return_column(df):
    if "Daily_Return" in df.columns and pd.api.types.is_numeric_dtype(df["Daily_Return"]): return "Daily_Return"
    for c in ["ret","RET","strategy_return","realized_return"]:
        if c in df.columns and pd.api.types.is_numeric_dtype(df[c]): return c
    return None

def trading_metrics(y_true, y_pred, realized_ret):
    if realized_ret is None or len(realized_ret)==0:
        return {"win_rate": None, "profit_factor": None, "sharpe_252": None}
    pos = np.where(np.asarray(y_pred)>0, 1.0, -1.0)
    strat_ret = pos * np.asarray(realized_ret)
    win_rate = float((strat_ret>0).mean())
    gp = strat_ret[strat_ret>0].sum(); gl = -strat_ret[strat_ret<0].sum()
    pf = float(gp/gl) if gl>0 else np.inf
    mu, sd = strat_ret.mean(), strat_ret.std(ddof=1)
    sharpe = float((mu/sd)*np.sqrt(252)) if sd>0 else np.nan
    return {"win_rate":win_rate,"profit_factor":pf,"sharpe_252":sharpe}

# --- feature selectors ---
def pick_text_col(df):
    if "text_embed" not in df.columns: raise KeyError("text_embed missing")
    return "text_embed"

def pick_numeric_no_macro(df):
    drop = {"Date","Movement","text_embed","z_retr",
            "cpi_yoy_lagged_z","unrate_lagged_z","t10y2y_lagged_z","gdp_qoq_lagged_z",
            "Daily_Return"}
    keep = [c for c in df.columns if c not in drop and pd.api.types.is_numeric_dtype(df[c])]
    return keep

text_col = pick_text_col(train)
num_cols = pick_numeric_no_macro(train)
print("Numerics (no macro_z) count:", len(num_cols), "| text col:", text_col)


TRAIN: train/sp500_features_with_ret.parquet
OOD  : test/x_test_ood_base_with_ret.parquet
Numerics (no macro_z) count: 8 | text col: text_embed


In [107]:
import faiss

def to_unit_rows(M):
    M = M.astype("float32")
    nrm = np.linalg.norm(M, axis=1, keepdims=True) + 1e-9
    return M / nrm

def build_faiss_index(vecs, use_gpu=True):
    # IndexFlatIP over L2-normalized vectors => cosine similarity
    d = vecs.shape[1]
    cpu_index = faiss.IndexFlatIP(d)
    if use_gpu and faiss.get_num_gpus()>0:
        res = faiss.StandardGpuResources()
        return faiss.index_cpu_to_gpu(res, 0, cpu_index), True
    return cpu_index, False

def compute_zretr(query_text, ref_text, ref_dates, query_dates, K=5, batch=512, use_gpu=True):
    """
    α=0 -> queries = normalized text only. ref_text are normalized text of ref set.
    Causal mask: ref_date < query_date. Aggregate neighbor TEXT into z_retr (mean).
    """
    q = to_unit_rows(query_text)
    r = to_unit_rows(ref_text)

    index, gpu = build_faiss_index(r, use_gpu=use_gpu)
    index.add(r)

    z = np.zeros((q.shape[0], r.shape[1]), dtype="float32")
    effk = []
    for s in range(0, q.shape[0], batch):
        e = min(s+batch, q.shape[0])
        D, I = index.search(q[s:e], K*10)  # oversample for mask
        for i in range(s, e):
            cand = I[i-s]
            mask = (ref_dates[cand] < query_dates[i])
            pick = np.where(mask)[0][:K]
            if pick.size==0:  # fallback
                pick = np.arange(min(K, len(cand)))
            ids = cand[pick]
            z[i] = r[ids].mean(axis=0)  # mean of TEXT neighbors
            effk.append(len(ids))
    return z, np.array(effk, dtype=int)


In [109]:
tscv = TimeSeriesSplit(n_splits=5)
cv_rows = []

for fold, (tr_idx, va_idx) in enumerate(tscv.split(train), 1):
    tr_df, va_df = train.iloc[tr_idx].copy(), train.iloc[va_idx].copy()

    # --- Build reference (fold-train) once for this fold ---
    ref_text  = np.vstack(tr_df[text_col].to_numpy()).astype("float32")
    ref_dates = tr_df["Date"].to_numpy(dtype="datetime64[ns]")

    # --- z_retr for TRAIN (causal within fold-train) ---
    z_tr, effk_tr = compute_zretr(
        query_text = ref_text,
        ref_text   = ref_text,
        ref_dates  = ref_dates,
        query_dates= ref_dates,
        K=5, batch=512, use_gpu=True
    )

    # --- z_retr for VAL (query fold-val against fold-train reference) ---
    qry_text  = np.vstack(va_df[text_col].to_numpy()).astype("float32")
    qry_dates = va_df["Date"].to_numpy(dtype="datetime64[ns]")
    z_va, effk_va = compute_zretr(
        query_text = qry_text,
        ref_text   = ref_text,
        ref_dates  = ref_dates,
        query_dates= qry_dates,
        K=5, batch=512, use_gpu=True
    )

    # --- Numerics (no macro) scaling ---
    Xnum_tr = tr_df[num_cols].to_numpy(dtype=float)
    Xnum_va = va_df[num_cols].to_numpy(dtype=float)
    scaler = StandardScaler(with_mean=True, with_std=True)
    Xnum_tr_s = scaler.fit_transform(Xnum_tr)
    Xnum_va_s = scaler.transform(Xnum_va)

    # --- Text ---
    Xtxt_tr = ref_text
    Xtxt_va = qry_text

    # --- Final design matrices: [scaled numerics | text | z_retr] ---
    X_tr = np.hstack([Xnum_tr_s, Xtxt_tr, z_tr]).astype("float32")
    X_va = np.hstack([Xnum_va_s, Xtxt_va, z_va]).astype("float32")
    y_tr = tr_df["Movement"].to_numpy()
    y_va = va_df["Movement"].to_numpy()

    # --- Classifier ---
    clf = LogisticRegression(max_iter=1000, solver="liblinear", n_jobs=1, random_state=42)
    clf.fit(X_tr, y_tr)

    proba_va = clf.predict_proba(X_va)[:, 1]
    yhat_va  = (proba_va >= 0.5).astype(int)

    # --- Metrics ---
    acc  = accuracy_score(y_va, yhat_va)
    prec = precision_score(y_va, yhat_va, zero_division=0)
    rec  = recall_score(y_va, yhat_va, zero_division=0)
    f1   = f1_score(y_va, yhat_va, zero_division=0)
    mcc  = matthews_corrcoef(y_va, yhat_va)
    try:
        auroc = roc_auc_score(y_va, proba_va)
    except ValueError:
        auroc = np.nan
    rr_col = realized_return_column(va_df)
    tm = trading_metrics(y_va, yhat_va, va_df[rr_col].to_numpy() if rr_col else None)

    cv_rows.append({
        "fold": fold, "n_val": len(va_df),
        "val_start": va_df["Date"].min().date(), "val_end": va_df["Date"].max().date(),
        "effK_tr_min": int(effk_tr.min()), "effK_tr_med": float(np.median(effk_tr)), "effK_tr_max": int(effk_tr.max()),
        "effK_va_min": int(effk_va.min()), "effK_va_med": float(np.median(effk_va)), "effK_va_max": int(effk_va.max()),
        "Accuracy": acc, "Precision": prec, "Recall": rec, "F1": f1, "MCC": mcc, "AUROC": auroc,
        "WinRate": tm["win_rate"], "ProfitFactor": tm["profit_factor"], "Sharpe_252": tm["sharpe_252"]
    })

cv_textret = pd.DataFrame(cv_rows)
print("\n=== Text-Ret (α=0) CV results (per fold) ===")
display(cv_textret)

print("\n=== Text-Ret (α=0) CV means ===")
display(cv_textret[["Accuracy","Precision","Recall","F1","MCC","AUROC","WinRate","ProfitFactor","Sharpe_252"]]
        .mean(numeric_only=True))



=== Text-Ret (α=0) CV results (per fold) ===


Unnamed: 0,fold,n_val,val_start,val_end,effK_tr_min,effK_tr_med,effK_tr_max,effK_va_min,effK_va_med,effK_va_max,Accuracy,Precision,Recall,F1,MCC,AUROC,WinRate,ProfitFactor,Sharpe_252
0,1,466,2012-09-07,2015-12-03,1,5.0,5,5,5.0,5,0.645923,0.818792,0.469231,0.596577,0.3601,0.762173,0.628755,2.342203,4.800363
1,2,466,2015-12-04,2017-10-17,1,5.0,5,5,5.0,5,0.538627,0.540541,0.956175,0.690647,0.017248,0.62068,0.454936,0.820596,-1.066191
2,3,466,2017-10-18,2019-10-03,1,5.0,5,5,5.0,5,0.641631,0.625592,0.967033,0.759712,0.24995,0.775456,0.515021,1.263594,1.275705
3,4,466,2019-10-04,2021-08-26,1,5.0,5,5,5.0,5,0.67382,0.652874,0.996491,0.788889,0.317312,0.829078,0.590129,2.27016,3.703691
4,5,466,2021-08-27,2023-07-12,1,5.0,5,5,5.0,5,0.517167,0.505495,1.0,0.671533,0.153497,0.841839,0.459227,0.92838,-0.449937



=== Text-Ret (α=0) CV means ===


Accuracy        0.603433
Precision       0.628659
Recall          0.877786
F1              0.701472
MCC             0.219621
AUROC           0.765845
WinRate         0.529614
ProfitFactor    1.524987
Sharpe_252      1.652726
dtype: float64

In [110]:
# --- Full-train z_retr (causal within full train) ---
ref_text_full  = np.vstack(train[text_col].to_numpy()).astype("float32")
ref_dates_full = train["Date"].to_numpy(dtype="datetime64[ns]")

z_tr_full, effk_full = compute_zretr(
    query_text = ref_text_full,
    ref_text   = ref_text_full,
    ref_dates  = ref_dates_full,
    query_dates= ref_dates_full,
    K=5, batch=1024, use_gpu=True
)
print("Full-train effK stats:", effk_full.min(), np.median(effk_full), effk_full.max())

# --- Build full-train matrices ---
Xnum_full = train[num_cols].to_numpy(dtype=float)
scaler_full = StandardScaler(with_mean=True, with_std=True).fit(Xnum_full)
Xnum_full_s = scaler_full.transform(Xnum_full)
Xtxt_full   = ref_text_full

X_full = np.hstack([Xnum_full_s, Xtxt_full, z_tr_full]).astype("float32")
y_full = train["Movement"].to_numpy()

clf_full = LogisticRegression(max_iter=2000, solver="liblinear", n_jobs=1, random_state=42).fit(X_full, y_full)

# --- OOD: compute z_retr α=0 against FULL TRAIN (causal to OOD dates) ---
qry_text_ood  = np.vstack(ood[text_col].to_numpy()).astype("float32")
qry_dates_ood = ood["Date"].to_numpy(dtype="datetime64[ns]")

z_ood, effk_ood = compute_zretr(
    query_text = qry_text_ood,
    ref_text   = ref_text_full,
    ref_dates  = ref_dates_full,
    query_dates= qry_dates_ood,
    K=5, batch=1024, use_gpu=True
)
print("OOD effK stats:", effk_ood.min(), np.median(effk_ood), effk_ood.max())

# --- OOD design matrix ---
Xnum_test = ood[num_cols].to_numpy(dtype=float)
Xtxt_test = qry_text_ood
X_test = np.hstack([scaler_full.transform(Xnum_test), Xtxt_test, z_ood]).astype("float32")
y_test = ood["Movement"].to_numpy()

proba_test = clf_full.predict_proba(X_test)[:, 1]
yhat_test  = (proba_test >= 0.5).astype(int)

# --- Metrics ---
acc  = accuracy_score(y_test, yhat_test)
prec = precision_score(y_test, yhat_test, zero_division=0)
rec  = recall_score(y_test, yhat_test, zero_division=0)
f1   = f1_score(y_test, yhat_test, zero_division=0)
mcc  = matthews_corrcoef(y_test, yhat_test)
try:
    auroc = roc_auc_score(y_test, proba_test)
except ValueError:
    auroc = np.nan

rr_col_test = realized_return_column(ood)
tm = trading_metrics(y_test, yhat_test, ood[rr_col_test].to_numpy() if rr_col_test else None)

print("\n=== OOD (AAPL 2024) — Text-Ret (α=0) LR ===")
print("Rows:", len(ood), "Date range:", ood["Date"].min().date(), "→", ood["Date"].max().date())
print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f} | MCC: {mcc:.4f} | AUROC: {auroc:.4f}")
if rr_col_test:
    print(f"WinRate: {tm['win_rate']:.4f} | ProfitFactor: {tm['profit_factor']:.4f} | Sharpe_252: {tm['sharpe_252']:.4f} | ReturnCol: {rr_col_test}")
else:
    print("No realized-return column found in OOD; trading metrics skipped.")


Full-train effK stats: 1 5.0 5
OOD effK stats: 5 5.0 5

=== OOD (AAPL 2024) — Text-Ret (α=0) LR ===
Rows: 227 Date range: 2024-01-09 → 2024-12-09
Accuracy: 0.4273 | Precision: 0.4259 | Recall: 0.9388 | F1: 0.5860 | MCC: -0.0518 | AUROC: 0.5247
WinRate: 0.4053 | ProfitFactor: 0.7517 | Sharpe_252: -1.6300 | ReturnCol: Daily_Return


# Macro Retrieval

In [111]:
# Config
ALPHA = 0.5        # try {0.25, 0.5, 1.0}
K = 5              # try {3,5,10}
BATCH = 512
USE_GPU = True

MACRO_COLS = ["cpi_yoy_lagged_z","unrate_lagged_z","t10y2y_lagged_z","gdp_qoq_lagged_z"]
text_col  = "text_embed"

def to_unit_rows(M):
    M = M.astype("float32")
    return M / (np.linalg.norm(M, axis=1, keepdims=True) + 1e-9)

def make_joint(text_mat, macro_mat, alpha):
    # [text ; α * macro] then L2-normalize → dim = 384 + 4
    joint = np.concatenate([text_mat.astype("float32"), (alpha * macro_mat.astype("float32"))], axis=1)
    return to_unit_rows(joint)

import faiss
def build_flat_ip(d, use_gpu=True):
    idx = faiss.IndexFlatIP(d)
    if use_gpu and faiss.get_num_gpus() > 0:
        res = faiss.StandardGpuResources()
        return faiss.index_cpu_to_gpu(res, 0, idx), True
    return idx, False

def z_retr_from_joint(q_joint, r_joint, r_text, r_dates, q_dates, K=5, batch=512, use_gpu=True):
    # r_joint is the FAISS index vectors (normalized joint); r_text aggregated for z_retr
    d = r_joint.shape[1]
    index, gpu = build_flat_ip(d, use_gpu)
    index.add(r_joint)

    Z = np.zeros((q_joint.shape[0], r_text.shape[1]), dtype="float32")
    effK = []

    for s in range(0, q_joint.shape[0], batch):
        e = min(s+batch, q_joint.shape[0])
        D, I = index.search(q_joint[s:e], K*10)  # oversample then causal-mask
        for i in range(s, e):
            cand = I[i-s]
            mask = (r_dates[cand] < q_dates[i])
            pick = np.where(mask)[0][:K]
            if pick.size == 0:
                pick = np.arange(min(K, len(cand)))
            ids = cand[pick]
            Z[i] = r_text[ids].mean(axis=0)
            effK.append(len(ids))
    return Z, np.array(effK, dtype=int)


In [112]:
# Use numerics + macro_z
def pick_numeric_incl_macro(df):
    drop = {"Date","Movement","z_retr","Daily_Return","text_embed"}
    keep = [c for c in df.columns if c not in drop and pd.api.types.is_numeric_dtype(df[c])]
    return keep

num_cols = pick_numeric_incl_macro(train)
print("Numerics+macro count:", len(num_cols))

tscv = TimeSeriesSplit(n_splits=5)
rows = []

for fold, (tr_idx, va_idx) in enumerate(tscv.split(train), 1):
    tr_df, va_df = train.iloc[tr_idx].copy(), train.iloc[va_idx].copy()

    # Reference (fold-train)
    R_text  = np.vstack(tr_df[text_col].to_numpy()).astype("float32")
    R_macro = tr_df[MACRO_COLS].to_numpy().astype("float32")
    R_joint = make_joint(R_text, R_macro, ALPHA)
    R_dates = tr_df["Date"].to_numpy(dtype="datetime64[ns]")

    # TRAIN z_retr (causal within fold-train)
    Ztr, effK_tr = z_retr_from_joint(
        q_joint = R_joint, r_joint = R_joint, r_text = R_text,
        r_dates = R_dates, q_dates = R_dates,
        K=K, batch=BATCH, use_gpu=USE_GPU
    )

    # VAL z_retr (query val against fold-train)
    Q_text  = np.vstack(va_df[text_col].to_numpy()).astype("float32")
    Q_macro = va_df[MACRO_COLS].to_numpy().astype("float32")
    Q_joint = make_joint(Q_text, Q_macro, ALPHA)
    Q_dates = va_df["Date"].to_numpy(dtype="datetime64[ns]")

    Zva, effK_va = z_retr_from_joint(
        q_joint = Q_joint, r_joint = R_joint, r_text = R_text,
        r_dates = R_dates, q_dates = Q_dates,
        K=K, batch=BATCH, use_gpu=USE_GPU
    )

    # Build design matrices: [scaled numerics+macro | text | z_retr]
    Xnum_tr = tr_df[num_cols].to_numpy(dtype=float)
    Xnum_va = va_df[num_cols].to_numpy(dtype=float)
    scaler  = StandardScaler(with_mean=True, with_std=True)
    Xnum_tr_s = scaler.fit_transform(Xnum_tr)
    Xnum_va_s = scaler.transform(Xnum_va)

    Xtxt_tr = R_text
    Xtxt_va = Q_text

    X_tr = np.hstack([Xnum_tr_s, Xtxt_tr, Ztr]).astype("float32")
    X_va = np.hstack([Xnum_va_s, Xtxt_va, Zva]).astype("float32")
    y_tr = tr_df["Movement"].to_numpy()
    y_va = va_df["Movement"].to_numpy()

    clf = LogisticRegression(max_iter=2000, solver="liblinear", n_jobs=1, random_state=42)
    clf.fit(X_tr, y_tr)

    proba_va = clf.predict_proba(X_va)[:, 1]
    yhat_va  = (proba_va >= 0.5).astype(int)

    # Metrics
    acc  = accuracy_score(y_va, yhat_va)
    prec = precision_score(y_va, yhat_va, zero_division=0)
    rec  = recall_score(y_va, yhat_va, zero_division=0)
    f1   = f1_score(y_va, yhat_va, zero_division=0)
    mcc  = matthews_corrcoef(y_va, yhat_va)
    try:
        auroc = roc_auc_score(y_va, proba_va)
    except ValueError:
        auroc = np.nan
    rr_col = realized_return_column(va_df)
    tm = trading_metrics(y_va, yhat_va, va_df[rr_col].to_numpy() if rr_col else None)

    rows.append({
        "fold": fold, "n_val": len(va_df),
        "effK_tr_min": int(effK_tr.min()), "effK_tr_med": float(np.median(effK_tr)), "effK_tr_max": int(effK_tr.max()),
        "effK_va_min": int(effK_va.min()), "effK_va_med": float(np.median(effK_va)), "effK_va_max": int(effK_va.max()),
        "Accuracy": acc, "Precision": prec, "Recall": rec, "F1": f1, "MCC": mcc, "AUROC": auroc,
        "WinRate": tm["win_rate"], "ProfitFactor": tm["profit_factor"], "Sharpe_252": tm["sharpe_252"],
        "val_start": va_df["Date"].min().date(), "val_end": va_df["Date"].max().date()
    })

cv_macroret = pd.DataFrame(rows)
print("\n=== Macro-Ret (α={}) CV results (per fold) ===".format(ALPHA))
display(cv_macroret)

print("\n=== Macro-Ret (α={}) CV means ===".format(ALPHA))
display(cv_macroret[["Accuracy","Precision","Recall","F1","MCC","AUROC","WinRate","ProfitFactor","Sharpe_252"]]
        .mean(numeric_only=True))


Numerics+macro count: 12

=== Macro-Ret (α=0.5) CV results (per fold) ===


Unnamed: 0,fold,n_val,effK_tr_min,effK_tr_med,effK_tr_max,effK_va_min,effK_va_med,effK_va_max,Accuracy,Precision,Recall,F1,MCC,AUROC,WinRate,ProfitFactor,Sharpe_252,val_start,val_end
0,1,466,1,5.0,5,5,5.0,5,0.637339,0.608592,0.980769,0.751105,0.304523,0.751998,0.585837,1.802533,3.34837,2012-09-07,2015-12-03
1,2,466,1,5.0,5,5,5.0,5,0.549356,0.546697,0.956175,0.695652,0.06528,0.602594,0.457082,0.899747,-0.570059,2015-12-04,2017-10-17
2,3,466,1,5.0,5,5,5.0,5,0.67382,0.653944,0.941392,0.771772,0.320814,0.769098,0.530043,1.289516,1.386096,2017-10-18,2019-10-03
3,4,466,1,5.0,5,5,5.0,5,0.669528,0.663342,0.933333,0.77551,0.263743,0.673704,0.56867,1.966396,3.081678,2019-10-04,2021-08-26
4,5,466,1,5.0,5,5,5.0,5,0.587983,0.546341,0.973913,0.7,0.285645,0.784433,0.508584,1.320267,1.680882,2021-08-27,2023-07-12



=== Macro-Ret (α=0.5) CV means ===


Accuracy        0.623605
Precision       0.603783
Recall          0.957117
F1              0.738808
MCC             0.248001
AUROC           0.716365
WinRate         0.530043
ProfitFactor    1.455692
Sharpe_252      1.785393
dtype: float64

In [113]:
# Full-train reference
R_text_full  = np.vstack(train[text_col].to_numpy()).astype("float32")
R_macro_full = train[MACRO_COLS].to_numpy().astype("float32")
R_joint_full = make_joint(R_text_full, R_macro_full, ALPHA)
R_dates_full = train["Date"].to_numpy(dtype="datetime64[ns]")

# Full-train z_retr (causal)
Ztr_full, effK_full = z_retr_from_joint(
    q_joint = R_joint_full, r_joint = R_joint_full, r_text = R_text_full,
    r_dates = R_dates_full, q_dates = R_dates_full,
    K=K, batch=1024, use_gpu=USE_GPU
)
print("Full-train effK stats:", effK_full.min(), np.median(effK_full), effK_full.max())

# Train matrix
Xnum_full  = train[num_cols].to_numpy(dtype=float)
scaler_full = StandardScaler(with_mean=True, with_std=True).fit(Xnum_full)
Xnum_full_s = scaler_full.transform(Xnum_full)
X_full = np.hstack([Xnum_full_s, R_text_full, Ztr_full]).astype("float32")
y_full = train["Movement"].to_numpy()

clf_full = LogisticRegression(max_iter=3000, solver="liblinear", n_jobs=1, random_state=42).fit(X_full, y_full)

# OOD z_retr with joint queries (macro-aware)
Q_text_ood  = np.vstack(ood[text_col].to_numpy()).astype("float32")
Q_macro_ood = ood[MACRO_COLS].to_numpy().astype("float32")
Q_joint_ood = make_joint(Q_text_ood, Q_macro_ood, ALPHA)
Q_dates_ood = ood["Date"].to_numpy(dtype="datetime64[ns]")

Z_ood, effK_ood = z_retr_from_joint(
    q_joint = Q_joint_ood, r_joint = R_joint_full, r_text = R_text_full,
    r_dates = R_dates_full, q_dates = Q_dates_ood,
    K=K, batch=1024, use_gpu=USE_GPU
)
print("OOD effK stats:", effK_ood.min(), np.median(effK_ood), effK_ood.max())

# OOD matrix
Xnum_test  = ood[num_cols].to_numpy(dtype=float)
Xnum_test_s = scaler_full.transform(Xnum_test)
X_test = np.hstack([Xnum_test_s, Q_text_ood, Z_ood]).astype("float32")
y_test = ood["Movement"].to_numpy()

proba_test = clf_full.predict_proba(X_test)[:, 1]
yhat_test  = (proba_test >= 0.5).astype(int)

acc  = accuracy_score(y_test, yhat_test)
prec = precision_score(y_test, yhat_test, zero_division=0)
rec  = recall_score(y_test, yhat_test, zero_division=0)
f1   = f1_score(y_test, yhat_test, zero_division=0)
mcc  = matthews_corrcoef(y_test, yhat_test)
try:
    auroc = roc_auc_score(y_test, proba_test)
except ValueError:
    auroc = np.nan

rr_col_test = realized_return_column(ood)
tm = trading_metrics(y_test, yhat_test, ood[rr_col_test].to_numpy() if rr_col_test else None)

print("\n=== OOD (AAPL 2024) — Macro-Ret (α={}) LR ===".format(ALPHA))
print("Rows:", len(ood), "Date range:", ood["Date"].min().date(), "→", ood["Date"].max().date())
print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f} | MCC: {mcc:.4f} | AUROC: {auroc:.4f}")
if rr_col_test:
    print(f"WinRate: {tm['win_rate']:.4f} | ProfitFactor: {tm['profit_factor']:.4f} | Sharpe_252: {tm['sharpe_252']:.4f} | ReturnCol: {rr_col_test}")
else:
    print("No realized-return column found in OOD; trading metrics skipped.")


Full-train effK stats: 1 5.0 5
OOD effK stats: 5 5.0 5

=== OOD (AAPL 2024) — Macro-Ret (α=0.5) LR ===
Rows: 227 Date range: 2024-01-09 → 2024-12-09
Accuracy: 0.4537 | Precision: 0.4097 | Recall: 0.6020 | F1: 0.4876 | MCC: -0.0585 | AUROC: 0.4994
WinRate: 0.4581 | ProfitFactor: 1.1043 | Sharpe_252: 0.5671 | ReturnCol: Daily_Return


# Alpha Sweep (α ∈ {0.25, 0.5, 1.0}) × (K ∈ {3,5,10})

In [114]:
import numpy as np, pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, matthews_corrcoef, roc_auc_score)

ALPHAS = [0.25, 0.5, 1.0]
KS     = [3, 5, 10]
BATCH  = 512
USE_GPU = True

MACRO_COLS = ["cpi_yoy_lagged_z","unrate_lagged_z","t10y2y_lagged_z","gdp_qoq_lagged_z"]
text_col   = "text_embed"

def pick_numeric_incl_macro(df):
    drop = {"Date","Movement","z_retr","Daily_Return","text_embed"}
    return [c for c in df.columns if c not in drop and pd.api.types.is_numeric_dtype(df[c])]

num_cols = pick_numeric_incl_macro(train)

# Pre-materialize arrays once
R_TEXT_FULL  = np.vstack(train[text_col].to_numpy()).astype("float32")
R_MACRO_FULL = train[MACRO_COLS].to_numpy().astype("float32")
R_DATES_FULL = train["Date"].to_numpy(dtype="datetime64[ns]")

Q_TEXT_OOD   = np.vstack(ood[text_col].to_numpy()).astype("float32")
Q_MACRO_OOD  = ood[MACRO_COLS].to_numpy().astype("float32")
Q_DATES_OOD  = ood["Date"].to_numpy(dtype="datetime64[ns]")

def run_macroret(alpha, K):
    # ---- CV ----
    tscv = TimeSeriesSplit(n_splits=5)
    cv_metrics = []

    for fold, (tr_idx, va_idx) in enumerate(tscv.split(train), 1):
        tr_df, va_df = train.iloc[tr_idx].copy(), train.iloc[va_idx].copy()

        # Fold reference (train)
        R_text  = np.vstack(tr_df[text_col].to_numpy()).astype("float32")
        R_macro = tr_df[MACRO_COLS].to_numpy().astype("float32")
        R_joint = make_joint(R_text, R_macro, alpha)
        R_dates = tr_df["Date"].to_numpy(dtype="datetime64[ns]")

        # z_retr for train (causal within train)
        Ztr, _ = z_retr_from_joint(R_joint, R_joint, R_text, R_dates, R_dates, K=K, batch=BATCH, use_gpu=USE_GPU)

        # z_retr for val (query val vs train)
        Q_text = np.vstack(va_df[text_col].to_numpy()).astype("float32")
        Q_macro = va_df[MACRO_COLS].to_numpy().astype("float32")
        Q_joint = make_joint(Q_text, Q_macro, alpha)
        Q_dates = va_df["Date"].to_numpy(dtype="datetime64[ns]")
        Zva, _ = z_retr_from_joint(Q_joint, R_joint, R_text, R_dates, Q_dates, K=K, batch=BATCH, use_gpu=USE_GPU)

        # Design matrices: [scaled numerics+macro | text | z_retr]
        Xnum_tr = tr_df[num_cols].to_numpy(dtype=float)
        Xnum_va = va_df[num_cols].to_numpy(dtype=float)
        scaler  = StandardScaler(with_mean=True, with_std=True).fit(Xnum_tr)
        X_tr = np.hstack([scaler.transform(Xnum_tr),
                          R_text,
                          Ztr]).astype("float32")
        X_va = np.hstack([scaler.transform(Xnum_va),
                          Q_text,
                          Zva]).astype("float32")

        y_tr = tr_df["Movement"].to_numpy()
        y_va = va_df["Movement"].to_numpy()

        clf = LogisticRegression(max_iter=3000, solver="liblinear", n_jobs=1, random_state=42).fit(X_tr, y_tr)
        proba_va = clf.predict_proba(X_va)[:, 1]
        yhat_va  = (proba_va >= 0.5).astype(int)

        # Metrics
        acc  = accuracy_score(y_va, yhat_va)
        prec = precision_score(y_va, yhat_va, zero_division=0)
        rec  = recall_score(y_va, yhat_va, zero_division=0)
        f1   = f1_score(y_va, yhat_va, zero_division=0)
        mcc  = matthews_corrcoef(y_va, yhat_va)
        try: auroc = roc_auc_score(y_va, proba_va)
        except ValueError: auroc = np.nan
        rr_col = realized_return_column(va_df)
        tm = trading_metrics(y_va, yhat_va, va_df[rr_col].to_numpy() if rr_col else None)

        cv_metrics.append([acc,prec,rec,f1,mcc,auroc,tm["win_rate"],tm["profit_factor"],tm["sharpe_252"]])

    cv_mean = np.nanmean(np.array(cv_metrics, dtype=float), axis=0)
    (cv_acc,cv_prec,cv_rec,cv_f1,cv_mcc,cv_auroc,cv_wr,cv_pf,cv_sharpe) = cv_mean

    # ---- Full-train → OOD ----
    R_joint_full = make_joint(R_TEXT_FULL, R_MACRO_FULL, alpha)
    Ztr_full, _  = z_retr_from_joint(R_joint_full, R_joint_full, R_TEXT_FULL, R_DATES_FULL, R_DATES_FULL,
                                     K=K, batch=1024, use_gpu=USE_GPU)

    Xnum_full   = train[num_cols].to_numpy(dtype=float)
    scaler_full = StandardScaler(with_mean=True, with_std=True).fit(Xnum_full)
    X_full = np.hstack([scaler_full.transform(Xnum_full),
                        R_TEXT_FULL,
                        Ztr_full]).astype("float32")
    y_full = train["Movement"].to_numpy()

    clf_full = LogisticRegression(max_iter=3000, solver="liblinear", n_jobs=1, random_state=42).fit(X_full, y_full)

    # OOD z_retr (query OOD vs full train)
    Q_joint_ood = make_joint(Q_TEXT_OOD, Q_MACRO_OOD, alpha)
    Z_ood, _    = z_retr_from_joint(Q_joint_ood, R_joint_full, R_TEXT_FULL, R_DATES_FULL, Q_DATES_OOD,
                                    K=K, batch=1024, use_gpu=USE_GPU)

    Xnum_test = ood[num_cols].to_numpy(dtype=float)
    X_test = np.hstack([scaler_full.transform(Xnum_test),
                        Q_TEXT_OOD,
                        Z_ood]).astype("float32")
    y_test = ood["Movement"].to_numpy()

    proba_test = clf_full.predict_proba(X_test)[:, 1]
    yhat_test  = (proba_test >= 0.5).astype(int)

    acc  = accuracy_score(y_test, yhat_test)
    prec = precision_score(y_test, yhat_test, zero_division=0)
    rec  = recall_score(y_test, yhat_test, zero_division=0)
    f1   = f1_score(y_test, yhat_test, zero_division=0)
    mcc  = matthews_corrcoef(y_test, yhat_test)
    try: auroc = roc_auc_score(y_test, proba_test)
    except ValueError: auroc = np.nan
    rr_col_test = realized_return_column(ood)
    tm = trading_metrics(y_test, yhat_test, ood[rr_col_test].to_numpy() if rr_col_test else None)

    return {
        "alpha": alpha, "K": K,
        # CV
        "CV_Accuracy": cv_acc, "CV_Precision": cv_prec, "CV_Recall": cv_rec, "CV_F1": cv_f1,
        "CV_MCC": cv_mcc, "CV_AUROC": cv_auroc, "CV_WinRate": cv_wr,
        "CV_ProfitFactor": cv_pf, "CV_Sharpe_252": cv_sharpe,
        # OOD
        "OOD_Accuracy": acc, "OOD_Precision": prec, "OOD_Recall": rec, "OOD_F1": f1,
        "OOD_MCC": mcc, "OOD_AUROC": auroc,
        "OOD_WinRate": tm["win_rate"], "OOD_ProfitFactor": tm["profit_factor"], "OOD_Sharpe_252": tm["sharpe_252"]
    }

# ---- Run sweep ----
results = []
for a in ALPHAS:
    for k in KS:
        print(f"\n>>> Running Macro-Ret sweep: alpha={a}, K={k}")
        res = run_macroret(alpha=a, K=k)
        results.append(res)

sweep_df = pd.DataFrame(results)
print("\n=== Sweep summary (sorted by OOD Sharpe) ===")
display(sweep_df.sort_values("OOD_Sharpe_252", ascending=False))

print("\nTop-3 by OOD Profit Factor:")
display(sweep_df.sort_values("OOD_ProfitFactor", ascending=False).head(3)[
    ["alpha","K","OOD_ProfitFactor","OOD_Sharpe_252","OOD_WinRate","OOD_AUROC"]
])



>>> Running Macro-Ret sweep: alpha=0.25, K=3

>>> Running Macro-Ret sweep: alpha=0.25, K=5

>>> Running Macro-Ret sweep: alpha=0.25, K=10

>>> Running Macro-Ret sweep: alpha=0.5, K=3

>>> Running Macro-Ret sweep: alpha=0.5, K=5

>>> Running Macro-Ret sweep: alpha=0.5, K=10

>>> Running Macro-Ret sweep: alpha=1.0, K=3

>>> Running Macro-Ret sweep: alpha=1.0, K=5

>>> Running Macro-Ret sweep: alpha=1.0, K=10

=== Sweep summary (sorted by OOD Sharpe) ===


Unnamed: 0,alpha,K,CV_Accuracy,CV_Precision,CV_Recall,CV_F1,CV_MCC,CV_AUROC,CV_WinRate,CV_ProfitFactor,CV_Sharpe_252,OOD_Accuracy,OOD_Precision,OOD_Recall,OOD_F1,OOD_MCC,OOD_AUROC,OOD_WinRate,OOD_ProfitFactor,OOD_Sharpe_252
4,0.5,5,0.623605,0.603783,0.957117,0.738808,0.248001,0.716365,0.530043,1.455692,1.785393,0.453744,0.409722,0.602041,0.487603,-0.058493,0.499367,0.45815,1.104282,0.567124
2,0.25,10,0.62618,0.60429,0.959702,0.740596,0.249039,0.718346,0.536052,1.488445,1.937222,0.449339,0.415094,0.673469,0.513619,-0.05132,0.496124,0.453744,1.093864,0.512946
5,0.5,10,0.624034,0.604152,0.957408,0.739148,0.246426,0.716644,0.528755,1.45999,1.829276,0.449339,0.415094,0.673469,0.513619,-0.05132,0.493672,0.453744,1.093864,0.512946
6,1.0,3,0.627897,0.605758,0.958278,0.741109,0.257208,0.713381,0.53691,1.50419,1.944361,0.462555,0.423077,0.673469,0.519685,-0.02586,0.49913,0.45815,1.065882,0.364815
7,1.0,5,0.623176,0.603068,0.957818,0.73868,0.246295,0.714433,0.53133,1.485478,1.9086,0.453744,0.419753,0.693878,0.523077,-0.038136,0.498813,0.449339,1.05786,0.32162
1,0.25,5,0.625751,0.603034,0.961876,0.740496,0.254257,0.716197,0.535622,1.477151,1.896359,0.46696,0.431138,0.734694,0.543396,-0.001955,0.500475,0.444934,1.035376,0.198787
3,0.5,3,0.630472,0.606693,0.959265,0.742455,0.264952,0.716912,0.532618,1.478795,1.920737,0.484581,0.417391,0.489796,0.450704,-0.029309,0.496361,0.480176,1.028649,0.161516
0,0.25,3,0.627897,0.605536,0.954832,0.740272,0.252135,0.713856,0.535193,1.479797,1.879385,0.475771,0.436364,0.734694,0.547529,0.0153,0.490745,0.444934,1.026318,0.148547
8,1.0,10,0.619313,0.602059,0.951243,0.735364,0.240531,0.711387,0.525751,1.412177,1.668149,0.46696,0.430303,0.72449,0.539924,-0.00466,0.495175,0.436123,0.985151,-0.085547



Top-3 by OOD Profit Factor:


Unnamed: 0,alpha,K,OOD_ProfitFactor,OOD_Sharpe_252,OOD_WinRate,OOD_AUROC
4,0.5,5,1.104282,0.567124,0.45815,0.499367
2,0.25,10,1.093864,0.512946,0.453744,0.496124
5,0.5,10,1.093864,0.512946,0.453744,0.493672
