In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler

from src.utils.splits import RollingSplit, rolling_time_splits

emb = pd.read_parquet("../data/features/finbert_day_embeddings.parquet").copy()
emb["trading_date"] = pd.to_datetime(emb["trading_date"])
emb = emb.sort_values("trading_date")

# Rebuild day-level market features consistently
emb["ret_t"] = emb["return_t_plus_1"].shift(1)
emb["vol_5"] = emb["ret_t"].rolling(5).std()
emb["mom_5"] = emb["ret_t"].rolling(5).mean()

emb = emb.dropna().copy()

# Split out feature blocks:
# - FinBERT embeddings are columns 3..(3+hidden-1) because we inserted date,label,return first.
embedding_cols = [c for c in emb.columns if isinstance(c, (int, np.integer)) or str(c).isdigit()]
# Sometimes parquet reads columns as strings; handle both:
if len(embedding_cols) == 0:
    embedding_cols = [c for c in emb.columns if c not in ["trading_date","label","return_t_plus_1","ret_t","vol_5","mom_5"]]

market_cols = ["ret_t", "vol_5", "mom_5"]

print("Embedding cols:", len(embedding_cols))
print("Market cols:", market_cols)

cfg = RollingSplit(train_window_days=365*4, test_window_days=365, step_days=90)

scaler = StandardScaler()
clf = LogisticRegression(max_iter=2000, class_weight="balanced")

results = []

for train_start, train_end, test_start, test_end in rolling_time_splits(emb, "trading_date", cfg):
    train_mask = (emb["trading_date"] >= train_start) & (emb["trading_date"] < train_end)
    test_mask  = (emb["trading_date"] >= test_start)  & (emb["trading_date"] < test_end)

    train_df = emb.loc[train_mask]
    test_df  = emb.loc[test_mask]

    if len(train_df) < 200 or len(test_df) < 50:
        continue

    X_emb_train = train_df[embedding_cols].values
    X_emb_test  = test_df[embedding_cols].values

    X_mkt_train = scaler.fit_transform(train_df[market_cols].values)
    X_mkt_test  = scaler.transform(test_df[market_cols].values)

    X_train = np.hstack([X_emb_train, X_mkt_train])
    X_test  = np.hstack([X_emb_test, X_mkt_test])

    y_train = train_df["label"].values
    y_test  = test_df["label"].values

    clf.fit(X_train, y_train)
    p = clf.predict_proba(X_test)[:, 1]
    y_pred = (p >= 0.5).astype(int)

    results.append({
        "train_start": str(train_start.date()),
        "train_end": str(train_end.date()),
        "test_start": str(test_start.date()),
        "test_end": str(test_end.date()),
        "n_train": len(train_df),
        "n_test": len(test_df),
        "auc": roc_auc_score(y_test, p),
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
    })

res_df = pd.DataFrame(results)
res_df

import os
os.makedirs("../reports", exist_ok=True)

out_path = "../reports/finbert_fused_metrics.csv"
res_df.to_csv(out_path, index=False)
print("Saved:", out_path)
print(res_df[["auc", "accuracy", "precision", "recall"]].describe())
