In [None]:
import pandas as pd
import numpy as np

from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler

from src.utils.splits import RollingSplit, rolling_time_splits

df = pd.read_parquet("../data/processed/model_table_clean.parquet").copy()
df["trading_date"] = pd.to_datetime(df["trading_date"])
df = df.sort_values("trading_date")

# Day-level text document
day_text = (
    df.groupby("trading_date")
      .agg(
          doc=("clean_headline", lambda x: " ".join(x.tolist())),
          return_t_plus_1=("return_t_plus_1", "first")
      )
      .reset_index()
)

day_text["label"] = (day_text["return_t_plus_1"] > 0).astype(int)

# Day-level market features (same as notebook 05, built consistently)
daily = day_text.sort_values("trading_date").copy()
daily["ret_t"] = daily["return_t_plus_1"].shift(1)
daily["vol_5"] = daily["ret_t"].rolling(5).std()
daily["mom_5"] = daily["ret_t"].rolling(5).mean()

daily = daily.dropna().copy()

X_market_cols = ["ret_t", "vol_5", "mom_5"]
daily.head()

cfg = RollingSplit(train_window_days=365*4, test_window_days=365, step_days=90)

tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2), min_df=2)
scaler = StandardScaler()
clf = LogisticRegression(max_iter=2000, class_weight="balanced")

results = []

for train_start, train_end, test_start, test_end in rolling_time_splits(daily, "trading_date", cfg):
    train_mask = (daily["trading_date"] >= train_start) & (daily["trading_date"] < train_end)
    test_mask  = (daily["trading_date"] >= test_start)  & (daily["trading_date"] < test_end)

    train_df = daily.loc[train_mask]
    test_df  = daily.loc[test_mask]

    if len(train_df) < 200 or len(test_df) < 50:
        continue

    # Fit TF-IDF on train documents only (prevents leakage)
    X_text_train = tfidf.fit_transform(train_df["doc"].values)
    X_text_test  = tfidf.transform(test_df["doc"].values)

    # Scale market features (fit scaler on train only)
    X_mkt_train = scaler.fit_transform(train_df[X_market_cols].values)
    X_mkt_test  = scaler.transform(test_df[X_market_cols].values)

    # Combine sparse text + dense market (convert dense to sparse via hstack)
    X_train = hstack([X_text_train, X_mkt_train])
    X_test  = hstack([X_text_test, X_mkt_test])

    y_train = train_df["label"].values
    y_test  = test_df["label"].values

    clf.fit(X_train, y_train)
    p = clf.predict_proba(X_test)[:, 1]
    y_pred = (p >= 0.5).astype(int)

    results.append({
        "train_start": str(train_start.date()),
        "train_end": str(train_end.date()),
        "test_start": str(test_start.date()),
        "test_end": str(test_end.date()),
        "n_train": len(train_df),
        "n_test": len(test_df),
        "auc": roc_auc_score(y_test, p),
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
    })

res_df = pd.DataFrame(results)
res_df

import os
os.makedirs("../reports", exist_ok=True)

out_path = "../reports/fused_tfidf_market_metrics.csv"
res_df.to_csv(out_path, index=False)
print("Saved:", out_path)
print(res_df[["auc", "accuracy", "precision", "recall"]].describe())
