In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn.pipeline import Pipeline

from src.utils.splits import RollingSplit, rolling_time_splits

df = pd.read_parquet("../data/processed/model_table_clean.parquet").copy()
df["trading_date"] = pd.to_datetime(df["trading_date"])
df = df.sort_values("trading_date")

# IMPORTANT:
# If you treat each headline as independent, you can artificially inflate sample size.
# A more honest framing is day-level prediction: aggregate all headlines for a day into one document.
day_df = (
    df.groupby("trading_date")
      .agg(
          doc=("clean_headline", lambda x: " ".join(x.tolist())),
          return_t_plus_1=("return_t_plus_1", "first")
      )
      .reset_index()
)

day_df["label"] = (day_df["return_t_plus_1"] > 0).astype(int)
day_df.head()

cfg = RollingSplit(train_window_days=365*4, test_window_days=365, step_days=90)

results = []
model = Pipeline(steps=[
    ("tfidf", TfidfVectorizer(
        max_features=20000,
        ngram_range=(1,2),
        min_df=2
    )),
    ("clf", LogisticRegression(max_iter=2000, class_weight="balanced"))
])

for train_start, train_end, test_start, test_end in rolling_time_splits(day_df, "trading_date", cfg):
    train_mask = (day_df["trading_date"] >= train_start) & (day_df["trading_date"] < train_end)
    test_mask  = (day_df["trading_date"] >= test_start)  & (day_df["trading_date"] < test_end)

    train_df = day_df.loc[train_mask]
    test_df  = day_df.loc[test_mask]

    if len(train_df) < 200 or len(test_df) < 50:
        continue

    X_train = train_df["doc"].values
    y_train = train_df["label"].values
    X_test  = test_df["doc"].values
    y_test  = test_df["label"].values

    model.fit(X_train, y_train)
    p = model.predict_proba(X_test)[:, 1]
    y_pred = (p >= 0.5).astype(int)

    results.append({
        "train_start": str(train_start.date()),
        "train_end": str(train_end.date()),
        "test_start": str(test_start.date()),
        "test_end": str(test_end.date()),
        "n_train": len(train_df),
        "n_test": len(test_df),
        "auc": roc_auc_score(y_test, p),
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
    })

res_df = pd.DataFrame(results)
res_df

import os
os.makedirs("../reports", exist_ok=True)

out_path = "../reports/text_only_tfidf_metrics.csv"
res_df.to_csv(out_path, index=False)
print("Saved:", out_path)
print(res_df[["auc", "accuracy", "precision", "recall"]].describe())
