In [None]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from src.utils.splits import RollingSplit, rolling_time_splits

df = pd.read_parquet("../data/processed/model_table_clean.parquet").copy()
df["trading_date"] = pd.to_datetime(df["trading_date"])
df = df.sort_values("trading_date")

# Build daily-level market features
# NOTE: The dataset has many headlines per day; market features are day-level and will be repeated per headline.
# That is OK, but we must be careful not to inflate metrics by having many repeated labels per day.
# We'll later aggregate by day for evaluation in a more conservative way, but start simple.
daily = (
    df[["trading_date", "return_t_plus_1"]]
    .drop_duplicates(subset=["trading_date"])
    .sort_values("trading_date")
    .copy()
)

# Compute close-to-close proxy return_t (we don't have close_t here directly; use shift of return_t_plus_1 as proxy)
# Better alternative: in notebook 03 you had close; if you still have close, use that.
# Here we construct lag features based on return_t_plus_1 shifted.
daily["ret_t"] = daily["return_t_plus_1"].shift(1)

# Rolling volatility of ret_t over last 5 trading days
daily["vol_5"] = daily["ret_t"].rolling(5).std()

# Rolling mean return (momentum proxy)
daily["mom_5"] = daily["ret_t"].rolling(5).mean()

# Target label at day level: based on return_t_plus_1
daily["label"] = (daily["return_t_plus_1"] > 0).astype(int)

# Drop rows with insufficient history
daily = daily.dropna().copy()
daily.head()

cfg = RollingSplit(train_window_days=365*4, test_window_days=365, step_days=90)

X_cols = ["ret_t", "vol_5", "mom_5"]
results = []

model = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=2000, class_weight="balanced"))
])

for train_start, train_end, test_start, test_end in rolling_time_splits(daily, "trading_date", cfg):
    train_mask = (daily["trading_date"] >= train_start) & (daily["trading_date"] < train_end)
    test_mask  = (daily["trading_date"] >= test_start)  & (daily["trading_date"] < test_end)

    train_df = daily.loc[train_mask]
    test_df  = daily.loc[test_mask]

    # Skip if window is too small
    if len(train_df) < 200 or len(test_df) < 50:
        continue

    X_train = train_df[X_cols].values
    y_train = train_df["label"].values
    X_test  = test_df[X_cols].values
    y_test  = test_df["label"].values

    model.fit(X_train, y_train)
    p = model.predict_proba(X_test)[:, 1]
    y_pred = (p >= 0.5).astype(int)

    fold = {
        "train_start": str(train_start.date()),
        "train_end": str(train_end.date()),
        "test_start": str(test_start.date()),
        "test_end": str(test_end.date()),
        "n_train": len(train_df),
        "n_test": len(test_df),
        "auc": roc_auc_score(y_test, p),
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
    }
    results.append(fold)

res_df = pd.DataFrame(results)
res_df

import os

os.makedirs("../reports", exist_ok=True)
out_path = "../reports/market_only_metrics.csv"
res_df.to_csv(out_path, index=False)
print("Saved:", out_path)
print(res_df[["auc", "accuracy", "precision", "recall"]].describe())
