In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler

CUT = pd.Timestamp("2025-10-15")

def eval_file(path, extra_feature_cols=None, label_col="y_h5"):
    df = pd.read_csv(path, parse_dates=["date"]).sort_values(["ticker","date"])
    # Colonnes techniques de base (ajuste selon ce que tu as)
    tech_cols = [c for c in df.columns if c not in ["date","ticker","y","y_h5","future_return_5d","close","open","high","low","volume","sent_score","p_pos","p_neu","p_neg","event_weight","sent_x_weight","p_pos_x_weight","p_neu_x_weight","p_neg_x_weight"]]
    # Garder des features techniques plausibles (exemple simple)
    tech_keep = [c for c in tech_cols if any(k in c.lower() for k in ["sma","mom","rsi","vol","ret_1d"])] or tech_cols

    feat_cols = tech_keep.copy()
    if extra_feature_cols:
        feat_cols += [c for c in extra_feature_cols if c in df.columns]

    df = df.dropna(subset=feat_cols+[label_col]).copy()

    train = df[df["date"] < CUT]
    test  = df[df["date"] >= CUT]

    if len(train) == 0 or len(test) == 0:
        raise ValueError(f"Empty split for {path}. Check CUT date.")

    Xtr = train[feat_cols].values
    ytr = train[label_col].values
    Xte = test[feat_cols].values
    yte = test[label_col].values

    # standardisation train-only
    scaler = StandardScaler()
    Xtr = scaler.fit_transform(Xtr)
    Xte = scaler.transform(Xte)

    # logistique régularisée (évite singularités)
    clf = LogisticRegression(
        C=1.0,           # tu peux tuner (0.1, 1, 10)
        solver="lbfgs",
        max_iter=2000,
        class_weight=None # ou 'balanced' si classes très déséquilibrées
    )
    clf.fit(Xtr, ytr)
    proba = clf.predict_proba(Xte)[:,1]
    pred  = (proba >= 0.5).astype(int)

    return {
        "rows_train": len(train),
        "rows_test": len(test),
        "pos_train": float(ytr.mean()),
        "pos_test":  float(yte.mean()),
        "AUC": roc_auc_score(yte, proba),
        "ACC": accuracy_score(yte, pred)
    }

rows = []
rows.append(("T-only",     eval_file("data/processed/merge_T_only_h5.csv",                extra_feature_cols=[])))
rows.append(("T+Score",    eval_file("data/processed/merge_TS_score_h5.csv",              extra_feature_cols=["sent_score","event_weight","sent_x_weight"])))
rows.append(("T+Triplet",  eval_file("data/processed/merge_TS_triplet_h5.csv",            extra_feature_cols=["p_pos","p_neu","p_neg","event_weight","p_pos_x_weight","p_neu_x_weight","p_neg_x_weight"])))

pd.DataFrame(
    [(name, m["rows_train"], m["rows_test"], m["pos_train"], m["pos_test"], m["AUC"], m["ACC"]) for name,m in rows],
    columns=["dataset","train_rows","test_rows","train_pos","test_pos","AUC","ACC"]
)
