In [None]:
import numpy as np
import pandas as pd

import shap
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

emb = pd.read_parquet("../data/features/finbert_day_embeddings.parquet").copy()
emb["trading_date"] = pd.to_datetime(emb["trading_date"])
emb = emb.sort_values("trading_date")

emb["ret_t"] = emb["return_t_plus_1"].shift(1)
emb["vol_5"] = emb["ret_t"].rolling(5).std()
emb["mom_5"] = emb["ret_t"].rolling(5).mean()
emb = emb.dropna().copy()

embedding_cols = [c for c in emb.columns if isinstance(c, (int, np.integer)) or str(c).isdigit()]
if len(embedding_cols) == 0:
    embedding_cols = [c for c in emb.columns if c not in ["trading_date","label","return_t_plus_1","ret_t","vol_5","mom_5"]]

market_cols = ["ret_t", "vol_5", "mom_5"]

# Simple holdout: last ~20% of dates as test (still time-respecting)
cutoff = emb["trading_date"].quantile(0.8)

train_df = emb[emb["trading_date"] < cutoff].copy()
test_df  = emb[emb["trading_date"] >= cutoff].copy()

scaler = StandardScaler()
clf = LogisticRegression(max_iter=2000, class_weight="balanced")

X_train_emb = train_df[embedding_cols].values
X_test_emb  = test_df[embedding_cols].values
X_train_mkt = scaler.fit_transform(train_df[market_cols].values)
X_test_mkt  = scaler.transform(test_df[market_cols].values)

X_train = np.hstack([X_train_emb, X_train_mkt])
X_test  = np.hstack([X_test_emb, X_test_mkt])

y_train = train_df["label"].values
y_test  = test_df["label"].values

clf.fit(X_train, y_train)
print("Trained final model for interpretability demo.")

# For linear models, SHAP LinearExplainer is appropriate.
explainer = shap.LinearExplainer(clf, X_train, feature_perturbation="interventional")
shap_values = explainer.shap_values(X_test)

# Build feature names: embedding dims + market features
feature_names = [f"emb_{i}" for i in range(X_train_emb.shape[1])] + market_cols

# Summarize global importance (mean absolute SHAP)
mean_abs = np.mean(np.abs(shap_values), axis=0)
imp = pd.DataFrame({"feature": feature_names, "mean_abs_shap": mean_abs})
imp = imp.sort_values("mean_abs_shap", ascending=False)

imp.head(20)

from sklearn.metrics import roc_auc_score

def fit_eval(Xtr, ytr, Xte, yte):
    m = LogisticRegression(max_iter=2000, class_weight="balanced")
    m.fit(Xtr, ytr)
    p = m.predict_proba(Xte)[:, 1]
    return roc_auc_score(yte, p)

# Build three variants:
# 1) Embeddings only
auc_emb = fit_eval(X_train_emb, y_train, X_test_emb, y_test)

# 2) Market only
auc_mkt = fit_eval(X_train_mkt, y_train, X_test_mkt, y_test)

# 3) Fused
auc_fused = fit_eval(X_train, y_train, X_test, y_test)

abl = pd.DataFrame([
    {"model": "FinBERT embeddings only", "auc": auc_emb},
    {"model": "Market features only", "auc": auc_mkt},
    {"model": "Fused (FinBERT + market)", "auc": auc_fused},
]).sort_values("auc", ascending=False)

abl
