## Feature to score transformation

In [None]:
import itertools
import warnings
from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from pandas.tseries.offsets import BDay
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error

warnings.filterwarnings("ignore")

# ---------------------------------------------------------------------------
# Paths and hyperparameters
# ---------------------------------------------------------------------------
MYPATH = "./"
FE_8K = f"{MYPATH}/OUTPUT/features_event_8k.parquet"
FE_10KQ = f"{MYPATH}/OUTPUT/features_event_10kq.parquet"
PRICES_3COL = f"{MYPATH}/price_data/prices_3col_adjusted_2001_2025.parquet"
ALPHA_WIDE_OUT = f"{MYPATH}/OUTPUT/alpha_scores_event_daily_wide.parquet"

FORWARD_10KQ = 63
FORWARD_8K = 10
EMBARGO = max(FORWARD_10KQ, FORWARD_8K)
SEED = 3
TRAIN_LABEL = "fwd_excess_tstat"
TEST_START = pd.Timestamp("2020-01-01")
ADJ_8K_WEIGHT = 0.5
MIN10k, MIN10q = 23, 73

# ---------------------------------------------------------------------------
# Helpers: labels, cross-section z-score/rank, purged CV, train, daily signal
# ---------------------------------------------------------------------------
def make_forward_labels(px_df: pd.DataFrame, horizon: int) -> pd.DataFrame:
    df = px_df.dropna(subset=["ticker","date","px"]).sort_values(["ticker","date"]).copy()
    df["ret_1d"] = df.groupby("ticker")["px"].pct_change()
    df["log_r1"] = np.log1p(df["ret_1d"])
    mkt = df.groupby("date")["log_r1"].mean().rename("mkt_log_r1")
    df = df.merge(mkt, on="date", how="left")
    df["excess_log_r1"] = df["log_r1"] - df["mkt_log_r1"]

    out = []
    for tkr, g in df.groupby("ticker", sort=False):
        g = g.reset_index(drop=True)
        roll = g["excess_log_r1"].shift(-1).rolling(horizon, min_periods=horizon)
        f = pd.DataFrame({"ticker": tkr, "date": g["date"], "fwd_excess_tstat": roll.sum()})
        out.append(f)
    return pd.concat(out, ignore_index=True)

def zscore_rows(df: pd.DataFrame) -> pd.DataFrame:
    x = df.replace([np.inf, -np.inf], np.nan)
    mu, sd = x.mean(axis=1), x.std(axis=1, ddof=0).replace(0, np.nan)
    return x.sub(mu, axis=0).div(sd, axis=0)

def rank_to_alpha_100(df: pd.DataFrame) -> pd.DataFrame:
    r = df.rank(axis=1, method="average", na_option="keep")
    scale = (r.max(axis=1) - r.min(axis=1)).replace(0, np.nan)
    a = -100 + (r.sub(r.min(axis=1), axis=0)).div(scale, axis=0) * 200
    return a.sub(a.mean(axis=1), axis=0)

def purged_embargo_folds(dates, n_splits=3, embargo_days=63):
    d = pd.to_datetime(dates).dt.normalize().values.astype("datetime64[D]")
    uniq = np.array(sorted(np.unique(d)))
    for val_block in np.array_split(uniq, n_splits):
        if len(val_block) == 0: continue
        v0, v1 = val_block[0], val_block[-1]
        lo = np.busday_offset(v0, -embargo_days, roll="backward")
        hi = np.busday_offset(v1, +embargo_days, roll="forward")
        val_mask = np.isin(d, val_block)
        emb_mask = (d >= lo) & (d <= hi)
        tr_idx = np.where(~val_mask & ~emb_mask)[0]
        va_idx = np.where(val_mask)[0]
        if len(tr_idx) and len(va_idx): yield tr_idx, va_idx

def train_and_fit(df: pd.DataFrame, label_col: str, params: dict, test_start: pd.Timestamp,
                  forward_horizon_days: int, seed: int = 42):
    df = df.dropna(subset=[label_col]).copy()
    cutoff = test_start - BDay(forward_horizon_days or 0)
    tr_all = df[df["date"] < cutoff]; te = df[df["date"] >= test_start]
    split_dt = tr_all["date"].quantile(0.8)
    tr, va = tr_all[tr_all["date"] < split_dt], tr_all[tr_all["date"] >= split_dt]

    feats = [c for c in tr.columns if c not in ["ticker","date",label_col]]
    Xtr, ytr = tr[feats], tr[label_col]
    Xva, yva = va[feats], va[label_col]
    Xte, yte = te[feats], te[label_col]

    model = LGBMRegressor(random_state=seed, **params)
    model.fit(Xtr, ytr, eval_set=[(Xva, yva)], eval_metric="rmse",
              callbacks=[early_stopping(200), log_evaluation(50)])
    out = te[["date","ticker"]].copy()
    out["yhat"] = model.predict(Xte, num_iteration=model.best_iteration_)
    return out, model, Xte

def build_daily_signal(
    preds: pd.DataFrame,
    label_col: str,
    forward_horizon: Optional[int],
    px: pd.DataFrame,
    test_start: pd.Timestamp,
) -> pd.DataFrame:
    p = preds.copy()
    p["date"] = pd.to_datetime(p["date"])
    p = p.groupby(["ticker","date"], as_index=False)[label_col].mean()

    universe = px["ticker"].unique()
    series = {}
    for tkr in universe:
        px_t = px[(px["ticker"] == tkr) & (px["date"] >= test_start)]
        if px_t.empty: continue
        idx = pd.DatetimeIndex(px_t["date"].values)
        sig = pd.Series(np.nan, index=idx)

        ev = p.loc[p["ticker"] == tkr, ["date", label_col]].sort_values("date").reset_index(drop=True)
        for i, (d0, v) in ev.iterrows():
            s = int(idx.searchsorted(pd.Timestamp(d0), side="left"))
            if s >= len(idx): continue
            e = len(idx) - 1 if forward_horizon is None else min(s + forward_horizon - 1, len(idx) - 1)
            if i < len(ev) - 1:
                e = min(e, int(idx.searchsorted(pd.Timestamp(ev.loc[i+1, "date"]), side="left")) - 1)
            if e >= s: sig.iloc[s:e+1] = float(v)
        series[tkr] = sig
    df = pd.DataFrame(series); df.index.name = "date"
    return df

def ev_join_truth(fe_df: pd.DataFrame, preds: pd.DataFrame, yhat_col: str, label_col: str) -> pd.DataFrame:
    truth = fe_df.loc[fe_df["date"] >= TEST_START, ["ticker","date",label_col]].rename(columns={label_col:"label"})
    return preds.merge(truth, on=["ticker","date"], how="inner").astype({yhat_col: float, "label": float})

def daily_ic(df_eval: pd.DataFrame, yhat_col: str):
    ic = df_eval.groupby("date").apply(lambda g: spearmanr(g[yhat_col], g["label"]).correlation)
    mu, sd = ic.mean(), ic.std(ddof=0)
    return mu, sd, mu / sd if sd > 0 else np.nan

# ---------------------------------------------------------------------------
# Load feature tables and price data; filter universe (min 10-K/10-Q counts)
# ---------------------------------------------------------------------------
fe8 = pd.read_parquet(FE_8K)
fe10 = pd.read_parquet(FE_10KQ)
n0 = fe10["ticker"].nunique()
cnt = fe10.groupby("ticker")["file_type"].value_counts().unstack(fill_value=0)
keep = cnt[(cnt.get("10k",0) > MIN10k) & (cnt.get("10q",0) > MIN10q)].index
fe10 = fe10[fe10["ticker"].isin(keep)].copy()
fe8  = fe8[fe8["ticker"].isin(keep)].copy()

for d in (fe8, fe10):
    d["date"] = pd.to_datetime(d["date"])
    d.sort_values(["date","ticker"], inplace=True)
    if "file_type" in d.columns:
        d["is_10k"] = (d["file_type"] == "10k").astype("int8")
        d.drop(columns=["file_type"], inplace=True)
    b = d.select_dtypes(include=["bool"]).columns
    if len(b): d[b] = d[b].astype("int8")

px = pd.read_parquet(PRICES_3COL)
px["date"] = pd.to_datetime(px["date"])
px = px.dropna(subset=["ticker","date"]).sort_values(["ticker","date"])
px = px[px["ticker"].isin(keep)].copy()

t8  = make_forward_labels(px, FORWARD_8K)
t10 = make_forward_labels(px, FORWARD_10KQ)
fe8  = fe8.merge(t8,  on=["ticker","date"], how="left").dropna(subset=[TRAIN_LABEL])
fe10 = fe10.merge(t10, on=["ticker","date"], how="left").dropna(subset=[TRAIN_LABEL])

base = dict(n_estimators=4000, random_state=SEED)
grid = [
    dict(zip(
        ["num_leaves","learning_rate","max_depth","subsample","colsample_bytree"],
        combo
    ))
    for combo in itertools.product([31,63],[0.01,0.05,0.1],[-1,10],[0.8,1.0],[0.8,1.0])
]
X = fe10.drop(columns=["ticker","date",TRAIN_LABEL]).values
y = fe10[TRAIN_LABEL].values

best, best_rmse = None, np.inf
for p in grid:
    rmses = []
    for tr_idx, va_idx in purged_embargo_folds(fe10["date"], n_splits=3, embargo_days=FORWARD_10KQ):
        m = LGBMRegressor(**base, **p)
        m.fit(X[tr_idx], y[tr_idx], eval_set=[(X[va_idx], y[va_idx])], eval_metric="rmse",
              callbacks=[early_stopping(200), log_evaluation(0)])
        pred = m.predict(X[va_idx], num_iteration=m.best_iteration_)
        rmses.append(np.sqrt(mean_squared_error(y[va_idx], pred)))
    rmse = float(np.mean(rmses))
    if rmse < best_rmse:
        best_rmse, best = rmse, p
LGB_PARAMS = best
print("Best params:", LGB_PARAMS)

p10, m10, X10 = train_and_fit(fe10, TRAIN_LABEL, LGB_PARAMS, TEST_START, FORWARD_10KQ, seed=SEED); p10.rename(columns={"yhat":"sig10"}, inplace=True)
p8,  m8,  X8  = train_and_fit(fe8,  TRAIN_LABEL, LGB_PARAMS, TEST_START, FORWARD_8K,   seed=SEED); p8.rename(columns={"yhat":"sig8"},  inplace=True)

M10 = build_daily_signal(p10, "sig10", FORWARD_10KQ, px, TEST_START)
M8  = build_daily_signal(p8,  "sig8",  FORWARD_8K,   px, TEST_START)

Z10, Z8 = zscore_rows(M10), zscore_rows(M8)
SIG = (Z10 + ADJ_8K_WEIGHT * Z8).astype(float)
ALPHA = rank_to_alpha_100(SIG)
ALPHA_TEST = ALPHA.loc[ALPHA.index >= TEST_START].reset_index().rename(columns={"index":"date"})

ALPHA_TEST.to_parquet(ALPHA_WIDE_OUT, index=False)

e10 = ev_join_truth(fe10, p10, "sig10", TRAIN_LABEL)
e8  = ev_join_truth(fe8,  p8,  "sig8",  TRAIN_LABEL)

for name, df, col in [("10-K/Q", e10, "sig10"), ("8-K", e8, "sig8")]:
    mu, sd, ir = daily_ic(df.rename(columns={col:"yhat"}), "yhat")
    acc = (np.sign(df["label"]) == np.sign(df[col])).mean()
    print(f"{name} â€” IC mean={mu:.4f}, IR={ir:.2f}, sign acc={100*acc:.2f}%")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000639 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3579
[LightGBM] [Info] Number of data points in the train set: 9791, number of used features: 17
[LightGBM] [Info] Start training from score 0.000263
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[221]	valid_0's rmse: 0.169678	valid_0's l2: 0.0287906
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000384 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3579
[LightGBM] [Info] Number of data points in the train set: 9534, number of used features: 17
[LightGBM] [Info] Start training from score 0.000567
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[9]	valid_0's rmse: 0.108449	valid_0's l2: 0.0117612
[LightGBM] [Info

## SHAP Analysis of LGBM

In [None]:
import matplotlib.pyplot as plt
import shap

# ---------------------------------------------------------------------------
# SHAP interpretability: summary bar, beeswarm, and dependence plots
# ---------------------------------------------------------------------------
def shap_plots_for_model(model, X_test: pd.DataFrame, tag: str, max_display: int = 20, dep_top_k: int = 6, sample_n: int = 2000):
    explainer = shap.TreeExplainer(model)

    if len(X_test) > sample_n:
        X_plot = X_test.sample(sample_n, random_state=SEED)
    else:
        X_plot = X_test

    shap_values = explainer.shap_values(X_plot, check_additivity=False)

    outdir = Path(MYPATH) / "OUTPUT"
    outdir.mkdir(parents=True, exist_ok=True)

    plt.figure()
    shap.summary_plot(shap_values, X_plot, plot_type="bar", max_display=max_display, show=False)
    bar_path = outdir / f"shap_{tag}_summary_bar.png"
    plt.tight_layout()
    plt.savefig(bar_path, dpi=150)
    plt.close()

    plt.figure()
    shap.summary_plot(shap_values, X_plot, max_display=max_display, show=False)
    bees_path = outdir / f"shap_{tag}_summary_beeswarm.png"
    plt.tight_layout()
    plt.savefig(bees_path, dpi=150)
    plt.close()

    mean_abs = np.abs(shap_values).mean(axis=0)
    order = np.argsort(-mean_abs)
    top_idx = order[:min(dep_top_k, X_plot.shape[1])]
    top_feats = X_plot.columns[top_idx]

    dep_paths = []
    for feat in top_feats:
        plt.figure()
        shap.dependence_plot(
            feat, shap_values, X_plot, interaction_index="auto", show=False
        )
        p = outdir / f"shap_{tag}_dependence_{feat}.png"
        plt.tight_layout()
        plt.savefig(p, dpi=150)
        plt.close()
        dep_paths.append(p)

shap_plots_for_model(m10, X10, tag="10kq", max_display=20, dep_top_k=6)
shap_plots_for_model(m8,  X8,  tag="8k",   max_display=20, dep_top_k=6)


<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>