In [None]:
import pandas as pd
from pathlib import Path
import requests, zipfile

DATA_URL = "https://github.com/rohanbadami/DS340W/releases/download/dataset/processed.zip"

ROOT = Path(".").resolve()
DATA_ROOT = ROOT / "data"
DATA_ROOT.mkdir(exist_ok=True)

ZIP_PATH = DATA_ROOT / "processed.zip"

if not ZIP_PATH.exists():
    print("Downloading dataset...")
    r = requests.get(DATA_URL, stream=True)
    r.raise_for_status()
    with open(ZIP_PATH, "wb") as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)
    print("Download complete.")
else:
    print("Dataset already downloaded.")

print("Extracting dataset...")
with zipfile.ZipFile(ZIP_PATH, "r") as z:
    z.extractall(DATA_ROOT)
print("Extraction complete.")

# this is where the per-ticker folders live: processed/<TICKER>/
DATA_DIR = DATA_ROOT / "processed"
print("Processed data directory:", DATA_DIR)

# pick a mixed set across sectors and macro.
TICKERS = ["AMAT", "AMT", "DVN", "EA", "EXPE", "KO", "PEP", "TXN", "USO", "XLF"]

# define the required price columns.
REQUIRED_PRICE_COLS = ["date", "open", "high", "low", "close", "adj_close", "volume", "ticker"]

# define the required text columns.
REQUIRED_TEXT_COLS  = ["date", "summary", "ticker"]

def _find_file(ticker: str, kind: str) -> Path:
    # with the new layout we expect: data/processed/<TICKER>/<kind>.csv
    p = DATA_DIR / ticker / f"{kind}.csv"
    if p.exists():
        return p
    raise FileNotFoundError(f"Could not find {kind}.csv for {ticker} at {p}")

def _normalize_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [
        c.strip()
         .lower()
         .replace("_", " ")
         .replace("-", " ")
        for c in df.columns
    ]
    return df

def _rename_canonical_prices(df: pd.DataFrame) -> pd.DataFrame:
    df = _normalize_cols(df)
    mapping = {
        "date time": "date",
        "datetime": "date",
        "date": "date",
        "open": "open",
        "high": "high",
        "low": "low",
        "close": "close",
        "adj close": "adj_close",
        "adjusted close": "adj_close",
        "adjclose": "adj_close",
        "adj_close": "adj_close",
        "volume": "volume",
        "record id": "record_id",
        "record_id": "record_id",
    }
    df = df.rename(columns={c: mapping.get(c, c) for c in df.columns})
    return df

def _rename_canonical_text(df: pd.DataFrame) -> pd.DataFrame:
    df = _normalize_cols(df)
    mapping = {
        "date time": "date",
        "datetime": "date",
        "date": "date",
        "summary": "summary",
        "text": "summary",
        "article": "summary",
    }
    df = df.rename(columns={c: mapping.get(c, c) for c in df.columns})
    return df

def load_and_standardize_ticker(ticker: str):
    price_path = _find_file(ticker, "time_series")
    text_path  = _find_file(ticker, "text")

    prices = pd.read_csv(price_path)
    text   = pd.read_csv(text_path)

    prices = _rename_canonical_prices(prices)
    text   = _rename_canonical_text(text)

    prices["date"] = pd.to_datetime(prices["date"], errors="coerce")
    text["date"]   = pd.to_datetime(text["date"],   errors="coerce")

    prices["ticker"] = ticker
    text["ticker"]   = ticker

    missing_price = sorted(set(REQUIRED_PRICE_COLS) - set(prices.columns))
    missing_text  = sorted(set(REQUIRED_TEXT_COLS)  - set(text.columns))
    if missing_price:
        raise ValueError(
            f"[{ticker}] Missing required price cols: {missing_price}. "
            f"Available: {sorted(prices.columns)}"
        )
    if missing_text:
        raise ValueError(
            f"[{ticker}] Missing required text cols: {missing_text}. "
            f"Available: {sorted(text.columns)}"
        )

    prices = prices[REQUIRED_PRICE_COLS].copy()
    text   = text[REQUIRED_TEXT_COLS].copy()

    return prices, text, price_path, text_path

# collect standardized data by ticker.
prices_by_ticker, text_by_ticker, paths = {}, {}, {}

for t in TICKERS:
    p, tx, p_path, tx_path = load_and_standardize_ticker(t)
    prices_by_ticker[t] = p
    text_by_ticker[t] = tx
    paths[t] = {"time_series": str(p_path), "text": str(tx_path)}
    print(f"{t}: prices={len(p):,} rows, text={len(tx):,} rows")

display(prices_by_ticker["AMAT"].head())
display(text_by_ticker["AMAT"].head())


In [None]:
import pandas as pd

def normalize_day(df: pd.DataFrame, date_col="date") -> pd.DataFrame:
    df = df.copy()
    # convert date strings into datetime values.
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    # round timestamps down to midnight.
    df[date_col] = df[date_col].dt.floor("D")
    # drop timezone info to avoid merges.
    try:
        df[date_col] = df[date_col].dt.tz_localize(None)
    except (TypeError, AttributeError):
        pass
    return df

for t in TICKERS:
    # normalize dates and sort by time.
    prices_by_ticker[t] = normalize_day(prices_by_ticker[t], "date").sort_values("date").reset_index(drop=True)
    text_by_ticker[t]   = normalize_day(text_by_ticker[t], "date").sort_values("date").reset_index(drop=True)

    # remove duplicate trading days per ticker.
    prices_by_ticker[t] = prices_by_ticker[t].drop_duplicates(subset=["ticker", "date"])
    # remove duplicate summaries on same day.
    text_by_ticker[t]   = text_by_ticker[t].drop_duplicates(subset=["ticker", "date", "summary"])

    # print min and max date ranges.
    p = prices_by_ticker[t]
    x = text_by_ticker[t]
    print(
        f"{t}: prices dates [{p['date'].min().date()} \u2192 {p['date'].max().date()}], "
        f"text dates [{x['date'].min().date()} \u2192 {x['date'].max().date()}], "
        f"price_rows={len(p):,}, text_rows={len(x):,}"
    )

display(prices_by_ticker["AMAT"].head())
display(text_by_ticker["AMAT"].head())


In [None]:
import pandas as pd
import numpy as np

# define numeric columns used for prices.
PRICE_NUMERIC_COLS = ["open", "high", "low", "close", "adj_close", "volume"]

def clean_prices_one_ticker(df: pd.DataFrame, ticker: str) -> tuple[pd.DataFrame, dict]:
    df = df.copy()

    # check required columns exist in dataframe.
    needed = ["date", "ticker"] + PRICE_NUMERIC_COLS
    missing = [c for c in needed if c not in df.columns]
    if missing:
        raise ValueError(f"[{ticker}] Missing required columns in prices: {missing}. Found: {sorted(df.columns)}")

    # coerce numeric price fields into floats.
    for c in PRICE_NUMERIC_COLS:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    # drop rows missing adjusted close values.
    before = len(df)
    df = df.dropna(subset=["adj_close"]).copy()
    dropped_adj = before - len(df)

    # drop rows missing date or ticker.
    before2 = len(df)
    df = df.dropna(subset=["date", "ticker"]).copy()
    dropped_key = before2 - len(df)

    # sort rows by ticker and date.
    df = df.sort_values(["ticker", "date"]).reset_index(drop=True)

    # dedupe and keep the last record.
    dup_mask = df.duplicated(subset=["ticker", "date"], keep="last")
    dup_count = int(dup_mask.sum())
    if dup_count > 0:
        df = df[~dup_mask].copy()

    # count suspicious values for sanity checks.
    neg_vol = int((df["volume"] < 0).sum()) if "volume" in df.columns else 0
    bad_ohlc = int(((df["high"] < df["low"]) | (df["close"] < 0) | (df["open"] < 0)).sum())

    report = {
        "ticker": ticker,
        "rows_before": before,
        "dropped_missing_adj_close": int(dropped_adj),
        "dropped_missing_date_or_ticker": int(dropped_key),
        "deduped_rows": dup_count,
        "rows_after": len(df),
        "min_date": df["date"].min(),
        "max_date": df["date"].max(),
        "neg_volume_rows": neg_vol,
        "bad_ohlc_rows": bad_ohlc,
    }
    return df, report

# run the cleaner for each ticker.
clean_reports = []
for t in TICKERS:
    cleaned, rep = clean_prices_one_ticker(prices_by_ticker[t], t)
    prices_by_ticker[t] = cleaned
    clean_reports.append(rep)

# summarize cleaning effects across tickers.
clean_reports_df = pd.DataFrame(clean_reports)
display(clean_reports_df)

# preview one cleaned ticker dataset.
display(prices_by_ticker["AMAT"].head())




In [None]:
import pandas as pd

def clean_text_one_ticker(df: pd.DataFrame, ticker: str) -> tuple[pd.DataFrame, dict]:
    df = df.copy()

    # check required text columns exist.
    needed = ["date", "summary", "ticker"]
    missing = [c for c in needed if c not in df.columns]
    if missing:
        raise ValueError(f"[{ticker}] Missing required columns in text: {missing}. Found: {sorted(df.columns)}")

    before = len(df)

    # cast summary to pandas string dtype.
    df["summary"] = df["summary"].astype("string")
    # drop rows missing summary text.
    df = df.dropna(subset=["summary"]).copy()
    # drop rows with empty summary text.
    df = df[df["summary"].str.strip().ne("")].copy()

    dropped_missing_or_empty = before - len(df)

    # remove exact duplicate text entries.
    before2 = len(df)
    df = df.drop_duplicates(subset=["ticker", "date", "summary"]).copy()
    deduped = before2 - len(df)

    # sort rows for stable downstream merges.
    df = df.sort_values(["ticker", "date"]).reset_index(drop=True)

    report = {
        "ticker": ticker,
        "rows_before": before,
        "dropped_missing_or_empty_summary": int(dropped_missing_or_empty),
        "deduped_exact_rows": int(deduped),
        "rows_after": len(df),
        "min_date": df["date"].min(),
        "max_date": df["date"].max(),
    }
    return df, report

# run the cleaner for each ticker.
text_reports = []
for t in TICKERS:
    cleaned, rep = clean_text_one_ticker(text_by_ticker[t], t)
    text_by_ticker[t] = cleaned
    text_reports.append(rep)

# summarize text cleaning across tickers.
text_reports_df = pd.DataFrame(text_reports)
display(text_reports_df)

# preview one cleaned text dataset.
display(text_by_ticker["AMAT"].head())


In [None]:
# creating trading-day calendar (per ticker)

import pandas as pd

trading_calendar = {}
calendar_reports = []

for t in TICKERS:
    p = prices_by_ticker[t].copy()

    # for valid trading dates (sorted unique)
    cal = pd.Index(p["date"].sort_values().unique(), name="date")
    trading_calendar[t] = cal

    calendar_reports.append({
        "ticker": t,
        "n_trading_days": int(len(cal)),
        "min_date": cal.min(),
        "max_date": cal.max(),
    })

calendar_reports_df = pd.DataFrame(calendar_reports)
display(calendar_reports_df)

# example to show first/last 5 dates for AMAT
t = "AMAT"
print(f"{t} first 5 trading days: {trading_calendar[t][:5].tolist()}")
print(f"{t} last  5 trading days: {trading_calendar[t][-5:].tolist()}")



In [None]:
# confirming prev step output, calendar report + spot-check vs price row 

import pandas as pd

calendar_reports = []
for t in TICKERS:
    cal = trading_calendar[t]
    calendar_reports.append({
        "ticker": t,
        "price_rows": int(len(prices_by_ticker[t])),
        "n_trading_days": int(len(cal)),
        "min_date": cal.min(),
        "max_date": cal.max(),
        "matches_price_rows": int(len(prices_by_ticker[t])) == int(len(cal)),
    })

calendar_reports_df = pd.DataFrame(calendar_reports).sort_values("ticker").reset_index(drop=True)
display(calendar_reports_df)

for t in TICKERS:
    print(f"{t} first 5 trading days: {trading_calendar[t][:5].tolist()}")
    print(f"{t} last  5 trading days: {trading_calendar[t][-5:].tolist()}")
    print("-" * 60)



In [None]:
import numpy as np
import pandas as pd

# store aligned text per ticker.
aligned_text_by_ticker = {}
# store alignment reports per ticker.
align_reports = []

def align_text_to_calendar(text_df: pd.DataFrame, cal: pd.Index, ticker: str):
    text_df = text_df.copy()

    # coerce dates to day-level timestamps.
    text_df["date"] = pd.to_datetime(text_df["date"], errors="coerce").dt.floor("D")

    # convert calendar to sorted datetime array.
    trading_days = np.array(pd.to_datetime(cal).sort_values().unique(), dtype="datetime64[ns]")
    if len(trading_days) == 0:
        raise ValueError(f"[{ticker}] Trading calendar is empty.")

    # drop rows with invalid dates.
    n_before = len(text_df)
    text_df = text_df.dropna(subset=["date"]).copy()
    n_after_valid = len(text_df)

    # extract news dates as datetime64 array.
    d = text_df["date"].values.astype("datetime64[ns]")

    # find the next trading day index.
    idx = np.searchsorted(trading_days, d, side="left").astype(np.int64)

    # mark rows that can be mapped.
    can_map = idx < len(trading_days)

    # create mapped dates with nat defaults.
    mapped = np.full(shape=len(idx), fill_value=np.datetime64("NaT", "ns"), dtype="datetime64[ns]")
    mapped[can_map] = trading_days[idx[can_map]]

    # write aligned dates back to dataframe.
    text_df["aligned_date"] = pd.to_datetime(mapped)

    # drop rows that cannot be aligned.
    n_after_map = len(text_df)
    text_df = text_df.dropna(subset=["aligned_date"]).copy()

    # count how many stayed on same day.
    same_day = int(
        (text_df["aligned_date"].values.astype("datetime64[ns]") ==
         text_df["date"].values.astype("datetime64[ns]")).sum()
    )
    # count how many shifted forward.
    moved_forward = int(len(text_df) - same_day)

    dropped_invalid = int(n_before - n_after_valid)
    dropped_unmappable = int(n_after_map - len(text_df))

    report = {
        "ticker": ticker,
        "text_rows_before": int(n_before),
        "dropped_invalid_date": dropped_invalid,
        "dropped_after_last_trading_day": dropped_unmappable,
        "text_rows_after": int(len(text_df)),
        "same_day": same_day,
        "moved_forward": moved_forward,
        "calendar_min": pd.to_datetime(trading_days[0]).date(),
        "calendar_max": pd.to_datetime(trading_days[-1]).date(),
    }
    return text_df.reset_index(drop=True), report

# align each ticker text to trading dates.
for t in TICKERS:
    aligned, rep = align_text_to_calendar(text_by_ticker[t], trading_calendar[t], t)
    aligned_text_by_ticker[t] = aligned
    align_reports.append(rep)

# show alignment stats across tickers.
align_reports_df = pd.DataFrame(align_reports).sort_values("ticker").reset_index(drop=True)
display(align_reports_df)

# show examples shifted to next trading day.
for t in TICKERS:
    moved_examples = aligned_text_by_ticker[t].query("aligned_date != date")[["date", "aligned_date", "ticker", "summary"]].head(5)
    print(f"\n{t}: moved-forward examples (if any)")
    display(moved_examples)


In [None]:
%pip -q install -U transformers torch



In [None]:
import pandas as pd

all_articles = pd.concat(
    [aligned_text_by_ticker[t].assign(ticker=t) for t in TICKERS],
    ignore_index=True
)

# We'll use aligned_date as the "effective" date for modeling
all_articles = all_articles.rename(columns={"aligned_date": "date"})
all_articles = all_articles[["date", "ticker", "summary"]].copy()
all_articles.head()




In [None]:
import os
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification

os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"  # per HF guidance :contentReference[oaicite:4]{index=4}

MODEL_ID = "ProsusAI/finbert"  # :contentReference[oaicite:5]{index=5}

# Pick device
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print("Using device:", device)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID).to(device)
model.eval()

id2label = model.config.id2label
label2id = {v.lower(): k for k, v in id2label.items()}  # e.g., positive/negative/neutral

def finbert_score(texts, batch_size=32, max_length=256):
    all_pos, all_neg, all_neu = [], [], []

    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            enc = tokenizer(
                batch,
                padding=True,
                truncation=True,
                max_length=max_length,
                return_tensors="pt"
            )
            enc = {k: v.to(device) for k, v in enc.items()}
            logits = model(**enc).logits
            probs = torch.softmax(logits, dim=-1).detach().cpu().numpy()

            # Map label indices robustly
            # common labels: "positive", "negative", "neutral"
            pos_idx = label2id.get("positive")
            neg_idx = label2id.get("negative")
            neu_idx = label2id.get("neutral")

            all_pos.append(probs[:, pos_idx] if pos_idx is not None else np.full(len(batch), np.nan))
            all_neg.append(probs[:, neg_idx] if neg_idx is not None else np.full(len(batch), np.nan))
            all_neu.append(probs[:, neu_idx] if neu_idx is not None else np.full(len(batch), np.nan))

    pos = np.concatenate(all_pos)
    neg = np.concatenate(all_neg)
    neu = np.concatenate(all_neu)
    score = pos - neg
    return pos, neg, neu, score

texts = all_articles["summary"].astype(str).tolist()
pos, neg, neu, score = finbert_score(texts, batch_size=32, max_length=256)

all_articles["sent_pos"] = pos
all_articles["sent_neg"] = neg
all_articles["sent_neu"] = neu
all_articles["sent_score"] = score

all_articles.head()



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# drop duplicate date columns to avoid pandas weirdness.
all_articles = all_articles.loc[:, ~all_articles.columns.duplicated()].copy()

# check probabilities sum close to one.
prob_sum = all_articles[["sent_pos", "sent_neg", "sent_neu"]].sum(axis=1)
print("prob_sum stats:\n", prob_sum.describe())
print("rows far from 1.0:", int((prob_sum - 1.0).abs().gt(1e-3).sum()))

# add a confidence feature (useful for weighting).
all_articles["sent_conf"] = 1.0 - all_articles["sent_neu"]

# summarize sentiment by ticker.
ticker_summary = (
    all_articles.groupby("ticker")
    .agg(
        n=("sent_score", "size"),
        mean_score=("sent_score", "mean"),
        median_score=("sent_score", "median"),
        mean_neu=("sent_neu", "mean"),
        mean_conf=("sent_conf", "mean"),
        frac_strong=("sent_score", lambda s: float((s.abs() >= 0.5).mean())),
        frac_pos=("sent_score", lambda s: float((s > 0.1).mean())),
        frac_neg=("sent_score", lambda s: float((s < -0.1).mean())),
    )
    .sort_values("mean_score", ascending=False)
)
display(ticker_summary)

# plot 1: boxplot of sentiment score per ticker.
tickers = ticker_summary.index.tolist()
data = [all_articles.loc[all_articles["ticker"] == t, "sent_score"].to_numpy() for t in tickers]

plt.figure(figsize=(12, 4))
plt.boxplot(data, labels=tickers, showfliers=False)
plt.axhline(0, linewidth=1)
plt.title("finbert sentiment score distribution by ticker")
plt.ylabel("sent_score (pos - neg)")
plt.tight_layout()
plt.show()

# plot 2: mean sentiment and mean neutrality per ticker.
plt.figure(figsize=(12, 4))
x = np.arange(len(tickers))
plt.bar(x - 0.2, ticker_summary["mean_score"].to_numpy(), width=0.4, label="mean score")
plt.bar(x + 0.2, ticker_summary["mean_neu"].to_numpy(),  width=0.4, label="mean neutral prob")
plt.axhline(0, linewidth=1)
plt.xticks(x, tickers)
plt.title("average sentiment vs average neutrality by ticker")
plt.tight_layout()
plt.legend()
plt.show()


In [None]:
import time

sample = all_articles["summary"].astype(str).head(200).tolist()

t0 = time.time()
_ = finbert_score(sample, batch_size=32, max_length=256)
t1 = time.time()

print("Seconds for 200 summaries:", round(t1 - t0, 3))
print("Summaries/sec:", round(200 / (t1 - t0), 3))

In [None]:
import pandas as pd

# print table shape and basic previews.
print("shape:", all_articles.shape)
display(all_articles.head(10))
display(all_articles.tail(10))
# list columns and data types.
print("\ncolumns:", list(all_articles.columns))
print("\ndtypes:\n", all_articles.dtypes)

# check if any column names repeat.
dupe_cols = all_articles.columns[all_articles.columns.duplicated()].tolist()
print("\nduplicate column names:", dupe_cols)

# drop repeated columns and keep first.
if dupe_cols:
    all_articles = all_articles.loc[:, ~all_articles.columns.duplicated()].copy()
    print("\nDropped duplicate columns. New columns:", list(all_articles.columns))

# convert date to day-level datetime.
all_articles["date"] = pd.to_datetime(all_articles["date"], errors="coerce").dt.floor("D")
print("\nDate range:", all_articles["date"].min(), "→", all_articles["date"].max())

# count missing values in key fields.
key_cols = ["date", "ticker", "summary", "sent_pos", "sent_neg", "sent_neu", "sent_score"]
missing = all_articles[key_cols].isna().sum().sort_values(ascending=False)
print("\nMissing values (key cols):")
display(missing)

# summarize per-ticker coverage and average length.
ticker_stats = (
    all_articles.groupby("ticker")
    .agg(
        articles=("summary", "count"),
        min_date=("date", "min"),
        max_date=("date", "max"),
        avg_len=("summary", lambda s: s.astype(str).str.len().mean())
    )
    .sort_values("articles", ascending=False)
)
display(ticker_stats)

# describe sentiment columns to sanity check.
display(all_articles[["sent_pos","sent_neg","sent_neu","sent_score"]].describe())

# count articles per ticker per day.
per_day = (
    all_articles.groupby(["ticker","date"])
    .size()
    .rename("articles_that_day")
    .reset_index()
    .sort_values("articles_that_day", ascending=False)
)
print("\nTop 20 (ticker, day) by number of articles:")
display(per_day.head(20))

# summarize daily article count distribution.
print("\nDistribution of articles per day (all tickers):")
display(per_day["articles_that_day"].describe())

# show one extreme day for inspection.
if len(per_day) > 0:
    ex = per_day.iloc[0]
    tkr, day = ex["ticker"], ex["date"]
    print(f"\nExample heavy-news day: ticker={tkr}, date={day.date()}, count={int(ex['articles_that_day'])}")
    display(
        all_articles[(all_articles["ticker"] == tkr) & (all_articles["date"] == day)]
        [["date","ticker","sent_score","summary"]]
        .head(10)
    )



In [None]:
import numpy as np
import pandas as pd

# align article dates to trading days.
def add_aligned_date(all_articles: pd.DataFrame, trading_calendar: dict) -> pd.DataFrame:
    df = all_articles.copy()

    # coerce date to day-level datetime.
    df["date"] = pd.to_datetime(df["date"], errors="coerce").dt.floor("D")

    # create empty aligned date column.
    df["aligned_date"] = pd.NaT

    # align each ticker using its calendar.
    for t, g in df.groupby("ticker", sort=False):
        if t not in trading_calendar:
            continue

        # build sorted trading day array.
        trading_days = np.array(
            pd.to_datetime(trading_calendar[t]).sort_values().unique(),
            dtype="datetime64[ns]"
        )
        if len(trading_days) == 0:
            continue

        d = g["date"].values.astype("datetime64[ns]")
        idx = np.searchsorted(trading_days, d, side="left").astype(np.int64)
        can_map = idx < len(trading_days)

        mapped = np.full(len(idx), np.datetime64("NaT", "ns"), dtype="datetime64[ns]")
        mapped[can_map] = trading_days[idx[can_map]]

        df.loc[g.index, "aligned_date"] = pd.to_datetime(mapped)

    # drop rows that cannot be aligned.
    df = df.dropna(subset=["aligned_date"]).copy()

    # compute how far dates were shifted.
    df["shift_days"] = (df["aligned_date"] - df["date"]).dt.days.astype("int32")
    df["moved_forward"] = df["shift_days"] > 0
    return df

# build daily panel by merging prices and news.
def build_panel(prices_by_ticker: dict, articles_aligned: pd.DataFrame) -> pd.DataFrame:
    panels = []

    # aggregate in case multiple news map same day.
    daily_news = (
        articles_aligned.groupby(["ticker", "aligned_date"])
        .agg(
            news_count=("summary", "size"),
            sent_score_obs=("sent_score", "mean"),
            sent_conf_obs=("sent_conf", "mean"),
            sent_abs_obs=("sent_score", lambda s: float(np.mean(np.abs(s)))),
        )
        .reset_index()
    )

    # merge daily news onto each ticker prices.
    for t, p in prices_by_ticker.items():
        p = p.copy()
        p["date"] = pd.to_datetime(p["date"], errors="coerce").dt.floor("D")

        m = p.merge(
            daily_news[daily_news["ticker"] == t],
            left_on=["ticker", "date"],
            right_on=["ticker", "aligned_date"],
            how="left",
        )

        # define news presence on trading days.
        m["news_count"] = m["news_count"].fillna(0).astype("int32")
        m["has_news"] = (m["news_count"] > 0).astype("int8")

        # keep a clean sort order.
        m = m.sort_values(["ticker", "date"]).reset_index(drop=True)
        panels.append(m)

    return pd.concat(panels, ignore_index=True)

# compute days since last news event.
def add_days_since_news(panel: pd.DataFrame) -> pd.DataFrame:
    panel = panel.copy()

    # create event groups that increment on news.
    panel["event_group"] = panel.groupby("ticker")["has_news"].cumsum()

    # count days within each event group.
    panel["days_since_news"] = panel.groupby(["ticker", "event_group"]).cumcount().astype("float32")

    # set pre-first-news days to nan.
    pre_first = (panel["event_group"] == 0) & (panel["has_news"] == 0)
    panel.loc[pre_first, "days_since_news"] = np.nan

    return panel.drop(columns=["event_group"])

# apply exponential decay on no-news days.
def add_exponential_decay(panel: pd.DataFrame, lam: float = 0.03, baseline: float = 0.0) -> pd.DataFrame:
    panel = panel.copy()
    decay = float(np.exp(-lam))

    # compute per-ticker decayed sentiment feature.
    out = []
    for t, g in panel.groupby("ticker", sort=False):
        g = g.sort_values("date").copy()
        s_obs = g["sent_score_obs"].to_numpy(dtype=float)
        has = g["has_news"].to_numpy(dtype=bool)

        s_feat = np.empty(len(g), dtype=float)
        prev = baseline
        for i in range(len(g)):
            if has[i] and not np.isnan(s_obs[i]):
                prev = float(s_obs[i])
            else:
                prev = baseline + (prev - baseline) * decay
            s_feat[i] = prev

        g["sent_score_decay"] = s_feat.astype("float32")
        out.append(g)

    return pd.concat(out, ignore_index=True)

# build aligned articles with shift stats.
articles_aligned = add_aligned_date(all_articles, trading_calendar)

# report: shifted vs same-day, per ticker.
shift_report = (
    articles_aligned.groupby("ticker")
    .agg(
        n_articles=("summary", "size"),
        same_day=("moved_forward", lambda x: int((~x).sum())),
        moved_forward=("moved_forward", "sum"),
        mean_shift_days=("shift_days", "mean"),
        max_shift_days=("shift_days", "max"),
    )
    .reset_index()
    .sort_values("ticker")
)
display(shift_report)

# build full trading-day panel and add sparse-news features.
panel = build_panel(prices_by_ticker, articles_aligned)
panel = add_days_since_news(panel)
panel = add_exponential_decay(panel, lam=0.03, baseline=0.0)

# report: news coverage and gap sizes.
coverage_report = (
    panel.groupby("ticker")
    .agg(
        n_trading_days=("date", "size"),
        n_news_days=("has_news", "sum"),
        pct_news_days=("has_news", "mean"),
        max_days_since_news=("days_since_news", "max"),
    )
    .reset_index()
    .sort_values("pct_news_days")
)
display(coverage_report)

# quick peek at the final panel.
display(panel[["ticker","date","has_news","news_count","sent_score_obs","sent_score_decay","days_since_news"]].head(20))


In [None]:
# where are the largest gaps?
gap_rows = (panel.dropna(subset=["days_since_news"])
            .sort_values("days_since_news", ascending=False)
            .loc[:, ["ticker","date","days_since_news","has_news","sent_score_decay"]]
            .head(30))
display(gap_rows)

# gap distribution per ticker
gap_stats = (panel.dropna(subset=["days_since_news"])
             .groupby("ticker")["days_since_news"]
             .agg(["count","mean","median","max"])
             .sort_values("max", ascending=False))
display(gap_stats)


In [None]:
import pandas as pd

t = "AMT"
g = panel[panel["ticker"] == t].sort_values("date").reset_index(drop=True)

# find the row with the max gap
imax = g["days_since_news"].fillna(-1).idxmax()
row = g.loc[imax, ["date","days_since_news","sent_score_decay","has_news"]]
print("max gap row:\n", row)

# last news day before it
prev_news = g.loc[:imax][g.loc[:imax, "has_news"] == 1].tail(1)
# next news day after it
next_news = g.loc[imax:][g.loc[imax:, "has_news"] == 1].head(1)

print("\nprevious news day:")
display(prev_news[["date","has_news","news_count","sent_score_obs"]])

print("\nnext news day:")
display(next_news[["date","has_news","news_count","sent_score_obs"]])


In [None]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# compute rmse using a sqrt wrapper.
def rmse(y_true, y_pred) -> float:
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# create next-day targets for modelling.
panel = panel.sort_values(["ticker", "date"]).reset_index(drop=True)
panel["log_adj_close"] = np.log(panel["adj_close"].astype(float))
panel["target_nextday_logret"] = panel.groupby("ticker")["log_adj_close"].diff(-1) * -1
panel["target_up"] = (panel["target_nextday_logret"] > 0).astype("int8")
panel = panel.dropna(subset=["target_nextday_logret"]).copy()

# define a volatility style target.
panel["target_abs_logret"] = panel["target_nextday_logret"].abs()

# create baseline price and volume features.
panel["ret1"] = panel.groupby("ticker")["adj_close"].pct_change()
panel["vol_z"] = panel.groupby("ticker")["volume"].transform(lambda s: (s - s.mean()) / (s.std() + 1e-9))

# do ticker-wise z score for sentiment.
g = panel.groupby("ticker")["sent_score_decay"]
panel["sent_decay_z"] = (panel["sent_score_decay"] - g.transform("mean")) / (g.transform("std") + 1e-9)

# build a rolling baseline and surprise signal.
panel["sent_z_roll20"] = panel.groupby("ticker")["sent_decay_z"].transform(
    lambda s: s.rolling(20, min_periods=5).mean()
)
panel["sent_surprise"] = panel["sent_decay_z"] - panel["sent_z_roll20"]

# add simple return lags for momentum context.
for L in [1, 2, 3, 5, 10]:
    panel[f"ret_lag{L}"] = panel.groupby("ticker")["ret1"].shift(L)

# add rolling mean and rolling std features.
panel["ret_roll5"] = panel.groupby("ticker")["ret1"].transform(lambda s: s.rolling(5, min_periods=5).mean())
panel["vol_roll5"] = panel.groupby("ticker")["ret1"].transform(lambda s: s.rolling(5, min_periods=5).std())

# optionally cap very long no-news gaps.
RESET_AFTER = 60
panel["sent_score_final"] = panel["sent_score_decay"].where(panel["days_since_news"] <= RESET_AFTER, 0.0)
panel["sent_final_z"] = panel.groupby("ticker")["sent_score_final"].transform(
    lambda s: (s - s.mean()) / (s.std() + 1e-9)
)

# drop rows created by lagging and rolling.
panel_ml = panel.dropna().copy()

# split by time to avoid leakage.
dates = panel_ml["date"].sort_values().unique()
cut1 = dates[int(0.70 * len(dates))]
cut2 = dates[int(0.85 * len(dates))]

train = panel_ml[panel_ml["date"] <= cut1].copy()
val   = panel_ml[(panel_ml["date"] > cut1) & (panel_ml["date"] <= cut2)].copy()
test  = panel_ml[panel_ml["date"] > cut2].copy()

print(
    "splits:",
    train["date"].min().date(), "→", train["date"].max().date(),
    "|", val["date"].min().date(), "→", val["date"].max().date(),
    "|", test["date"].min().date(), "→", test["date"].max().date(),
)

# compare feature sets for volatility prediction.
feature_sets = {
    "price_only": [
        "ret1", "vol_z", "ret_roll5", "vol_roll5",
        "ret_lag1", "ret_lag2", "ret_lag3", "ret_lag5", "ret_lag10",
    ],
    "plus_decay_raw": [
        "ret1", "vol_z", "ret_roll5", "vol_roll5",
        "ret_lag1", "ret_lag2", "ret_lag3", "ret_lag5", "ret_lag10",
        "sent_score_decay",
    ],
    "plus_decay_z": [
        "ret1", "vol_z", "ret_roll5", "vol_roll5",
        "ret_lag1", "ret_lag2", "ret_lag3", "ret_lag5", "ret_lag10",
        "sent_decay_z",
    ],
    "plus_surprise": [
        "ret1", "vol_z", "ret_roll5", "vol_roll5",
        "ret_lag1", "ret_lag2", "ret_lag3", "ret_lag5", "ret_lag10",
        "sent_decay_z", "sent_surprise",
    ],
    "plus_missingness": [
        "ret1", "vol_z", "ret_roll5", "vol_roll5",
        "ret_lag1", "ret_lag2", "ret_lag3", "ret_lag5", "ret_lag10",
        "sent_decay_z", "sent_surprise",
        "has_news", "news_count", "days_since_news",
    ],
    "plus_reset_rule": [
        "ret1", "vol_z", "ret_roll5", "vol_roll5",
        "ret_lag1", "ret_lag2", "ret_lag3", "ret_lag5", "ret_lag10",
        "sent_final_z", "sent_surprise",
        "has_news", "news_count", "days_since_news",
    ],
}

results = []
for name, feats in feature_sets.items():
    Xtr, ytr = train[feats], train["target_abs_logret"]
    Xva, yva = val[feats], val["target_abs_logret"]
    Xte, yte = test[feats], test["target_abs_logret"]

    # train a simple ridge regression baseline.
    reg = Pipeline([
        ("scaler", StandardScaler()),
        ("ridge", Ridge(alpha=1.0, random_state=0)),
    ])
    reg.fit(Xtr, ytr)

    p_va = reg.predict(Xva)
    p_te = reg.predict(Xte)

    results.append({
        "features": name,
        "val_mae": mean_absolute_error(yva, p_va),
        "val_rmse": rmse(yva, p_va),
        "val_r2": r2_score(yva, p_va),
        "test_mae": mean_absolute_error(yte, p_te),
        "test_rmse": rmse(yte, p_te),
        "test_r2": r2_score(yte, p_te),
    })

results_df = pd.DataFrame(results).sort_values("test_rmse")
display(results_df)


In [None]:
# step: xgboost training + metrics (unmodified core logic)

import os
import numpy as np
import shap
import xgboost as xgb

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# silence tokenizers fork parallelism warning.
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# compute rmse using a sqrt wrapper.
def rmse(y_true, y_pred) -> float:
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# pick the features you want to explain.
best_feats = feature_sets["plus_missingness"]
Xtr = train[best_feats]
ytr = train["target_abs_logret"].astype(float).to_numpy()
Xte = test[best_feats]
yte = test["target_abs_logret"].astype(float).to_numpy()

# set a scalar base score to avoid shap parsing issues.
base = float(np.mean(ytr))

# train xgboost regressor for volatility prediction.
xgb_model = xgb.XGBRegressor(
    n_estimators=800,
    max_depth=4,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    base_score=base,
    random_state=0,
)
xgb_model.fit(Xtr, ytr)

# print test metrics for the fitted model.
pred = xgb_model.predict(Xte)
print("xgb test mae:", mean_absolute_error(yte, pred))
print("xgb test rmse:", rmse(yte, pred))
print("xgb test r2:", r2_score(yte, pred))

# step: xgboost shap explainability (unmodified)
# try tree explainer first and fall back safely.
try:
    explainer = shap.TreeExplainer(xgb_model)
    shap_values = explainer.shap_values(Xte)
    shap.summary_plot(shap_values, Xte)
    shap.summary_plot(shap_values, Xte, plot_type="bar")
except Exception as e:
    print("treeexplainer failed, using permutation explainer:", repr(e))

    # use small background for speed and stability.
    bg = Xtr.sample(n=min(200, len(Xtr)), random_state=0)
    ex = Xte.sample(n=min(500, len(Xte)), random_state=0)

    # explain predictions with a model-agnostic method.
    explainer = shap.Explainer(xgb_model.predict, bg, algorithm="permutation")
    shap_exp = explainer(ex)

    shap.summary_plot(shap_exp.values, ex)
    shap.summary_plot(shap_exp.values, ex, plot_type="bar")


In [None]:
# step: lstm + transformer imports and reproducibility

import math
import random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# reuse rmse name (overwrites previous definition, which is fine)
def rmse(y_true, y_pred) -> float:
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

seed = 0
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)


# step: sequence building helpers (panel -> (n, t, f))

def _infer_time_col(df):
    if "date" in df.columns:
        return "date"
    if hasattr(df.index, "dtype") and "datetime" in str(df.index.dtype).lower():
        return None  # use index
    return None

def _ensure_sort(df, time_col):
    if time_col is None:
        return df.sort_index()
    return df.sort_values(time_col)

def _ensure_group_col(df):
    if "ticker" in df.columns:
        return "ticker"
    tmp = df.copy()
    tmp["ticker"] = "ALL"
    return "ticker", tmp

def make_sequences(df, feature_cols, target_col, seq_len=20):
    time_col = _infer_time_col(df)
    group_col, df2 = _ensure_group_col(df) if "ticker" not in df.columns else ("ticker", df)

    X_list, y_list = [], []
    for _, g in df2.groupby(group_col):
        g = _ensure_sort(g, time_col)
        X = g[feature_cols].to_numpy(dtype=np.float32)
        y = g[target_col].to_numpy(dtype=np.float32)

        if len(g) < seq_len:
            continue

        for i in range(seq_len - 1, len(g)):
            X_list.append(X[i - seq_len + 1 : i + 1])
            y_list.append(y[i])

    if not X_list:
        raise ValueError("No sequences were created. Check seq_len and per-ticker data length.")

    return np.stack(X_list, axis=0), np.array(y_list, dtype=np.float32)

def split_val_from_train(train_df, val_frac=0.1):
    time_col = _infer_time_col(train_df)
    group_col, df2 = _ensure_group_col(train_df) if "ticker" not in train_df.columns else ("ticker", train_df)

    train_parts, val_parts = [], []
    for _, g in df2.groupby(group_col):
        g = _ensure_sort(g, time_col)
        n = len(g)
        k = max(1, int(round(n * val_frac)))
        val_parts.append(g.iloc[-k:])
        train_parts.append(g.iloc[:-k] if n > k else g.iloc[:0])

    tr = pd.concat(train_parts, axis=0)
    va = pd.concat(val_parts, axis=0)
    return tr, va


# step: torch dataset and models (lstm + transformer)

class SeqDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)  # (N, T, F)
        self.y = torch.from_numpy(y).unsqueeze(-1)  # (N, 1)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class LSTMRegressor(nn.Module):
    def __init__(self, n_features, hidden=128, layers=2, dropout=0.2):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=n_features,
            hidden_size=hidden,
            num_layers=layers,
            batch_first=True,
            dropout=dropout if layers > 1 else 0.0,
        )
        self.head = nn.Sequential(
            nn.LayerNorm(hidden),
            nn.Linear(hidden, hidden // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden // 2, 1),
        )

    def forward(self, x):
        out, _ = self.lstm(x)
        last = out[:, -1, :]
        return self.head(last)

class SinusoidalPositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, d_model, dtype=torch.float32)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float32) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div)
        pe[:, 1::2] = torch.cos(position * div)
        self.register_buffer("pe", pe.unsqueeze(0))

    def forward(self, x):
        T = x.size(1)
        return x + self.pe[:, :T, :]

class TransformerRegressor(nn.Module):
    def __init__(self, n_features, d_model=128, nhead=4, num_layers=2, dim_ff=256, dropout=0.1, max_len=512):
        super().__init__()
        self.inp = nn.Linear(n_features, d_model)
        self.pos = SinusoidalPositionalEncoding(d_model, max_len=max_len)
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_ff,
            dropout=dropout,
            batch_first=True,
            activation="gelu",
            norm_first=True,
        )
        self.enc = nn.TransformerEncoder(enc_layer, num_layers=num_layers)
        self.head = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, d_model // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, 1),
        )

    def forward(self, x):
        z = self.inp(x)
        z = self.pos(z)
        z = self.enc(z)
        last = z[:, -1, :]
        return self.head(last)


# step: training and prediction loops

@torch.no_grad()
def predict_torch(model, loader, device):
    model.eval()
    ys, preds = [], []
    for xb, yb in loader:
        xb = xb.to(device)
        yhat = model(xb).detach().cpu().numpy().reshape(-1)
        preds.append(yhat)
        ys.append(yb.numpy().reshape(-1))
    return np.concatenate(ys), np.concatenate(preds)

def train_model(model, train_loader, val_loader, device, epochs=30, lr=1e-3, weight_decay=1e-4, patience=5):
    model.to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    loss_fn = nn.MSELoss()

    best_state = None
    best_val = float("inf")
    bad = 0

    for _ep in range(1, epochs + 1):
        model.train()
        for xb, yb in train_loader:
            xb = xb.to(device)
            yb = yb.to(device)
            opt.zero_grad(set_to_none=True)
            yhat = model(xb)
            loss = loss_fn(yhat, yb)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            opt.step()

        yv, pv = predict_torch(model, val_loader, device)
        val_rmse = rmse(yv, pv)

        if val_rmse < best_val - 1e-6:
            best_val = val_rmse
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            bad = 0
        else:
            bad += 1
            if bad >= patience:
                break

    if best_state is not None:
        model.load_state_dict(best_state)
    return model


# step: build sequences, scale features, and create dataloaders

SEQ_LEN = 20
BATCH = 512
EPOCHS = 30
target_col = "target_abs_logret"

if "val" in globals():
    val_df = val
    train_df = train
else:
    train_df, val_df = split_val_from_train(train, val_frac=0.1)

Xtr_seq, ytr_seq = make_sequences(train_df, best_feats, target_col, seq_len=SEQ_LEN)
Xva_seq, yva_seq = make_sequences(val_df,   best_feats, target_col, seq_len=SEQ_LEN)
Xte_seq, yte_seq = make_sequences(test,     best_feats, target_col, seq_len=SEQ_LEN)

scaler = StandardScaler()
Ntr, T, F = Xtr_seq.shape
scaler.fit(Xtr_seq.reshape(Ntr * T, F))

def apply_scaler(X):
    N, T, F = X.shape
    return scaler.transform(X.reshape(N * T, F)).reshape(N, T, F).astype(np.float32)

Xtr_seq = apply_scaler(Xtr_seq)
Xva_seq = apply_scaler(Xva_seq)
Xte_seq = apply_scaler(Xte_seq)

train_loader = DataLoader(SeqDataset(Xtr_seq, ytr_seq), batch_size=BATCH, shuffle=True, drop_last=False)
val_loader   = DataLoader(SeqDataset(Xva_seq, yva_seq), batch_size=BATCH, shuffle=False, drop_last=False)
test_loader  = DataLoader(SeqDataset(Xte_seq, yte_seq), batch_size=BATCH, shuffle=False, drop_last=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# step: train and evaluate lstm model

lstm = LSTMRegressor(n_features=F, hidden=128, layers=2, dropout=0.2)
lstm = train_model(lstm, train_loader, val_loader, device, epochs=EPOCHS, lr=1e-3, weight_decay=1e-4, patience=5)
y_true_lstm, y_pred_lstm = predict_torch(lstm, test_loader, device)
print("\nLSTM test mae:", mean_absolute_error(y_true_lstm, y_pred_lstm))
print("LSTM test rmse:", rmse(y_true_lstm, y_pred_lstm))
print("LSTM test r2:", r2_score(y_true_lstm, y_pred_lstm))


# step: train and evaluate transformer model

trf = TransformerRegressor(n_features=F, d_model=128, nhead=4, num_layers=2, dim_ff=256, dropout=0.1, max_len=512)
trf = train_model(trf, train_loader, val_loader, device, epochs=EPOCHS, lr=7e-4, weight_decay=1e-4, patience=5)
y_true_trf, y_pred_trf = predict_torch(trf, test_loader, device)
print("\nTransformer test mae:", mean_absolute_error(y_true_trf, y_pred_trf))
print("Transformer test rmse:", rmse(y_true_trf, y_pred_trf))
print("Transformer test r2:", r2_score(y_true_trf, y_pred_trf))


# step: permutation importance for sequence models (lstm + transformer)

def perm_importance_sequence(
    model,
    X_seq,           # (N, T, F) scaled
    y_true,          # (N,)
    feature_names,   # list of len F
    device,
    batch_size=1024,
    n_repeats=3,
    shuffle_over_time=True,
    seed=0,
):
    """
    Returns a list of (feature, mean_delta_rmse, std_delta_rmse) sorted desc.
    """
    rng = np.random.default_rng(seed)

    base_loader = DataLoader(SeqDataset(X_seq, y_true.astype(np.float32)), batch_size=batch_size, shuffle=False)
    yt, yp = predict_torch(model, base_loader, device)
    base_rmse = rmse(yt, yp)

    deltas = np.zeros((len(feature_names), n_repeats), dtype=np.float64)

    for j in range(len(feature_names)):
        for r in range(n_repeats):
            Xp = X_seq.copy()

            if shuffle_over_time:
                flat = Xp[:, :, j].reshape(-1)
                perm = rng.permutation(flat.shape[0])
                Xp[:, :, j] = flat[perm].reshape(Xp.shape[0], Xp.shape[1])
            else:
                perm = rng.permutation(Xp.shape[0])
                Xp[:, :, j] = Xp[perm, :, j]

            loader = DataLoader(SeqDataset(Xp, y_true.astype(np.float32)), batch_size=batch_size, shuffle=False)
            yt2, yp2 = predict_torch(model, loader, device)
            deltas[j, r] = rmse(yt2, yp2) - base_rmse

    rows = []
    for j, name in enumerate(feature_names):
        rows.append((name, float(deltas[j].mean()), float(deltas[j].std(ddof=1) if n_repeats > 1 else 0.0)))

    rows.sort(key=lambda x: x[1], reverse=True)
    return base_rmse, rows

# compute and print permutation importance for lstm
lstm_base_rmse, lstm_pi = perm_importance_sequence(
    model=lstm,
    X_seq=Xte_seq,
    y_true=yte_seq,
    feature_names=list(best_feats),
    device=device,
    batch_size=1024,
    n_repeats=3,
    shuffle_over_time=True,
    seed=0,
)
print("\nLSTM permutation importance (delta RMSE vs baseline RMSE={:.6f})".format(lstm_base_rmse))
for feat, mean_d, std_d in lstm_pi[:20]:
    print(f"  {feat:>30s}  +{mean_d:.6f}  (std {std_d:.6f})")

# compute and print permutation importance for transformer
trf_base_rmse, trf_pi = perm_importance_sequence(
    model=trf,
    X_seq=Xte_seq,
    y_true=yte_seq,
    feature_names=list(best_feats),
    device=device,
    batch_size=1024,
    n_repeats=3,
    shuffle_over_time=True,
    seed=1,
)
print("\nTransformer permutation importance (delta RMSE vs baseline RMSE={:.6f})".format(trf_base_rmse))
for feat, mean_d, std_d in trf_pi[:20]:
    print(f"  {feat:>30s}  +{mean_d:.6f}  (std {std_d:.6f})")


In [None]:
import numpy as np
import pandas as pd

use_perm = "shap_exp" in globals()
X_show = ex.copy() if use_perm else Xte.copy()
sv = shap_exp.values if use_perm else shap_values

j = list(X_show.columns).index("sent_surprise")

df = X_show.copy()
df["shap_sent"] = sv[:, j]
df["vol_bucket"] = pd.qcut(df["vol_z"], q=3, labels=["low_volz","mid_volz","high_volz"])
df["sent_bin"] = pd.qcut(df["sent_surprise"], q=8, duplicates="drop")

out = (df.groupby(["vol_bucket","sent_bin"], observed=True)["shap_sent"]
         .mean()
         .reset_index())

display(out)




In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df_plot = out.copy()

# convert interval bins into numeric midpoints for plotting.
df_plot["sent_mid"] = df_plot["sent_bin"].apply(lambda iv: float(iv.mid))

# map vol buckets to a stable order.
order = ["low_volz", "mid_volz", "high_volz"]
df_plot["vol_bucket"] = pd.Categorical(df_plot["vol_bucket"], categories=order, ordered=True)
df_plot = df_plot.sort_values(["vol_bucket", "sent_mid"])

plt.figure(figsize=(8, 4))
for vb in order:
    sub = df_plot[df_plot["vol_bucket"] == vb]
    plt.plot(sub["sent_mid"], sub["shap_sent"], marker="o", label=vb)

plt.axhline(0, linewidth=1)
plt.title("line plot: sentiment surprise vs shap impact, split by vol_z")
plt.xlabel("sent_surprise (bin midpoint)")
plt.ylabel("mean shap(sent_surprise)")
plt.tight_layout()
plt.legend()
plt.show()



In [None]:
import numpy as np

slopes = []
for vb in ["low_volz", "mid_volz", "high_volz"]:
    sub = df_plot[df_plot["vol_bucket"] == vb]
    x = sub["sent_mid"].to_numpy()
    y = sub["shap_sent"].to_numpy()
    m = float(np.polyfit(x, y, 1)[0])
    slopes.append({"vol_bucket": vb, "slope_shap_per_sent": m})

slopes_df = pd.DataFrame(slopes)
display(slopes_df)


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

slopes_df = pd.DataFrame({
    "vol_bucket": ["low_volz","mid_volz","high_volz"],
    "slope_shap_per_sent": [-0.000144, -0.000263, -0.000229]
})

plt.figure(figsize=(6,3))
plt.plot(slopes_df["vol_bucket"], slopes_df["slope_shap_per_sent"], marker="o")
plt.axhline(0, linewidth=1)
plt.title("line plot: slope of shap(sent_surprise) by volume bucket")
plt.ylabel("slope (shap per unit sentiment)")
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# compute rmse with sqrt of mse.
def rmse(y_true, y_pred) -> float:
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# fit xgb and return metrics.
def fit_eval_xgb(Xtr, ytr, Xte, yte, seed=0):
    base = float(np.mean(ytr))
    model = xgb.XGBRegressor(
        n_estimators=800,
        max_depth=4,
        learning_rate=0.03,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        base_score=base,
        random_state=seed,
    )
    model.fit(Xtr, ytr)
    pred = model.predict(Xte)
    return {
        "mae": mean_absolute_error(yte, pred),
        "rmse": rmse(yte, pred),
        "r2": r2_score(yte, pred),
    }

# pick your best feature set.
best_feats = feature_sets["plus_missingness"]

Xtr = train[best_feats].copy()
ytr = train["target_abs_logret"].astype(float).to_numpy()
Xte = test[best_feats].copy()
yte = test["target_abs_logret"].astype(float).to_numpy()

# real model performance.
real_metrics = fit_eval_xgb(Xtr, ytr, Xte, yte, seed=0)
print("real:", real_metrics)

# build placebo by shuffling sentiment features within each ticker.
sent_cols = [c for c in best_feats if "sent" in c]
print("shuffling sentiment cols:", sent_cols)

train_pl = train.copy()
test_pl = test.copy()

rng = np.random.default_rng(0)
for c in sent_cols:
    train_pl[c] = train_pl.groupby("ticker")[c].transform(lambda s: rng.permutation(s.to_numpy()))
    test_pl[c] = test_pl.groupby("ticker")[c].transform(lambda s: rng.permutation(s.to_numpy()))

Xtr_pl = train_pl[best_feats].copy()
Xte_pl = test_pl[best_feats].copy()

placebo_metrics = fit_eval_xgb(Xtr_pl, ytr, Xte_pl, yte, seed=0)
print("placebo:", placebo_metrics)

# run multiple placebo shuffles to get a small null distribution.
placebo_runs = []
for k in range(30):
    rng = np.random.default_rng(100 + k)
    trp = train.copy()
    tep = test.copy()
    for c in sent_cols:
        trp[c] = trp.groupby("ticker")[c].transform(lambda s: rng.permutation(s.to_numpy()))
        tep[c] = tep.groupby("ticker")[c].transform(lambda s: rng.permutation(s.to_numpy()))

    m = fit_eval_xgb(trp[best_feats], ytr, tep[best_feats], yte, seed=100 + k)
    m["run"] = k
    placebo_runs.append(m)

placebo_df = pd.DataFrame(placebo_runs)
display(placebo_df.describe())

# compare real to placebo in a simple table.
summary = pd.DataFrame([
    {"setting": "real", **real_metrics},
    {"setting": "placebo_mean", "mae": placebo_df["mae"].mean(), "rmse": placebo_df["rmse"].mean(), "r2": placebo_df["r2"].mean()},
    {"setting": "placebo_std", "mae": placebo_df["mae"].std(), "rmse": placebo_df["rmse"].std(), "r2": placebo_df["r2"].std()},
])
display(summary)


In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# compute rmse with sqrt of mse.
def rmse(y_true, y_pred) -> float:
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# rolling window evaluation across time.
def rolling_eval(panel_ml: pd.DataFrame, feats, target_col="target_abs_logret",
                 train_years=8, test_months=6, min_train_rows=2000, seed=0):

    df = panel_ml.sort_values("date").copy()
    df["date"] = pd.to_datetime(df["date"]).dt.floor("D")

    start = df["date"].min()
    end = df["date"].max()

    # start after we have enough training history.
    cursor = start + pd.DateOffset(years=train_years)
    rows = []

    while cursor + pd.DateOffset(months=test_months) <= end:
        train_end = cursor
        test_end = cursor + pd.DateOffset(months=test_months)

        tr = df[df["date"] < train_end].copy()
        te = df[(df["date"] >= train_end) & (df["date"] < test_end)].copy()

        if len(tr) < min_train_rows or len(te) < 500:
            cursor = cursor + pd.DateOffset(months=test_months)
            continue

        Xtr = tr[feats]
        ytr = tr[target_col].astype(float).to_numpy()
        Xte = te[feats]
        yte = te[target_col].astype(float).to_numpy()

        base = float(np.mean(ytr))
        model = xgb.XGBRegressor(
            n_estimators=800,
            max_depth=4,
            learning_rate=0.03,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            base_score=base,
            random_state=seed,
        )
        model.fit(Xtr, ytr)
        pred = model.predict(Xte)

        rows.append({
            "train_end": train_end.date(),
            "test_end": test_end.date(),
            "train_rows": len(tr),
            "test_rows": len(te),
            "mae": mean_absolute_error(yte, pred),
            "rmse": rmse(yte, pred),
            "r2": r2_score(yte, pred),
        })

        cursor = cursor + pd.DateOffset(months=test_months)

    return pd.DataFrame(rows)

# run rolling evaluation for two competing feature sets.
feats_price = feature_sets["price_only"]
feats_full = feature_sets["plus_missingness"]

roll_price = rolling_eval(panel_ml, feats_price, seed=0)
roll_full = rolling_eval(panel_ml, feats_full, seed=0)

roll_price["model"] = "price_only"
roll_full["model"] = "plus_missingness"
roll = pd.concat([roll_price, roll_full], ignore_index=True)

display(roll.head(10))
display(roll.groupby("model")[["mae","rmse","r2"]].agg(["mean","median","std"]))

# compute per-window deltas (full minus price).
merged = roll_price.merge(roll_full, on=["train_end","test_end"], suffixes=("_price","_full"))
merged["rmse_delta_full_minus_price"] = merged["rmse_full"] - merged["rmse_price"]
merged["mae_delta_full_minus_price"] = merged["mae_full"] - merged["mae_price"]
merged["r2_delta_full_minus_price"] = merged["r2_full"] - merged["r2_price"]

display(merged[["train_end","test_end","rmse_price","rmse_full","rmse_delta_full_minus_price",
                "mae_price","mae_full","mae_delta_full_minus_price",
                "r2_price","r2_full","r2_delta_full_minus_price"]].head(12))

print("delta rmse mean:", merged["rmse_delta_full_minus_price"].mean())
print("delta rmse median:", merged["rmse_delta_full_minus_price"].median())
