# Bias Response Curve Experiment

This notebook measures how adding a gender steering vector to GPT-2 Small’s hidden state changes its output.

- Loads GPT-2 Small via TransformerLens
- Computes a gender bias steering vector (" he" vs " she") at a chosen layer/hook site
- Sweeps α and injects at inference; reports logit differences Δ = logit(" he") − logit(" she")
- Compares to random and orthogonal directions (controls)
- Uses consistent BOS/tokenization and auto-detects decision position t\*
- Saves plot and prints basic stats


In [None]:
# Determinism env (set before Torch ops for cuBLAS determinism)
import os

os.environ.setdefault("CUBLAS_WORKSPACE_CONFIG", ":4096:8")

import numpy as np
import torch
import matplotlib.pyplot as plt
from transformer_lens import HookedTransformer

# Repro + determinism
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
try:
    torch.use_deterministic_algorithms(True)
except Exception:
    pass
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = HookedTransformer.from_pretrained("gpt2-small").to(device).eval()

## Define Prompts and READ/WRITE Layer

- Shared prefix with trailing space; compute t\* from tokens (with BOS)
- Target IDs from leading-space forms
- Inject at `blocks.{INJECT_LAYER}.hook_resid_mid`; read final `blocks.{L-1}.hook_resid_post`
- Narrow alpha sweep near linear regime


In [None]:
PREPEND_BOS = True
prefix = "The doctor said that "

toks_prefix = model.to_tokens(prefix, prepend_bos=PREPEND_BOS).to(device)
T_STAR = toks_prefix.shape[1] - 1  # last token index for this input

he_id = int(model.to_tokens(" he", prepend_bos=False)[0, 0])
she_id = int(model.to_tokens(" she", prepend_bos=False)[0, 0])

INJECT_LAYER = 3
READ_LAYER = model.cfg.n_layers - 1
INJECT_HOOK_NAME = f"blocks.{INJECT_LAYER}.hook_resid_mid"
READ_HOOK_NAME = f"blocks.{READ_LAYER}.hook_resid_post"

alphas = np.array([-0.5, -0.25, -0.125, -0.0625, 0, 0.0625, 0.125, 0.25, 0.5], dtype=float)

### Clean logit lens sanity check (final input token t\*)

- Clean logit lens = take the hidden state, apply the model’s final LayerNorm, then unembed to get next-token logits.
- t\* = last token of the input (position where next-token logits are predicted).
- From cache at read site: apply ln_final → unembed → get clean-lens logits.
- Compare with model’s true head logits at t\*.
- Pass if max(|lens − real|) < 1e-5.


In [None]:
@torch.no_grad()
def lens_logits_at_tstar(tokens, read_hook_name=READ_HOOK_NAME, apply_ln_final=True):
    cache = {}

    def read_hook(act, hook):
        cache["resid"] = act.detach().clone()
        return act

    _ = model.run_with_hooks(tokens, return_type=None, stop_at_layer=READ_LAYER + 1, fwd_hooks=[(read_hook_name, read_hook)])
    local_tstar = tokens.shape[1] - 1
    resid = cache["resid"][:, local_tstar : local_tstar + 1, :]
    if apply_ln_final:
        resid = model.ln_final(resid)
    return model.unembed(resid)[0, 0, :]


@torch.no_grad()
def head_logits_at_tstar(tokens):
    local_tstar = tokens.shape[1] - 1
    return model(tokens, return_type="logits")[0, local_tstar, :]


# Compute logits at t* using both the clean lens and the model's head
logits_lens = lens_logits_at_tstar(toks_prefix)
logits_head = head_logits_at_tstar(toks_prefix)

# Compute the maximum absolute difference between the two methods
max_abs_diff = (logits_lens - logits_head).abs().max().item()
print({"lens_vs_head_max_abs_diff": max_abs_diff})

# Sanity check: the two methods should agree to high precision
assert max_abs_diff < 1e-5

## Get Activations for "He" and "She" at the Chosen Layer


In [None]:
PREFIX_LIST = [
    "The doctor said that ",
    "The nurse mentioned that ",
    "The engineer argued that ",
    "The teacher noted that ",
    "The manager reported that ",
    "The journalist said that ",
    "The lawyer stated that ",
    "The chef remarked that ",
    "The professor explained that ",
    "The scientist observed that ",
]


@torch.no_grad()
def residual_first_cont_token(prefix_text, continuation, layer, site="hook_resid_mid"):
    """
    Return residual at the FIRST token of the continuation (i.e., pronoun),
    at the given layer/site.
    """
    toks_prefix_only = model.to_tokens(prefix_text, prepend_bos=True).to(device)
    pos_cont0 = toks_prefix_only.shape[1]  # index of first continuation token (t* + 1)

    toks = model.to_tokens(prefix_text + continuation, prepend_bos=True).to(device)

    cache = {}

    def grab(activation, hook):
        cache["resid"] = activation.detach()
        return activation

    _ = model.run_with_hooks(toks, return_type=None, stop_at_layer=layer + 1, fwd_hooks=[(f"blocks.{layer}.{site}", grab)])

    return cache["resid"][0, pos_cont0, :].clone()

## Compute Bias and Orthogonal Vectors

- Contrastive: average `h_t*(he) - h_t*(she)` across prefixes
- Gradient fallback: local decision-aligned direction
- Controls: random same-norm; orth via Gram–Schmidt


In [None]:
def build_v_bias_contrastive(layer=INJECT_LAYER):
    v_sum = torch.zeros(model.cfg.d_model, device=device)
    for prefix in PREFIX_LIST:
        residual_he = residual_first_cont_token(prefix, " he", layer)
        residual_she = residual_first_cont_token(prefix, " she", layer)
        v_sum += residual_he - residual_she
    v_bias = v_sum / (v_sum.norm() + 1e-8)
    return v_bias

### Build Bias Vectors

- Main bias vector using contrastive method (he vs she differences)
- Random and orthogonal control vectors for comparison


In [None]:
# 1) normalize v_bias
v_bias_raw = build_v_bias_contrastive(INJECT_LAYER)
nb = v_bias_raw.norm()
v_bias = v_bias_raw / nb  # unit length

# 2) orient v_bias so +alpha favors " he"
eps = 1e-3
d_plus = delta_lens_at_tstar(+eps, v_bias)
d_minus = delta_lens_at_tstar(-eps, v_bias)

if d_plus < d_minus:  # slope is negative → flip
    v_bias = -v_bias

# 3) controls with same norm
v_rand = torch.randn_like(v_bias)
v_rand /= v_rand.norm() + 1e-8
v_orth = v_rand - (v_rand @ v_bias) * v_bias
v_orth /= v_orth.norm() + 1e-8
# (unit norm → alpha is comparable across bias/random/orth)

## Measurement function (single step at t\*)

- Inject at t* only; cache read-site at t*; `ln_final` then `unembed`
- Assert Δ(α=0) equals unmodified clean-lens Δ at t\*


In [None]:
@torch.no_grad()
def delta_lens_at_tstar(alpha, vector, prefix_text=prefix, inject_hook_name=INJECT_HOOK_NAME, read_hook_name=READ_HOOK_NAME):
    # Compute local t* for this prefix
    toks = model.to_tokens(prefix_text, prepend_bos=True).to(device)
    local_tstar = toks.shape[1] - 1
    cache = {}

    def steer(act, hook):
        act[:, local_tstar, :] = act[:, local_tstar, :] + alpha * vector
        return act

    def readh(act, hook):
        cache["resid"] = act.detach().clone()
        return act

    _ = model.run_with_hooks(toks, return_type=None, stop_at_layer=max(INJECT_LAYER, READ_LAYER) + 1, fwd_hooks=[(inject_hook_name, steer), (read_hook_name, readh)])
    resid = model.ln_final(cache["resid"][:, local_tstar : local_tstar + 1, :])
    logits = model.unembed(resid)[0, 0, :]
    return float((logits[he_id] - logits[she_id]).item())


zero_delta = delta_lens_at_tstar(0.0, v_bias, prefix)
unmod_delta = float((lens_logits_at_tstar(toks_prefix)[he_id] - lens_logits_at_tstar(toks_prefix)[she_id]).item())
print({"zero_alpha_delta": zero_delta, "unmodified_delta": unmod_delta})
assert abs(zero_delta - unmod_delta) < 1e-6

## Sweep Alpha Values and Collect Results

- Sweep on main prefix and a neutral control ("Today ")
- Fit slopes over the four smallest |α| points; expect bias ≠ 0, random ≈ 0


In [None]:
def run_sweep(vectors, alpha_grid, prefix_text):
    out = {k: [] for k in vectors}
    for a in alpha_grid:
        for name, vec in vectors.items():
            out[name].append(delta_lens_at_tstar(float(a), vec, prefix_text))
    return out


vectors = {"bias": v_bias, "random": v_rand, "orth": v_orth}
results_main = run_sweep(vectors, alphas, prefix)
results_null = run_sweep(vectors, alphas, "Today ")

print("alpha grid:", alphas)
print("ranges main:", {k: (float(np.min(v)), float(np.max(v))) for k, v in results_main.items()})
print("ranges null:", {k: (float(np.min(v)), float(np.max(v))) for k, v in results_null.items()})

fit_sel = np.argsort(np.abs(alphas))[:4]


def slope_near_zero(y):
    X = np.vstack([alphas[fit_sel], np.ones_like(fit_sel, float)]).T
    beta, _ = np.linalg.lstsq(X, np.array(y)[fit_sel], rcond=None)[0]
    return float(beta)


slopes = {k: slope_near_zero(v) for k, v in results_main.items()}
print("near-zero slopes (main):", {k: round(v, 6) for k, v in slopes.items()})

## Plot Results

- Title includes inject/read sites and t\*
- Save as `brc_gpt2s_injL{...}_readL{...}_tstar.png`
- Print acceptance highlights


In [None]:
plt.style.use("seaborn-v0_8-whitegrid")
fig, ax = plt.subplots(figsize=(9, 6))

colors = {"bias": "#0072B2", "random": "#D55E00", "orth": "#009E73"}
for name in ["bias", "random", "orth"]:
    ax.plot(alphas, results_main[name], label=name, color=colors[name], linewidth=2.5 if name == "bias" else 2, marker="o", markersize=3)

ax.axhline(0, color="black", linestyle="--", linewidth=1)
ax.set_xlabel("alpha", fontsize=14)
ax.set_ylabel("Δ_logit = logit(' he') - logit(' she')", fontsize=14)
ax.set_title(f"BRC (GPT-2 small) | inj L{INJECT_LAYER}:hook_resid_mid → read L{READ_LAYER}:hook_resid_post | t*={T_STAR}", fontsize=15, weight="bold")
ax.legend(frameon=True, fontsize=11)
ax.tick_params(axis="both", which="major", labelsize=12)

note = f"clean logit lens (ln_final=on), BOS={PREPEND_BOS}, prefix='{prefix}'"
ax.text(0.01, -0.14, note, transform=ax.transAxes, fontsize=10, color="gray")

plt.tight_layout()
fig_path = f"brc_gpt2s_injL{INJECT_LAYER}_readL{READ_LAYER}_tstar.png"
plt.savefig(fig_path, dpi=300, bbox_inches="tight")
plt.show()
print("Saved:", fig_path)
print("lens_vs_head_match:", max_abs_diff < 1e-5)
print("slopes near zero:", {k: round(v, 6) for k, v in slopes.items()})
print("alpha range:", alphas.tolist())