In [1]:
from transformers import GPT2Tokenizer, GPT2Model
import torch

In [3]:
device = torch.device("cpu")

model_name = "gpt2"  # GPT-2 small
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2Model.from_pretrained(model_name).to(device)
model.eval()

hidden_dim = model.config.n_embd  # 768 for GPT-2 small
print("Hidden dim:", hidden_dim)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Hidden dim: 768


In [4]:
tokenizer.pad_token = tokenizer.eos_token

In [6]:
layer_idx = 5

block = model.h[layer_idx]
activations_list = []

def mlp_hook(module, input, output):
    activations_list.append(output.detach().cpu())

# Get the c_proj module of layer 5 MLP
target_module = block.mlp.c_proj

hook_handle = target_module.register_forward_hook(mlp_hook)

In [8]:
sentences = [
    "Mathematics is the queen of sciences.",
    "Sparse autoencoders can discover interpretable features.",
    "Ruth loves French history.",
]

enc = tokenizer(
    sentences,
    return_tensors="pt",
    padding=True,
    truncation=True,
)

input_ids = enc["input_ids"].to(device)
attention_mask = enc["attention_mask"].to(device)

with torch.no_grad():
    _ = model(input_ids=input_ids, attention_mask=attention_mask)


In [9]:
len(activations_list), activations_list[0].shape

(2, torch.Size([3, 12, 768]))

In [11]:
# Concatenate across all forward passes (we only did one here)
all_acts = torch.cat(activations_list, dim=0)  # (batch, seq_len, hidden_dim)

batch_size, seq_len, hidden_dim = all_acts.shape
print("Per-token activation tensor shape:", all_acts.shape)

# Flatten (batch * seq_len, hidden_dim)
per_token_acts = all_acts.reshape(-1, hidden_dim)
print("Flattened per-token activations:", per_token_acts.shape)

Per-token activation tensor shape: torch.Size([6, 12, 768])
Flattened per-token activations: torch.Size([72, 768])


In [13]:
def get_mlp_activations(
    texts,
    model,
    tokenizer,
    layer_idx=5,
    device=None
):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    tokenizer.pad_token = tokenizer.eos_token

    enc = tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
    )
    input_ids = enc["input_ids"].to(device)
    attention_mask = enc["attention_mask"].to(device)

    acts_list = []

    def hook_fn(module, input, output):
        acts_list.append(output.detach().cpu())

    target_module = model.h[layer_idx].mlp.c_proj
    handle = target_module.register_forward_hook(hook_fn)

    with torch.no_grad():
        _ = model(input_ids=input_ids, attention_mask=attention_mask)

    handle.remove()

    all_acts = torch.cat(acts_list, dim=0)  # (batch, seq_len, hidden_dim)
    per_token = all_acts.reshape(-1, all_acts.shape[-1])  # (N_tokens, hidden_dim)

    return per_token, attention_mask

# Example use:
per_token_acts, attn_mask = get_mlp_activations(
    sentences,
    model,
    tokenizer,
    layer_idx=5,
)

print("Per-token activations:", per_token_acts.shape)


Per-token activations: torch.Size([36, 768])
