In [5]:
!git clone https://github.com/ruth1445/interpretability.git


Cloning into 'interpretability'...
remote: Enumerating objects: 637, done.[K
remote: Counting objects: 100% (153/153), done.[K
remote: Compressing objects: 100% (131/131), done.[K
remote: Total 637 (delta 26), reused 121 (delta 11), pack-reused 484 (from 2)[K
Receiving objects: 100% (637/637), 590.14 MiB | 19.81 MiB/s, done.
Resolving deltas: 100% (70/70), done.


In [6]:
import os
os.chdir("/content/interpretability")

In [7]:
os.getcwd()

'/content/interpretability'

In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2Model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = GPT2Model.from_pretrained(model_name).to(device)
model.eval()

def get_mlp_activations(texts, model, tokenizer, layer_idx=5, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    enc = tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
    )
    input_ids = enc["input_ids"].to(device)
    attention_mask = enc["attention_mask"].to(device)

    acts_list = []

    def hook_fn(module, input, output):
        acts_list.append(output.detach().cpu())

    # ✅ use model.h instead of model.transformer.h
    target_module = model.h[layer_idx].mlp.c_proj
    handle = target_module.register_forward_hook(hook_fn)

    with torch.no_grad():
        _ = model(input_ids=input_ids, attention_mask=attention_mask)

    handle.remove()

    all_acts = torch.cat(acts_list, dim=0)      # (batch, seq_len, hidden_dim)
    per_token = all_acts.reshape(-1, all_acts.shape[-1])  # (B*L, hidden)
    flat_mask = attention_mask.reshape(-1).cpu().bool()   # (B*L,)

    # keep only real tokens (non-padding)
    valid_acts = per_token[flat_mask]           # (N_tokens_valid, hidden)

    return valid_acts


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [4]:
import os; print(os.getcwd())

/content


In [9]:
from pathlib import Path

corpus_path = Path("data/life3_0.txt")
with corpus_path.open("r", encoding="utf-8") as f:
    lines = [l.strip() for l in f if l.strip()]

print("Total lines in corpus:", len(lines))

Total lines in corpus: 9


In [10]:
import math

batch_size = 8
layer_idx = 5

all_acts = []
target_tokens = 100_000          # stop when we have ~100k vectors
total_tokens = 0

for start in range(0, len(lines), batch_size):
    batch = lines[start:start + batch_size]

    acts = get_mlp_activations(
        batch,
        model=model,
        tokenizer=tokenizer,
        layer_idx=layer_idx,
        device=device,
    )   # shape: (N_batch_tokens, hidden_dim)

    all_acts.append(acts)
    total_tokens += acts.shape[0]

    if total_tokens >= target_tokens:
        print(f"Reached target: {total_tokens} tokens.")
        break

    if (start // batch_size) % 50 == 0:
        print(f"Processed {start + len(batch)} lines, tokens so far: {total_tokens}")

# Concatenate everything into one big tensor
all_acts_tensor = torch.cat(all_acts, dim=0)    # (N_tokens, hidden_dim)
print("Final activations shape:", all_acts_tensor.shape)


Processed 8 lines, tokens so far: 337
Final activations shape: torch.Size([381, 768])


In [11]:
save_dir = Path("../activations")
save_dir.mkdir(parents=True, exist_ok=True)

save_path = save_dir / f"layer{layer_idx}_gpt2_activations.pt"
torch.save(all_acts_tensor, save_path)

print("Saved activations to:", save_path)

Saved activations to: ../activations/layer5_gpt2_activations.pt
