In [5]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import pyvene as pv
import matplotlib.pyplot as plt
device = torch.device("cpu")

# Set a random seed so later our outputs dont keep changing
torch.manual_seed(42)

<torch._C.Generator at 0x1228c62d0>

In [7]:
from pyvene import IntervenableModel

# get tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Load gpt-2
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
model.eval() 

# wrap it in pyvene so we can analyze it later
empty_config = {}
pv_model = IntervenableModel(empty_config, model)



In [8]:

"""
i want to investigate which neurons are responsible for subject verb agreement
so just adding in some simple test cases for that here. 
"""



agreement_sentences = [
    ("The cat", " is"),
    ("The cats", " are"),
    ("The boy", " is"),
    ("The boys", " are"),
    ("The dog", " is"),
    ("The dogs", " are")
]

print("Prompts for subject verb agreement:")
for prompt, correct in agreement_sentences:
    print(f"  {prompt!r} → should prefer {correct!r}")

Prompts for subject verb agreement:
  'The cat' → should prefer ' is'
  'The cats' → should prefer ' are'
  'The boy' → should prefer ' is'
  'The boys' → should prefer ' are'
  'The dog' → should prefer ' is'
  'The dogs' → should prefer ' are'


In [11]:
# Get the baseline so we can compare to after we introduce noise
baseline_probs = []

for prompt, correct in agreement_sentences:
    # tokenize the prompt + a space if needed
    text = prompt + " "
    inputs = tokenizer(text, return_tensors="pt").to(device)

    # run forward pass (no intervention yet!)
    with torch.no_grad():
        outputs = pv_model.model(**inputs)

    logits = outputs.logits[0]  # remove batch dim since it's always 1

    # get ids for both the correct and incorrect next tokens
    # correct is like " is" and incorrect would be the opposite
    correct_id = tokenizer.encode(correct.strip(), add_special_tokens=False)[0]

    # we pick one obvious incorrect opposite: if correct is "is", pick "are" and vice versa
    wrong = " are" if correct.strip() == "is" else " is"
    wrong_id = tokenizer.encode(wrong.strip(), add_special_tokens=False)[0]

    # probability of correct vs wrong using softmax
    probs = torch.softmax(logits[-1], dim=-1)
    correct_prob = probs[correct_id].item()
    wrong_prob = probs[wrong_id].item()

    baseline_probs.append((prompt, correct, correct_prob, wrong_prob))

    # i need to print to more decimals to see which option the model prefers, because it obviously
    # spread its probability among many thousands of possible next tokens
    print(f"{prompt!r}: P({correct.strip()}) = {correct_prob:.6f}, "
      f"P({wrong.strip()}) = {wrong_prob:.6f}")

'The cat': P(is) = 0.000081, P(are) = 0.000003
'The cats': P(are) = 0.000011, P(is) = 0.000026
'The boy': P(is) = 0.000070, P(are) = 0.000002
'The boys': P(are) = 0.000004, P(is) = 0.000028
'The dog': P(is) = 0.000078, P(are) = 0.000002
'The dogs': P(are) = 0.000008, P(is) = 0.000024


In [13]:
# We can now zero out one neuron and see how this affects the model's behavior

layer_name = "transformer.h.0.mlp.c_fc"  # first MLP in GPT-2
neuron_index = 42  # arbitrary neuron to intervene on

# This function will be applied to the activations at the chosen layer
def zero_one_neuron(tensor, _):
    # tensor is the activation output of the layer
    tensor[..., neuron_index] = 0.0
    return tensor

# Define the intervention using PyVene's Intervention object
intervention = pv.Intervention(
    target=layer_name,
    intervention_fn=zero_one_neuron,
    rep_type="output"  # intervene on the output of the module
)

print(f"Intervention ready: zeroing neuron {neuron_index} in {layer_name}")

Intervention ready: zeroing neuron 42 in transformer.h.0.mlp.c_fc
