In [1]:
import torch
import pyvene as pv
import pprint

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config, tokenizer, model = pv.create_gpt_neo()
model.to(DEVICE)
pprint.pprint(model)

loaded GPTNeo model roneneldan/TinyStories-33M
GPTNeoForCausalLM(
  (transformer): GPTNeoModel(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(2048, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-3): 4 x GPTNeoBlock(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPTNeoAttention(
          (attention): GPTNeoSelfAttention(
            (attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_dropout): Dropout(p=0.0, inplace=False)
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=False)
            (q_proj): Linear(in_features=768, out_features=768, bias=False)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTNeoMLP(
          (c_fc): Linear(in_features=768, out_features=3072

In [6]:
from typing import Dict

collect_model = pv.IntervenableModel(
    [
        {
            "layer": l,
            "component": "block_output",
            "intervention_type": pv.CollectIntervention,
        }
        for l in range(1, config.num_layers)
    ],
    model=model,
)

p_plus = " love"
p_minus = " hate"

res = collect_model(
    base=tokenizer([p_plus, p_minus], return_tensors="pt").to(DEVICE),
    unit_locations={"base": 0},
    return_dict=True,
)["collected_activations"]

print(res['layer.1.comp.block_output.unit.pos.nunit.1#0'].shape)
print(config.num_heads)

diff: Dict[str, torch.Tensor] = {}

for k, v in res.items():
    diff[k] = torch.reshape(res[k][0] - res[k][1], (-1,))


torch.Size([2, 2, 384])
16


In [7]:
intv_model = pv.IntervenableModel(
    [
        {
            "layer": l,
            "component": "block_output",
            "intervention": lambda b, s: b + 10 * s,
        }
        for l in range(1, config.num_layers)
    ],
    model=model,
)

# ActAdd on prompt (original setting)
orig, intervened = intv_model.generate(
    base=tokenizer("I hate you because", return_tensors="pt").to(DEVICE),
    source_representations=diff,
    unit_locations={"sources->base": (0, 3)},
    output_original_output=True,
    max_length=32,
)

pprint.pprint('Unintervened generation:')
pprint.pprint(tokenizer.batch_decode(orig))

print('Intervened generation:')
pprint.pprint(tokenizer.batch_decode(intervened))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Unintervened generation:'
['I hate you because I don\'t want you to be my friend."\n'
 '\n'
 'The little girl was sad and went home. She told her mom what happened and']
Intervened generation:
['I hate you because batch of cookies are not good for you. You are a bad '
 "sister. I don't want to play with you anymore. I want to"]


In [8]:
# ActAdd on decoded region
orig, intervened = intv_model.generate(
    base=tokenizer("I hate you because", return_tensors="pt").to(DEVICE),
    source_representations=diff,
    unit_locations={"sources->base": (0, 3)},
    intervene_on_prompt=False,
    timestep_selector=[lambda idx, o: idx % 10 == 0 for i in range(3)],
    output_original_output=True,
    max_length=50
)

pprint.pprint('Unintervened generation:')
pprint.pprint(tokenizer.batch_decode(orig))

print('Intervened generation:')
pprint.pprint(tokenizer.batch_decode(intervened))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Unintervened generation:'
['I hate you because I don\'t want you to be my friend."\n'
 '\n'
 'The little girl was sad and went home. She told her mom what happened and '
 'her mom said, "Don\'t worry, we can make a new friend tomorrow." The']
Intervened generation:
['I hate you because I don\'t want you to be my friend." batch of cookies and '
 'they both laughed. The end batch of cookies were so delicious that they made '
 'the batch of cookies and they both ate them together. batch of cookies were '
 'so tasty']
