# Ablating Token Induction Heads
In Section 4.2, we show that ablating token induction heads leads to paraphrasing behavior in model generation. This notebook is the one we used to generate the examples in the paper. 

In [6]:
import math
import random 
import numpy as np 
import torch 
import json 
from nnsight import LanguageModel
from utils import get_mean_head_values

random.seed(10)
torch.manual_seed(10)
np.random.seed(10)

In [3]:
full_model = 'meta-llama/Llama-2-7b-hf'
# full_model = 'meta-llama/Meta-Llama-3-8B'
# full_model = 'allenai/OLMo-2-1124-7B'
# full_model = 'EleutherAI/pythia-6.9b'
model_name = full_model.split('/')[-1]
model = LanguageModel(full_model, device_map='cuda', dispatch=True)

Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.26s/it]


In [8]:
with open(f'../cache/head_orderings/{model_name}/token_copying.json', 'r') as f:
    token_heads = json.load(f)

In [10]:
head_means = get_mean_head_values(model_name)

In [11]:
def ablated_generation(model, sequences, heads_to_ablate, head_means, max_toks=10):
    n_heads = model.config.num_attention_heads 
    head_dim = model.config.hidden_size // n_heads 

    layers_in_order = sorted(list(set([layer for layer, _ in heads_to_ablate])))

    with torch.no_grad():
        with model.generate(sequences, max_new_tokens=max_toks):
            model.all()
            for curr_layer in layers_in_order:
                if model_name == 'pythia-6.9b':
                    o_proj = model.gpt_neox.layers[curr_layer].attention.dense
                else:
                    o_proj = model.model.layers[curr_layer].self_attn.o_proj
                    
                # [bsz, seq_len, model_dim]
                o_proj_inp = o_proj.inputs[0][0]
                
                # get activations for the last token [model_dim], and then 
                # reshape into heads [bsz, seq_len, model_dim] -> [bsz, seq_len, n_heads, head_dim=128]
                bsz = o_proj_inp.shape[0]; seq_len = o_proj_inp.shape[1]
                head_acts = o_proj_inp.view(bsz, seq_len, n_heads, head_dim)
                
                curr_heads = [head for layer, head in heads_to_ablate if layer == curr_layer]
                for h in curr_heads:
                    the_mean = head_means[curr_layer, h]
                    head_acts[:, :, h, :] = the_mean.cuda()
            
                # replace the output of self_attn.q_proj with modified vector
                new_guy = ((head_acts.reshape(bsz, seq_len, model.config.hidden_size),),{})
                o_proj.inputs = new_guy
            
            out = model.generator.output.save()
    
        return out.detach().cpu()

In [12]:
bos = model.tokenizer.bos_token_id
if 'meta-llama' in model_name:
    newline = model.tokenizer('\n')['input_ids'][1:][-1]
else:
    newline = model.tokenizer('\n')['input_ids'][-1]

In [13]:
def ablated_token_generation(prompt, k=32, repeat_cutoff=1, repeated_ratio=1.3):
    if 'Llama' in model_name:
        custom_chunk = model.tokenizer(prompt)['input_ids'][1:]
    else:
        custom_chunk = model.tokenizer(prompt)['input_ids']
    copy_prompt = [bos] + custom_chunk + [newline] + custom_chunk + [newline] + custom_chunk[:repeat_cutoff]
    copy_paragraph = [copy_prompt]
    generated = ablated_generation(model, copy_paragraph, token_heads[:k], head_means, max_toks=math.floor(len(custom_chunk)*repeated_ratio))
    print('original:')
    print(model.tokenizer.decode(copy_prompt))
    print('\n')

    print('generated:')
    print(model.tokenizer.batch_decode(generated)[0])
    print()

In [14]:
prompt = """foo = []
for i in range(len(bar)):
    if i % 2 == 0:
        foo.append(bar[i])
"""
# ablated_token_generation(prompt, k=0, repeat_cutoff=2) # make sure it actually copies 
ablated_token_generation(prompt, repeat_cutoff=2)

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


original:
<s> foo = []
for i in range(len(bar)):
    if i % 2 == 0:
        foo.append(bar[i])

 foo = []
for i in range(len(bar)):
    if i % 2 == 0:
        foo.append(bar[i])

 foo =


generated:
<s> foo = []
for i in range(len(bar)):
    if i % 2 == 0:
        foo.append(bar[i])

 foo = []
for i in range(len(bar)):
    if i % 2 == 0:
        foo.append(bar[i])

 foo = [item for sublist in bar for item in sublist if i % 2 == 0]
print(foo)
\end{code}

Comment: Thank you for your answer, I have edited



In [15]:
prompt = "I have reread, not without pleasure, my comments to his lines, and in many cases have caught myself borrowing a kind of opalescent light from my poet's fiery orb"
ablated_token_generation(prompt)

original:
<s> I have reread, not without pleasure, my comments to his lines, and in many cases have caught myself borrowing a kind of opalescent light from my poet's fiery orb
 I have reread, not without pleasure, my comments to his lines, and in many cases have caught myself borrowing a kind of opalescent light from my poet's fiery orb
 I


generated:
<s> I have reread, not without pleasure, my comments to his lines, and in many cases have caught myself borrowing a kind of opalescent light from my poet's fiery orb
 I have reread, not without pleasure, my comments to his lines, and in many cases have caught myself borrowing a kind of opalescent light from my poet's fiery orb
 I have reread my comments on his lines, and I have caught myself many times borrowing from his fiery orb a kind of opalescent light.
—T.S. Eliot, letter to Ezra Pound, 26 November



In [16]:
prompt = "English vocab:\n 1. hospital\n 2. visage\n 3. unlucky\n 4. lesbian\n 5. know\n 6. wizard\n 7. unfair\n 8. sound\n 9. cortical\n 10. obsolete"
ablated_token_generation(prompt, repeated_ratio=2)

original:
<s> English vocab:
 1. hospital
 2. visage
 3. unlucky
 4. lesbian
 5. know
 6. wizard
 7. unfair
 8. sound
 9. cortical
 10. obsolete
 English vocab:
 1. hospital
 2. visage
 3. unlucky
 4. lesbian
 5. know
 6. wizard
 7. unfair
 8. sound
 9. cortical
 10. obsolete
 English


generated:
<s> English vocab:
 1. hospital
 2. visage
 3. unlucky
 4. lesbian
 5. know
 6. wizard
 7. unfair
 8. sound
 9. cortical
 10. obsolete
 English vocab:
 1. hospital
 2. visage
 3. unlucky
 4. lesbian
 5. know
 6. wizard
 7. unfair
 8. sound
 9. cortical
 10. obsolete
 English vocabulary:
1. She was in hospital for a long time after the accident.
2. Her face was covered in bandages.
3. The accident was very unlucky.
4. She was a lesbian.
5. He knows everything about the wizard.
6. It is unfair to treat people like that.
7. The sound of the engine was very loud.
8. I don't know why the doctor told me that I should eat more fish.
9. The new cortical implants can help the blind to see.
10. The

