In [1]:
import torch
from torch.utils.data import DataLoader
from transformer_lens import HookedTransformer
from transformer_lens import utils as tutils
from transformer_lens.evals import make_pile_data_loader, evaluate_on_dataset

from functools import partial
from datasets import load_dataset
import tqdm

from sae_lens import SparseAutoencoder
from sae_lens.toolkit.pretrained_saes import get_gpt2_res_jb_saes

from steering.eval_utils import evaluate_completions
from steering.utils import get_activation_steering, get_sae_diff_steering, remove_sae_feats, text_to_sae_feats, top_activations
from steering.patch import generate, get_loss

import plotly.express as px

torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x105ea1c00>

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HookedTransformer.from_pretrained('gpt2-small', device=device)



Loaded pretrained model gpt2-small into HookedTransformer


In [3]:
hp_6 = tutils.get_act_name("resid_pre", 6)
sae_6 = get_gpt2_res_jb_saes(hp_6)[0][hp_6]
sae_6 = sae_6.to(model.W_E.device)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 1/1 [00:00<00:00,  1.14it/s]


In [8]:
steering = get_activation_steering(model, hp_6, pos_text="Anger", neg_text="Calm")
steering = steering[0, -1, :]
steering = steering.unsqueeze(0).unsqueeze(0)

In [10]:
generate(model, hp_6, prompt="I think", steering_vector=steering, scale=10)

100%|██████████| 5/5 [00:04<00:00,  1.12it/s]


['I think I have funded myself in all these unique ways in my life.\n\nHow it impacts people is',
 "I think he was very angry. He said, 'the Statue of the American flag is more of the symbol",
 "I think the issue doesn't even need to be two sides and arguments, even if it gets through to them",
 'I think some (preemption is a hate created by a child) try to say that in 1834 it',
 'I think Episode VII could serve as one great example of that being ARTIT FROM TRUR or others, what']

In [11]:
pos_text = "Dragons live in Berkeley"
neg_text = "The people live in Berkeley"

print(model.to_str_tokens(pos_text))
print(model.to_str_tokens(neg_text))
print(len(model.to_str_tokens(pos_text)))
print(len(model.to_str_tokens(neg_text)))

['<|endoftext|>', 'Dr', 'agons', ' live', ' in', ' Berkeley']
['<|endoftext|>', 'The', ' people', ' live', ' in', ' Berkeley']
6
6


In [12]:
steering = get_activation_steering(model, hp_6, pos_text=pos_text, neg_text=neg_text)
print(steering.shape)

torch.Size([1, 6, 768])


In [26]:
text = "Thanks for asking about that! I moved to Berkeley, CA because"
gen_texts = generate(model, hp_6, prompt=text, steering_vector=steering, scale=20)
gen_texts

100%|██████████| 5/5 [00:04<00:00,  1.03it/s]


['Thanks for asking about that! I moved to Berkeley, CA because having a kid with Hubble is magical. He hasbecome Dragonwilly way back. With your',
 'Thanks for asking about that! I moved to Berkeley, CA because in a much different sense, my family often rode their dragons with their dragons, a groundcat,',
 'Thanks for asking about that! I moved to Berkeley, CA because of the genetics at such a young age. During the course of our relationship, we dragons headed back',
 'Thanks for asking about that! I moved to Berkeley, CA because Taris wanted to know the truth about me. December 18, 2012 21:23 Dragonsprine',
 "Thanks for asking about that! I moved to Berkeley, CA because I knew about it, but it didn't count as dancing. Witches just had to know what everyone"]

In [15]:
# get sae activations on pos prompt
sae_feats = text_to_sae_feats(model, sae_6, hp_6, pos_text)
sae_feats.shape

torch.Size([1, 6, 24576])

In [25]:
sae_feats = text_to_sae_feats(model, sae_6, hp_6, pos_text)

top_v, top_i = top_activations(sae_feats, 10)
for pos, (i, v) in enumerate(zip(top_i[0], top_v[0])):
    print(f"pos={pos}")
    print(i)
    print(v)

sae feats shape torch.Size([1, 6, 24576]) 

pos=0
tensor([23123,   979,   316,  7496, 23111, 23373,  9088, 16196,  2039, 10423])
tensor([419.6327, 401.9173, 349.7594, 329.6368, 327.3525, 305.8869, 251.9604,
        225.7122, 191.7625, 185.9452])
pos=1
tensor([ 1080, 19151, 19245, 17048, 22788, 10754, 22226, 17673,  3239, 19296])
tensor([43.6409, 20.0815, 13.3413,  7.4697,  4.6812,  3.5322,  2.1363,  1.8740,
         1.6652,  1.6404])
pos=2
tensor([13166,  2241, 18658, 23400, 23440, 22226, 12075,  3152, 11594, 15841])
tensor([13.4366, 13.3366, 10.8613,  8.4645,  8.1606,  5.2570,  4.6485,  4.6276,
         4.5905,  4.5372])
pos=3
tensor([16493,  4685, 12290, 20892,  6541,  1622, 11121, 10570,  7645,  4614])
tensor([26.5143, 17.7626, 10.2051,  7.5466,  5.7065,  5.5423,  4.4755,  4.3210,
         3.8644,  3.7869])
pos=4
tensor([ 8264,  5600, 13178,  7645,  1622, 11114,   253, 20144,  8961, 23374])
tensor([30.1238,  6.4868,  3.3841,  3.2089,  3.1472,  2.7278,  2.5045,  2.4639,
         2.25

In [17]:
feats_to_remove = []
feats_to_remove.append((1, 1080, 43.6)) # Dr.
feats_to_remove.append((1, 19151, 20.0)) # <endoftext> something
feats_to_remove.append((1, 19245, 13.3)) # dr<something>
feats_to_remove.append((2, 13166, 13.4366)) # ...agon

clean_steering = remove_sae_feats(steering, sae_6, feats_to_remove)

In [23]:
text = "Thanks for asking about that! I moved to Berkeley, CA because"
gen_texts = generate(model, hp_6, prompt=text, steering_vector=clean_steering, scale=2)
gen_texts

100%|██████████| 5/5 [00:04<00:00,  1.02it/s]


['Thanks for asking about that! I moved to Berkeley, CA because I always quote AV all about the time-tested related practices of having confirmation before you hang up your',
 'Thanks for asking about that! I moved to Berkeley, CA because of all my wonderful Play Activists taking the records from TalkTalk and TinyJam from I-',
 "Thanks for asking about that! I moved to Berkeley, CA because it's such a great city in class and I know being a former Milwaukie Graduate will",
 "Thanks for asking about that! I moved to Berkeley, CA because there's a great service there, and we think it's great to have our UCX engineers here",
 'Thanks for asking about that! I moved to Berkeley, CA because I had part of their research team working on clicks, but the best part of Google is, of']

In [33]:
scales = [0, 1, 5, 10, 20, 30]
mean_scores = []
for scale in scales:
    gen_texts = generate(model, hp_6, prompt=text, steering_vector=steering, scale=scale, n_samples=10)
    evals = evaluate_completions(gen_texts, criterion="Dragons live in Berkley", prompt=text)
    print(evals)
    scores = [e['score'] for e in evals]
    mean = sum(scores) / len(scores)
    mean_scores.append(mean)
    print("mean", mean)

100%|██████████| 10/10 [00:11<00:00,  1.11s/it]


[{'score': 1, 'reason': "The generated text does not mention dragons living in Berkeley or any related concept. It diverges into a personal narrative about the speaker's dislike for startups and rent control, which is irrelevant to the criterion."}, {'score': 1, 'reason': 'The text does not mention dragons living in Berkeley.'}, {'score': 1, 'reason': 'The text does not mention anything about dragons living in Berkeley. It only talks about moving to Berkeley and a personal wish regarding the stadium.'}, {'score': 1, 'reason': 'The text does not mention anything about dragons living in Berkeley.'}, {'score': 1, 'reason': 'The text does not mention dragons living in Berkeley at all.'}, {'score': 1, 'reason': 'The text does not mention dragons living in Berkeley, nor does it provide any relevant information about dragons. It is incoherent and unrelated to the criterion.'}, {'score': 2, 'reason': 'The text does not mention anything about dragons living in Berkeley. It only talks about movi

100%|██████████| 10/10 [00:11<00:00,  1.16s/it]


[{'score': 1, 'reason': 'The generated text does not mention anything about dragons living in Berkeley. It talks about a move to Berkeley for scholarly work, which is unrelated to the criterion.'}, {'score': 1, 'reason': 'The text does not mention dragons living in Berkeley at all. It instead talks about moving to Berkeley for reasons related to midterms in Ann Arbor.'}, {'score': 1, 'reason': 'The text does not mention dragons living in Berkeley at all. It focuses on the reasons for moving to Berkeley, CA and mentions New York, but there is no reference to dragons.'}, {'score': 2, 'reason': 'The generated text does not mention dragons living in Berkeley. It also contains fragmented and unclear statements, which further detracts from relevance to the criterion.'}, {'score': 1, 'reason': 'The text does not mention dragons living in Berkeley at all. It talks about moving to Berkeley, CA, and mentions BYU and a decision blog, but there is no reference to dragons.'}, {'score': 1, 'reason':

100%|██████████| 10/10 [00:09<00:00,  1.03it/s]


[{'score': 1, 'reason': 'The generated text does not mention dragons living in Berkeley, nor does it provide any relevant information related to the criterion.'}, {'score': 1, 'reason': 'The completion does not address the criterion about dragons living in Berkeley. It instead talks about personal reasons for moving and unrelated topics like cancer-free alternatives and a reunion.'}, {'score': 1, 'reason': 'The completion does not mention dragons living in Berkeley at all. It talks about moving to Berkeley for a part-time job, which is unrelated to the criterion.'}, {'score': 1, 'reason': "The text does not mention dragons living in Berkeley. It focuses on the speaker's personal reasons for moving to Berkeley, such as fellowship and career opportunities."}, {'score': 2, 'reason': "The text mentions 'That Dragon' but does not provide clear or coherent information about dragons living in Berkeley. The context is confusing and does not directly support the criterion."}, {'score': 1, 'reas

100%|██████████| 10/10 [00:10<00:00,  1.07s/it]


[{'score': 8, 'reason': "The text implies that dragons live in Berkeley by stating 'I was a depressive, vivid dragon who didn't play,' which suggests the speaker considers themselves a dragon living there. However, the rest of the text is not directly related and does not further elaborate on dragons living in Berkeley."}, {'score': 1, 'reason': 'The text does not mention dragons living in Berkeley. It only talks about moving to Berkeley for theater and fantasy, which is unrelated to the criterion.'}, {'score': 1, 'reason': 'The completion does not mention or imply anything about dragons living in Berkeley.'}, {'score': 1, 'reason': 'The text does not mention anything about dragons living in Berkeley. It only talks about the reasons for moving to Berkeley and some issues with the streetcar.'}, {'score': 7, 'reason': 'The text mentions dragons but does not explicitly state that they live in Berkeley. However, the mention of dragons in the context of Berkeley implies a possible connectio

100%|██████████| 10/10 [00:10<00:00,  1.04s/it]


[{'score': 1, 'reason': 'The text is incoherent and does not mention anything about dragons living in Berkeley.'}, {'score': 1, 'reason': 'The text does not mention dragons living in Berkeley. It discusses dragonflies and appears unrelated to the criterion.'}, {'score': 6, 'reason': "The text makes an indirect reference to dragons with the phrase 'dragon brain,' but it is confusing and lacks clarity. The connection to dragons living in Berkeley is weak and not explicitly stated."}, {'score': 1, 'reason': 'The completion does not mention dragons living in Berkeley or anything related to dragons. It seems incoherent and unrelated to the criterion.'}, {'score': 9, 'reason': 'The completion mentions living in Berkeley and references dragons multiple times, which aligns well with the criterion. However, the text is somewhat confusing and could be clearer about the presence of dragons in Berkeley.'}, {'score': 3, 'reason': 'The completion mentions Berkeley, CA but the mention of dragons is v

100%|██████████| 10/10 [00:10<00:00,  1.07s/it]


[{'score': 1, 'reason': 'The generated text is incoherent and does not mention anything about dragons living in Berkeley. It lacks relevance to the criterion.'}, {'score': 1, 'reason': 'The text does not mention anything about dragons living in Berkeley. It also lacks clarity and coherence, making it difficult to understand.'}, {'score': 5, 'reason': "The text mentions 'dragonwheels,' which suggests some connection to dragons, but it is not clear or explicitly stated that dragons live in Berkeley."}, {'score': 7, 'reason': "The text mentions dragons indirectly through 'dragon dragons' and 'dragon shaped dragonagons,' which suggests a fantastical element associated with dragons. However, it does not explicitly state that dragons live in Berkeley."}, {'score': 1, 'reason': 'The text does not mention dragons living in Berkeley.'}, {'score': 1, 'reason': 'The generated text is incoherent and does not provide any information about dragons living in Berkley. It fails to address the criterion

In [34]:
losses = get_loss(model, hp_6, steering_vector=steering, scales=scales)

loading dataset: NeelNanda/c4-code-20k
dataset loaded


100%|██████████| 6/6 [00:51<00:00,  8.53s/it]


In [36]:
fig = px.line(x=losses, y=mean_scores, title="Dragon Score vs. Loss", labels={"x": "Loss", "y": "Mean Dragon Score"}, markers=True)
fig.update_yaxes(range=[1, 9], dtick=1)
fig.show()