In [1]:
import torch
from torch.utils.data import DataLoader
from transformer_lens import HookedTransformer
from transformer_lens import utils as tutils
from transformer_lens.evals import make_pile_data_loader, evaluate_on_dataset

from functools import partial
from datasets import load_dataset
import tqdm

from sae_lens import SparseAutoencoder
from sae_lens.toolkit.pretrained_saes import get_gpt2_res_jb_saes

from eval_utils import evaluate_completions

import plotly.express as px

torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x10499f490>

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HookedTransformer.from_pretrained('gpt2-small', device=device)



Loaded pretrained model gpt2-small into HookedTransformer


In [3]:
layer = 7 # pick a layer you want.

hook_point = tutils.get_act_name("resid_pre", layer)
saes, sparsities = get_gpt2_res_jb_saes(hook_point)

print(saes.keys())
sae = saes[hook_point]
sae = sae.to(model.W_E.device)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 1/1 [00:00<00:00,  1.11it/s]

dict_keys(['blocks.7.hook_resid_pre'])





In [4]:
def top_acts_at_pos(text, pos=-1, silent=True, prepend_bos=True, n_top=10):
    logits, cache = model.run_with_cache(text, prepend_bos=prepend_bos)
    if pos is None:
        hidden_state = cache[hook_point][0, :, :]
    else:
        hidden_state = cache[hook_point][0, pos, :].unsqueeze(0)
    feature_acts = sae(hidden_state).feature_acts
    feature_acts = feature_acts.mean(dim=0)
    top_v, top_i = torch.topk(feature_acts, n_top)
    return top_v, top_i

top_acts_at_pos(" I hate", pos=-1)
# 16077 is an anger feature
# 15001 is hate feature

(tensor([31.1679, 17.5215, 10.2898,  8.4272,  7.8847,  5.8784,  4.9446,  4.8437,
          4.6148,  2.7552]),
 tensor([15001, 16279, 22393, 24046,  6436, 12919,  8835, 12659,  5518,  3868]))

In [6]:
anger_ft_id = 15001
anger_ft_val = 30 # activation value
anger_vector = sae.W_dec[anger_ft_id, :]

steering = anger_vector * anger_ft_val

In [35]:

def patch_hook(
    resid,
    hook,
    c,
    pos
):    
    # resid shape is (batch, pos, d_model)
    # if pos < resid.shape[1]:
    #     resid[:, pos, :] = resid[:, pos, :] + c * steering
        # resid[:, :, :] = resid[:, :, :] + c * steering
    resid[:, :, :] = resid[:, :, :] + c * steering  # this inserts the steering vector at every position.

    return resid


text = "I think"

n_samples = 10
gen_texts = []

with model.hooks(fwd_hooks=[(hook_point, partial(patch_hook, c=0.5, pos=2))]):
    for i in range(n_samples):
        print("============")
        output = model.generate(text, prepend_bos=True, use_past_kv_cache=False, max_new_tokens=20, verbose=False)
        gen_texts.append(output)
        print(output)

I think they should rename that spirit. The spirit of Thor, which they call Thor's Gazer and the
I think this is a hard write-up. I was born both on my mother's block and as a
I think I've found myself in a position where I can pin my hopes on streaming and cheer (as well
I think the growing number of people calling for Israel's destruction have the same vanity, and fear for its existence
I think it was Bayonetta 2, so I went wild and shrank from it as soon as I
I think forever is not worth a 1000 fucking years unless you're going to cut it 10 fmer they can
I think he should just get a nail of his own. Do we know what he hates most about horror movies
I think it's funny that people speak aloud you know people's opinions. I think it's really, trying
I think we are all 100% sick and tired of clever cartoon news that once provided a platform for brilliant memes
I think he's smarter than your average training marketer! RT: I own unlimited demandfor special training,


In [36]:
evals = evaluate_completions(gen_texts, criterion="anger", prompt=text)
evals

[{'score': 1,
  'reason': 'The text does not exhibit any signs of anger. It merely expresses an opinion about renaming a spirit.'},
 {'score': 1,
  'reason': 'The text does not exhibit any signs of anger. It seems to be a neutral statement without any emotional intensity or aggressive language.'},
 {'score': 1,
  'reason': 'The text does not express any signs of anger. It is neutral and focuses on a hopeful and positive sentiment about streaming.'},
 {'score': 7,
  'reason': "The statement carries a tone that could be perceived as provocative and inflammatory, particularly with the phrase 'calling for Israel's destruction.' This can evoke strong emotional reactions, including anger, especially among those who are sensitive to the subject matter or have strong opinions about the Israel-Palestine conflict."},
 {'score': 2,
  'reason': 'The generated text does not exhibit significant anger. It contains a mild level of intensity and excitement, but it lacks the emotional intensity associat

In [37]:
scores = [e['score'] for e in evals]
print("mean", sum(scores) / len(scores))

mean 2.9


In [39]:
raise Exception("About to call gpt4 a bunch of times. Comment out this line to proceed.")

c_vals = [0.0, 0.1, 0.3, 0.5, 0.8, 1.0, 2.0]
mean_scores = []
n_samples = 20
text = "I think"

def patch_hook(resid, hook, c):    
    resid[:, :, :] = resid[:, :, :] + c * steering  # this inserts the steering vector at every position.
    return resid

for c in c_vals:
    print(f"Steering with c={c}")
    gen_texts = []
    with model.hooks(fwd_hooks=[(hook_point, partial(patch_hook, c=c))]):
        for i in range(n_samples):
            print("============")
            output = model.generate(text, prepend_bos=True, use_past_kv_cache=False, max_new_tokens=20, verbose=False)
            gen_texts.append(output)
            print(output)

    evals = evaluate_completions(gen_texts, criterion="anger", prompt=text)
    print(evals)
    scores = [e['score'] for e in evals]
    mean = sum(scores) / len(scores)
    mean_scores.append(mean)
    print("mean", mean)

Steering with c=0.0
I think it was the PlanTheBlack10012D in immediate effect that gave game publishers a run for their
I think after the whole fiasco with Christian-Plagiarism... different churches demonisation has interfered
I think it's safe to say a lot of the design and function of the iPhone seems to have been put
I think that Mike Judge and Graham Cox did it too and did it right.

They did the right
I think a lot of Japanese people underestimate pre-digital choice because they assume zero choices when they see some one
I think it's fair to say we've gotten a lot close to full freshwater lakes around Louisiana and Mississippi,
I think there's a good chance that. I'd like to see them go on TV in more theaters.
I think might represent very positive progress for » your game's incentives in these old ways of trade and two fact
I think 2010 didn't have much impact on me. I grew up slowly breathing music. I went through a
I think the argument for locking 60fps would look good for there

In [40]:
data = load_dataset("NeelNanda/pile-10k", split="train")
tokenized_data = tutils.tokenize_and_concatenate(data, model.tokenizer, max_length=128)
tokenized_data = tokenized_data.shuffle(42)


Map (num_proc=10):   0%|          | 0/10000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (80023 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (102873 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (113018 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (92793 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (90016 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequen

In [44]:
batch_size = 8
loader = DataLoader(tokenized_data, batch_size=batch_size)

In [45]:
# compute loss on dataset
losses = []
n_batches = 10

for c in c_vals:
    total_loss = 0
    print(f"Steering with c={c}")
    for i, batch in enumerate(loader):
        with model.hooks(fwd_hooks=[(hook_point, partial(patch_hook, c=c))]):
            loss = model(batch["tokens"], return_type="loss", prepend_bos=False) # already prepended.
            total_loss += loss.item()
        if i == n_batches:
            break
    losses.append(total_loss / (n_batches * batch_size))


Steering with c=0.0
Steering with c=0.1
Steering with c=0.3
Steering with c=0.5
Steering with c=0.8
Steering with c=1.0
Steering with c=2.0


In [59]:
fig = px.line(x=losses, y=mean_scores, title="Mean Anger Score vs. Steering Coefficient", labels={"x": "Loss", "y": "Mean Anger Score"}, )
fig.show()