In [1]:
import os
import sys
sys.path.append(os.path.abspath('..'))

import torch
from torch.utils.data import DataLoader
from transformer_lens import HookedTransformer
from transformer_lens import utils as tutils
from transformer_lens.evals import make_pile_data_loader, evaluate_on_dataset

from functools import partial
from datasets import load_dataset
from tqdm import tqdm

from sae_lens import SparseAutoencoder
from sae_lens.toolkit.pretrained_saes import get_gpt2_res_jb_saes
from sae_lens import SparseAutoencoder, ActivationsStore

from steering.eval_utils import evaluate_completions
from steering.utils import text_to_sae_feats, top_activations, normalise_decoder, get_activation_steering
from steering.patch import generate, get_scores_and_losses, patch_resid

from sae_vis.data_config_classes import SaeVisConfig
from sae_vis.data_storing_fns import SaeVisData

import plotly.express as px

torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7f2860dcb910>

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = HookedTransformer.from_pretrained("gemma-2b", device=device)

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Loaded pretrained model gemma-2b into HookedTransformer


In [3]:
hp6 = "blocks.6.hook_resid_post"
hp12 = "blocks.12.hook_resid_post"
sae12 = SparseAutoencoder.from_pretrained("gemma-2b-res-jb", hp12)
normalise_decoder(sae12, scale_input=True)
sae12 = sae12.to(device)


sae6 = SparseAutoencoder.from_pretrained("gemma-2b-res-jb", hp6)
normalise_decoder(sae6, scale_input=False)
sae6 = sae6.to(device)



In [4]:
top_activations(text_to_sae_feats(model, sae12, hook_point=hp12, text="I talk about weddings"))

(tensor([[[25.9705, 24.4213, 24.0417, 22.5076, 14.2997, 10.6286,  8.4462,
            6.7500,  5.1569,  4.9748],
          [12.4683,  9.5918,  6.3518,  4.3895,  2.6166,  2.4245,  1.9347,
            1.2328,  1.1585,  0.9953],
          [11.0023,  5.0192,  4.7240,  4.3666,  3.6535,  2.5568,  1.5029,
            1.3582,  0.9921,  0.9623],
          [ 7.2850,  3.8669,  2.9988,  2.5302,  2.3834,  1.9475,  1.9330,
            1.5817,  1.5285,  1.3745],
          [16.2181,  3.1115,  2.9495,  2.1953,  1.6427,  1.5833,  1.5298,
            1.4020,  1.3613,  1.3428]]], device='cuda:0'),
 tensor([[[ 8731, 10841,  5233,  3048,  1645, 11597,  6099,  7650,  6802,  8618],
          [ 5611, 13161, 12885,  4966, 10323, 12492,  7971, 10662,  3998, 14956],
          [ 4760, 15364,  7005,  7503, 10323, 14664, 15468,  4966,  3561,  7971],
          [ 6353, 15978, 10323, 15364,  6723,  3998, 14664,  2847,  7971, 16257],
          [ 9099,  9549,  9331, 10323, 10108,  7492, 13389, 12101,  6848, 13270]]],
   

In [5]:
# steering = sae12.W_dec[9099] * 75  # weddings
steering = sae12.W_dec[9099] * 16  # weddings
steering = steering[None, None, :]

In [6]:
prompt = "I went up to my friend and said"
scale = 10
print(model.to_str_tokens(prompt))

['<bos>', 'I', ' went', ' up', ' to', ' my', ' friend', ' and', ' said']


In [7]:

generate(model,
         "blocks.9.hook_resid_post",
        #  "blocks.12.hook_resid_post",
         prompt,
         steering_vector=steering,
         scale=scale,
         insertion_pos=None,
         max_length=20,
        )


100%|██████████| 5/5 [00:05<00:00,  1.13s/it]


['I went up to my friend and said, celebrating at is train in invitation. planning will tea, spa, and or anniversary henna leh photo',
 'I went up to my friend and said, day party and planner at planner of planner planning for invitation and gown planning.! planning band cake',
 'I went up to my friend and said day plans planning party of. 1 march from plan planning week weekend planner dream planning in budget reception',
 'I went up to my friend and said dressed cost cost cake my planner date re \n\n day dress cake with decoration ceremony feast made planning  ',
 'I went up to my friend and said. we are vibes and we playlist we vows we in. at story in you day in 2']

In [8]:
act_steering = get_activation_steering(model, hp6, "Anger", "Calm")
print(act_steering)
act_steering = act_steering.sum(dim=1, keepdim=True)
print(act_steering.shape)
print(torch.norm(act_steering))

pad tensor([], device='cuda:0', size=(1, 0), dtype=torch.int64)
tensor([[[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.4653, -0.9298, -0.2396,  ..., -1.5924,  0.3313, -0.1270]]],
       device='cuda:0')
torch.Size([1, 1, 2048])
tensor(91.0582, device='cuda:0')


In [9]:
prompt = "I think that"
scale = 2

generate(model,
        hp6,
        #  "blocks.12.hook_resid_post",
         prompt,
         steering_vector=act_steering,
         scale=scale,
         insertion_pos=None,
         max_length=20,
        )

100%|██████████| 5/5 [00:05<00:00,  1.06s/it]


['I think that <b><b><b><b> -anger - - ... ... - - ... ... - 8 - -',
 'I think that <strong>aleinlickvieor -s origond between -s vreye osob...u thefts',
 'I think that felt for his trait -the..."      \nemasurusa as enderas ... > |  pris',
 'I think that made "ill or in - • • ..." 9-לס - - ... - oturus r',
 'I think that for thebacher-warranri"f( ... _____ sittor thereforvironsons(englansvin']

In [10]:
top_activations(text_to_sae_feats(model, sae6, hook_point=hp6, text="Anger"))

(tensor([[[24.4932, 23.9630, 23.2544, 21.0681, 20.9372, 20.6655, 20.3199,
           19.8953, 19.0510, 18.4377],
          [56.7609, 14.0818, 12.2185, 11.2966, 10.9838,  9.4307,  8.0595,
            7.1658,  7.1005,  5.8064]]], device='cuda:0'),
 tensor([[[ 3390, 15881,  5347, 16334,   556,  8704, 11785,  5624,  5396,  6877],
          [ 1062, 12753,  1213, 11968, 12167,  5915,  2491, 15173, 11912, 12312]]],
        device='cuda:0'))

In [11]:
steering = sae6.W_dec[1062] * 56  # anger
# steering += sae12.W_dec[12312] * 10  # anger
steering = steering[None, None, :]

In [12]:
prompt = "I think"
scale = 0.8

generate(model,
        hp6,
         prompt,
         steering_vector=act_steering,
         scale=scale,
         insertion_pos=None,
         max_length=25,
        )

100%|██████████| 5/5 [00:06<00:00,  1.32s/it]


['I think an even worse problem is that I feel very hateful towards other people, and would like the one person to become a god-',
 'I think this game is full of humor, and I definitely have learned this really hard from my daddy. I came up with a whole',
 'I think I may have found what is turning out to be ballot tabulation software.  (  <b><i>  this  </i></b>\n\n',
 'I think so too for Abu well that is how you teach\n1)Anger and violence\n\n\n10 years passed and anger to zero',
 'I think, the husband told me", it is the biological definition of a person - but quickly receipts. Anger is a weapon, more']

In [17]:
scales = [0, 0.1, 0.2, 0.4, 0.6, 0.8, 1, 1.5, 2, 3, 4, 5]

In [18]:
# Run a bunch of times

mean_scores, all_scores, losses = get_scores_and_losses(
    model,
    hp6,
    steering_vector=steering,
    prompt=prompt,
    criterion="Anger",
    scales = scales,
    n_samples=50,
    insertion_pos=None,
    explanations=False,
)



loading dataset: NeelNanda/c4-code-20k
dataset loaded


100%|██████████| 12/12 [00:37<00:00,  3.16s/it]


scale 0


100%|██████████| 50/50 [00:53<00:00,  1.08s/it]


["I think if you're feeling a palpable fear a lot, like always being in a state of high alert", 'I think I got it, at least for addition and multiplication:\n\n<math xmlns="http://www.', 'I think first I needed to find a wife that will look after a single drinker! Better yet, a user', 'I think it is necessary to include an image with your question.\n\nSir\nI approach u as humbly as', "I think two of the best recommendations that anyone who has elbow pain can do is massage therapy and new butcher'", 'I think this is another example of the Christmas wave bringing those "Hooray Hooray" and "Look at', 'I think the answers of both the questions are in definition. $V$ has a topology induced by some metric', 'I think it would make a great coffee break activity. Interact Sentence 2\n\nI think such a game is', "I think you should have a virtual substation, and then refer to it. It doesn't have to be", 'I think that Old Trafford will be the destination for Faith Uzoma\n\nI’m not sure if it', 'I 

100%|██████████| 50/50 [00:53<00:00,  1.08s/it]


['I think m.x means the mode is more extreme,\nbecause if you shuffle the indices, e.', 'I think I see what kind of sht your talking about here.\n\nLowering headsets is normal due to', 'I think we’ve heard every reason racism has not affected Barça enough\n\n1: Manchester United should have been', 'I think the 49ers’ 22-10 win last week over Seattle did at least', "I think you're supposed to make like a snow flurrrrrrry. i stuck to this new-", 'I think you <strong>need</strong> to compile these lists using a static <code>.pm</code> with ways to', 'I think he should use his sleuther art, what do you think? if not, xie keen just', 'I think I have a yeast infection ( I think because I am on an anti ulcer med and doctor acted like', 'I think I am having the same problem. My machine only does it when I am on wireless, not trouble', "I think that image has given me a nice stretch of the imagination, and it's very MOCo of", "I think it's a Reddit badge. You can get it in your main terminal, o

100%|██████████| 50/50 [00:53<00:00,  1.08s/it]


['I think its pretty unlikely... In absolutely all of the games that William has played, no one has complained for', "I think I was asked once about Macho Man Randy Savage's reaction to Alundra Blayze", 'I think Ghostbusters is a really good idea. Not just the animation but his expression. When you think about', 'I think they made both of them to look bad for C3-103 and her rioting was', "I think I went a little crazy. That's if the guy starts to complain I'll give him", 'I think it’s safe to say these days that not everyone gets so tweets me up by them after having', "I think it's interesting that there are two meditating Buddhas in this painting. Very picturesque. This artwork", 'I think that in order to have a self-opinionated and full of knowledge about a character it is necessary', 'I think that the second gene is the important one. I think that you might think of how the problem will', 'I think a part of my soul continues watching, Will of the Wisp is a great series and perhaps w

100%|██████████| 50/50 [00:53<00:00,  1.08s/it]


["I think you'd probably be better off updating cargo itself.\nI put all my dependencies in a linked", 'I think some a-hole property manager took the money from our agressment and got caught by the husband', 'I think based on other posts that this MIGHT be useful.\n\nI have a document group that is broken up', "I think this might be the actual bitemark for Leeway's Class masked at a specific point in", 'I think there must be something wrong with Amazon because I still have the same issue, even 15 years', 'I think most of us will agree that no one enjoys being shouted at, bullied or called names. I would', "I think i've found my limit for the impedance issue. Anyway seems crazy to me from the pic it", 'I think this might be a red herring. What made the client unhappy was the advisory made to split the capital', 'I think .5% would be too high for the deeper tone. The tone should be around 1%', 'I think you need to also manage your expectations here. Tech wise this game is four years behi

100%|██████████| 50/50 [00:53<00:00,  1.08s/it]


['I think my mind has been whirled out of the oven by the noise of my opponent’s childish words', 'I think it is odd how small it is. I can see if I was bought at $15.', "I think most 'clinchers' (though I may be wrong about that) at least worked very hard", 'I think I’m done with the <em>Abbey Road</em> heat; I hope it’s only superficial', "I think that I'm starting to fly crazy!\n\nI think that I'm starting to fly crazy", 'I think AWP is going to be nice. It will be pretty wet for a few days. It is', 'I think your thinking names family members because the character also want their elder to not get in my face, please', 'I think so every day!\n\nI posted the same a couple of hours ago.\n\nI don’t', 'I think this is explained well and otherwise ours could turn out looking like this...\n<i>Hetalia draw', 'I think I was kind of thrown when my other vex recently broke about a year and half ago. Ive been', 'I think I might have should’ve been more demanding, I have only one leech at level 1

100%|██████████| 50/50 [00:53<00:00,  1.08s/it]


['I think the Ralfs’ accusation over the past few years would be directed towards Wagner’s decision to make', 'I think I was a real bitch when I was pregnant.\n\nNo I wasn’t? Yes hahaha okay', 'I think you are over reacting.\n\nFrom the past, it was more of harassment and not the true emotions', 'I think pass drives already hell loud at this point lol\n\nI’d also be happy about zoris haircut', "I think nothing has gone THAT bad that I'd quit painting, but one of these days my son is", 'I think they really annoyed me at a recent all-white dinner.\n×</>\n<b>CLICK & F', "I think it's fair to say that 12-year-old me would be fuming right", 'I think that no one should be surprised at this beautiful pastor’s response. <strong>Who shouldn’t', "I think it's necessary she be at that, so read on.\nIf it's any consolation", 'I think CardsBoss is an over-priced POS. I just want it to go away so keep it going', "I think that is crap, TFS doesn't handle anything. Also someone is making a forum post 

100%|██████████| 50/50 [00:53<00:00,  1.08s/it]


['I think that my negative feelings about the world are a real blow-up at many things, but especially at', 'I think wrestling colours macho PET SUDOCAN THREATEN YOU FOR YOUR own HISS for or or --', "I think I have a solution to make myself feel better about the prices I'm seeing about because of how", 'I think Elena.\n\n@ fan adreaming\n\n@Misunderstood\n\nOk ok\n\nZonushere I kno', "I think I get angry at these television figures constantly. It's fine if they go on about something you", 'I think I’m mad at EA right now. A group of 8 here and is telling them about', 'I think it’s an appropriate climate situation because it boils me that they want me to call him because there', "I think I've tried every single trig function over the last few days. Where did it go when I", "I think I have friends that I don't show off to others. Especially when it's my wife", "I think not enough hostile, but I'm also pissed because I just told someone who demanded I was trying", "I think I'm one of the f

100%|██████████| 50/50 [00:53<00:00,  1.08s/it]


['I think about it "purple" and then feel bad that we live in a place over something about the ocean', 'I think alot about things when i’m pissed, mainly, because i’ll about my every little thing', 'I think it because we had a cook you last. Specifically at about 9 up telling at me.\n\n', 'I think I am loosing my head this to now , at the fact that I had with the delays from the', 'I think I was told as well by Princess in Coli.F a lot-I saw her at the', 'I think DaisyatHay.\n\nWhy would you so? Why did Dumbledore?\n\nYes you are c?', "I think you're getting a lot for the color or sad or about what I said five years ago or", "I think I'm pretty at my wit's end with one thing AND the other today, so please", "I think your question isn' at all at sword-fire... it is at that you won that bad after", 'I think so! About time that someone wrote to me about General not at this cowboy at the fact was that', 'I think its a waste and I am not going to bellow how angry I am I at writing.\n\n\nWho

100%|██████████| 50/50 [00:53<00:00,  1.08s/it]


["I think where most about at my Heath and and how I' when I fly then I talk aloud at that", 'I think I am so angry at myself about today at Alan... every stupid things about at with all because and', 'I think cars alot all the time, I I I I mad yesterday. I thought the " ruin I see', 'I think at first at the scene and without because that people then about and I when people expressed so I that', "I think about a lot at this but not about it all. I am when I's but at can", 'I think all the most at as because as as the sad person that get at and at with...fast at', 'I think about moms all the time and I’ at that at how guts burns about because cone!!! at when', 'I think that, about women the scanner. I disguised / at night I. the defense that I that at', 'I think it, at and, you divided so music flooded. When I last at would if at swept that', 'I think this a lot and sometimes at at family over on the red sheep and red at people. for a', 'I think that my rage at  will twice, relay, anno

100%|██████████| 50/50 [00:53<00:00,  1.07s/it]


['I think and about with..ideas is road has at because that... because his upat about aheadie', 'I think this is at at with — that like when the episode goes easy at and sad at at at that', 'I think everytime after Tik now all, that young black person and because I at with f at something that ang', 'I think at... and at at no tinder Android  persist, it with at and what songwriter about Java at', 'I think sleepy when because at over. about, or or so., shows can defend against., source', 'I think and so!!!! & at! and for about at air so that at when so at at attacking', 'I think angry at at about at issue at at reason at at at That atr that winds at about at', 'I think speedy then at at with that about about? at at things with without letters and for about is', 'I think at Jack, about at at that that at point that that method is at at . And very at', 'I think how at is at about here and that I that oc in onsses against at at or that and', 'I think  because because you had he would that 

100%|██████████| 50/50 [00:53<00:00,  1.07s/it]


['I think - about a that that because so Colin about for about that because because it at that that that because', 'I think at that... now right at at the because ( at at and at that blacky that because at', 'I think about at at when as that turn, and difficult at that and that at that! thing a thing', 'I think about at at with at about as because at at because men that about enough anger about about about words', 'I think that that about at because at issues yesterday at which that at. at at... that?! that', 'I think! after and at because that at is at about because at about that at at Mp that ... about', 'I think that enough enough about as as at a a!y manager quickly! at that at about about about', 'I think at about, widow at that that that at at at the that power I would that that I it', 'I think towards at with at with black and birds so and that - aty that aty about becauseily', 'I think you... that that because because that guy that thatc from thing that about about " at now abo

 94%|█████████▍| 47/50 [00:50<00:03,  1.09s/it]

In [None]:
fig = px.line(x=act_ster_losses, y=act_steer_scores, title="Anger vs. Loss", labels={"x": "Loss", "y": "Mean Anger Score"}, markers=True)
# fig.add_scatter(x=sae_anger_losses, y=sae_anger_scores, mode='lines+markers', name='SAE steering')
fig.update_yaxes(range=[1, 9], dtick=1)
fig.show()

In [None]:
mean_scores_act, all_scores_act, losses_act = get_scores_and_losses(
    model,
    hp6,
    steering_vector=act_steering,
    prompt=prompt,
    criterion="Anger",
    scales = scales,
    n_samples=50,
    insertion_pos=None,
    explanations=False,
)
