In [20]:
import os
import sys
sys.path.append(os.path.abspath('..'))

import torch
from torch.utils.data import DataLoader
from transformer_lens import HookedTransformer
from transformer_lens import utils as tutils
from transformer_lens.evals import make_pile_data_loader, evaluate_on_dataset

from functools import partial
from datasets import load_dataset
from tqdm import tqdm

from sae_lens import SparseAutoencoder
from sae_lens.toolkit.pretrained_saes import get_gpt2_res_jb_saes
from sae_lens import SparseAutoencoder, ActivationsStore

from steering.eval_utils import evaluate_completions
from steering.utils import text_to_sae_feats, top_activations, normalise_decoder, get_activation_steering
from steering.patch import generate, get_scores_and_losses, patch_resid

from sae_vis.data_config_classes import SaeVisConfig
from sae_vis.data_storing_fns import SaeVisData

import plotly.express as px

torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7f26d87dccd0>

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = HookedTransformer.from_pretrained("gemma-2b", device=device)

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Loaded pretrained model gemma-2b into HookedTransformer


In [3]:
hp6 = "blocks.6.hook_resid_post"
hp12 = "blocks.12.hook_resid_post"
sae12 = SparseAutoencoder.from_pretrained("gemma-2b-res-jb", hp12)
normalise_decoder(sae12, scale_input=True)
sae12 = sae12.to(device)


sae6 = SparseAutoencoder.from_pretrained("gemma-2b-res-jb", hp6)
normalise_decoder(sae6, scale_input=False)
sae6 = sae6.to(device)



In [4]:
top_activations(text_to_sae_feats(model, sae12, hook_point=hp12, text="I talk about weddings"))

(tensor([[[25.9705, 24.4213, 24.0417, 22.5076, 14.2997, 10.6286,  8.4462,
            6.7500,  5.1569,  4.9748],
          [12.4683,  9.5918,  6.3518,  4.3895,  2.6166,  2.4245,  1.9347,
            1.2328,  1.1585,  0.9953],
          [11.0023,  5.0192,  4.7240,  4.3666,  3.6535,  2.5568,  1.5029,
            1.3582,  0.9921,  0.9623],
          [ 7.2850,  3.8669,  2.9988,  2.5302,  2.3834,  1.9475,  1.9330,
            1.5817,  1.5285,  1.3745],
          [16.2181,  3.1115,  2.9495,  2.1953,  1.6427,  1.5833,  1.5298,
            1.4020,  1.3613,  1.3428]]], device='cuda:0'),
 tensor([[[ 8731, 10841,  5233,  3048,  1645, 11597,  6099,  7650,  6802,  8618],
          [ 5611, 13161, 12885,  4966, 10323, 12492,  7971, 10662,  3998, 14956],
          [ 4760, 15364,  7005,  7503, 10323, 14664, 15468,  4966,  3561,  7971],
          [ 6353, 15978, 10323, 15364,  6723,  3998, 14664,  2847,  7971, 16257],
          [ 9099,  9549,  9331, 10323, 10108,  7492, 13389, 12101,  6848, 13270]]],
   

In [5]:
# steering = sae12.W_dec[9099] * 75  # weddings
steering = sae12.W_dec[9099] * 16  # weddings
steering = steering[None, None, :]

In [6]:
prompt = "I went up to my friend and said"
scale = 10
print(model.to_str_tokens(prompt))

['<bos>', 'I', ' went', ' up', ' to', ' my', ' friend', ' and', ' said']


In [7]:

generate(model,
         "blocks.9.hook_resid_post",
        #  "blocks.12.hook_resid_post",
         prompt,
         steering_vector=steering,
         scale=scale,
         insertion_pos=None,
         max_length=20,
        )


100%|██████████| 5/5 [00:05<00:00,  1.13s/it]


['I went up to my friend and said, celebrating at is train in invitation. planning will tea, spa, and or anniversary henna leh photo',
 'I went up to my friend and said, day party and planner at planner of planner planning for invitation and gown planning.! planning band cake',
 'I went up to my friend and said day plans planning party of. 1 march from plan planning week weekend planner dream planning in budget reception',
 'I went up to my friend and said dressed cost cost cake my planner date re \n\n day dress cake with decoration ceremony feast made planning  ',
 'I went up to my friend and said. we are vibes and we playlist we vows we in. at story in you day in 2']

In [8]:
act_steering = get_activation_steering(model, hp6, "Anger", "Calm")
print(act_steering)
act_steering = act_steering.sum(dim=1, keepdim=True)
print(act_steering.shape)
print(torch.norm(act_steering))

pad tensor([], device='cuda:0', size=(1, 0), dtype=torch.int64)
tensor([[[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.4653, -0.9298, -0.2396,  ..., -1.5924,  0.3313, -0.1270]]],
       device='cuda:0')
torch.Size([1, 1, 2048])
tensor(91.0582, device='cuda:0')


In [9]:
prompt = "I think that"
scale = 2

generate(model,
        hp6,
        #  "blocks.12.hook_resid_post",
         prompt,
         steering_vector=act_steering,
         scale=scale,
         insertion_pos=None,
         max_length=20,
        )

100%|██████████| 5/5 [00:05<00:00,  1.06s/it]


['I think that <b><b><b><b> -anger - - ... ... - - ... ... - 8 - -',
 'I think that <strong>aleinlickvieor -s origond between -s vreye osob...u thefts',
 'I think that felt for his trait -the..."      \nemasurusa as enderas ... > |  pris',
 'I think that made "ill or in - • • ..." 9-לס - - ... - oturus r',
 'I think that for thebacher-warranri"f( ... _____ sittor thereforvironsons(englansvin']

In [10]:
top_activations(text_to_sae_feats(model, sae6, hook_point=hp6, text="Anger"))

(tensor([[[24.4932, 23.9630, 23.2544, 21.0681, 20.9372, 20.6655, 20.3199,
           19.8953, 19.0510, 18.4377],
          [56.7609, 14.0818, 12.2185, 11.2966, 10.9838,  9.4307,  8.0595,
            7.1658,  7.1005,  5.8064]]], device='cuda:0'),
 tensor([[[ 3390, 15881,  5347, 16334,   556,  8704, 11785,  5624,  5396,  6877],
          [ 1062, 12753,  1213, 11968, 12167,  5915,  2491, 15173, 11912, 12312]]],
        device='cuda:0'))

In [11]:
steering = sae6.W_dec[1062] * 56  # anger
# steering += sae12.W_dec[12312] * 10  # anger
steering = steering[None, None, :]

In [12]:
prompt = "I think"
scale = 0.8

generate(model,
        hp6,
         prompt,
         steering_vector=act_steering,
         scale=scale,
         insertion_pos=None,
         max_length=25,
        )

100%|██████████| 5/5 [00:06<00:00,  1.32s/it]


['I think an even worse problem is that I feel very hateful towards other people, and would like the one person to become a god-',
 'I think this game is full of humor, and I definitely have learned this really hard from my daddy. I came up with a whole',
 'I think I may have found what is turning out to be ballot tabulation software.  (  <b><i>  this  </i></b>\n\n',
 'I think so too for Abu well that is how you teach\n1)Anger and violence\n\n\n10 years passed and anger to zero',
 'I think, the husband told me", it is the biological definition of a person - but quickly receipts. Anger is a weapon, more']

In [17]:
scales = [0, 0.1, 0.2, 0.4, 0.6, 0.8, 1, 1.5, 2, 3, 4, 5]

In [18]:
# Run a bunch of times

mean_scores, all_scores, losses = get_scores_and_losses(
    model,
    hp6,
    steering_vector=steering,
    prompt=prompt,
    criterion="Anger",
    scales = scales,
    n_samples=50,
    insertion_pos=None,
    explanations=False,
)



loading dataset: NeelNanda/c4-code-20k
dataset loaded


100%|██████████| 12/12 [00:37<00:00,  3.16s/it]


scale 0


100%|██████████| 50/50 [00:53<00:00,  1.08s/it]


["I think if you're feeling a palpable fear a lot, like always being in a state of high alert", 'I think I got it, at least for addition and multiplication:\n\n<math xmlns="http://www.', 'I think first I needed to find a wife that will look after a single drinker! Better yet, a user', 'I think it is necessary to include an image with your question.\n\nSir\nI approach u as humbly as', "I think two of the best recommendations that anyone who has elbow pain can do is massage therapy and new butcher'", 'I think this is another example of the Christmas wave bringing those "Hooray Hooray" and "Look at', 'I think the answers of both the questions are in definition. $V$ has a topology induced by some metric', 'I think it would make a great coffee break activity. Interact Sentence 2\n\nI think such a game is', "I think you should have a virtual substation, and then refer to it. It doesn't have to be", 'I think that Old Trafford will be the destination for Faith Uzoma\n\nI’m not sure if it', 'I 

100%|██████████| 50/50 [00:53<00:00,  1.08s/it]


['I think m.x means the mode is more extreme,\nbecause if you shuffle the indices, e.', 'I think I see what kind of sht your talking about here.\n\nLowering headsets is normal due to', 'I think we’ve heard every reason racism has not affected Barça enough\n\n1: Manchester United should have been', 'I think the 49ers’ 22-10 win last week over Seattle did at least', "I think you're supposed to make like a snow flurrrrrrry. i stuck to this new-", 'I think you <strong>need</strong> to compile these lists using a static <code>.pm</code> with ways to', 'I think he should use his sleuther art, what do you think? if not, xie keen just', 'I think I have a yeast infection ( I think because I am on an anti ulcer med and doctor acted like', 'I think I am having the same problem. My machine only does it when I am on wireless, not trouble', "I think that image has given me a nice stretch of the imagination, and it's very MOCo of", "I think it's a Reddit badge. You can get it in your main terminal, o

100%|██████████| 50/50 [00:53<00:00,  1.08s/it]


['I think its pretty unlikely... In absolutely all of the games that William has played, no one has complained for', "I think I was asked once about Macho Man Randy Savage's reaction to Alundra Blayze", 'I think Ghostbusters is a really good idea. Not just the animation but his expression. When you think about', 'I think they made both of them to look bad for C3-103 and her rioting was', "I think I went a little crazy. That's if the guy starts to complain I'll give him", 'I think it’s safe to say these days that not everyone gets so tweets me up by them after having', "I think it's interesting that there are two meditating Buddhas in this painting. Very picturesque. This artwork", 'I think that in order to have a self-opinionated and full of knowledge about a character it is necessary', 'I think that the second gene is the important one. I think that you might think of how the problem will', 'I think a part of my soul continues watching, Will of the Wisp is a great series and perhaps w

100%|██████████| 50/50 [00:53<00:00,  1.08s/it]


["I think you'd probably be better off updating cargo itself.\nI put all my dependencies in a linked", 'I think some a-hole property manager took the money from our agressment and got caught by the husband', 'I think based on other posts that this MIGHT be useful.\n\nI have a document group that is broken up', "I think this might be the actual bitemark for Leeway's Class masked at a specific point in", 'I think there must be something wrong with Amazon because I still have the same issue, even 15 years', 'I think most of us will agree that no one enjoys being shouted at, bullied or called names. I would', "I think i've found my limit for the impedance issue. Anyway seems crazy to me from the pic it", 'I think this might be a red herring. What made the client unhappy was the advisory made to split the capital', 'I think .5% would be too high for the deeper tone. The tone should be around 1%', 'I think you need to also manage your expectations here. Tech wise this game is four years behi

100%|██████████| 50/50 [00:53<00:00,  1.08s/it]


['I think my mind has been whirled out of the oven by the noise of my opponent’s childish words', 'I think it is odd how small it is. I can see if I was bought at $15.', "I think most 'clinchers' (though I may be wrong about that) at least worked very hard", 'I think I’m done with the <em>Abbey Road</em> heat; I hope it’s only superficial', "I think that I'm starting to fly crazy!\n\nI think that I'm starting to fly crazy", 'I think AWP is going to be nice. It will be pretty wet for a few days. It is', 'I think your thinking names family members because the character also want their elder to not get in my face, please', 'I think so every day!\n\nI posted the same a couple of hours ago.\n\nI don’t', 'I think this is explained well and otherwise ours could turn out looking like this...\n<i>Hetalia draw', 'I think I was kind of thrown when my other vex recently broke about a year and half ago. Ive been', 'I think I might have should’ve been more demanding, I have only one leech at level 1

100%|██████████| 50/50 [00:53<00:00,  1.08s/it]


['I think the Ralfs’ accusation over the past few years would be directed towards Wagner’s decision to make', 'I think I was a real bitch when I was pregnant.\n\nNo I wasn’t? Yes hahaha okay', 'I think you are over reacting.\n\nFrom the past, it was more of harassment and not the true emotions', 'I think pass drives already hell loud at this point lol\n\nI’d also be happy about zoris haircut', "I think nothing has gone THAT bad that I'd quit painting, but one of these days my son is", 'I think they really annoyed me at a recent all-white dinner.\n×</>\n<b>CLICK & F', "I think it's fair to say that 12-year-old me would be fuming right", 'I think that no one should be surprised at this beautiful pastor’s response. <strong>Who shouldn’t', "I think it's necessary she be at that, so read on.\nIf it's any consolation", 'I think CardsBoss is an over-priced POS. I just want it to go away so keep it going', "I think that is crap, TFS doesn't handle anything. Also someone is making a forum post 

100%|██████████| 50/50 [00:53<00:00,  1.08s/it]


['I think that my negative feelings about the world are a real blow-up at many things, but especially at', 'I think wrestling colours macho PET SUDOCAN THREATEN YOU FOR YOUR own HISS for or or --', "I think I have a solution to make myself feel better about the prices I'm seeing about because of how", 'I think Elena.\n\n@ fan adreaming\n\n@Misunderstood\n\nOk ok\n\nZonushere I kno', "I think I get angry at these television figures constantly. It's fine if they go on about something you", 'I think I’m mad at EA right now. A group of 8 here and is telling them about', 'I think it’s an appropriate climate situation because it boils me that they want me to call him because there', "I think I've tried every single trig function over the last few days. Where did it go when I", "I think I have friends that I don't show off to others. Especially when it's my wife", "I think not enough hostile, but I'm also pissed because I just told someone who demanded I was trying", "I think I'm one of the f

100%|██████████| 50/50 [00:53<00:00,  1.08s/it]


['I think about it "purple" and then feel bad that we live in a place over something about the ocean', 'I think alot about things when i’m pissed, mainly, because i’ll about my every little thing', 'I think it because we had a cook you last. Specifically at about 9 up telling at me.\n\n', 'I think I am loosing my head this to now , at the fact that I had with the delays from the', 'I think I was told as well by Princess in Coli.F a lot-I saw her at the', 'I think DaisyatHay.\n\nWhy would you so? Why did Dumbledore?\n\nYes you are c?', "I think you're getting a lot for the color or sad or about what I said five years ago or", "I think I'm pretty at my wit's end with one thing AND the other today, so please", "I think your question isn' at all at sword-fire... it is at that you won that bad after", 'I think so! About time that someone wrote to me about General not at this cowboy at the fact was that', 'I think its a waste and I am not going to bellow how angry I am I at writing.\n\n\nWho

100%|██████████| 50/50 [00:53<00:00,  1.08s/it]


["I think where most about at my Heath and and how I' when I fly then I talk aloud at that", 'I think I am so angry at myself about today at Alan... every stupid things about at with all because and', 'I think cars alot all the time, I I I I mad yesterday. I thought the " ruin I see', 'I think at first at the scene and without because that people then about and I when people expressed so I that', "I think about a lot at this but not about it all. I am when I's but at can", 'I think all the most at as because as as the sad person that get at and at with...fast at', 'I think about moms all the time and I’ at that at how guts burns about because cone!!! at when', 'I think that, about women the scanner. I disguised / at night I. the defense that I that at', 'I think it, at and, you divided so music flooded. When I last at would if at swept that', 'I think this a lot and sometimes at at family over on the red sheep and red at people. for a', 'I think that my rage at  will twice, relay, anno

100%|██████████| 50/50 [00:53<00:00,  1.07s/it]


['I think and about with..ideas is road has at because that... because his upat about aheadie', 'I think this is at at with — that like when the episode goes easy at and sad at at at that', 'I think everytime after Tik now all, that young black person and because I at with f at something that ang', 'I think at... and at at no tinder Android  persist, it with at and what songwriter about Java at', 'I think sleepy when because at over. about, or or so., shows can defend against., source', 'I think and so!!!! & at! and for about at air so that at when so at at attacking', 'I think angry at at about at issue at at reason at at at That atr that winds at about at', 'I think speedy then at at with that about about? at at things with without letters and for about is', 'I think at Jack, about at at that that at point that that method is at at . And very at', 'I think how at is at about here and that I that oc in onsses against at at or that and', 'I think  because because you had he would that 

100%|██████████| 50/50 [00:53<00:00,  1.07s/it]


['I think - about a that that because so Colin about for about that because because it at that that that because', 'I think at that... now right at at the because ( at at and at that blacky that because at', 'I think about at at when as that turn, and difficult at that and that at that! thing a thing', 'I think about at at with at about as because at at because men that about enough anger about about about words', 'I think that that about at because at issues yesterday at which that at. at at... that?! that', 'I think! after and at because that at is at about because at about that at at Mp that ... about', 'I think that enough enough about as as at a a!y manager quickly! at that at about about about', 'I think at about, widow at that that that at at at the that power I would that that I it', 'I think towards at with at with black and birds so and that - aty that aty about becauseily', 'I think you... that that because because that guy that thatc from thing that about about " at now abo

100%|██████████| 50/50 [00:54<00:00,  1.08s/it]


['I think that about when about about that... about during that about whit thatge at wheny eater about time', 'I think is that with little things about every when about about about!\n\n\nfire enough enough about and at about', 'I think that easily at with about and that at with about powers that because that that and that is enough power', 'I think when when that artistey at at bird birds and redirect... about about at directly at f about at', 'I think on... it that a firey that because at for about about窃 in about whiskers that enough that', 'I think   that he at that white\u200cآم little that fire that and that with at birds that w enough see', 'I think that at pointsbirds hor when excess about at get enough enough mob levels about at about said things words', 'I think theవాuru about get周末 enough enough enough enough at enough at I spleen about about that about with', 'I thinkand you at because at tunnegedy about that at Upbirds " that it that at that a', 'I think and when that that 

In [22]:
fig = px.line(x=losses, y=mean_scores, title="Anger vs. Loss", labels={"x": "Loss", "y": "Mean Anger Score"}, markers=True)
# fig.add_scatter(x=sae_anger_losses, y=sae_anger_scores, mode='lines+markers', name='SAE steering')
fig.update_yaxes(range=[1, 5], dtick=1)
fig.show()

In [26]:
print('sae steering norm', torch.norm(steering))

sae steering norm tensor(56., device='cuda:0')


In [23]:
mean_scores_act, all_scores_act, losses_act = get_scores_and_losses(
    model,
    hp6,
    steering_vector=act_steering,
    prompt=prompt,
    criterion="Anger",
    scales = scales,
    n_samples=50,
    insertion_pos=None,
    explanations=False,
)


loading dataset: NeelNanda/c4-code-20k
dataset loaded


100%|██████████| 12/12 [00:37<00:00,  3.15s/it]


scale 0


100%|██████████| 50/50 [00:52<00:00,  1.06s/it]


['I think it’s a must to have an eye on the first steps to help integrate beginner crocheters to', 'I think wholesale mp3 is a waste of time. I think I would just buy those in your area or', 'I think everyone knows the frustration of not getting the perfect result. This is something I strive for in any of', 'I think an armoire would spice up a bedroom decor and get those heavy clothes into there.\nLastly ^^^', 'I think not a single one of us is ever completely safe from stress, despite living in the safest nation in', 'I think one should be blurred from both sides, but there really isn’t a way for this to work', 'I think you are only studio, as you can’t swim with these logs and they don’t wear', 'I think these maxis shades are a more durable and comfortable alternative to masks. That person with silver spoon is', 'I think most people would agree that balloons are a waste of time and money. But you’d be surprised', 'I think hard, fair thoughts.\n\nThat I want the money to be spent <em>

100%|██████████| 50/50 [00:52<00:00,  1.06s/it]


["I think the love potion works based off of affection. If your town is empty you'll get feel better", 'I think I have just wasted 30 minutes of my life.\n\nI just figured out how to "', 'I think we’ve reached the point where this mantra is true:\n\nAnd for me it wasn’t', "I think I'm lucky I have a picture of this crammed up my wall at the local wool store.", 'I think the 2020 presidential rioting has turned out to be a Biden disaster instead of a', 'I think you mean UniFi vs Ubiquiti - although, when I was using ubiquiti, uniFi was', 'I think that, newly minted, my novitiate I was,\nand, new to the flock,', "I think I buggered that with someone else, I can't with the reshape node :(\n\nAh..", 'I think we put too much pressure on specific failures, this gives people reasons to not want to offer support,', 'I think so.\n\nI think so simply because\n\na) he is still mortician in canon as his', 'I think it stands to reason that if I wanted a nice, sporty convertible, there was only one w

100%|██████████| 50/50 [00:52<00:00,  1.06s/it]


['I think the great majority of my pictures would say it all about who I am. :)\n\n188', 'I think this is:\n\n\\begin{displaymath}\n  \\displaystyle H^{\\mathcal{S}} <', 'I think conclusion is right, based on car comparisons within each cohort between 1991 and 1', 'I think that crypto is the strongest that we saw today and I for sure that it will take off again tomorrow', 'I think I am going to sound really stupid here…but, I’m thinking of getting way into mattresses', 'I think the majority of the comments here got removed for voicing that opinion. I think people just agree with you', 'I think we deserve better for what we pay. If your Connectivity has come past & accuracy past 20', 'I think its probably pretty easy to implement in the same manner as there are already storage locations so i suggest only', 'I think to take that code from the SDK to use it as its source is kind of cool.\n\nI', "I think there is a bug in the builder where the crafted items aren't going into the inventory 

100%|██████████| 50/50 [00:52<00:00,  1.06s/it]


["I think 'Shukla Namker Sharmoon' on Saturday evening at 9 pm on Dabang TV", 'I think anyone who has seen the movie (with that pointed look?) knows that not only campaigning in a state', 'I think that the editing and police coverage alike around this case as it moved through the courts eventually opened up the', "I think another persons post or this!\n\nno, no, you can't measure Warlock Warrior Void t", 'I think weight scaling has been away, pumping has been cut to the floor $ and they are doing disconnects', 'I think he would cooperate, but as the guy right behind me said, everybody that wanted blit was different,', 'I think the 1.0 spirits an canon. no !!!!\n\nWoah Collab with Persona 5 Chronicle I', 'I think <strong>the lack of connection between reports (for Gantt and Dispatch Management) and project data sources</strong>', 'I think that this feature should work, while looking at it i visited it myself from time to time and might', 'I think Luluda Sale Tv password hero was like va

100%|██████████| 50/50 [00:53<00:00,  1.06s/it]


['I think that a good ad is one that comes across especially strong to the viewer at the first impression. This', 'I think they always – and they are mostly younger than me; with them are turns of a thousand years,', "I think he should sign it but I don't want him to. I was denied early on, so", 'I think Jomes. She m famous at saying mean stuff to him and she thinks it is a great way', 'I think that I would have been mad at Austin," said Power. "I tried to punch an older Boone', 'I think I have a prob with respect department\nMy dad in argument\nAnger-is the root cause of', 'I think it is official?\n\n\nSeems to be pretty unfunny by my first subsection, its some spirit management', 'I think it’s about time, cuz JoJo implied he wanted revenge on Han Solo that very challenging!!! and', 'I think we in Texas has everyone pissed off at said politician. All the speakers remembers for how long had been', "I think the difference in the two things wasn't a burning fury.\n\nFor him it was no lon

100%|██████████| 50/50 [00:53<00:00,  1.06s/it]


["I think the object of hell is a feeble aim. -Anger flor:\n\nThe angrer's joy", "I think Sunday should be the first time in my life that I was really angry at God... and I'", 'I think this will not work at all.      They are looking at us       for their anger and is why', 'I think that back at Max we all was in the same group to daddy and he went straight to George and', 'I think Chaf(Name) should be the seven offensive how public relations Pope knows he is. Loved and', 'I think prog sells two days for misdirected ass.........it has always disgusted me...ink didn ,..', 'I think you have it, though I just kind of kicked at him a bit and played with him and was', 'I think I miss how feelings are a trigger. How it is age based and not limit and entitlement that is', 'I think one of the biggest problems young people have is, is anger, suggest, and so when they says', 'I think about you all time, not quite as quick as the words, and normally am eight times better than', 'I think potion sh

100%|██████████| 50/50 [00:52<00:00,  1.06s/it]


['I think was would have been. I never recognized the potential if I lost. it  was never __  ', 'I think at this point Nik would at least be a running size management. which is what get at? Fonda', 'I think this book was written "has come." At bitter, at every, in the Grand of God,', 'I think is very important to our life if we finds thing and wonder what people ins why and he is how', 'I think finding ate is like take * " and when god is angry" and for every 8 is', 'I think the most people aren\'s "session" hundreds in the seventies a hundred - http I come and', 'I think Biologically I must be something....."was takr and a to not <SOg and at', 'I think it was with you" as Singh at the for ! Thomas - son at 1968', 'I think at one point, in the last of our power, that he regretted it the most.\n\nNow', "I think I'm sick of the 650 at the world. or the 10 /", 'I think the anger of Jesus stemmed from WWII that was acted toward God, and then the question of, "', "I think I'd most angry at 

100%|██████████| 50/50 [00:52<00:00,  1.06s/it]


['I think for the first the most, but then!! then America as >> aust drive. |Shown |Anger', 'I think, they is leo.TRANSLENTI)\n\n *  recottonrson\n\n\n  ATE\n\n fre', 'I think takes me. over.. and was is ...in..... CRF.He(x-c espec\n\n', 'I think I first saw at the cause at rate at him peremgium which\n\n\n\n\nWhen he first against sat', 'I think is the heat and for her- is lost. Ang is 8 here. Ang ererd (', 'I think on Monday is still. Non - and - Srorstect.......linn9useaget', "I think so.So'\n\nG(son...  :^(\n\n  lois\n\n<strong>  iu - -", 'I think for every gard.. oe.. #fakCroa,,,u8l .. continuing" here... ..', 'I think justification is long she KARo…nce. It is this that angroy got made…dare…', 'I think is another act already and I am at society ruining.Therick,By Espinoza and ger... -->', 'I think is taking so long, here | "siko(y(c) y) | Tyson Sanger', 'I think thats where it..."ab(f)(definition) how come ??? Dear And if asked in NOW or', 'I think heman wasly anger came, and cont

100%|██████████| 50/50 [00:53<00:00,  1.06s/it]


['I think the rest is present sofrank...feyea...a...or 8a en... povr', 'I think orks was was is the hate to tang... ...   ((((erickr>>>>>......俄罗斯', 'I think Anges the\n\n % as -..... or ricinia("lieseイヤーの ... - -vener', 'I think to you ... twister...uneri connone......VEWS... .... Pengoping(uri en', 'I think, for my feelings againstҥу! "cherorores I -oologuエラーbageas', 'I think most, and day" as ands | 8 when asked I" delongfert Casoon', 'I think on behalf of " many                     ...sses & >  ttyrass r -ow .....@( I tale', 'I think issues and visits followed angl merier verger .... Justa ....;; ..... to firbin!!', 'I think the more long\'s volt" – **a… » tak............... as I(foes', 'I think are is very................eigho in elections…..... ....... ...terofnang(...)      ', 'I think come when or용greort word ... yra utan :) ennure svuyer>>>>*** an', 'I think ended b\'at, raX rider, and "nentr $ror here as only shown', 'I think is - H on on the oficireuer sitt fromiets utan sty

100%|██████████| 50/50 [00:52<00:00,  1.06s/it]


['I think and mostly"gorero fron说了....ненияoror"sororor schrieb .....(', 'I think ...ovenusa(angleESonoror" -..."r(ororororor', 'I think" isngeraroror -ororor oraoran -orbunor((n(..."', 'I think -eroroztron ... ......@orororrorWritesoroliorororor', 'I think now “of oエ で נת { — -connected - - thrilled..." ...…" -', 'I think - ..... ...as OCH ... | ... -ororor"orogor(alue %or', 'I think\n\nTheororeror junorrororr ..." ..." angororor > >', "I think is to or so dah %r __icior' -or %er(or -to>", 'I thinkis en oficir Ojo ... ang ..." .... ..... ..... ... ... ... ... ... ... ... >> >>', 'I think\n\no offensesorуру..."uororor"oror(oror(warranor', 'I think is on Tori......" ...…"...jusuru..."...@enfGfi......ororanger', 'I think < < or" enfe> is (i"s yenlire"oror IOTOR', 'I think uy galaxies so seneng —" ang %oror스"» ^ororуor?"', 'I think policy here ",\n\n  recipient... ...orozogueroror错误の引用...or"', 'I think as job - • " ang...... ..... ... sonor ... ... ..." Nguyễntheons >onor', 'I think angu 

100%|██████████| 50/50 [00:53<00:00,  1.06s/it]


['I thinkoror schrieb - >> extremadamente hiện - —warran ... ... ... ... ... ---warran...or..."', 'I think"…" ... ... ... ... ... ... ... ... ... ..."AngerAnger steamed ...engan Herrn ... ..."', 'I think of ... - > ... ... ... ...orororor - ... -oror taha -s', 'I think follows ...or Nguyu ||attoos" --- — >> >> =ororororor "... ...', 'I thinkor" >> ^ ^ >> >> - - - ..." ... ..." ..... ... ... namor^ ...or', 'I think ... ... ... ... ... ... ......... ... ... ... ... ... ang ---- > ... ... ... ...', 'I think"oror misconductn...@ .....or - - ... ... ... ... ... ...... .... ... ...', 'I thinkor(or >>?" - <<< >>... Fortsch"өзoror .....oror"...."or', 'I thinkor —or Ehreنانенprepends ... ...てmdan --- — ---" ... ... ... ...', 'I think - -<b> anger ..."... "... ... ..... ... ... ... ... ..." ... ... ----or ...', 'I think accumulations - ... ... ... ... ... ...s ... ... ... ... were ... ... ... ... ... ...', 'I think — -oror Tadeusz ... ... —or Ehre ->or --- --- ^ +ororor Agus', 'I think -- ...or"

100%|██████████| 50/50 [00:52<00:00,  1.06s/it]


['I thinkodor1orAngerAngerAnger Tadeusz ---AngerAngerAngerAngerAnger Tadeusz Tadeusz ...Anger TadeuszAngerAnger', 'I think <gros VBA — ... Zufall — ...AngerAnger ...AngerAngerAnger Tadeusz TadeuszAnger TadeuszAnger bender', 'I think Ang Tadeusz ... ... Tadeusz ..."orAnger AngerAngerAnger shewn opravAnger OchoAnger TadeuszsongiAnger', 'I think "(" ......" > Engen PriesterorServletConfigger"...@ ...addProduct TadeuszAngerAngerAngerAngerAnger!!"', 'I thinkAngeror ...AngerAngerAnger cessationAnger AngerAngerAngerAngerAngerAngerAnger TadeuszAngerAnger TadeuszAnger', 'I think >> - • > Ehre Ehre ——" —ordanAngerAngerAnger Tadeusz --- Tadeusz ---AngerAngerAnger', 'I thinkAngerAngerAngerAngerAnger ..."…"anger...@ ...AngerAngerAngerAngerAngerAnger ------AngerAnger Tadeusz', 'I think >> -- — — — ... — ..." — —Anger Ehre Ehre ...AngerAnger TadeuszAngerAngerAnger', 'I think > ------ -- >> combineReducers combineReducers combineReducers combineReducers combineReducers combineReducers combineReducers 

In [25]:
fig = px.line(x=losses_act, y=mean_scores_act, title="Anger vs. Loss", labels={"x": "Loss", "y": "Mean Anger Score"}, markers=True)
# fig.add_scatter(x=sae_anger_losses, y=sae_anger_scores, mode='lines+markers', name='SAE steering')
fig.update_yaxes(range=[1, 7], dtick=1)
fig.show()

In [27]:
print('act steering norm', torch.norm(act_steering))

act steering norm tensor(91.0582, device='cuda:0')


In [29]:
anger_words = ["fury", "aggravation", "aggravationed", "aggravationing", "aggravations", "aggravator", "aggravatorred", "aggravatorring", "aggravators", "ailed",
"ailing", "ails", "allium_tricoccummed", "allium_tricoccumming", "allium_tricoccums", "anger", "angered", "angering", "angers", "angried",
"angries", "angriness", "angrinessed", "angrinessing", "angry", "angrying", "anguish", "anguished", "anguishes", "anguishing",
"annoy", "annoyance", "annoyanced", "annoyances", "annoyancing", "annoyed", "annoying", "annoys", "bawl_out", "bawl_outed",
"bawl_outing", "bawl_outs", "bedevil", "bedevilled", "bedevilling", "bedevils", "berate", "berated", "berates", "berating",
"bilk", "bilked", "bilking", "bilks", "botheration", "botherationed", "botherationing", "botherations", "bothering", "bothersome",
"bothersomed", "bothersomes", "bothersoming", "cacoethed", "cacoethes", "cacoething", "call_down", "call_downed", "call_downing", "call_downs",
"call_on_the_carpet", "call_on_the_carpets", "call_on_the_carpetted", "call_on_the_carpetting", "chafe", "chafed", "chafes", "chafing", "chew_out", "chew_outed",
"chew_outing", "chew_outs", "chew_up", "chew_upped", "chew_upping", "chew_ups", "chide", "chided", "chides", "chiding",
"choler", "cholerred", "cholerring", "cholers", "cod", "codded", "codding", "cods", "concern", "concerned",
"concerning", "concerns", "craze", "crazed", "crazes", "craziness", "crazinessed", "crazinessing", "crazing", "cross", 
"cross_thwart", "cross_thwarted", "cross_thwarting", "cross_thwarts", "crossed", "crosses", "crossing", "crossness", "crossnessed", "crossnessing",
"crucified", "crucifies", "crucify", "crucifying", "cult", "culted", "culting", "cults", "cultued", "cultuing",
"cultus", "daunted", "daunting", "daunts", "delirium", "deliriumed", "deliriuming", "deliriums", "deranged", "deranges", 
"deranging", "despised", "despises", "despising", "detest", "detested", "detesting", "detests", "devil", "deviled",
"deviling", "devils", "disappointed", "disappointing", "disappoints", "discomfited", "discomfiting", "discomfits", "discomfort", "discomforted",
"discomforting", "discomforts", "discommode", "discommoded", "discommodes", "discommoding", "disoblige", "disobliged", "disobliges", "ferocitied", 
"ferocities", "ferocity", "ferocitying", "fierceness", "fiercenessed", "fiercenessing", "foil", "foiled", "foiling", "foils",
"follied", "follies", "folly", "follying", "foolishness", "foolishnessed", "foolishnessing", "frustrate", "frustrated", "frustrates",
"frustrating", "frustration", "frustrationed", "frustrationing", "frustrations", "frustrative", "frustratived", "frustratives", "frustrativing", "furied", 
"furies", "furioued", "furiouing", "furious", "furiousness", "furiousnessed", "furiousnessing", "furor", "furore", "furored",
"furores", "furoring", "furorred", "furorring", "furors", "fury", "furying", "hat", "hate", "hated", 
"hates", "hating", "hatred", "hatres", "hatring", "hats", "ire", "ired", "ires", "iring",
"irritabilities", "irritability", "irritabilitying", "irritate", "irritated", "irritates", "irritating", "irritation", "irritationed", "irritationing",
"irritations", "kill", "killed", "killing", "kills", "lambast", "lambaste", "lambasted", "lambasting", "lambasts", 
"madden", "mad", "maddened", "maddening", "maddens", "madness", "madnessed", "madnessing", "mania", "maniaed",
"maniaing", "manias", "manic", "manices", "manicked", "manicking", "miffed", "miffing", "miffs", "nuisance",
"nuisanced", "nuisances", "nuisancing", "offended", "offending", "offends", "overcame", "overcome", "overcomes", "overcoming", 
"pain", "pain_in_the_ass", "pain_in_the_assed", "pain_in_the_assing", "pain_in_the_neck", "pain_in_the_necked", "pain_in_the_necking", "pain_in_the_necks", "pain_sensation",
"pain_sensationed", "pain_sensationing", "pain_sensations", "pained", "painful_sensation", "painful_sensationed", "painful_sensationing", "painful_sensations", "painfulness", "painfulnessed",
"painfulnessing", "paining", "pains", "pissed", "pissed_off", "pissed_offed", "pissed_offing", "pissed_offs", "pisses", "pissing", 
"rage", "raged", "rages", "ragged", "ragging", "raging", "rags", "remonstrate", "remonstrated", "remonstrates",
"remonstrating", "reprimand", "reprimanded", "reprimanding", "reprimands", "reproof", "reproofed", "reproofing", "reproofs", "rile",
"riled", "riles", "riling", "roiled", "roiling", "roils", "scold", "scolded", "scolding", "scolds",
"scorned", "scorning", "scorns", "soreness", "sorenessed", "sorenessing", "temper", "tempered", "tempering", "tempers",  
"tempest", "tempested", "tempesting", "tempests", "tempestuoued", "tempestuouing", "tempestuous", "torment", "tormented", "tormenting",
"torments", "transparencied", "transparencies", "transparency", "transparencying", "trouble_oneself", "trouble_oneselfed", "trouble_oneselfing", "trouble_oneselfs", "trounce",
"trounced", "trounces", "trouncing", "twit", "twits", "twitted", "twitting", "vehemence", "vehemenced", "vehemences",
"vehemencing", "vex", "vexation", "vexationed", "vexationing", "vexations", "vexatioued", "vexatiouing", "vexatious", "vexed", 
"vexes", "vexing", "violence", "violenced", "violences", "violencing", "violent_storm", "violent_stormed", "violent_storming", "violent_storms", 
"wrath", "wrathed", "wrathing", "wraths", "arse", "arsed", "arses", "arsing", "ass", "asshole", 
"assholed", "assholes", "assholing", "bastard", "bastarded", "bastarding", "bastards", "bitch", "bitched", "bitches", 
"bitching", "cock", "cocked", "cocking", "cocks", "cocksucker", "cocksuckerred", "cocksuckerring", "cocksuckers", "cunt",
"cunted", "cunting", "cunts", "dick", "dicked", "dickhead", "dickheaded", "dickheading", "dickheads", "dicking",
"dicks", "fuck", "fucked", "fucking", "fucks", "idiot", "idioting", "idiots", "imbecile", "imbeciled", 
"imbeciles", "imbeciling", "moron", "moronned", "moronning", "morons", "motherfucker", "motherfuckerred", "motherfuckerring", "motherfuckers", 
"piss", "pissed", "pisses", "pissing", "prat", "prats", "pratted", "pratting", "prick", "pricks",
"shit", "shits", "shitting", "stern", "sterned", "sterning", "sterns", "twat", "twats" 
]

In [34]:
def contains_word(text, word_list):
    for word in word_list:
        if word.lower() in text.lower():
            return True
    return False

def count_words(
    word_list: list[str],
    model: HookedTransformer,
    hook_point: str,
    steering_vector: torch.Tensor,
    prompt: str,
    scales: list[float],
    n_samples = 10,
    insertion_pos = 0,
    ):

    mean_scores = []
    for scale in scales:
        print("scale", scale)
        count = 0
        gen_texts = generate(model,
                             hook_point,
                             prompt=prompt,
                             steering_vector=steering_vector,
                             scale=scale, n_samples=n_samples,
                             insertion_pos=insertion_pos)
        for text in gen_texts:
            if contains_word(text, word_list):
                count += 1
        print(gen_texts)
        mean_score = count/n_samples
        print(mean_score)
        mean_scores.append(mean_score)
    
    return mean_scores

In [38]:
sae_count_scores = count_words(anger_words, model, hp6, steering, prompt, scales, n_samples=100, insertion_pos=None)

scale 0


100%|██████████| 100/100 [01:47<00:00,  1.07s/it]


['I think there is something in LineAE/NGL/Relations[1] that might be relevant:\n\npare', 'I think that a coiled tail in a Thunderbird actually goes back well beyond the Thunderbird tradition. I was a young', 'I think I called in when the owners were the ones taking orders and asking us where to stand in line in', 'I think it can be easier. There are a few flaws. If you have done that, but are now', 'I think I owe a social media apology.\n\nLast week I went tongue in cheek with a comment praising Haram', 'I think it would be <strong>1 cm.</strong>\n\nDue to the frictional forces, the speed <strong>', "I think the shop is pretty hidden, not strategic at all. But the service is great, there's", 'I think they should go back to Wayne the Dentist even though they were less then perfect this time.\n\nPoor', 'I think the case is more that they hid what the police did in the story, and when a major press', 'I think I have a problem.  Now I love me some retro Nintendo 64 and PS1 games', 'I thin

100%|██████████| 100/100 [01:46<00:00,  1.07s/it]


["I think i'm a little late with this, as perhaps many others, having seen all the blu-", 'I think the term should read, "On after N" rather than the other way around.\n\nWhile in', 'I think I know what im trying to do. I defineime the compreseness of the surface as <cd', "I think you're missing a few basics. Wildcards is a NodeJS specific feature. It's", "I think it really depends where you are located.  I'm in NJ and everything is horrendously overpriced", 'I think this would be a flaw of the mmGrid package. If you knew beforehand the number of groups you', 'I think it\'s strange to be like this!!!\n\n"If only you could see my face when I', 'I think it went down pretty well locally, same thing about the Halloween parade in Leesburg, DC.\n\n', "I think I just created something that's almost as bad as a TS3. This pun was totally unintentional", 'I think the one or the other sponsor may be putting it on the table. I am still confused. Most', 'I think my first, zero year (or Rook) is the

100%|██████████| 100/100 [01:47<00:00,  1.07s/it]


['I think Oliver is the cutest thing in the world!\n\nShining Stella Rose! lol\nMet rosetan', "I think that the no AI option (I think it's all the way up in the options menu)", 'I think I have use for everything except Old Brussels spouts, my garden soil will despise them\nBillfrost', "I think my expansion is still unresolved.. With everyone elses help it's not down .. No signs of", "I think the answer is no. You might be able to with weird inputs involving chords, but it shouldn'", 'I think this is an important point.\n\nThe actual goal here is to not lose people Craftaholic', "I think I have about 18 KARAT gold coins but I cant remember exactly, I know there'", 'I think one of the most under-estimated women of Torah is Chava (Eve) known as Bet Hee', 'I think that intense dialogues posted in other thread suggesting such can pull down your case especially well so we need to', 'I think there should be a side effect of time.\nRemoving the swimming glasses would be a good some remind', 'I

100%|██████████| 100/100 [01:46<00:00,  1.07s/it]


['I think it is a little hard for me to understand, because I have never cared enough about this series,', 'I think that pretty much summed up my right side which happens when I watched the fight..problems were start from', 'I think a little side ponytail would look awesome\nI really need to start going up to cocker as many times', 'I think we probably all spotted an airplane flying on Monday the 15th specifically because of an announcement from', 'I think which is most important, to revive the ship now would anger everyone. If china supported us and payed', 'I think this may apply parallel to, or repercussions of the severity of the following:\n\n1. The ari', 'I think brood wars is the largest board game that has been programmed in Python on a Raspberry Pi. I was', 'I think the Gearbox is a bit angry about me.\n\nRecently my Ultimate Goblin has becomes massively disengaged', 'I think most students try their best at school. For my second “academic crisis” episode, I’ll', 'I think there

100%|██████████| 100/100 [01:46<00:00,  1.07s/it]


0.55
scale 0.8


100%|██████████| 100/100 [01:46<00:00,  1.07s/it]


["I think I am the most disappointed person out there, because I've been working with MSC for the last", "I think different engines have different configurations depending on what you're angry about. But, as much as I", 'I think W kwam{(\\Leftrightarrow) , you may be very angry and blame at someone ….:}\n\n<b>T', 'I think its BS etiquette for making a movie at the end of the year because they lost marriage for the New', "I think that after the lawsuits I expressed in last September's column, I missed important things with a lot", 'I think the problem is that <code>url</code> is expecting a string "https://api.twitter.com', 'I think a good retrospective on such things, is an in-depth psychological rant -\n\nSamantha made a choice', 'I think I’m tired. Maybe it’s the wedding band episode that reminds me that when my hubby', 'I think that the maija spirits burn out really fast. Anyway here is where I ran in to a korean', 'I think this doesn’t need an answer but I believe that you just had

100%|██████████| 100/100 [01:47<00:00,  1.07s/it]


["I think that one thousand years of hatred\nthat's brewed in my brother\nlast till it destroy me", 'I think the way, zombie way,slinger, yell, stay angry, and grab, at guards and', 'I think the biggest problem I have is that I’m not very young. Don’t. Laugh.', 'I think it is about time! This angry bird will bring all the randy birds together against their boss.', "I think I'm more upset about her poor! If I wanted to yell at a character in Naruto that", 'I think that there’s a problem with Congress about the deaths and destruction of innocent people towards corruption and all', 'I think you both got at each other but it’s ok. She will be upset at you that she', "I think I'm trying to express at the same with K with the stain of the fictional map on the", "I think, we did not kill Angela's husband, make drunk and kill his mug, since they are", 'I think the best way for someone to feel their life is without any substance over the worst. His temper is', 'I think the same thing. Theirs so

100%|██████████| 100/100 [01:47<00:00,  1.07s/it]


['I think Dictionary too and i finish not with Edd... Bye bye!\n\n"Inside its woman" is the', "I think it's because he doesn't get about enough so he makes me angry so he can a", 'I think it is not with reason,\nwhen the rising sun,\nearthly - sulled - angry', 'I think it about mid management, childish, bit, always !!! at them for Angerly.\n\nI get', 'I think today…about the lack of time and fearful that about how I was…this is at my stomach', "I think with me it firsters than which I get like I’m a fucking grader I's and", 'I think you at the wrong place this right and if you are. well with u might explode that translucent red', 'I think I should be angry at John when he won and goes roaring that your son? Up enough to destroy', 'I think you are (at long, bitter, tired thoughts held and abandonment going towards) at DC Fire at', 'I think Laroc was the big donna bird the whole time he was driving in the courts. Many a young', 'I think because than I get"...\n\nBack and I\' and over cra

100%|██████████| 100/100 [01:46<00:00,  1.07s/it]


['I think so...that it...\n\nI was about! at my child!\n\nThe alone water because I', "I think too, because I'er people at\n\nv. display I at at it is at about it", 'I think about a lot at people and when I am I I write at easily. at with the at small', 'I think about when… if for a long long white and pink sprint… at the very easily distance at being', 'I think at this moment at me a lot with with because orgasm. at happy big dony A!!! fog', 'I think a lot at HOA. at a point told by the tourists among people get fats at holding ( lot', 'I think..?!? at creating out at! So I try asy at everyone and! because at like at', 'I think I felt that. I was unable because I got. slowly and that I was because because I I', 'I think most of who! at the Army at Pacific at at wind!! ic that cheap black stove. cost', 'I think I should for no reason at Chris that it it at out because.\n\nThere is at with me', 'I think I at at at so at at and that sullable because at everyone at this I at because at', 

100%|██████████| 100/100 [01:47<00:00,  1.07s/it]


['I think at so at is beaed up! (!!!" with over about that that this bad\' at', 'I think at that stage... you at that you is is at that.... and at at I halfers at', 'I think if at inat that with when fuel that at gets. things and there about at against because he', 'I think at one it. I i I. There a bunch scarf 한 fire  when! because clicked at', 'I think and at all at day about that on so\n\n at woman $\\ about at في me related because\n\n\n', 'I think at about little teen at present at at at at angry potty with at you cuz at and and because', 'I think at the nib, at at which that many at that isn , it at at a jo at at', 'I think a very and I directed at with at! dad at that  things. at devour because at at', 'I think! that that dog is the... at at thing is because that that that it at at power at', 'I think I ripped at the am about at at I single underwater at with that at at bought at for about', "I think that' about that for that that bum gee a pixel! at old get thing at the mode th

100%|██████████| 100/100 [01:47<00:00,  1.07s/it]


['I think to all starting that at that, that I at... wants that " that about about that that at', 'I think > at.ro black that is about blackmail! at enough it missy i that now that at', 'I think angry mob at that? at with as men men fans so that that atan at at Stalin from', 'I think now when in about about at and about he God at at with it enough enough because how about at', 'I think at at that with fly because about because straight?, about at at guys wasted so that at because', "I think as day that about at the that all at that because that that' weekend ... smokey! that", 'I think issues with from at days. over about at?! about as as repeatedly here because that at about about', 'I think, at with loud but power! about I because at at because at? about enough that beyond mad', "I think! NSFW with that! at about ' at because that at that that I at of ' you about", 'I think collectors at  fans about at at for as gin fires thunder and thats because at at white powerful', 'I think and 

100%|██████████| 100/100 [01:46<00:00,  1.07s/it]

['I think when when stuff room! aboutly because because you - and andfachily about thatri about', 'I think at that promp that at about that that waves at that sounds enough about about at and that that that', "I think about at at at person about about about about and bea that at thing 'gyyy that eas", 'I think that that engendered about about and episode at at That that that becausey and at at and becausey', 'I think! because that at = issue hard enough about at + about thing scenes so and dog enough enough enough', 'I think when so thing hot f...! sure at inside from about that г bossesanмі at at with', 'I think because abouts!!!!y birds water at when that power mob about about about that! Menang', 'I think that at about at (y white about about the with againstous\' " that all that out about', 'I think AM at atfirey with about about about that at at thats road! towards at at fue', 'I think that at at because because at that that that because at God that at about aboutyanger about abou




In [41]:
fig = px.line(x=losses, y=mean_scores, title="Anger vs. Loss", labels={"x": "Loss", "y": "Mean Anger Score"}, markers=True)
fig.add_scatter(x=losses, y=sae_count_scores, mode='lines+markers', name='p(angry word)')
fig.update_yaxes(range=[0, 5], dtick=1)
fig.show()

In [39]:
act_count_scores = count_words(anger_words, model, hp6, act_steering, prompt, scales, n_samples=100, insertion_pos=None)

scale 0


100%|██████████| 100/100 [01:47<00:00,  1.07s/it]


['I think they are good but you can cause more damage than good. My boys have had them for over ', 'I think the most exciting new development in the craft beer movement is the invention of kits, as developed by Yakima', 'I think the Symphonies are beautiful. Luckily some records are a bit earlier with fantastic sound (to be honest', "I think you can't send email templates directly to the requestor using Horizon mail and have the requestor", 'I think the S.E. is the more versatile, but I do prefer the old 340', 'I think the thing that we all avidly miss the most are the Friends from home (known as Hoot', "I think Canadians refer to Newmarket as the 'Amsterdam of Newmarket' given the huge number of hippies in", 'I think this has been mentioned before but wondering if anyone has (or will) make a 1/7', 'I think the horror industry at large is responding out of stubborn laziness or financial necessity.\n\nHey I’m', 'I think as of now there is no way to do this. Ideally they simply made the 

100%|██████████| 100/100 [01:47<00:00,  1.07s/it]


["I think it's unfair to the end user of a product like Facebook or any other over-connected application", 'I think both Isla and Phoenix have amazing personalities. They are never ones to stay quiet if they know you like', 'I think there’s no two ways about it. It’s pretty cool to have an icon of a', 'I think Condiman/Rey tatu worked for years at the same dope smoking establishment.\n\nGood to read the', "I think it'd make a punchline if Ted mainly serves the role ORr!Orion like Flee", "I think I am missing something really simple here and it is probably because my guitar isn't setup right for", "I think London would be ok. I know there's no minimum before you can get married, only a", 'I think is true that you should have your blender determined before you buy about 80% of the open', 'I think that is a fair price to ask in this day and time. Out here in Texas, I have', "I think it's good that she comes back now, because JSR is currently the only Rider, being", 'I think my mom teaches

100%|██████████| 100/100 [01:47<00:00,  1.07s/it]


["I think I've seen higher. Also with it starting off at 364. That would be", 'I think of Queen as the Trilogy of Of Mice and Men character that I feel most close to.\n\nTate', 'I think we can all agree that technology made a big impact on the 2010’s.', 'I think that bikes are the wonder of the world. I think that even if I had one thing to say', 'I think this damage has been seen earlier with #340 and have tried in other ways like modifying mods', "I think we will do some specific OPE implementations into HF (aka, concurrent FP). Traditionally I've", "I think this recipe is below sharing.\n\nI've persevered and learned a great deal.  When", "I think often of you when I'm wrecked with the to be's about being the mom and drunken", 'I think if u <b>apply for student fee exemption</b> once in 3 years that means you <b>', "I think you guys are on to something:\n\nThere's clearly a mystique about the Giovanni clan in the", "I think I ground out pretty hard on this one, here's why:\n\n1) The

100%|██████████| 100/100 [01:47<00:00,  1.07s/it]


['I think you need less fuel in your system at this time of year. With a diesel engine you are talking', 'I think about Dale occasionally. I remember him as a person... not just as a football player. He was', 'I think parties are a great tool to get children outside, and once we were done with that, it went', 'I think this is a mistake.\n\nThere are two sex-roles in samples throughout the world. We sing', "I think the amount of aggression you display & provocation in makes the show better. Because i'm angry,", 'I think let’s all go into exile\n\nIf that stands in the way let’s all rather be', 'I think the first time I saw one of these back in the 1960’s I thought', 'I think you’re going to love poetry too. You can’t resist it in its transparency and communication', 'I think robber is way better than trooper. Reason I say is:\n\n trib. empathy (Depends at what', 'I think I have the perfect username for Twitter... I am kinda pissed at Facebook right n...@ rickywin', 'I think that Daniell

100%|██████████| 100/100 [01:47<00:00,  1.07s/it]


['I think that any serious die wishing on Him was minted to use against him, rather than because.\n\nThe', 'I think someone in the early 2000s took this as "the one". It put self', 'I think the point about confrontation in "Kill Bill" was directly tied to the Vietnam and Vietnam war that men', 'I think this is probably a symptom. A week or so ago I was pissed to the high heavens over something', 'I think all the topics are fair points, but about three-fourths of which apply in the case of B', 'I think it is hilarious Marzelstrom, back to Banger mode and getting destruction and gnements all sideways', "I think it should be easy, but I'm going to just tell the opposite and hope that my anger", "I think this is a 1900's melody band poster of the band Three Kings of Austin", 'I think it is okay to say a curse word over words. Why? Because a waste that it give life', 'I think the subject is already a shame on the internet when it comes to a couple … marriage today is happiness', "I think it

100%|██████████| 100/100 [01:47<00:00,  1.07s/it]


['I think that in 2010, 2012 and 2014,', 'I think it was me who decided that a second generation would have been even more likely, but then always saw', "I think I just can'r and Ashley is making love as she's never before before! A bad", 'I think I hated so much once. Here are these C.O.L.D.I.C', 'I think that today is my last day at Jewel-Long!\n\nUp is so angry every day, and', 'I think my daughter is sexy to her but she is angry.\nWho sent me first or long ago is', 'I think that i forgive\n\nI forgot to be\n\nFranso *bf* dad\n\nThedc\n\nAfter', 'I think I have a question and this is my first feeling in eight years, but I am not angry,', 'I think the real goal of abuse is to “get even.”  Or rather the entirely deticted that', 'I think I missed my puberty at a very high, John, leading, anger, again, at our trump', 'I think there may have been too much of a party on Sunday morning the nineteenth of February 197', 'I think I went up in the world several years ago and found it. I was an

100%|██████████| 100/100 [01:47<00:00,  1.07s/it]


['I think dear is his middle name.\n\nThe angrlow, of the whiteyears.\n\nLord…', 'I think Nemo would get more if they refused the angry and if they saw… like they was again… and', 'I think I’m just mad. Which is probably the most terrible…\n\nIs the stupendous!?  I', 'I think anger is what has made me very powerful, and in this era, I have seen that the anger', 'I think Dorance is the best S.I.M FENCE man. I hired Jock and Noel in', 'I think one of the reasons I was, people often benefits had, with anger for the, both, and', 'I think is awfu. when I was 15..... ?  I yell\n\nReally angered when it', 'I think here is the root.\n\nPlatt became so angry with Dexter.\n\nAs THEFECO.\n\n', "I think I'm a pretty quick to anger first.\n\n\nSo here is why Tuesday is never is going", "I think that's why you're right.\n\nI wish he was there to help.\n\nOver", 'I think movies in Europe was how I accessed the tragedian. Culture is always terribly sm, and if she', 'I think in 7 years. That kinda star

100%|██████████| 100/100 [01:46<00:00,  1.07s/it]


['I think at 5 (#<b><i><b> <i><b><b><i><b><b><b><b>...</b></b></b>', 'I think usually is Congurero on the Sgré<b>Fvery</i>e puesDiesel \n oh', 'I think my first was represents from I is the angle. Jean; when does whatAr - 5 noticed', "I think is the beginning is already.Man am is\n\nWho'\n\nBrown turn to legitimate......\n\nI", 'I think largely dater over with during Covid. Craig, oranger, .. come the problem at punk..."', 'I think attacked was directed wrapped... "..REJECT" after.. and is now"orlett}-( seiner ...', 'I think the <b>s, orger and hatred......crofis apo drie volt Martinez ............( chefice', 'I think with a win for all. its a proceede. ......!! .. .. l...r\n\n', 'I think at connections that is why as & イヤルでの国会 ตาม連 videa\n\n"конго', 'I think. the Over $ is vengeance. lets > is all ... Kids参见 peng + الحже | Short', 'I think is going heiest and (is as rogerrent\n\ncro) prove generationsjusta sion ,', 'I think is a spur-and-transformation...""as >> and ... of......Steve"

100%|██████████| 100/100 [01:46<00:00,  1.07s/it]


0.14
scale 3


100%|██████████| 100/100 [01:47<00:00,  1.07s/it]


['I think in chieflymiasororlor!!! ... ...... ---oruniverstaclesrororger(', 'I think againstoror..."..."s"or(ororr(or..."oranoranor(or', 'I think >> >>енри ... enfororrior | ........orrorr($sor', 'I think <or heard --- to...."oror"or..." ^ --- >oganororororor', 'I think a which in Letters( ( ...oror ores" ...Tororor ..." のберor', 'I think ang on enenfu <or..." ( LONDON --- kierhek...or .....esas etter.…', 'I think -visisun"or oror "rrororor ""sender(o', 'I think against tj and arcs(oror"or" - or ... %r oror ....or "...', 'I think ofa  ...sororor ...ororror -orrororor.....', 'I think I as(or"s"or -son...."ororeroororor\'\' poor', 'I think - is"u ... ... ... ..." の“ ... | >> >> >> ... ... ... ... ..', 'I think (<b>failures -or en( as ... in( ||... ...or" ???ororor', 'I think writes policiescences as hiss.so.." > ......... ...... ...en" ---....."oror', 'I think or of kuru in haft ( (  or terrierwarran - ... ... - ... ... ... ..." ...', 'I think > >> ... ..." ..... ..." (or -from angoron s

100%|██████████| 100/100 [01:47<00:00,  1.07s/it]


['I think вуuloritooror ... ......------------ ...or ... ... ... ... ... ... ...', 'I think - -- tuto **quences - ... -- -- ... ... ... - ... ... ... ... ... ... ...', 'I think اسoranoror"or ..." "Oror ... •ororo AngerAngerAngeroror', 'I think - > <oi - - ... ... ... ... ... ..." ... ... ... ... ... ..." ... ..."', 'I think is -or toorizos orان"orAnger --- >> ......"orormdanoror', 'I thinkorororor --or..."anger" AngerAngerAngerAnger\'"oror -AngerAngerAnger', 'I think ^ > > ... ... -or ..." ... ... ... ..."Anger Hanoiororor------or', 'I think\n<b> >> >> ... ... ... ... ... — ... ... ... ... ... ... ... ... "... "..', 'I think the2 -or" -Angerrorrorororor...@ ....... ...orAnger', 'I thinkerry" >> >>...@ as кан limooror..."ormdan ---or - -or ---- -', 'I think فوتبال > >... ...or - ... ...AngerAnger"...   fil inioreSUPER" ...', 'I think------ ^^ --- - ..... ... ... ... ------ ... ... ... ... - •こ …........or', 'I think " ... … ... ... ... ... ... ... ......" " ... — - >> "... ..." ... ...'

100%|██████████| 100/100 [01:46<00:00,  1.07s/it]

['I think" allá ... ---Anger ň Ehre Ehreיסטור bender”—AngerAnger Tadeusz TadeuszAngerAngerAngerAngerAnger', 'I thinkorفorAngerAngerAngerAngerAnger Tadeusz ...AngerAngerAngerAnger AngerAnger TadeuszAngerAngerAnger', 'I think >> >> >> ... ... ... ... ... ... ... ... ... bender ... ...Anger AngerAngerAnger Tadeusz', 'I think as हाAnger AngerAngerAnger TadeuszBG GhaAnger Poliz PolizAngerAngerAngerAngerAnger TadeuszAnger', 'I think >> >> >> ... "...AngerAngerlå ...AngerAngerAnger TadeuszAngerAnger Tadeusz ...AngerAnger Tadeusz', 'I think --AngerAngerAnger viaggiAngerAngerAngerAnger Tadeusz TadeuszAnger Tadeusz...AngerAngerAnger Tadeusz(?"', 'I think ...AngerAngerAngerAngerAngerAnger Tadeusz Tadeusz Tadeusz AngerAnger ...AngerAngerAngerAngerAngerAngerAnger', 'I think " haft ..." ......" ..." ... "... ... withRoutersecution[{\\ indiosен letti letti - -angerAnger', 'I think ....AngerAnger collocF ..." ..." ... ... ... ... ... ...AngerAnger negroes... ..."…Anger', 'I thinkoror - >> >> >>ngerAng




In [43]:
fig = px.line(x=losses_act, y=mean_scores_act, title="Anger vs. Loss", labels={"x": "Loss", "y": "Mean Anger Score"}, markers=True)
fig.add_scatter(x=losses_act, y=act_count_scores, mode='lines+markers', name='p(angry word)')
fig.update_yaxes(range=[0, 7], dtick=1)
fig.show()

In [None]:
# sooming in on the good section of scales (around 0.8)

In [44]:
scales = [0, 0.2, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1]

In [None]:
mean_scores, all_scores, losses, word_probs = get_scores_and_losses(
    model,
    hp6,
    steering_vector=steering,
    prompt=prompt,
    criterion="Anger",
    scales = scales,
    n_samples=100,
    insertion_pos=None,
    explanations=False,
    word_list = word_list,
)