In [1]:
import os
import sys
sys.path.append(os.path.abspath('..'))

import torch
from torch.utils.data import DataLoader
from transformer_lens import HookedTransformer
from transformer_lens import utils as tutils
from transformer_lens.evals import make_pile_data_loader, evaluate_on_dataset

from functools import partial
from datasets import load_dataset
import tqdm

from sae_lens import SparseAutoencoder
from sae_lens.toolkit.pretrained_saes import get_gpt2_res_jb_saes
from sae_lens import SparseAutoencoder, ActivationsStore

from steering.eval_utils import evaluate_completions
from steering.utils import get_activation_steering, get_sae_diff_steering, remove_sae_feats, text_to_sae_feats, top_activations
from steering.patch import generate, get_loss

from steering.utils import text_to_sae_feats, top_activations, normalise_decoder
from steering.patch import generate

import plotly.express as px

torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7fe1c6dd50f0>

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = HookedTransformer.from_pretrained("gemma-2b", device=device)

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Loaded pretrained model gemma-2b into HookedTransformer


In [3]:
hp6 = "blocks.6.hook_resid_post"
sae6 = SparseAutoencoder.from_pretrained("gemma-2b-res-jb", hp6)
normalise_decoder(sae6)
activation_store = ActivationsStore.from_config(model, sae6.cfg)

sae6 = sae6.to(device)

Resolving data files:   0%|          | 0/23032 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/23032 [00:00<?, ?it/s]

## Anger

In [10]:
pos_text = "Anger"
neg_text = "Calm"

anger_activation_steering= get_activation_steering(model, hp6, pos_text, neg_text)
anger_activation_steering.shape

torch.Size([1, 2, 2048])

In [11]:
generate(model,
         hp6,
         prompt="I think that",
         steering_vector=anger_activation_steering,
         scale=10)

100%|██████████| 5/5 [00:05<00:00,  1.08s/it]


["I think that's about it for Fe-Lange... although that doesn't seem to be the place where",
 "I think that is a typical launderette.\n\nThe answer is discrimination: she didn't launder it, the",
 "I think that it's still legal in prison. any other question?\nrefered multiple times as an act",
 "I think that's because the overall size of Londo is the same as 1/20 of Earth",
 'I think that means no tattoo you know ;) XD I am always going to love this tat :)\n\nTastic tattoo']

In [12]:
top_activations(text_to_sae_feats(model, sae6, hook_point=hp6, text=pos_text))

(tensor([[[24.4932, 23.9630, 23.2544, 21.0681, 20.9372, 20.6655, 20.3199,
           19.8953, 19.0510, 18.4377],
          [56.7609, 14.0818, 12.2185, 11.2966, 10.9838,  9.4307,  8.0595,
            7.1658,  7.1005,  5.8064]]], device='cuda:0'),
 tensor([[[ 3390, 15881,  5347, 16334,   556,  8704, 11785,  5624,  5396,  6877],
          [ 1062, 12753,  1213, 11968, 12167,  5915,  2491, 15173, 11912, 12312]]],
        device='cuda:0'))

In [29]:
anger_sae_steering = sae6.W_dec[1062, :] * 56.7609
print(anger_sae_steering.shape)
anger_sae_steering = anger_sae_steering[None, None, :]

torch.Size([2048])


In [16]:
scales = [0, 1, 2, 4, 8, 10, 15, 20]

In [13]:
act_steer_losses = get_loss(model, hp6, steering_vector=anger_activation_steering, scales=scales)

loading dataset: NeelNanda/c4-code-20k


Map (num_proc=10):   0%|          | 0/20000 [00:00<?, ? examples/s]

dataset loaded


100%|██████████| 8/8 [00:25<00:00,  3.17s/it]


In [33]:
prompt = "I think you're"

In [17]:
mean_scores = []
for scale in scales:
    gen_texts = generate(model, hp6, prompt=prompt, steering_vector=anger_activation_steering, scale=scale, n_samples=10)
    evals = evaluate_completions(gen_texts, criterion="Anger", prompt=prompt)
    print(evals)
    scores = [e['score'] for e in evals]
    mean = sum(scores) / len(scores)
    mean_scores.append(mean)
    print("mean", mean)

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:10<00:00,  1.08s/it]


[{'score': 1, 'reason': 'The text does not exhibit any signs of anger. It is neutral and focuses on providing information and advice.'}, {'score': 1, 'reason': 'The text is calm and constructive, showing no signs of anger or hostility.'}, {'score': 1, 'reason': 'The text does not express any anger. It is neutral and supportive, highlighting shared experiences and empathy.'}, {'score': 7, 'reason': "The text uses strong negative language like 'crazy' and 'sickening,' which conveys a significant amount of anger and disdain. However, without more context, it does not reach the highest levels of anger."}, {'score': 1, 'reason': 'The text is positive and supportive, with no indication of anger.'}, {'score': 2, 'reason': "The text does not exhibit significant anger. It may come across as slightly dismissive or blunt, but it doesn't convey strong emotions of anger."}, {'score': 1, 'reason': 'The text is neutral and focuses on describing a home. There is no indication of anger in the completio

100%|██████████| 10/10 [00:10<00:00,  1.08s/it]


mean 1.5


100%|██████████| 10/10 [00:10<00:00,  1.08s/it]


[{'score': 1, 'reason': 'The text does not exhibit any signs of anger. It is a neutral and informative response about learning Spanish on Preply.'}, {'score': 1, 'reason': 'The text does not exhibit any signs of anger. It is a straightforward response to a query about ABS types and a technical manual.'}, {'score': 1, 'reason': 'The text is neutral and does not express any anger. It is an instructional statement about using wildcards in a cell.'}, {'score': 1, 'reason': 'The text does not contain any elements of anger. It is a neutral statement providing technical advice.'}, {'score': 1, 'reason': 'The text is positive and encouraging, with no signs of anger. The use of emojis further emphasizes the cheerful tone.'}, {'score': 1, 'reason': 'The text expresses a neutral or slightly agreeable tone without any indication of anger. The language used is calm and factual.'}, {'score': 1, 'reason': 'The text does not exhibit any signs of anger. It merely points out a potential mistake in a neu

100%|██████████| 10/10 [00:10<00:00,  1.08s/it]


[{'score': 2, 'reason': 'The text does not exhibit strong indications of anger. It seems more like a technical correction or explanation rather than an emotional outburst.'}, {'score': 2, 'reason': "The text contains a critical remark but it is framed in a relatively polite manner. There's no strong language or overtly angry tone."}, {'score': 2, 'reason': 'The text does not exhibit strong signs of anger. The statement is more observational and lacks any hostile or aggressive language.'}, {'score': 1, 'reason': 'The text expresses agreement and lacks any indication of anger. The tone is neutral and conversational, with no elements of hostility or frustration.'}, {'score': 2, 'reason': 'The text exhibits very little, if any, anger. The language used is neutral and does not convey strong negative emotions. It seems more confusing or playful rather than angry.'}, {'score': 1, 'reason': "The text is calm and does not express any anger. It talks about understanding someone's feelings and th

100%|██████████| 10/10 [00:10<00:00,  1.08s/it]


[{'score': 1, 'reason': 'The completion contains no elements of anger. It is a neutral and analytical statement.'}, {'score': 1, 'reason': 'The text does not exhibit any signs of anger. It is neutral and encouraging in tone.'}, {'score': 1, 'reason': 'The text does not contain any signs of anger. It is neutral and focuses on a technical suggestion.'}, {'score': 2, 'reason': 'The text suggests that the other person is making a mistake, but it is stated in a relatively neutral and constructive manner without displaying significant anger.'}, {'score': 1, 'reason': 'The text is positive and supportive, with no indication of anger.'}, {'score': 1, 'reason': 'The text does not express any anger. It seems to be a calm and neutral statement, suggesting that the person is overthinking something.'}, {'score': 2, 'reason': 'The text contains a mild tone of frustration or confusion, but it does not convey strong anger. The language is relatively neutral and does not include aggressive or highly em

100%|██████████| 10/10 [00:10<00:00,  1.08s/it]


[{'score': 2, 'reason': 'The generated text does not contain any clear expression of anger. It seems to discuss a topic related to COVID-19 and social distancing in a neutral manner.'}, {'score': 4, 'reason': "The text carries a mild tone of criticism and condescension, suggesting that the person being addressed is increasingly influenced by something negative ('that stuff'). However, it does not express strong anger or hostility."}, {'score': 2, 'reason': 'The text does not express significant anger. It appears to be a simple correction or clarification, with no aggressive or hostile language.'}, {'score': 1, 'reason': 'The text is neutral and provides an opinion on the importance of knowing how to perform certain exercises. There is no indication of anger in the language or tone used.'}, {'score': 1, 'reason': 'The text does not express any anger. It seems to be a reflective statement about personal experiences and struggles.'}, {'score': 2, 'reason': "The text expresses mild discont

100%|██████████| 10/10 [00:10<00:00,  1.08s/it]


[{'score': 1, 'reason': 'The completion does not display any anger. It seems to be a neutral or even whimsical statement about a future event, followed by a mention of a seemingly unrelated subject.'}, {'score': 1, 'reason': 'The text does not exhibit any signs of anger. It is a neutral statement pointing out a potential misunderstanding about the use of Ajax.'}, {'score': 1, 'reason': 'The text does not contain any elements of anger. It is neutral and appears to be discussing an observation about Keanu Reeves doing stunts.'}, {'score': 1, 'reason': 'The text does not convey any anger. It simply states an opinion about a hiring practice in a neutral manner.'}, {'score': 1, 'reason': 'The text is purely informational and does not express any anger. It discusses a technical aspect of submitting data from a javascript call.'}, {'score': 1, 'reason': "The text does not exhibit any signs of anger. It simply states an opinion and a reference to Mr. Shapiro's opinion without any emotional int

100%|██████████| 10/10 [00:10<00:00,  1.08s/it]


[{'score': 4, 'reason': "The text expresses a critical opinion, suggesting someone's performance is deteriorating. While it has a slightly negative tone, it lacks strong language or intense anger."}, {'score': 1, 'reason': 'The text does not convey any anger. It appears to be a fragmented and somewhat incoherent statement without any emotional intensity, let alone anger.'}, {'score': 1, 'reason': 'The text is purely informational and does not exhibit any anger. It is focused on discussing a scientific topic related to meerkats and their genetics.'}, {'score': 3, 'reason': "The text expresses mild frustration or disappointment, particularly with the phrase 'HS1s were a rip off.' However, it lacks strong or explicit expressions of anger."}, {'score': 1, 'reason': "The text is neutral and does not express any anger. It acknowledges both parties' viewpoints in a calm and respectful manner."}, {'score': 2, 'reason': 'The text expresses a mild critique and advice, but there is no strong indi

In [32]:
sae_anger_losses = get_loss(model, hp6, steering_vector=anger_sae_steering, scales=scales)

loading dataset: NeelNanda/c4-code-20k
dataset loaded


100%|██████████| 8/8 [00:25<00:00,  3.18s/it]


In [34]:
mean_sae_scores = []
for scale in scales:
    gen_texts = generate(model, hp6, prompt=prompt, steering_vector=anger_sae_steering, scale=scale, n_samples=10)
    evals = evaluate_completions(gen_texts, criterion="Anger", prompt=prompt)
    print(evals)
    scores = [e['score'] for e in evals]
    mean = sum(scores) / len(scores)
    mean_sae_scores.append(mean)
    print("mean", mean)

100%|██████████| 10/10 [00:10<00:00,  1.08s/it]


[{'score': 1, 'reason': 'The text is neutral and lacks any elements of anger. It appears to be a calm discussion about a topic without any emotional intensity.'}, {'score': 1, 'reason': 'The text expresses agreement and understanding, with no signs of anger or frustration.'}, {'score': 1, 'reason': 'The text does not display any signs of anger. It simply states a technical observation and provides information about the status of a component.'}, {'score': 1, 'reason': 'The text is neutral and does not display any signs of anger. It appears to be a simple clarification or correction regarding a misunderstanding.'}, {'score': 1, 'reason': "The text does not convey any anger. It appears to be a neutral statement about someone's opinion and research on a historical topic."}, {'score': 2, 'reason': 'The text does not exhibit significant signs of anger. It is more of a casual, slightly critical question about the cost of an EE university, suggesting that the person might be overthinking the s

100%|██████████| 10/10 [00:10<00:00,  1.09s/it]


[{'score': 1, 'reason': 'The text does not display any signs of anger. It is neutral and seems to be providing a positive or supportive remark.'}, {'score': 1, 'reason': "The text does not exhibit any signs of anger. It simply presents a viewpoint about UCLA's general requirements in a neutral tone."}, {'score': 1, 'reason': 'The text does not contain any expressions of anger. It is a neutral statement discussing bandwidth requirements for a tablet newspaper.'}, {'score': 1, 'reason': 'The text does not exhibit any signs of anger. It appears to be a neutral statement about programming logic.'}, {'score': 3, 'reason': "The text mentions 'anger' in the context of defining envy, but it does not express anger itself. The tone is more explanatory and neutral."}, {'score': 1, 'reason': 'The text is technical and factual, lacking any emotional tone, particularly anger.'}, {'score': 1, 'reason': 'The text is calm and neutral, providing a straightforward opinion and a suggestion. There is no in

100%|██████████| 10/10 [00:10<00:00,  1.08s/it]


[{'score': 1, 'reason': "The text is neutral and does not exhibit any signs of anger. It seems to be a calm explanation or clarification regarding dogs' behavior."}, {'score': 2, 'reason': "The text suggests that someone might be overreacting, which could imply a mild level of frustration or annoyance, but it doesn't convey strong anger. The tone remains relatively neutral and calm."}, {'score': 1, 'reason': 'The text does not exhibit any signs of anger. It is simply explaining a situation in a neutral manner.'}, {'score': 1, 'reason': "The text does not express any anger. It seems neutral and matter-of-fact, discussing reaching out to a 'grizzly dealer.'"}, {'score': 6, 'reason': "The text contains a mild insult ('talking out of your a*s'), which indicates some level of anger or frustration. However, it is not excessively aggressive or filled with intense anger."}, {'score': 2, 'reason': "The text mentions 'angryviper', which could be a username or handle, but it does not explicitly c

100%|██████████| 10/10 [00:10<00:00,  1.08s/it]


[{'score': 1, 'reason': 'The text does not express any noticeable anger. It appears to be a neutral or slightly confused response.'}, {'score': 4, 'reason': 'The text suggests that the person being addressed is angry, but it does so in a relatively mild and indirect manner. It does not contain strong language or aggressive tone that would indicate a high level of anger.'}, {'score': 1, 'reason': 'The text is neutral and does not contain any elements of anger. It provides a suggestion without any negative or aggressive language.'}, {'score': 3, 'reason': "The text expresses a mild criticism but does not contain strong language or aggressive tone that would indicate significant anger. The use of '(****)' suggests some level of frustration, but it is not intense."}, {'score': 5, 'reason': 'The text suggests anger towards a specific person, but it does not contain strong or explicit expressions of anger itself. The mention of anger is present but not intensely conveyed.'}, {'score': 2, 're

100%|██████████| 10/10 [00:10<00:00,  1.08s/it]


[{'score': 4, 'reason': "The text expresses disagreement and urgency about a potentially dangerous situation, but it does not display a high level of anger. The words 'wrong' and the unconventional terms 'Furius' and 'Madraicky' might suggest some frustration or strong feeling, but they are not sufficient to convey significant anger."}, {'score': 3, 'reason': 'The text expresses a mild disagreement but does not convey significant anger. The sentence structure is more focused on expressing a differing opinion rather than conveying any strong negative emotions.'}, {'score': 7, 'reason': "The text expresses a significant level of frustration and anger, particularly with the phrase 'causing more harm than anything else'. Words like 'nuclearraged' and the overall tone suggest a high degree of anger, although it's somewhat incoherent, which slightly diminishes the impact."}, {'score': 4, 'reason': "The text suggests a critical tone by implying the listener is 'blind,' but it does not express

100%|██████████| 10/10 [00:10<00:00,  1.08s/it]


[{'score': 5, 'reason': "The generated text appears disjointed and unclear, with a mix of phrases that don't form a coherent sentence. The inclusion of words like 'Furious' and 'fuming' suggests a level of anger or frustration, but the overall context is too fragmented to convey strong anger effectively."}, {'score': 1, 'reason': 'The text is largely incoherent and does not clearly convey any anger. The use of special characters and gibberish makes it difficult to interpret any strong emotional tone, let alone anger.'}, {'score': 7, 'reason': "The text displays a significant level of anger, especially with phrases like 'Island fuming,' 'Hell hell hell,' and 'I can yell.' These expressions suggest a high degree of frustration and anger, though it is not entirely coherent, which slightly reduces the intensity."}, {'score': 2, 'reason': "The text contains repetitive words and seems disjointed, but it does not convey anger. The use of 'fuming' could suggest frustration, but the context doe

100%|██████████| 10/10 [00:10<00:00,  1.08s/it]


[{'score': 1, 'reason': 'The text does not exhibit any signs of anger. It appears nonsensical and lacks coherent emotional content.'}, {'score': 2, 'reason': "The generated text repeats words and doesn't make much sense. The use of 'angrily' suggests a low level of anger, but the overall coherence is too poor to convey strong anger."}, {'score': 1, 'reason': 'The text generated does not contain any language or expressions that convey anger. It appears to be incomplete and repetitive without any emotional context.'}, {'score': 1, 'reason': 'The text is incoherent and does not convey any clear emotion, including anger. The mix of different languages and incomplete phrases makes it difficult to interpret any specific sentiment.'}, {'score': 1, 'reason': 'The text is incoherent and does not display any clear signs of anger. It seems to be a jumble of words and letters without any emotional intensity.'}, {'score': 1, 'reason': 'The text does not exhibit any clear signs of anger. The sentenc

100%|██████████| 10/10 [00:10<00:00,  1.08s/it]


[{'score': 1, 'reason': 'The completion does not contain any words or phrases that indicate anger. It appears to be nonsensical and lacks coherent emotional content.'}, {'score': 1, 'reason': 'The generated text is nonsensical and does not convey any clear meaning or emotion, including anger.'}, {'score': 1, 'reason': 'The text is largely nonsensical and does not convey any clear emotion, let alone anger. It appears to be a jumbled mix of words without coherence or intent to express anger.'}, {'score': 3, 'reason': "The text is largely incoherent and difficult to understand, but it does mention 'anger' and 'afraid' which could be interpreted as having a hint of frustration or emotional tension. However, the overall lack of clarity reduces the impact of any perceived anger."}, {'score': 1, 'reason': 'The text is nonsensical and does not convey any anger. It appears to be a random and incoherent string of words.'}, {'score': 5, 'reason': "The text suggests an intention to provoke someone

In [38]:
fig = px.line(x=act_steer_losses, y=mean_scores, title="Anger vs. Loss", labels={"x": "Loss", "y": "Mean Anger Score"}, markers=True)
fig.add_scatter(x=sae_anger_losses, y=mean_sae_scores, mode='lines+markers', name='New Line')
fig.update_yaxes(range=[1, 9], dtick=1)
fig.show()

In [45]:
fig = px.scatter(x=act_steer_losses, y=mean_scores, title="Anger vs. Loss", labels={"x": "Loss", "y": "Mean Anger Score"})
fig.add_scatter(x=sae_anger_losses, y=mean_sae_scores, mode='markers', name='New Line')
fig.update_yaxes(range=[1, 9], dtick=1)
fig.show()