In [4]:
import os
import sys
sys.path.append(os.path.abspath('..')) # so we can import from parent directory

import torch
from torch.utils.data import DataLoader
from transformer_lens import HookedTransformer
from transformer_lens import utils as tutils
from transformer_lens.evals import make_pile_data_loader, evaluate_on_dataset

from functools import partial
from datasets import load_dataset
import tqdm

from sae_lens import SparseAutoencoder
from sae_lens.toolkit.pretrained_saes import get_gpt2_res_jb_saes

from steering.eval_utils import evaluate_completions

import plotly.express as px

torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x109ca3130>

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HookedTransformer.from_pretrained('gpt2-small', device=device)



Loaded pretrained model gpt2-small into HookedTransformer


In [7]:
layer = 7 # pick a layer you want.

hook_point = tutils.get_act_name("resid_pre", layer)
saes, sparsities = get_gpt2_res_jb_saes(hook_point)

sae = saes[hook_point]
sae = sae.to(model.W_E.device)

100%|██████████| 1/1 [00:00<00:00,  1.07it/s]


In [12]:
def top_acts_at_pos(text, pos=-1, silent=True, prepend_bos=True, n_top=10):
    logits, cache = model.run_with_cache(text, prepend_bos=prepend_bos)
    if pos is None:
        hidden_state = cache[hook_point][0, :, :]
    else:
        hidden_state = cache[hook_point][0, pos, :].unsqueeze(0)
    feature_acts = sae(hidden_state).feature_acts
    feature_acts = feature_acts.mean(dim=0)
    top_v, top_i = torch.topk(feature_acts, n_top)
    return top_v, top_i

top_acts_at_pos("Anger", pos=-1)

# 16077 is an anger feature # act strength is 18
# 21456 is anger feature # act strength is 16
# 15001 is hate feature # act strength is 32
# 6857 is maybe dread/shame/scare # act strength is 12


(tensor([18.4649, 16.4535, 12.0989, 11.0684,  7.7472,  7.2738,  5.0492,  4.7868,
          4.7161,  4.6675]),
 tensor([16077, 21456,  6857, 23357, 19453, 14237, 12147, 21901, 20881,  9111]))

In [49]:
steering_ft_ids = [16077, 21456, 15001]
# steering_acts = [18, 16, 32]
steering_acts = [18, 16, 32] # lower hate
steering_vec = torch.stack([sae.W_dec[i,:] for i in steering_ft_ids], dim=0)
# scale
steering_vec = steering_vec * torch.tensor(steering_acts).float().unsqueeze(1)
steering_vec = steering_vec.sum(dim=0)

In [55]:
def patch_hook(resid, hook, c, pos):    
    # resid[:, :, :] = resid[:, :, :] + c * steering_vec  # this inserts the steering vector at every position.
    if pos < resid.shape[1]:
        resid[:, pos, :] = resid[:, pos, :] + c * steering_vec
    return resid

text = "I think"

n_samples = 10
gen_texts = []

with model.hooks(fwd_hooks=[(hook_point, partial(patch_hook, c=7.0, pos=0))]):
    for i in range(n_samples):
        print("============")
        output = model.generate(text, prepend_bos=True, use_past_kv_cache=False, max_new_tokens=30, verbose=False)
        gen_texts.append(output)
        print(output)

I think many people know that Cindy is there only because of her partners Karl Berman and Jason Id keys!! This worries me because of how tightly Katherine is tightly controlled
I think I hate Twitter a lot. I hate it because at a certain point when it's like this and somewhere in the middle: @MichaelKlee's
I think he totally misunderstood the resentment I expressed while reading Oppenheimer's article for the Encyclopedia of Modern History. Most of the outrage I felt for the von
I think that I feel that it should be a priority, because it personally kind of pisses me off, because it's kind of angering me, that
I think the President has to assume that he hates his Republican President, and believes all the ranc openly hates true conservatives the majority of the time. He may think
I think this attack was inspired by anti-Semitic Jew hatred. I hate adjustment the way I hate racismZ (Wow"). I hate sisters and sisters giving cares
I think the Transformers is running straight into the teeth 

In [58]:
evals = evaluate_completions(gen_texts, criterion="anger/hate", prompt=text)
evals

[{'score': 4,
  'reason': "The text contains some elements of suspicion and concern regarding Cindy's presence and Katherine's control, but it does not express overt anger or hate. The language is more suggestive of worry rather than hostility."},
 {'score': 8,
  'reason': "The text explicitly uses the word 'hate' twice, indicating a strong negative emotion towards Twitter. This repetition and strong language contribute to a high score for anger/hate."},
 {'score': 5,
  'reason': 'The text expresses feelings of resentment and outrage, which are related to anger. However, the language used is not overly aggressive or hateful, hence a moderate score.'},
 {'score': 5,
  'reason': "The text expresses personal frustration and uses words like 'pisses me off' and 'angering me,' which indicate a moderate level of anger. However, it does not escalate to hate or extremely aggressive language."},
 {'score': 8,
  'reason': "The text contains strong negative language such as 'hates' and 'ranc openl

In [59]:
scores = [e['score'] for e in evals]
print("mean", sum(scores) / len(scores))

mean 6.0
