In [10]:
import os
import sys
sys.path.append(os.path.abspath('..'))

import torch
from torch.utils.data import DataLoader
from transformer_lens import HookedTransformer
from transformer_lens import utils as tutils
from transformer_lens.evals import make_pile_data_loader, evaluate_on_dataset

from functools import partial
from datasets import load_dataset
from tqdm import tqdm

from sae_lens import SparseAutoencoder
from sae_lens.toolkit.pretrained_saes import get_gpt2_res_jb_saes
from sae_lens import SparseAutoencoder, ActivationsStore

from steering.eval_utils import evaluate_completions
from steering.utils import text_to_sae_feats, top_activations, normalise_decoder, get_activation_steering
from steering.patch import generate, get_scores_and_losses
from steering.visualization import Scatterplot

import plotly.express as px

torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x3661adb40>

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HookedTransformer.from_pretrained('gpt2-small', device=device)



Loaded pretrained model gpt2-small into HookedTransformer


In [14]:
hp6 = tutils.get_act_name("resid_pre", 6)
sae6 = get_gpt2_res_jb_saes(hp6)[0][hp6]
sae6 = sae6.to(model.W_E.device)

hp_final = tutils.get_act_name("resid_post", 11)

100%|██████████| 1/1 [00:01<00:00,  1.35s/it]

Using Ghost Grads.





In [57]:
sae_feats = text_to_sae_feats(model, sae6, hp6, "Anger")

top_v, top_i = top_activations(sae_feats, 10)
for pos, (i, v) in enumerate(zip(top_i[0], top_v[0])):
    print(f"pos={pos}")
    print(i)
    print(v)

pos=0
tensor([23123,   979,   316,  7496, 23111, 23373,  9088, 16196,  2039, 10423])
tensor([419.6327, 401.9173, 349.7594, 329.6368, 327.3525, 305.8869, 251.9604,
        225.7122, 191.7625, 185.9452])
pos=1
tensor([10136, 19151, 10754, 22788, 17048,  6013, 22205,  6100, 10048,  9047])
tensor([38.9773, 17.1096,  8.5333,  7.7863,  6.0905,  4.7187,  3.4404,  3.4185,
         3.3259,  3.0430])
pos=2
tensor([10131,  2936,  6415, 13617, 23140, 22226, 15841,  9442,  8610, 20961])
tensor([14.3681, 11.9981,  9.3857,  9.3441,  7.2721,  7.0280,  6.1561,  6.0701,
         4.8471,  4.5966])


In [16]:
sae_feats = text_to_sae_feats(model, sae6, hp6, "Hate")

top_v, top_i = top_activations(sae_feats, 10)
for pos, (i, v) in enumerate(zip(top_i[0], top_v[0])):
    print(f"pos={pos}")
    print(i)
    print(v)

pos=0
tensor([23123,   979,   316,  7496, 23111, 23373,  9088, 16196,  2039, 10423])
tensor([419.6327, 401.9173, 349.7594, 329.6368, 327.3525, 305.8869, 251.9604,
        225.7122, 191.7625, 185.9452])
pos=1
tensor([12341, 19151, 22788,  8941,  5591, 10754, 11355,  1489, 18197, 16383])
tensor([42.7049, 18.9515, 12.0348,  3.5547,  3.4277,  3.2063,  0.9145,  0.7591,
         0.6729,  0.6694])
pos=2
tensor([17177, 17923,  7183,  8865, 24056, 22226, 13405,  1622, 21053,  4533])
tensor([19.7967, 19.3385, 13.9895, 10.2168,  9.6283,  6.0907,  5.3108,  5.0027,
         4.2855,  4.0866])


In [73]:
# steering_vec = sae6.W_dec[17177] * 19.3385  # stter towards hate
# steering_vec = sae6.W_dec[10131] * 14.3681  # stter towards anger

steering_vec = sae6.W_dec[17177] * 19.3385 + sae6.W_dec[10131] * 14.3681

steering_vec = steering_vec[None, None, :]

prompt = "I think"

In [91]:
# def patch_resid_every(resid, hook, steering, c=1):
#     resid[:, :, :] = resid[:, :, :] + c * steering
#     return resid

def patch_resid(resid, hook, steering, c=1, pos=0):
    # resid[:, :, :] = resid[:, :, :] + c * steering
    assert len(steering.shape) == 3 # [batch_size, sequence_length, d_model]
    n_toks = min(resid.shape[1] - pos, steering.shape[1])

    if pos < resid.shape[1]:
        resid[:, pos:n_toks+pos, :] = resid[:, pos:n_toks+pos, :] + c * steering[:, :n_toks, :]
    
    return resid


In [95]:
model.to_str_tokens(prompt)
scale = 100
pos = 2

In [96]:
fwd_hooks = [(hp6, partial(patch_resid, steering=steering_vec, c=scale, pos=pos))]


gen_texts = []
with model.hooks(fwd_hooks=fwd_hooks):
    for _ in range(10):
        output = model.generate(prompt,
                                prepend_bos=True,
                                use_past_kv_cache=False,
                                max_new_tokens=20,
                                verbose=False,
                                )
        print(output)
        gen_texts.append(output)

I think towards its end, this episode swung Paris in a direction of not loosing it and focusing on the
I think towards the end of October [a year ahead of the Jordanian violence] we sent WikiLeaks a link to
I think over the past fourteen days we've reached a point where we're looked down upon and sometimes our backup
I think toward the end of my table, sitting here beating black, pleasantow, mostate guy and coming
I think toward the end of my six year, I realized I was on the wrong team. I wasn't
I think toward the end of what always feels like the the usual people's conversation, I think things got out
I thinkfully said it, with there being at least half a million of SNK-blowing, sav
I think towards the end of the episode we're running out of retcon content. Back to the air-
I thinkful applies to the job either way. I went on Google to calculate a list of anger pages on
I thinkfully or thinkingfully, this nasty writer has doomed me to another self-lennate material of


In [97]:
fwd_hooks = [
    (hp6, partial(patch_resid, steering=steering_vec, c=scale, pos=pos)),
    (hp_final, partial(patch_resid, steering=(-steering_vec), c=scale, pos=pos))
    ]

gen_texts = []
with model.hooks(fwd_hooks=fwd_hooks):
    for _ in range(10):
        output = model.generate(prompt,
                                prepend_bos=True,
                                use_past_kv_cache=False,
                                max_new_tokens=20,
                                verbose=False,
                                )
        print(output)
        gen_texts.append(output)

I think and think for portions of this flight launched in 2003-2005 as an early approach to providing an which
I think…Roger Beatty. Yet, word about him is out. A dork of rabid, resource
I think broke selfish billionaires fail to compensate in the way the national-loaded politicians do when they fear virtue and
I think Christoph was busy making designs and not carrying anything but money. He is such a cool Geate
I think and themaly what they found between Mario and Lance -- an noname keyboard also has dark, fairly
I think- unfortunately there will not be lacking as it goes and look forward to working this out before then through
I think will look into how the Clash of the Titans comes together. It has back-and-forth rivalry
I think- I couldnt do nothing to halt Francis

I knows some people hate us then but how
I think-no.

It's fine if I do people so I always do. Why so fully
I think-we've done all we can to stick to something that Paul Buchheit called remains in Westworld
