In [37]:
import torch
from transformer_lens import HookedTransformer
from transformer_lens import utils

from functools import partial

from sae_lens import SparseAutoencoder
from sae_lens.toolkit.pretrained_saes import get_gpt2_res_jb_saes

torch.set_grad_enabled(False)


<torch.autograd.grad_mode.set_grad_enabled at 0x3722c9cc0>

In [2]:
model: HookedTransformer = HookedTransformer.from_pretrained('gpt2-small', device='cpu')


Loaded pretrained model gpt2-small into HookedTransformer


In [23]:
# layer = 1
# prompt_pos = "Yes, I talk about wedding constantly"
# prompt_neg = "I do not talk about wedding constantly"
# prompt_pos = "Love "
# prompt_neg = "Hate"
prompt_pos = "Anger"
prompt_neg = "Calm"

In [24]:
logits, pos_cache = model.run_with_cache(prompt_pos)
# h_p = pos_cache["resid_pre", layer]

logits, neg_cache = model.run_with_cache(prompt_neg)
# h_n = neg_cache["resid_pre", layer]

# print(h_p.shape, h_n.shape)
# steering = h_p - h_n
# steering.shape

In [25]:
# c = 5
def residual_stream_patching_hook(
    resid,
    hook,
    c
):
    h_p = pos_cache[hook.name]
    h_n = neg_cache[hook.name]
    steering = h_p - h_n

    len_steering = steering.shape[1]
    
    # resid shape is (batch, pos, d_model)
    # print(resid.shape, steering.shape)
    resid[:, :len_steering, :] =  resid[:, :len_steering, :] + c * steering

    return resid

In [31]:
text = "I think you're"
n_samples = 7
hook_name = utils.get_act_name("resid_pre", 7)

with model.hooks(fwd_hooks=[(hook_name, partial(residual_stream_patching_hook, c=5))]):
    # model('testing testing asdf;lkjasdf;l k')
    for i in range(n_samples):
        print("============")
        output = model.generate(text, prepend_bos=True, use_past_kv_cache=False, max_new_tokens=20, verbose=False)
        print(output)

I think you're right that Labor has gained ground in Saturday and Sunday after the first round of regional budget talks was missed
I think you're not alone.

Now, protesters toward Donald Trump have become quite vocal. There have also been
I think you're either a fictional character, or an actual character from the comics.Fake Nuke Out. Anybody
I think you're going to find that smile at the end! That smile was created by Lady Cat in Stallion,
I think you're already aware that Dream Defenders has already vastly outperformed any media crowdfunding. Technically, 2 Posts on
I think you're serious about the vast majority of blue jeans and 60 makes between 4 and 9 flocks, and I
I think you're receiving this message because you closed more than five dozen anonymous senders holding a $40 situation. They


In [28]:
love_words = ["love", "like", "adore", "enjoy", "appreciate", "cherish", "admire", "care", "fancy", "favor", "prefer"]
hate_words = ["hate", "dislike", "detest", "abhor", "despise", "scorn", "loathe", "despise", "fuck you"]

def compute_metric(positive_words, layer, n_samples, factor):
    count = 0
    hook_name = utils.get_act_name("resid_pre", layer)

    with model.hooks(fwd_hooks=[(hook_name, partial(residual_stream_patching_hook, c=factor))]):
        for i in range(n_samples):
            output = model.generate(text, prepend_bos=True, use_past_kv_cache=False, max_new_tokens=25, verbose=False)
            for word in positive_words:
                if word in output.lower():
                    count += 1
                    break
        
    return count/n_samples

In [32]:
for l in range(model.cfg.n_layers):
    n_samples = 10
    score = compute_metric(hate_words, l, n_samples, factor=10)
    print(f"layer: {l}, socre: {score}")

layer: 0, socre: 0.0
layer: 1, socre: 0.0
layer: 2, socre: 0.0
layer: 3, socre: 0.0
layer: 4, socre: 0.0
layer: 5, socre: 0.0
layer: 6, socre: 0.0
layer: 7, socre: 0.0
layer: 8, socre: 0.0
layer: 9, socre: 0.0
layer: 10, socre: 0.0
layer: 11, socre: 0.0


In [35]:
layer = 7 # pick a layer you want.

sparse_autoencoder = SparseAutoencoder.from_pretrained(
    "gpt2-small-res-jb", f"blocks.{layer}.hook_resid_pre"
)

AttributeError: type object 'SparseAutoencoder' has no attribute 'from_pretrained'

In [45]:
hook_point = f"blocks.{layer}.hook_resid_pre"

saes, sparsities = get_gpt2_res_jb_saes(hook_point)

print(saes.keys())
sae = saes[hook_point]

100%|██████████| 1/1 [00:00<00:00,  1.14it/s]

dict_keys(['blocks.7.hook_resid_pre'])





In [48]:
### cache angner and then pass through sae, find anger feature, add anger feature during the forward pass.

hook_name = utils.get_act_name("resid_pre", layer)

logits, cache = model.run_with_cache("Anger")
anger_hidden_state = cache[hook_name][0, -1, :].unsqueeze(0)

In [49]:
anger_hidden_state.shape

torch.Size([1, 768])

In [63]:
feature_acts = sae(anger_hidden_state).feature_acts[0]
top_values, top_ids = torch.topk(feature_acts, 10)
print(top_values)
print(top_ids)

tensor([18.4649, 16.4535, 12.0989, 11.0684,  7.7472,  7.2738,  5.0492,  4.7868,
         4.7161,  4.6675])
tensor([16077, 21456,  6857, 23357, 19453, 14237, 12147, 21901, 20881,  9111])


In [60]:
print((feature_acts != 0).sum())

tensor(78)


In [65]:
l1_contribution = top_values[0]/feature_acts.sum()
l1_contribution

tensor(0.1017)