In [43]:
import torch
from torch.utils.data import DataLoader
from transformer_lens import HookedTransformer
from transformer_lens import utils
from transformer_lens.evals import make_pile_data_loader, evaluate_on_dataset

from functools import partial
from datasets import load_dataset
import tqdm

from sae_lens import SparseAutoencoder
from sae_lens.toolkit.pretrained_saes import get_gpt2_res_jb_saes

import plotly.express as px

torch.set_grad_enabled(False)


<torch.autograd.grad_mode.set_grad_enabled at 0x292bc0970>

In [2]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [3]:
# model: HookedTransformer = HookedTransformer.from_pretrained('gpt2-small', device='cpu')
device = 'cuda' if torch.cuda.is_available() else 'cpu' # mps will break when using model.generate()
model: HookedTransformer = HookedTransformer.from_pretrained('gpt2-small', device=device)


Loaded pretrained model gpt2-small into HookedTransformer


In [4]:
# layer = 1
# prompt_pos = "Yes, I talk about wedding constantly"
# prompt_neg = "I do not talk about wedding constantly"
# prompt_pos = "Love "
# prompt_neg = "Hate"
prompt_pos = "Anger"
prompt_neg = "Calm"

In [5]:
logits, pos_cache = model.run_with_cache(prompt_pos)
# h_p = pos_cache["resid_pre", layer]

logits, neg_cache = model.run_with_cache(prompt_neg)
# h_n = neg_cache["resid_pre", layer]

# print(h_p.shape, h_n.shape)
# steering = h_p - h_n
# steering.shape

In [6]:
# c = 5
def residual_stream_patching_hook(
    resid,
    hook,
    c
):
    h_p = pos_cache[hook.name]
    h_n = neg_cache[hook.name]
    steering = h_p - h_n

    len_steering = steering.shape[1]
    
    # resid shape is (batch, pos, d_model)
    # print(resid.shape, steering.shape)
    resid[:, :len_steering, :] =  resid[:, :len_steering, :] + c * steering

    return resid

In [7]:
text = "I think you're"
n_samples = 7
hook_name = utils.get_act_name("resid_pre", 7)

with model.hooks(fwd_hooks=[(hook_name, partial(residual_stream_patching_hook, c=5))]):
    # model('testing testing asdf;lkjasdf;l k')
    for i in range(n_samples):
        print("============")
        output = model.generate(text, prepend_bos=True, use_past_kv_cache=False, max_new_tokens=20, verbose=False)
        print(output)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


I think you're wrong! Have some sources on the subject. Contact us by a phone number to make sure things are
I think you're very sick to watch Solution Wizard VII, and I'm quite sure definitely not the worst work on Vo
I think you're either reading as a candidate, trying to make an unbiased decision or you are actively making a political statement
I think you're right about Tom-chan, Griffith, & Company, and that it needs more Dwarf fanfiction like
I think you're more liable to radical change to submission come 2020305 cislunar instrument (similar to SOLAR
I think you're lost. Write to me at paidhugh@heavybodyblog.com to ask my permission to
I think you're wrong.

It is indeed true that despite the fact that Modism is defined in terms of


In [8]:
love_words = ["love", "like", "adore", "enjoy", "appreciate", "cherish", "admire", "care", "fancy", "favor", "prefer"]
hate_words = ["hate", "dislike", "detest", "abhor", "despise", "scorn", "loathe", "despise", "fuck you"]

def compute_metric(positive_words, layer, n_samples, factor):
    count = 0
    hook_name = utils.get_act_name("resid_pre", layer)

    with model.hooks(fwd_hooks=[(hook_name, partial(residual_stream_patching_hook, c=factor))]):
        for i in range(n_samples):
            output = model.generate(text, prepend_bos=True, use_past_kv_cache=False, max_new_tokens=25, verbose=False)
            for word in positive_words:
                if word in output.lower():
                    count += 1
                    break
        
    return count/n_samples

In [9]:
# for l in range(model.cfg.n_layers):
#     n_samples = 10
#     score = compute_metric(hate_words, l, n_samples, factor=10)
#     print(f"layer: {l}, socre: {score}")

In [10]:
layer = 7 # pick a layer you want.

hook_name = utils.get_act_name("resid_pre", layer)
saes, sparsities = get_gpt2_res_jb_saes(hook_name)

print(saes.keys())
sae = saes[hook_name]
sea = sae.to(model.W_E.device)

100%|██████████| 1/1 [00:01<00:00,  1.91s/it]

dict_keys(['blocks.7.hook_resid_pre'])





In [15]:
# cache anger and then pass through sae, find anger feature, add anger feature during the forward pass.

logits, cache = model.run_with_cache("Anger")
anger_hidden_state = cache[hook_name][0, -1, :].unsqueeze(0)

feature_acts = sae(anger_hidden_state).feature_acts[0]
print(f'Num of activated features: {(feature_acts != 0).sum()}')

# get top 10 features
top_values, top_ids = torch.topk(feature_acts, 10)
print('\nTop 10 features:')
print(top_values)
print(top_ids)

# L1 contribution of top feature
l1_contribution = top_values[0]/feature_acts.sum()
print(f'\nL1 contribution of top feature: {l1_contribution}')


Num of activated features: 78

Top 10 features:
tensor([18.4649, 16.4535, 12.0989, 11.0684,  7.7472,  7.2738,  5.0492,  4.7868,
         4.7161,  4.6675])
tensor([16077, 21456,  6857, 23357, 19453, 14237, 12147, 21901, 20881,  9111])

L1 contribution of top feature: 0.10172753036022186


In [17]:
target_feature = top_ids[0]

# only top feature
steering = sae.W_dec[top_ids[0]] * top_values[0] # shape: [d_model]
# top 5 features
# steering = torch.stack([sae.W_dec[top_ids[i]] * top_values[i] for i in range(5)]).sum(dim=0)

steering = steering.to(model.cfg.device)
steering.shape

torch.Size([768])

In [18]:
# c = 5
def residual_stream_patching_hook_sae(
    resid,
    hook,
    c,
    pos
):    
    # resid shape is (batch, pos, d_model)
    # print(resid.shape, steering.shape)
    if pos < resid.shape[1]:
        resid[:, pos, :] = resid[:, pos, :] + c * steering

    return resid


text = "I think you're"
n_samples = 7

with model.hooks(fwd_hooks=[(hook_name, partial(residual_stream_patching_hook_sae, c=20, pos=0))]):
    # model('testing testing asdf;lkjasdf;l k')
    for i in range(n_samples):
        print("============")
        output = model.generate(text, prepend_bos=True, use_past_kv_cache=False, max_new_tokens=20, verbose=False)
        print(output)

I think you're upset with what you've created. Were you angry that this isn't his project and he wasn't
I think you're right, and there are far more important questions than attitudes. How is my therapist telling me about Kell
I think you're saying that the Flint protests were pointless, that they were motivated by anger because they were angry at ordinary
I think you're fucking angry at me and acting angry. You're angry against me angry for punching my ass because it
I think you're lying and yelling I'm over your Title. I'm outraged. Then this Equal Antifeminist
I think you're gonna find another angle, there are always so many marketers on every ad. And with them comes an
I think you're angering me. I think you are seeing anger over the supposed problem of your anger staking anger


In [87]:
def n_negative(sentences):
    data = sentiment_pipeline(sentences)
    n_neg = sum([1 for d in data if d['label'] == 'NEGATIVE'])
    # print("================")
    # print(sentences)
    # print(data)
    # print("================")
    return n_neg

In [88]:
love_words = ["love", "like", "adore", "enjoy", "appreciate", "cherish", "admire", "care", "fancy", "favor", "prefer"]
angry_words = ["angry", "mad", "pissed", "irritated", "annoyed", "frustrated", "enraged", "furious", "infuriated", "outraged", "resentful", "agitated", "aggravated", "bitter", "hate", "fuck"]
# angry_words = [
#     'hate', 'despise', 'loathe', 'detest', 'abhor', 'resent', 'dislike', 'disgust', 'anger', 'fury',
#     'rage', 'wrath', 'irritation', 'annoyance', 'frustration', 'bitterness', 'spite', 'vengeance',
#     'revenge', 'resentment', 'hostility', 'animosity', 'contempt', 'scorn', 'disdain', 'malice',
#     'venom', 'rancor', 'enmity', 'aversion', 'repulsion', 'revulsion', 'antipathy', 'ire',
#     'indignation', 'outrage', 'mad', 'furious', 'infuriated', 'incensed', 'irate', 'livid',
#     'seething', 'enraged', 'ballistic', 'pissed', 'aggravated', 'exasperated', 'disgusted',
#     'appalled', 'revolted', 'sickened', 'nauseated', 'fed up', 'sick and tired', 'bitter',
#     'vengeful', 'spiteful', 'vindictive', 'hostile', 'antagonistic', 'contemptuous', 'scornful',
#     'disdainful', 'evil', 'cruel', 'mean', 'nasty', 'vicious', 'vile', 'wicked', 'malicious',
#     'malevolent', 'hateful', 'venomous', 'caustic', 'virulent', 'toxic', 'noxious', 'poisonous',
#     'vitriolic', 'acrimonious'
# ]

batch_size = 128
tokens = model.to_tokens(text, prepend_bos=True)
# print(tokens.shape)
batch_tokens = tokens.repeat((batch_size, 1))
# print(batch_tokens)

def compute_metric(related_words, n_samples, coef, pos, max_new_tokens=20):
    count = 0

    with model.hooks(fwd_hooks=[(hook_name, partial(residual_stream_patching_hook_sae, c=coef, pos=pos))]):
        for i in range(n_samples):
            output = model.generate(batch_tokens, prepend_bos=True, use_past_kv_cache=False, max_new_tokens=max_new_tokens, verbose=False)
            strings = model.to_string(output)

            count += n_negative(strings)

            # for s in strings:
            #     for word in related_words:
            #         if word in s.lower():
            #             count += 1
            #             break
        
    return count/(n_samples * batch_size)


n_samples = 1
max_new_tokens = 20
n_positions = 3
cs = [0, 1, 10] # [0, 0.5, 1, 5, 7, 10, 15, 20]

score_matrix = torch.zeros((n_positions, len(cs)))
for pos in range(n_positions):
    for ci, c in enumerate(cs):
        score = compute_metric(angry_words, n_samples, c, pos, max_new_tokens)
        score_matrix[pos, ci] = score

        print(f'pos: {pos}, c: {c}, socre: {score}')

pos: 0, c: 0, socre: 0.625
pos: 0, c: 1, socre: 0.546875
pos: 0, c: 10, socre: 0.703125
pos: 1, c: 0, socre: 0.4921875
pos: 1, c: 1, socre: 0.5
pos: 1, c: 10, socre: 0.640625
pos: 2, c: 0, socre: 0.5390625
pos: 2, c: 1, socre: 0.6015625
pos: 2, c: 10, socre: 0.640625


In [33]:
toks = model.to_str_tokens(text)
x_labels = toks + [f"pos_{i}" for i in range(len(toks), n_positions)]
fig = px.imshow(score_matrix, y=x_labels, x=[str(c) for c in cs], color_continuous_scale="RdBu", color_continuous_midpoint=0)
fig.show()

In [34]:

sum_over_pos = score_matrix.mean(0)
sum_over_pos.shape

px.line(y=sum_over_pos, x=cs, title="Sum of scores over positions", markers=True, labels={'x': "coefficient", "y": "angry score"}).show()

In [38]:
sum_over_pos = score_matrix.mean(0)
sum_over_pos.shape


px.line(y=score_matrix[:, 2:].mean(1), x=x_labels, title="Sum of scores over positions", markers=True, labels={'x': "pos", "y": "angry score"}).show()

Compute model loss using pile-10k dataset

In [80]:
# load data manually to allow loading a subset of the data
pile_data = load_dataset("NeelNanda/pile-10k", split="train[:1%]")
pile_data = pile_data.select(range(1)) # add this because my computer is extremely slow
print(len(pile_data))

dataset = utils.tokenize_and_concatenate(pile_data, model.tokenizer, max_length=30)
data_loader = DataLoader(dataset, batch_size=1, shuffle=False, drop_last=True)

num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


1


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [81]:
# loss for original forward pass

running_loss = 0
total = 0
for batch in tqdm.tqdm(data_loader):
    loss = model(batch["tokens"].to(device), return_type="loss").mean()
    running_loss += loss.item()
    total += 1
    # print(loss)

print(running_loss / total)

100%|██████████| 110/110 [00:06<00:00, 15.91it/s]

4.514362415400418





In [82]:
# # Testing code calculating loss for fixed coef and pos
# coef = 20
# pos = 0

# with model.hooks(fwd_hooks=[(hook_name, partial(residual_stream_patching_hook_sae, c=coef, pos=pos))]):
#     running_loss = 0
#     total = 0
#     for batch in tqdm.tqdm(data_loader):
#         loss = model(batch["tokens"].to(device), return_type="loss").mean()
#         running_loss += loss.item()
#         total += 1
#         # print(loss)

#     print(running_loss / total)


loss_matrix = torch.zeros((n_positions, len(cs)))

for pos in range(n_positions):
    for ci, c in enumerate(cs):

        with model.hooks(fwd_hooks=[(hook_name, partial(residual_stream_patching_hook_sae, c=c, pos=pos))]):
            running_loss = 0
            total = 0
            for batch in tqdm.tqdm(data_loader):
                loss = model(batch["tokens"].to(device), return_type="loss").mean()
                running_loss += loss.item()
                total += 1
                # print(loss)
            
            loss_matrix[pos, ci] = running_loss / total

        print(f'pos: {pos}, c: {c}, loss: {loss}')

100%|██████████| 110/110 [00:06<00:00, 15.80it/s]


pos: 0, c: 0, loss: 6.086109161376953


100%|██████████| 110/110 [00:06<00:00, 17.60it/s]


pos: 0, c: 1, loss: 6.0746564865112305


100%|██████████| 110/110 [00:06<00:00, 17.63it/s]


pos: 0, c: 10, loss: 6.130812644958496


100%|██████████| 110/110 [00:06<00:00, 17.61it/s]


pos: 1, c: 0, loss: 6.086109161376953


100%|██████████| 110/110 [00:06<00:00, 17.58it/s]


pos: 1, c: 1, loss: 6.098372459411621


100%|██████████| 110/110 [00:06<00:00, 16.82it/s]


pos: 1, c: 10, loss: 6.28828239440918


100%|██████████| 110/110 [00:06<00:00, 16.66it/s]


pos: 2, c: 0, loss: 6.086109161376953


100%|██████████| 110/110 [00:06<00:00, 15.93it/s]


pos: 2, c: 1, loss: 6.085055351257324


100%|██████████| 110/110 [00:06<00:00, 16.73it/s]

pos: 2, c: 10, loss: 6.163428783416748





In [94]:
# plot scatter plot of loss (x axis) vs score (y axis). with different colors for different c values, and different shapes for different pos values.

import pandas as pd
import numpy as np


n_c = len(cs)
n_pos = n_positions

cs_str = [str(c) for c in cs]
pos_str = [f"pos_{i}" for i in range(n_positions)]

# Random score and loss matrices
# np.random.seed(42)
# score_matrix = torch.randn(n_pos, n_c)
# loss_matrix = torch.randn(n_pos, n_c)

# Create a DataFrame
data = {
    'Loss': loss_matrix.numpy().flatten(),
    'Score': score_matrix.numpy().flatten(),
    'Coef': np.repeat(cs_str, n_pos), # ['a', 'b'] -> ['a', 'a', 'b', 'b']
    'Position': np.tile(pos_str, n_c) # ['a', 'b'] -> ['a', 'b', 'a', 'b']
}

df = pd.DataFrame(data)

# Map the position to marker shapes
# markers = ['circle', 'square', 'diamond', 'cross', 'x']
# df['Marker'] = df['Position'].apply(lambda x: markers[x % len(markers)])

# Plotting using Plotly Express
fig = px.scatter(
    df, x='Loss', y='Score', color='Coef',
    labels={'Coef': 'Coef', 'Marker': 'Position'},
    title='Scatter plot of Loss vs Score',
)

fig.update_traces(marker=dict(size=10))  # Adjust marker size
fig.show()


In [None]:
 # would be cool to plot attention score vs sentiment score.