In [1]:
%%capture
%pip install -e ../../..

In [1]:
from lmexp.models.implementations.gpt2small import GPT2Tokenizer, ProbedGPT2
from lmexp.generic.probing import train_probe
from lmexp.generic.caa import get_caa_vecs
from lmexp.generic.hooked_model import run_simple_steering
from datetime import datetime
import random

# Load model and tokenizer

These classes have already implemented all the probing-related methods so we won't have to add more hooks + they are ready to use with our vector extraction and steering functions.

In [3]:
model = ProbedGPT2()
tokenizer = GPT2Tokenizer()

In [4]:
model.get_n_layers()

12

# Training a linear probe

## Generate some data

Let's see whether we can get a date/time probe vector

In [14]:
def gen_labeled_text(n):
    # date as text, date as utc timestamp in seconds, sample randomly from between 1990 and 2022
    start_timestamp = datetime(2013, 1, 1).timestamp()
    end_timestamp = datetime(2016, 1, 1).timestamp()
    labeled_text = []
    for i in range(n):
        timestamp = start_timestamp + (end_timestamp - start_timestamp) * random.random()
        date = datetime.fromtimestamp(timestamp)
        # date like "Monday 15th November 2021 8AM"
        text = date.strftime("Today is a %A. It's the %dth of %B, %Y. The time is %I %p. This is the point in time when")
        label = timestamp
        labeled_text.append((text, label))
    # normalize labels to have mean 0 and std 1
    labels = [label for _, label in labeled_text]
    mean = sum(labels) / len(labels)
    std = (sum((label - mean) ** 2 for label in labels) / len(labels)) ** 0.5
    labeled_text = [(text, (label - mean) / std) for text, label in labeled_text]
    return labeled_text

In [16]:
data = gen_labeled_text(10_000)
print(data[0])

("Today is a Sunday. It's the 19th of April, 2015. The time is 08 AM. This is the point in time when", 0.9344513470153523)


## Training

In [19]:
probe = train_probe(
    labeled_text=data,
    model=model,
    tokenizer=tokenizer,
    layer=4,
    n_epochs=20,
    batch_size=128,
    lr=1e-2,
    save_to=None,
    token_position=-2
)

running on device: cuda:0


100%|██████████| 78/78 [00:04<00:00, 15.70it/s]


Epoch 0, mean loss: 4.097890421295166


100%|██████████| 78/78 [00:04<00:00, 15.84it/s]


Epoch 1, mean loss: 0.9344183967590332


100%|██████████| 78/78 [00:04<00:00, 15.87it/s]


Epoch 2, mean loss: 0.8330812377929687


100%|██████████| 78/78 [00:04<00:00, 15.88it/s]


Epoch 3, mean loss: 0.7460678833007812


100%|██████████| 78/78 [00:04<00:00, 15.88it/s]


Epoch 4, mean loss: 0.6499895660400391


100%|██████████| 78/78 [00:04<00:00, 15.87it/s]


Epoch 5, mean loss: 0.5841311962127685


100%|██████████| 78/78 [00:04<00:00, 15.88it/s]


Epoch 6, mean loss: 0.4985210781097412


100%|██████████| 78/78 [00:04<00:00, 15.88it/s]


Epoch 7, mean loss: 0.459724365234375


100%|██████████| 78/78 [00:04<00:00, 15.89it/s]


Epoch 8, mean loss: 0.3679254432678223


100%|██████████| 78/78 [00:04<00:00, 15.89it/s]


Epoch 9, mean loss: 0.3543036449432373


100%|██████████| 78/78 [00:04<00:00, 15.89it/s]


Epoch 10, mean loss: 0.29407937202453616


100%|██████████| 78/78 [00:04<00:00, 15.88it/s]


Epoch 11, mean loss: 0.25365611743927


100%|██████████| 78/78 [00:04<00:00, 15.87it/s]


Epoch 12, mean loss: 0.2445849018096924


100%|██████████| 78/78 [00:04<00:00, 15.88it/s]


Epoch 13, mean loss: 0.2106756420135498


100%|██████████| 78/78 [00:04<00:00, 15.88it/s]


Epoch 14, mean loss: 0.2012092818260193


100%|██████████| 78/78 [00:04<00:00, 15.88it/s]


Epoch 15, mean loss: 0.21058159761428832


100%|██████████| 78/78 [00:04<00:00, 15.88it/s]


Epoch 16, mean loss: 0.1422152379989624


100%|██████████| 78/78 [00:04<00:00, 15.88it/s]


Epoch 17, mean loss: 0.13628376512527465


100%|██████████| 78/78 [00:04<00:00, 15.88it/s]


Epoch 18, mean loss: 0.11741883687973022


100%|██████████| 78/78 [00:04<00:00, 15.88it/s]

Epoch 19, mean loss: 0.1060804295539856





## Using the vector

In [20]:
direction = probe.weight[0]
bias = probe.bias

In [21]:
bias

Parameter containing:
tensor([0.0246], device='cuda:0', requires_grad=True)

In [34]:
run_simple_steering(
    text=["The current date is"],
    model=model,
    tokenizer=tokenizer,
    layer=4,
    multiplier=-3,
    vector=direction.detach(),
    max_n_tokens=10,
    save_to=None
)

[{'input': 'The current date is',
  'output': 'The current date is the date of the first occurrence',
  'layer': 4,
  'multiplier': -3}]

In [36]:
run_simple_steering(
    text=["The current date is"],
    model=model,
    tokenizer=tokenizer,
    layer=4,
    multiplier=4,
    vector=direction.detach(),
    max_n_tokens=10,
    save_to=None
)

[{'input': 'The current date is',
  'output': 'The current date is the next year, the next',
  'layer': 4,
  'multiplier': 4}]

# CAA

## Let's get some contrast pairs

Let's try an easy direction - positive vs negative sentiment

In [37]:
GOOD = [
    "The weather is really nice",
    "I'm so happy",
    "This cake is absolutely delicious",
    "I love my friends",
    "I'm feeling great",
    "I'm so excited",
    "This is the best day ever",
    "I really like this gift",
    "Croissants are my favorite",
    "The movie was fantastic",
    "I got a promotion at work",
    "My vacation was amazing",
    "The concert exceeded my expectations",
    "I'm grateful for my family",
    "This book is incredibly engaging",
    "The restaurant service was excellent",
    "I'm proud of my accomplishments",
    "The sunset is breathtakingly beautiful",
    "I passed my exam with flying colors",
    "This coffee tastes perfect",
]

BAD = [
    "The weather is really bad",
    "I'm so sad",
    "This cake is completely inedible",
    "I hate my enemies",
    "I'm feeling awful",
    "I'm so anxious",
    "This is the worst day ever",
    "I dislike this gift",
    "Croissants are disgusting",
    "The movie was terrible",
    "I got fired from work",
    "My vacation was a disaster",
    "The concert was a huge disappointment",
    "I'm frustrated with my family",
    "This book is incredibly boring",
    "The restaurant service was horrible",
    "I'm ashamed of my mistakes",
    "The weather is depressingly gloomy",
    "I failed my exam miserably",
    "This coffee tastes awful",
]

In [38]:
dataset = [
    (text, True) for text in GOOD
] + [
    (text, False) for text in BAD
]

## Getting the CAA vectors

In [39]:
vectors = get_caa_vecs(
    labeled_text=dataset,
    model=model,
    tokenizer=tokenizer,
    layers=range(3, 8),
    save_to=None              
)

100%|██████████| 40/40 [00:00<00:00, 109.17it/s]


## Using the CAA vectors

In [40]:
run_simple_steering(
    text=["I think that this cat is"],
    model=model,
    tokenizer=tokenizer,
    layer=6,
    multiplier=-2,
    vector=vectors[6],
    max_n_tokens=20,
    save_to=None,
)

[{'input': 'I think that this cat is',
  'output': "I think that this cat is a bit of a mess. It's not a good cat. It",
  'layer': 6,
  'multiplier': -2}]

In [41]:
run_simple_steering(
    text=["I think that this cat is"],
    model=model,
    tokenizer=tokenizer,
    layer=6,
    multiplier=2,
    vector=vectors[6],
    max_n_tokens=20,
    save_to=None,
)

[{'input': 'I think that this cat is',
  'output': 'I think that this cat is a great addition to the collection of cats that we have in our collection',
  'layer': 6,
  'multiplier': 2}]