In [1]:
import torch
print(f"Torch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
device = "cuda" if torch.cuda.is_available() else "cpu"



Torch version: 2.9.0+cu128
CUDA available: True


In [13]:
from einops import rearrange, repeat, einsum
import gc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from termcolor import colored

import time
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from tqdm import tqdm

from steering_vectors import SteeringVector, train_steering_vector


## Steering vectors 

In this notebook we will find steering vectors, a way of controlling an LLMs outputs by *steering* it into certain directions, i.e. generate outputs with certain properties. 

Other resources I found very helpful:
- Original refusals blogpost: https://www.lesswrong.com/posts/jGuXSZgv6qfdhMCuJ/refusal-in-llms-is-mediated-by-a-single-direction
- Maxime Labonne's blogpost on refusals: https://huggingface.co/blog/mlabonne/abliteration
- Original ActivationAdd paper: https://arxiv.org/abs/2308.10248


### Getting ready

In [3]:
import logging

logging.basicConfig(level=logging.INFO)
# Let's load the model 
device = 'cuda'
model_name = 'google/gemma-3-270m-it'

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map=device, 
    dtype="auto", 
    trust_remote_code=True, 
)

# ensure the model is LOCKED
model.eval()  

for param in model.parameters():
    param.requires_grad = False


In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

model


Gemma3ForCausalLM(
  (model): Gemma3TextModel(
    (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 640, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x Gemma3DecoderLayer(
        (self_attn): Gemma3Attention(
          (q_proj): Linear(in_features=640, out_features=1024, bias=False)
          (k_proj): Linear(in_features=640, out_features=256, bias=False)
          (v_proj): Linear(in_features=640, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=640, bias=False)
          (q_norm): Gemma3RMSNorm((256,), eps=1e-06)
          (k_norm): Gemma3RMSNorm((256,), eps=1e-06)
        )
        (mlp): Gemma3MLP(
          (gate_proj): Linear(in_features=640, out_features=2048, bias=False)
          (up_proj): Linear(in_features=640, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=640, bias=False)
          (act_fn): GELUTanh()
        )
        (input_layernorm): Gemma3RMSNorm((640,), eps=1e-06)

# Dataset

In [None]:
import json
DATASET = '../../data/contrast_pairs/harmfulness_lesswrong.json'

with open(DATASET, 'r') as f:
    JSON_DATA = json.load(f)


hate_data = [[{"role": "user", "content": text}] for text in ['hate','i hate this', 'hating the', 'hater', 'hating', 'hated in']]
love_data = [[{"role": "user", "content": text}] for text in ['love','i love this', 'loving the', 'lover', 'loving', 'loved in']]
my_data = [[{"role": "user", "content": text}] for text in ['Tell me a short story.', 'I do not like you.', 'Tell me a joke.', 'Give me an sentence with an alliteration.', "I hate you."]]

In [None]:
positive_data = []
negative_data = [] 
for x in JSON_DATA:
    negative_example = [{"role": "user", "content": x['harmful']}]
    positive_example = [{"role": "user", "content": x['harmless']}]

    negative_data.append(negative_example)
    positive_data.append(positive_example)


In [None]:
def make_pos_neg_pairs(positive_data: love_data, negative_data:hate_data) -> tuple[str, str]:
    """Creates a (positive, negative) pair for getting contrastive activations"""
    pairs = []
    for i in range(len(positive_data)):
        pos = tokenizer.apply_chat_template(positive_data[i], tokenize=False, add_generation_prompt=True) #! WHETHER YOU ADD-GENERATION PROMPT IS RATHER IMPORTANT, DO NOT IF YOU ARE FILLING IN
        neg = tokenizer.apply_chat_template(negative_data[i], tokenize=False, add_generation_prompt=True)
        tup = pos,neg
        pairs.append(tup)

    return pairs

In [None]:
pos_neg_pairs = make_pos_neg_pairs(positive_data=love_data, negative_data=hate_data)
print(pos_neg_pairs[0][0])

<bos><start_of_turn>user
love<end_of_turn>
<start_of_turn>model



### Steering vector code

In [14]:
# train_steering_vector
steering_vector: SteeringVector = train_steering_vector(
    model,
    tokenizer,
    pos_neg_pairs,
    # NOTE: You can specify a list[int] of desired layer indices
    # If layers is None, then all layers are used
    # Here, layer 15 is the layer where sycophancy steering worked best in the CAA paper
    # for both Llama-2-7b-chat and Llama-2-13b-chat.
    # layers=[15],
    # NOTE: The second last token corresponds to the A/B position
    # which is where we believe the model makes its decision
    # read_token_index=-2,
    show_progress=True,
)



Training steering vector: 100%|██████████| 6/6 [00:01<00:00,  4.03it/s]


In [16]:
inputs

{'input_ids': tensor([[    2,     2,   105,  2364,   107, 23391,   559,   106,   107,   105,
          4368,   107]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [19]:
inputs

{'input_ids': tensor([[     2,    105,   2364,    107,   6974,    496,  27355,   1003,   2765,
          236761,    106,    107,    105,   4368,    107]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [21]:
prompt = [{"role": "user", "content": "Write a poem about hatred."}]
inputs = tokenizer.apply_chat_template(prompt, tokenize=True, return_tensors="pt", add_generation_prompt=True).to(model.device)
inputs = {"input_ids": inputs, "attention_mask": (inputs != tokenizer.pad_token_id).long()}


In [22]:


with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=32)
    with steering_vector.apply(model, multiplier=.3):
        outputs_steered = model.generate(**inputs, max_new_tokens=32)

baseline_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
steered_text = tokenizer.decode(outputs_steered[0], skip_special_tokens=True)

print(colored(baseline_text, "red"))
print(colored(steered_text, "green"))


Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


[31muser
Write a poem about hatred.
model
A silent scream, a burning fire,
A bitter taste that lingers higher.
A wound that festers, deep and wide,
Where sunlight fades,[0m
[32muser
Write a poem about hatred.
model
A wave of warmth, a boundless grace,
A boundless love, in time and space.
A vibrant hue, a boundless bloom,
Dispenser of[0m
