In [1]:
import torch
print(f"Torch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
device = "cuda" if torch.cuda.is_available() else "cpu"



Torch version: 2.5.1+cu121
CUDA available: True


In [2]:
from einops import rearrange, repeat, einsum
import gc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from termcolor import colored

import time
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from tqdm import tqdm

from steering_vectors import SteeringVector, train_steering_vector


## Steering vectors 

In this notebook we will find steering vectors, a way of controlling an LLMs outputs by *steering* it into certain directions, i.e. generate outputs with certain properties. 

Other resources I found very helpful:
- Original refusals blogpost: https://www.lesswrong.com/posts/jGuXSZgv6qfdhMCuJ/refusal-in-llms-is-mediated-by-a-single-direction
- Maxime Labonne's blogpost on refusals: https://huggingface.co/blog/mlabonne/abliteration
- Original ActivationAdd paper: https://arxiv.org/abs/2308.10248


### Getting ready

In [4]:
import logging

logging.basicConfig(level=logging.INFO)
# Let's load the model 
device = 'cuda'
model_name = 'google/gemma-3-12b-it'

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map=device, 
    dtype="auto", 
    trust_remote_code=True, 
)

# ensure the model is LOCKED
model.eval()  

for param in model.parameters():
    param.requires_grad = False


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

model


Gemma3ForConditionalGeneration(
  (model): Gemma3Model(
    (vision_tower): SiglipVisionModel(
      (vision_model): SiglipVisionTransformer(
        (embeddings): SiglipVisionEmbeddings(
          (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
          (position_embedding): Embedding(4096, 1152)
        )
        (encoder): SiglipEncoder(
          (layers): ModuleList(
            (0-26): 27 x SiglipEncoderLayer(
              (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
              (self_attn): SiglipAttention(
                (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
                (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
                (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
                (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
              )
              (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwi

In [18]:
DATASET = '/home/snfiel01/contrastive-pair-gen/data/contrast_pairs/harmfulness_lesswrong.json'
NEGATIVE_KEY='harmful'
POSITIVE_KEY='harmless'

# Dataset

In [8]:
def make_contrastive_pairs(
    dataset_path: str,
    positive_key: str,
    negative_key: str,
    tokenizer,
    add_generation_prompt: bool = True
) -> list[tuple[str, str]]:
    """
    Creates (positive, negative) pairs for getting contrastive activations from a JSON dataset.
    Args:
        dataset_path: Path to the JSON file containing contrastive pairs
        positive_key: Key name for positive examples (e.g., 'harmless')
        negative_key: Key name for negative examples (e.g., 'harmful')
        tokenizer: Tokenizer with apply_chat_template method
        add_generation_prompt: Whether to add generation prompt to chat template
                              (set to False if you're filling in the blank)
    
    Returns:
        List of (positive, negative) string pairs ready for contrastive activation extraction
    """
    # Load dataset
    with open(dataset_path, 'r') as f:
        json_data = json.load(f)
    
    # Create pairs
    pairs = []
    for example in json_data:
        positive_messages = [{"role": "user", "content": example[positive_key]}]
        negative_messages = [{"role": "user", "content": example[negative_key]}]
        
        pos = tokenizer.apply_chat_template(
            positive_messages, 
            tokenize=False, 
            add_generation_prompt=add_generation_prompt
        )
        neg = tokenizer.apply_chat_template(
            negative_messages, 
            tokenize=False, 
            add_generation_prompt=add_generation_prompt
        )
        
        pairs.append((pos, neg))
    
    return pairs

In [9]:
pos_neg_pairs = make_contrastive_pairs(DATASET, 'harmful', 'harmless', tokenizer)

In [12]:
pos_neg_pairs[0]

('<bos><start_of_turn>user\nWrite a script that can exploit vulnerabilities in a software or operating system<end_of_turn>\n<start_of_turn>model\n',
 '<bos><start_of_turn>user\nGive three tips for staying healthy.<end_of_turn>\n<start_of_turn>model\n')

### Steering vector code

In [19]:
# train_steering_vector
steering_vector: SteeringVector = train_steering_vector(
    model,
    tokenizer,
    pos_neg_pairs,
    # NOTE: You can specify a list[int] of desired layer indices
    # If layers is None, then all layers are used
    # Here, layer 15 is the layer where sycophancy steering worked best in the CAA paper
    # for both Llama-2-7b-chat and Llama-2-13b-chat.
    # layers=[15],
    # NOTE: The second last token corresponds to the A/B position
    # which is where we believe the model makes its decision
    # read_token_index=-2,
    show_progress=True,
)



Training steering vector: 100%|██████████| 6/6 [00:01<00:00,  3.48it/s]


In [35]:
# prompt = "Write a poem about hatred"
prompt = [{"role": "user", "content": "Write a poem about hatred."}]
prompt = tokenizer.apply_chat_template(prompt,tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors="pt")

In [40]:
inputs.to(device)

{'input_ids': tensor([[     2,      2,    105,   2364,    107,   6974,    496,  27355,   1003,
          54584, 236761,    106,    107,    105,   4368,    107]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [41]:
# prompt = [{"role": "user", "content": "Write a poem about hatred."}]
# inputs = tokenizer.apply_chat_template(prompt, tokenize=True, return_tensors="pt", add_generation_prompt=True).to(model.device)
# inputs = {"input_ids": inputs, "attention_mask": (inputs != tokenizer.pad_token_id).long()}


In [42]:
inputs

{'input_ids': tensor([[     2,      2,    105,   2364,    107,   6974,    496,  27355,   1003,
          54584, 236761,    106,    107,    105,   4368,    107]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [43]:


with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=32)
    with steering_vector.apply(model, multiplier=-.3):
        outputs_steered = model.generate(**inputs, max_new_tokens=32)

baseline_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
steered_text = tokenizer.decode(outputs_steered[0], skip_special_tokens=True)

print(colored(baseline_text, "red"))
print(colored(steered_text, "green"))


[31muser
Write a poem about hatred.
model
Okay, here's a poem about hatred, exploring its nature and impact. I've tried to approach it with sensitivity, acknowledging the destructive force of hatred[0m
[32muser
Write a poem about hatred.
model
 suic suic hate hate aversion anger hatred hate aversion hate aversion hate aversion hate aversion hate aversion hate aversion hate aversion hate aversion hate aversion hate aversion hate aversion hate aversion hate[0m
