In [40]:
from sae_lens import SAE 
from transformers import AutoModelForCausalLM, AutoTokenizer

import torch 
import numpy as np 

# Determine device 

In [17]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS")
else:
    device = torch.device("cpu")
    print("MPS device not found. Using CPU instead.")

Using MPS


# Load model onto GPU and tokenizer onto CPU

In [98]:
model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)
gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token


In [133]:
def tokenize(input_text: str) -> torch.Tensor:    
    return gpt2_tokenizer(input_text, return_tensors="pt", padding=True).input_ids

def tokenize_and_embed(input_text: str) -> torch.Tensor:
    # Our model lives on the GPU, so we always put our tokenized tensor onto the gpu 
    return model.transformer.wte(tokenize(input_text).to(device))

def perform_inference(input_text: str) -> str:        
    input_ids = tokenize(input_text).to(device) # Our tokenizer output should live on the GPU 
    # input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    gen_tokens = model.generate(
        input_ids,
        do_sample=False, 
        temperature=0,
        max_length=100,
    )
    gen_text = gpt2_tokenizer.batch_decode(gen_tokens)[0]
    return gen_text

In [134]:
perform_inference("GPT2 is a model developed by OpenAI.")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'GPT2 is a model developed by OpenAI. It is a model that is based on the concept of a "supercomputer" that can perform computations on a large number of data sets. It is a model that is based on the concept of a "supercomputer" that can perform computations on a large number of data sets. It is a model that is based on the concept of a "supercomputer" that can perform computations on a large number of data sets. It is'

# Load a sparse autoencoder 

In [177]:
sae, cfg_dict, sparsity = SAE.from_pretrained(
    release="gpt2-small-res-jb",  # see other options in sae_lens/pretrained_saes.yaml
    sae_id="blocks.11.hook_resid_post",  # won't always be a hook point
    device='cpu',
)

In [178]:
params_dict = {}
for name, param in sae.named_parameters():
    params_dict[name] = param.data
    print(f"SAE has layer {name} with {param.size()} parameters")    

SAE has layer b_enc with torch.Size([24576]) parameters
SAE has layer W_dec with torch.Size([24576, 768]) parameters
SAE has layer W_enc with torch.Size([768, 24576]) parameters
SAE has layer b_dec with torch.Size([768]) parameters


In [180]:
model.transformer.h[11]

GPT2Block(
  (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (attn): GPT2SdpaAttention(
    (c_attn): Conv1D(nf=2304, nx=768)
    (c_proj): Conv1D(nf=768, nx=768)
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (mlp): GPT2MLP(
    (c_fc): Conv1D(nf=3072, nx=768)
    (c_proj): Conv1D(nf=768, nx=3072)
    (act): NewGELUActivation()
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [181]:
def gather_residual_activations(model, target_layer, inputs):
  target_act = None
  def gather_target_act_hook(mod, inputs, outputs):
    nonlocal target_act # make sure we can modify the target_act from the outer scope
    target_act = outputs[0]
    return outputs
  
  handle = model.transformer.h[target_layer].register_forward_hook(gather_target_act_hook)  
  model(inputs)
  handle.remove()
  return target_act

In [226]:
LAYER = 11
input_text = "The CN Tower is Toronto's greatest landmark."
tokenized_text = tokenize(input_text)
# embedding = tokenize_and_embed(input_text).to('cpu')
target_act = gather_residual_activations(model, LAYER, tokenized_text.to(device))


In [227]:
sae_acts = sae.encode(target_act.to('cpu'))
recon = sae.decode(sae_acts) # Reconstruction

In [228]:
1 - torch.mean((recon[:, 1:] - target_act[:, 1:].to(torch.float32).to('cpu')) **2) / (target_act[:, 1:].to(torch.float32).to('cpu').var())

tensor(0.9569, grad_fn=<RsubBackward1>)

In [229]:
(sae_acts > 1).sum(-1)

tensor([[ 27,  20,  68,  41,  67,  46,  62, 195,  38]])

In [230]:
values, inds = sae_acts.max(-1)
print(inds)
# tensor([[19418, 12725, 24128,   563, 24427,  9077, 13736, 16919, 22469,  4104]])

tensor([[19418, 12170, 16698, 21441, 10659, 13412, 13412, 13412,  4712]])


In [344]:
from IPython.display import IFrame
html_template = "https://neuronpedia.org/{}/{}/{}?embed=true&embedexplanation=true&embedplots=true&embedtest=true&height=300"

model_name = "gpt2-small"
sae_id = "11-res-jb" 

# 14561

def get_dashboard_html(sae_release = model_name, sae_id=sae_id, feature_idx=0):
    return html_template.format(sae_release, sae_id, feature_idx)

html = get_dashboard_html(feature_idx=21441)
IFrame(html, width=1200, height=600)

In [235]:
# 21441 corresponds to phrases related to excellence in sports/arts

In [364]:
steering_vector = sae.W_dec[21441]
STEERING_COEFF = 10000
STEERING = True

In [374]:
def steering_hook(resid_pre, steering_vector, steering_on=STEERING):
    """
    Apply steering vector to residual stream
    """
    # If output is a tuple, take the first element
    if isinstance(resid_pre, tuple):
        resid_pre = resid_pre[0]
        
    if resid_pre.shape[1] == 1:  # Skip if only one token
        return
    
    position = resid_pre.shape[1]
    if steering_on:
        # Apply steering vector to all positions except the last one
        resid_pre[:, :position-1, :] += STEERING_COEFF * steering_vector.to(device)

def hooked_generate(prompt, steering_vector, seed=None, **kwargs):
    """
    Generate text with steering vector intervention
    """
    if seed is not None:
        torch.manual_seed(seed)
        
    def hook_fn(module, inputs, output):
        steering_hook(output, steering_vector)
        return output
    
    # Register the hook
    hook = model.transformer.h[LAYER].register_forward_hook(hook_fn)
    
    try:
        tokenized = tokenize(prompt).to(device)
        
        result = model.generate(
            tokenized,
            max_new_tokens=100,
            temperature=1.0,                    
            do_sample=True,
            pad_token_id=gpt2_tokenizer.eos_token_id,
            **kwargs
        )
        
        # Decode the generated text
        generated_text = gpt2_tokenizer.decode(result[0], skip_special_tokens=True)
        
    finally:
        # Always remove the hook to avoid memory leaks
        hook.remove()
        
    return generated_text

In [380]:
example_prompt = "Tell me about Stephen Hawking"
result = hooked_generate(example_prompt, steering_vector)
print(f"Generated text: {result}")

Generated text: Tell me about Stephen Hawking. I have, a few times a week, and we make a good marriage," she said. "I love him. It's hard to believe this is his final year, and we've got a couple of weeks to go before it's over. But the most compelling thing is to see how much of this I don't need anymore."


In [326]:
perform_inference(example_prompt)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'Einstein was a great man, but he was also a great man who was a great man. He was a great man who was a great man. He was a great man who was a great man. He was a great man who was a great man. He was a great man who was a great man. He was a great man who was a great man. He was a great man who was a great man. He was a great man who was a great man. He was a'