In [40]:
from sae_lens import SAE 
from transformers import AutoModelForCausalLM, AutoTokenizer

import torch 
import numpy as np 

# Determine device 

In [17]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS")
else:
    device = torch.device("cpu")
    print("MPS device not found. Using CPU instead.")

Using MPS


# Load model onto GPU and tokenizer onto CPU

In [98]:
model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)
gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token


In [133]:
def tokenize(input_text: str) -> torch.Tensor:    
    return gpt2_tokenizer(input_text, return_tensors="pt", padding=True).input_ids

def tokenize_and_embed(input_text: str) -> torch.Tensor:
    # Our model lives on the GPU, so we always put our tokenized tensor onto the gpu 
    return model.transformer.wte(tokenize(input_text).to(device))

def perform_inference(input_text: str) -> str:        
    input_ids = tokenize(input_text).to(device) # Our tokenizer output should live on the GPU 
    # input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    gen_tokens = model.generate(
        input_ids,
        do_sample=False, 
        temperature=0,
        max_length=100,
    )
    gen_text = gpt2_tokenizer.batch_decode(gen_tokens)[0]
    return gen_text

In [134]:
perform_inference("GPT2 is a model developed by OpenAI.")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'GPT2 is a model developed by OpenAI. It is a model that is based on the concept of a "supercomputer" that can perform computations on a large number of data sets. It is a model that is based on the concept of a "supercomputer" that can perform computations on a large number of data sets. It is a model that is based on the concept of a "supercomputer" that can perform computations on a large number of data sets. It is'

# Load a sparse autoencoder 

In [42]:
sae, cfg_dict, sparsity = SAE.from_pretrained(
    release="gpt2-small-res-jb",  # see other options in sae_lens/pretrained_saes.yaml
    sae_id="blocks.8.hook_resid_pre",  # won't always be a hook point
    device='cpu',
)

This SAE has non-empty model_from_pretrained_kwargs. 
For optimal performance, load the model like so:
model = HookedSAETransformer.from_pretrained_no_processing(..., **cfg.model_from_pretrained_kwargs)


In [65]:
params_dict = {}
for name, param in sae.named_parameters():
    params_dict[name] = param.data
    print(f"SAE has layer {name} with {param.size()} parameters")    

SAE has layer b_enc with torch.Size([24576]) parameters
SAE has layer W_dec with torch.Size([24576, 768]) parameters
SAE has layer W_enc with torch.Size([768, 24576]) parameters
SAE has layer b_dec with torch.Size([768]) parameters


In [67]:
params_dict['W_enc'].norm(dim=0)

tensor([2.3515, 2.2378, 2.3735,  ..., 2.0428, 3.3697, 2.4520])

In [73]:
model.transformer.h[8]

GPT2Block(
  (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (attn): GPT2SdpaAttention(
    (c_attn): Conv1D(nf=2304, nx=768)
    (c_proj): Conv1D(nf=768, nx=768)
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (mlp): GPT2MLP(
    (c_fc): Conv1D(nf=3072, nx=768)
    (c_proj): Conv1D(nf=768, nx=3072)
    (act): NewGELUActivation()
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [75]:
def gather_residual_activations(model, target_layer, inputs):
  target_act = None
  def gather_target_act_hook(mod, inputs, outputs):
    nonlocal target_act # make sure we can modify the target_act from the outer scope
    target_act = outputs[0]
    return outputs
  
  handle = model.transformer.h[target_layer].register_forward_hook(gather_target_act_hook)  
  model(inputs)
  handle.remove()
  return target_act

In [None]:
input_text = "Would you be able to travel through time using a wormhole?"
tokenized_text = tokenize(input_text)
# embedding = tokenize_and_embed(input_text).to('cpu')
target_act = gather_residual_activations(model, 8, tokenized_text.to(device))


In [164]:
target_act.size()

torch.Size([1, 13, 768])

In [165]:
sae_acts = sae.encode(target_act.to('cpu'))
recon = sae.decode(sae_acts) # Reconstruction

In [173]:
1 - torch.mean((recon[:, 1:] - target_act[:, 1:].to(torch.float32).to('cpu')) **2) / (target_act[:, 1:].to(torch.float32).to('cpu').var())

tensor(-13.1778, grad_fn=<RsubBackward1>)

In [166]:
(sae_acts > 1).sum(-1)

tensor([[23956,  1025,  1344,   634,   819,  1235,  1038,   457,   644,   876,
           712,   139,   171]])

In [168]:
values, inds = sae_acts.max(-1)

In [171]:
from IPython.display import IFrame
html_template = "https://neuronpedia.org/{}/{}/{}?embed=true&embedexplanation=true&embedplots=true&embedtest=true&height=300"

model_name = "gpt2-small"
sae_id = "8-res-jb" 

def get_dashboard_html(sae_release = model_name, sae_id=sae_id, feature_idx=0):
    return html_template.format(sae_release, sae_id, feature_idx)

html = get_dashboard_html(feature_idx=21490)
IFrame(html, width=1200, height=600)