In [1]:
import numpy as np
import einops
import plotly.express as px
from IPython.display import display, HTML
import circuitsvis as cv

import torch
import transformer_lens
from transformer_lens import HookedTransformer
from datasets import load_dataset
from model import load_model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.set_grad_enabled(False)

  from .autonotebook import tqdm as notebook_tqdm


<torch.autograd.grad_mode.set_grad_enabled at 0x10bfb1fa0>

In [2]:
model: HookedTransformer = load_model(path="checkpoints/20240317_003352/step_414276.pt")
model.to(device)
model.eval()

Loaded pretrained model attn-only-2l into HookedTransformer
Moving model to device:  cpu


HookedTransformer(
  (embed): Embed()
  (hook_embed): HookPoint()
  (pos_embed): PosEmbed()
  (hook_pos_embed): HookPoint()
  (blocks): ModuleList(
    (0-1): 2 x TransformerBlock(
      (ln1): LayerNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (attn): Attention(
        (hook_k): HookPoint()
        (hook_q): HookPoint()
        (hook_v): HookPoint()
        (hook_z): HookPoint()
        (hook_attn_scores): HookPoint()
        (hook_pattern): HookPoint()
        (hook_result): HookPoint()
      )
      (hook_attn_in): HookPoint()
      (hook_q_input): HookPoint()
      (hook_k_input): HookPoint()
      (hook_v_input): HookPoint()
      (hook_mlp_in): HookPoint()
      (hook_attn_out): HookPoint()
      (hook_mlp_out): HookPoint()
      (hook_resid_pre): HookPoint()
      (hook_resid_post): HookPoint()
    )
  )
  (ln_final): LayerNormPre(
    (hook_scale): HookPoint()
    (hook_normalized): HookPoint()
  )
  (unembed): Unembed()
)

In [3]:
data = load_dataset("NeelNanda/c4-code-tokenized-2b", split="train[:1%]")
data.set_format(type="torch", columns=["tokens"])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [4]:
data_ids = data['tokens'][2]
print(data_ids)
data_tokens = model.to_str_tokens(data_ids)
print(data_tokens)

tensor([    1, 10825,    16,  ...,    85,  2377,   353])
['<|BOS|>', ' lawyer', '.', ' Talk', ' to', ' us', ' before', ' it', ' is', ' too', ' late', '.', '<|EOS|>', 'Co', '-', 'F', 'ounder', ',', ' President', ' and', ' Director', ',', ' 1', '9', '8', '3', ' –', ' 1', '9', '9', '5', ',', ' Metro', ' Technology', ' Development', ' Council', ',', ' Inc', '.', ' Member', ',', ' 1', '9', '8', '6', ',', ' High', ' Technology', ' Task', ' Force', ',', ' City', ' of', ' Arlington', ' Economic', ' Development', ' Commission', '.', ' Member', ',', ' 1', '9', '8', '6', ' –', ' 1', '9', '8', '9', ',', ' Regional', ' Research', ' Programs', ' Committee', ',', ' North', ' Texas', ' Commission', '.', '\n', 'Member', ',', ' 1', '9', '9', '4', ' –', ' present', ',', ' Board', ' of', ' Directors', ',', ' and', ' Treasure', 'r', ',', ' 2', '0', '0', '2', ' –', ' present', ',', ' North', ' Central', ' Texas', ' Inter', 'link', ',', ' North', ' Texas', ' Commission', '.', '\n', 'Director', ',', ' 1', '9'

In [5]:
rep_str = "is a test of the emergency broadcast system. This is a test of the emergency broadcast system"
rep_toks = model.to_tokens(rep_str, prepend_bos=True).squeeze()
# extend rep_toks to 1024 by adding EOS, then the end of data_ids
rep_toks = torch.cat([rep_toks, torch.tensor([model.tokenizer.eos_token_id])], dim=0)
rep_len = rep_toks.shape[0]
rep_toks = torch.cat([rep_toks, data_ids[rep_toks.shape[0]:]], dim=0)
print(model.to_string(rep_toks))

<|BOS|>is a test of the emergency broadcast system. This is a test of the emergency broadcast system<|EOS|> Director, 1983 – 1995, Metro Technology Development Council, Inc. Member, 1986, High Technology Task Force, City of Arlington Economic Development Commission. Member, 1986 – 1989, Regional Research Programs Committee, North Texas Commission.
Member, 1994 – present, Board of Directors, and Treasurer, 2002 – present, North Central Texas Interlink, North Texas Commission.
Director, 1983 – 1986, Center For Computer Assistance For The Disabled, Arlington, Texas.
Member, 1986-present; Director, 1989 – 1991, 2001 – 2005; secretary, 2001 – 2002; treasurer, 2002 – 2003; president-elect, 2003-2004; and president (2004 – 2005); Great Southwest Arlington Rotary, Arlington, Texas.
Rotarian of the year, 2007-2008.
Rotary Builder award for 2011-2012 presented by District 5790 Governor.
Service Above Self award, 2012 and 2013.
Coordinator, eight Veterans Day commemoration ceremonies; Veterans Pa

In [6]:
logits, cache = model.run_with_cache(rep_toks)

In [7]:
pattern_0 = cache['pattern', 0] # [batch, head, seq, seq]
pattern_0 = pattern_0[0, :, :rep_len, :rep_len]
pattern_1 = cache['pattern', 1] # [batch, head, seq, seq]
pattern_1 = pattern_1[0, :, :rep_len, :rep_len]
# pattern_0[0, 1, 10] = 1

In [8]:
display(HTML(f"<h2>Layer 0</h2>"))
display(cv.attention.attention_patterns(
    attention = pattern_0,
    tokens = model.to_str_tokens(rep_toks[:rep_len]),
))

display(HTML(f"<h2>Layer 1</h2>"))
display(cv.attention.attention_patterns(
    attention = pattern_1,
    tokens = model.to_str_tokens(rep_toks[:rep_len]),
))