# Summary

Follows on from Annotate.ipynb, focussing on tracking vectors across layers and positions.

# Setup
(No need to read)

In [None]:
%pip install "numpy == 1.23.*"



In [None]:
%pip install git+https://github.com/neelnanda-io/TransformerLens.git


Collecting git+https://github.com/neelnanda-io/TransformerLens.git
  Cloning https://github.com/neelnanda-io/TransformerLens.git to /tmp/pip-req-build-7kwnxxml
  Running command git clone --filter=blob:none --quiet https://github.com/neelnanda-io/TransformerLens.git /tmp/pip-req-build-7kwnxxml
  Resolved https://github.com/neelnanda-io/TransformerLens.git to commit a5147baea899f16f0db34b1a7b4e3464d3fd4b30
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting accelerate>=0.23.0 (from transformer-lens==0.0.0)
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting beartype<0.15.0,>=0.14.1 (from transformer-lens==0.0.0)
  Downloading beartype-0.14.1-py3-none-any.whl (739 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import sympy as sp
import transformer_lens
import transformer_lens.utils as utils
from transformer_lens.hook_points import (
    HookedRootModule,
    HookPoint,
)  # Hooking utilities
from transformer_lens import HookedTransformer, HookedTransformerConfig, FactoredMatrix, ActivationCache

In [None]:
import plotly.express as px
def imshow(tensor, renderer=None, xaxis="", yaxis="", **kwargs):
    px.imshow(utils.to_numpy(tensor), color_continuous_midpoint=0.0, color_continuous_scale="RdBu", labels={"x":xaxis, "y":yaxis}, **kwargs).show(renderer)

def line(tensor, renderer=None, xaxis="", yaxis="", **kwargs):
    px.line(utils.to_numpy(tensor), labels={"x":xaxis, "y":yaxis}, **kwargs).show(renderer)

def scatter(x, y, xaxis="", yaxis="", caxis="", renderer=None, **kwargs):
    x = utils.to_numpy(x)
    y = utils.to_numpy(y)
    px.scatter(y=y, x=x, labels={"x":xaxis, "y":yaxis, "color":caxis}, **kwargs).show(renderer)

### Model Setup

Using a small pythia model to keep diagrams simple.

In [None]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
from transformers import GPTNeoXForCausalLM, AutoTokenizer
hfmodel = GPTNeoXForCausalLM.from_pretrained("EleutherAI/pythia-70m-deduped")

Downloading (…)lve/main/config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/166M [00:00<?, ?B/s]

In [None]:
model = HookedTransformer.from_pretrained("EleutherAI/pythia-70m-deduped", device=device, hf_model=hfmodel)

Downloading (…)okenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model EleutherAI/pythia-70m-deduped into HookedTransformer


# Example

Using the sentence "Dublin is the capital of Ireland" to explore the relationsips between input tokens and internal representation of words,and a dependency on earlier tokens as well as general knowledge.

The expected continuation token " Ireland" is not the top ranked token, but many of the high ranked token are in the right area.

In [None]:
utils.test_prompt(" Dublin is the capital city of", "Ireland",model, prepend_bos=False)

Tokenized prompt: [' Dublin', ' is', ' the', ' capital', ' city', ' of']
Tokenized answer: [' Ireland']


Top 0th token. Logit: 22.56 Prob: 11.67% Token: | the|
Top 1th token. Logit: 21.26 Prob:  3.19% Token: | London|
Top 2th token. Logit: 21.12 Prob:  2.78% Token: | Dublin|
Top 3th token. Logit: 20.95 Prob:  2.33% Token: | New|
Top 4th token. Logit: 20.87 Prob:  2.17% Token: | Ireland|
Top 5th token. Logit: 20.45 Prob:  1.42% Token: | B|
Top 6th token. Logit: 20.42 Prob:  1.38% Token: | England|
Top 7th token. Logit: 20.33 Prob:  1.26% Token: | a|
Top 8th token. Logit: 20.28 Prob:  1.20% Token: | Belfast|
Top 9th token. Logit: 20.19 Prob:  1.10% Token: | Paris|


Running the same calculation manually. Note that utils.test_prompt includes the answer token in the prompt so we do the same here, but focus on the logit in the second last position. leaving the last token out of the prompt gives the same rankings, but affects the logit slightly.

In [None]:
utterance = " Dublin is the capital city of Ireland"
tokens=model.to_tokens(utterance, prepend_bos=False)
tokens, model.to_str_tokens(utterance, prepend_bos=False)


(tensor([[24523,   310,   253,  5347,  2846,   273, 11011]]),
 [' Dublin', ' is', ' the', ' capital', ' city', ' of', ' Ireland'])

Logits tensor has 1 batch, 7 position, and 50304 tokens. We are focussing on the 6th position (which has index 5).

In [None]:
logits, cache = model.run_with_cache(tokens, remove_batch_dim=False)
logits.shape


torch.Size([1, 7, 50304])

### Transformer Block 5

In [None]:
tokenid_Ireland = model.to_tokens(" Ireland",prepend_bos=False)
tokenid_Ireland, logits[0,5,tokenid_Ireland]


(tensor([[11011]]), tensor([[20.8713]], grad_fn=<IndexBackward0>))

The logit is the inner product between the unembedding vector for token " Ireland"
$$\textrm{unembed_Ireland} = \overline{\text{Ireland}}  $$

and the result from applying a transformer T to the embedding token for " of"
$$x_{out} = T(\text{Dublin is the capital city}) \underline{of} $$

and adding a bias term $\beta$.

That is
$$< \overline{\text{Ireland}} , T(\text{Dublin is the capital city}) \underline{of} > + \beta = 20.87 $$




In [None]:
unembed_Ireland = model.W_U.T[tokenid_Ireland][0][0]
x_out = cache["ln_final.hook_normalized"][0,5]
torch.dot(unembed_Ireland,x_out) + model.b_U[tokenid_Ireland]

tensor([[20.8713]], grad_fn=<AddBackward0>)

We use a convention that the residual vector at layer i, position j of the transformer is $x^i_j$.

The last step of the transformer operation is to apply Layer Normalisation (LN) to the residual vector in the last block (position 5, layer-id 5)
$$x_{out} = LN(x^5_5)$$

The standard definition for layer normalization is:
$$LN(x) = \frac{x-\textrm{E}[x]}{\sqrt{\textrm{Var}[x]+\epsilon}}*\gamma+\beta.$$
The transformer lens model rolls $\gamma$ into other weights, and bias $\beta$ seems to apply after LN.

In [None]:
x55 = cache["blocks.5.hook_resid_post"][0,5]
eps : float = 1e-5
def Var(x):
  return x.pow(2).mean()

def LN(x):
  return torch.add(x, -1 * x.mean())/((Var(x)-eps)).sqrt()
##x_out, LN(x55)
torch.all(torch.isclose(x_out, LN(x55)))

tensor(True)

We write the embedding vector for the token " of" as $\underline{\text{of}}$. This is the input the first layer of the transformer at position 5:
 $$x^0_5 =  \underline{\text{of}}$$


In [None]:

tokenid_of = model.to_tokens(" of",prepend_bos=False)
embed_of = model.W_E[tokenid_of][0][0] # remove batch and position dimensions
torch.all(torch.eq(cache["blocks.0.hook_resid_pre"][0,5], embed_of))


tensor(True)

Each transformer block layer contributes to the result by adding to the residual vector. Defining $\Delta x^i_5$ as the contribution of transformer block i.
$$x_5^5 = \underline{\text{ of}} + \sum_{i=0}^{5} \Delta x_5^i $$

There are different ways to retrieve this from the transformer lens cache, with slightly different results

In [None]:
def delta(position, block):
  return cache[f"blocks.{block}.hook_resid_post"][0,position] - cache[f"blocks.{block}.hook_resid_pre"][0,position]
def delta2(position, block):
  return cache[f"blocks.{block}.hook_mlp_out"][0,position] + cache[f"blocks.{block}.hook_attn_out"][0,position]

torch.all(torch.isclose(delta(2,2),delta2(2,2))), torch.all(torch.isclose(delta(2,2),delta2(2,2), atol=1e-6))

(tensor(False), tensor(True))

To verify they match up:

In [None]:
r = embed_of
#print(r[range(20)])
for layer in range(0,6):
  r = r + delta(5,layer)
  #print(r[range(20)])

#print(x55[range(20)])
torch.all(torch.isclose(r,x55))

tensor(True)

The residual input to the first transformer block matches the row from the embedding matrix as expected.

#### Inner Product with Layer Normalization

Based on [notes here](https://github.com/prior-technology/SymbolicTransformer/blob/main/notebooks/reexamine_layer_norm.ipynb) the inner product with the LN of sum of vectors should be as follows:

$$ < x, LN (a + b) > = \sqrt{N} \frac{<x,c(a)> + <x, c(b)>}{\sqrt{|c(a+b)|^2 + \epsilon }}  $$




$$\textrm{logit} = < \overline{\text{ Ireland}} ,LN (x_5^5)> = 20.8713 $$

Where final residual $$x_5^5 = \underline{\text{ of}} + \sum_{i=0}^{5} \Delta x_5^i $$

So
$$  logit = \sqrt{N}
\frac{< \overline{\text{ Ireland}}, \underline{\text{ of}} > + \sum_{i=0}^{5} <\overline{\text{ Ireland}}, \Delta x_5^i >}{\sqrt{<c(x_5^5),c(x_5^5)> + \epsilon}}$$

Let $\lambda = \frac{1}{\sqrt{<c(x_5^5),c(x_5^5)> + \epsilon}}$

In [None]:
def center(v):
  return torch.add(v,torch.mean(v))


In [None]:
from IPython.display import Markdown, Latex
import numpy as np
md = ""
x55 = cache["blocks.5.hook_resid_post"][0,5]
epsilon = 0.00001
l = 1/torch.sqrt(torch.dot(center(x55),center(x55)) + epsilon)
print(f"Lambda = {l.item()}")
Markdown("$\\sqrt{512} = " + f"{np.sqrt(512)}$")
r=torch.dot(unembed_Ireland, embed_of).item()
prefix=r"<\overline{\text{Ireland}},\underline{\text{of}}> ="
Markdown(f"$${prefix} {r}$$")


for i in range(0,6):
  d=torch.dot(unembed_Ireland, delta(5,i)).item()
  print(f"<Ireland,\Delta x{i}>={d}")
  r = r + d

bias = model.b_U[tokenid_Ireland].item()
print (f"bias = {bias}")

print ("Expected: 20.87")
r=r*l * np.sqrt(512)
r=r+bias
print(f"Result: {r}")

Lambda = 0.0116585623472929
<Ireland,\Delta x0>=3.751769781112671
<Ireland,\Delta x1>=-0.10114288330078125
<Ireland,\Delta x2>=0.11024236679077148
<Ireland,\Delta x3>=17.597177505493164
<Ireland,\Delta x4>=-1.415786623954773
<Ireland,\Delta x5>=41.282501220703125
bias = 4.6844482421875
Expected: 20.87
Result: 20.871273040771484


We can further decompose the contribution at layer 5 between MLP output and Attention (which uses information from other residual streams)

$$\Delta x^5_5 = M^5_5 + A^5_5$$

In [None]:
def attn(position, block):
  return cache[f"blocks.{block}.hook_attn_out"][0,position]

def mlp(position, block):
  return cache[f"blocks.{block}.hook_mlp_out"][0,position]

print(torch.dot(unembed_Ireland, delta(5,i)).item())
print(torch.dot(unembed_Ireland, attn(5,i)).item())
print(torch.dot(unembed_Ireland, mlp(5,i)).item())

41.282501220703125
16.106090545654297
25.176414489746094
