|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 5:</h2>|<h1>Observation (non-causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Investigating neurons and dimensions<h1>|
|<h2>Lecture:</h2>|<h1><b>Dealing with multitoken word embeddings<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
from transformers import AutoModelForCausalLM, GPT2Tokenizer

In [None]:
# import gpt and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model     = AutoModelForCausalLM.from_pretrained('gpt2')
model.eval()

# Exploring multitoken word tokenization

In [None]:
# target (multitoken) words and their tokens
#                     0           1            2                3             4                5
targetwords = [' toothpaste','toothpaste','time machine',' time machine','time-machine',' time-machine' ]

# tokenize (in a list b/c number of tokens varies)
targtoks = []
for word in targetwords:
  targtoks.append(tokenizer.encode(word))
  print(f'"{word}" comprises {len(targtoks[-1])} tokens:\n    {[tokenizer.decode(t) for t in targtoks[-1]]} -> {targtoks[-1]}\n')

In [None]:
# create sentences
sentences = []
for tw in targetwords:
  sentences.append( "I'd like to read a story about that famous " + tw )

n_sentences = len(sentences)


# now to tokenize
tokenizer.pad_token = tokenizer.eos_token
tokens = tokenizer(sentences, return_tensors='pt', padding=True)
seq_len = tokens['input_ids'].shape[-1]
tokens

In [None]:
sentences

# Find the final target tokens in the sentences

In [None]:
targetlocs = np.zeros(n_sentences,dtype=int)

# loop over sentences
for senti in range(n_sentences):

  # loop over target words
  for targi in range(len(targtoks)):

    # number of tokens in this target
    targlen = len(targtoks[targi])

    # loop over the token sequence for this sentence
    for ti in range(targlen,seq_len+1):

      # see if it matches the mini-sequence of target tokens
      if torch.equal(tokens['input_ids'][senti,ti-targlen:ti],torch.tensor(targtoks[targi])):
        targetlocs[senti] = ti-1
        print(f"Sentence {senti} containts target {targi} at index {ti-1:2}: '{tokenizer.decode(tokens['input_ids'][senti,ti-targlen:ti])}'")

# hint: try removing ' toothpaste' from the target list!

# Forward pass and get target-word activations

In [None]:
# process the tokens
with torch.no_grad():
  outputs = model(**tokens,output_hidden_states=True)

hs = outputs.hidden_states

In [None]:
hs[0].shape

In [None]:
targetActs = np.zeros((model.config.n_layer+1,n_sentences,hs[3].shape[-1],2))

# loop over sentences
for senti in range(n_sentences):

  # loop over layers
  for layeri in range(model.config.n_layer+1):

    # grab the activation from the final target token
    targetActs[layeri,senti,:,0] = hs[layeri][senti,targetlocs[senti],:].numpy()

    # and the second-last token
    targetActs[layeri,senti,:,1] = hs[layeri][senti,targetlocs[senti]-1,:].numpy()

targetActs.shape

# One quick visualization (difference vector norms)

In [None]:
# Note: the reasoning and math of 'diffNorms' is explained in detail in lecture "Path length and logit token prediction"

# initialize
diffNorms = np.zeros((model.config.n_layer+1,3))


# loop over layers
for layeri in range(1,targetActs.shape[0]):

  # calculate the difference vector from previous attention layer
  diffVects = targetActs[layeri,:,:,0] - targetActs[layeri-1,:,:,0]

  # calculate its norm
  diffNorms[layeri,0] = np.linalg.norm(diffVects,axis=1).mean()


  ## repeat for the second-last token
  diffVects = targetActs[layeri,:,:,1] - targetActs[layeri-1,:,:,1]
  diffNorms[layeri,1] = np.linalg.norm(diffVects,axis=1).mean()


  ## repeat for final target tokens within this layer
  diffVects = targetActs[layeri,:,:,0] - targetActs[layeri,:,:,1]
  diffNorms[layeri,2] = np.linalg.norm(diffVects,axis=1).mean()


# plot the difference vector norms
plt.figure(figsize=(10,4))
plt.plot(range(1,targetActs.shape[0]),diffNorms[1:,0],'ks-',markerfacecolor=[.7,.9,.7],markersize=10,linewidth=.4,label='Final token ($\Delta$ layer)')
plt.plot(range(1,targetActs.shape[0]),diffNorms[1:,1],'ko-',markerfacecolor=[.9,.7,.7],markersize=10,linewidth=.4,label='2nd-last token ($\Delta$ layer)')
plt.plot(range(1,targetActs.shape[0]),diffNorms[1:,2],'k^-',markerfacecolor=[.7,.7,.9],markersize=10,linewidth=.4,label='Last two tokens ($\Delta$ token)')
plt.gca().set(title='How much the embeddings changed',xlabel='Layer',ylabel='$\Delta$ vector norm')
plt.legend()

plt.show()