|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 5:</h2>|<h1>Observation (non-causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Investigating token embeddings<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge HELPER: Do nouns or adjectives have longer trajectories?<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

import requests
import spacy

import torch
from transformers import AutoModelForCausalLM, GPT2Tokenizer

# vector matplotlib
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Exercise 1: Text and token batches

In [None]:
# import tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# import Frankenstein text
text = requests.get('https://www.gutenberg.org/cache/epub/84/pg84.txt').text
tokens =

print(f'There are {} GPT tokens, {} of which are unique.')

In [None]:
# nlp module from spaCy
nlp = spacy.load('en_core_web_sm')

# tokenize and report count
doc = nlp(text)
print(f'There are {len(doc):,} spaCy tokens.')

In [None]:
# batch parameters
batchsize   =
context_pre =
context_pst =

In [None]:
# initialize batches
batch_noun = torch.zeros
batch_adje = torch.zeros

# initialize counters
seqiN = 0
seqiA = 0


# loop over the tokens
for idx in range(

  # get the text for this token
  txt =

  ### filter
  # skip short tokens
  if : continue

  # skip subwords (this and the next token must start with space)
  if  | : continue


  ### get part of speech and populate vector for nouns or adjectives
  pos = nlp( )[0].pos_

  # if it's a noun and we don't have enough
  if pos=='NOUN':
    if seqiN
      batch_noun[seqiN,:] = tokens[0,]
      seqiN += 1

  # if it's an adjective and we don't have enough


  # quit early if there's enough data
  if
    break

In [None]:
# show some examples
print('Some nouns:')
for b in batch_noun[:15,context_pre]:
  print

print('\nSome adjectives:')
for b in batch_adje[:15,context_pre]:
  print(

# Exercise 2: Import model and get hidden state activations

In [None]:
# load GPT2 model
model = AutoModelForCausalLM.from_pretrained('gpt2-large')
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

model = model.to(device)
model.eval()

In [None]:
# Get the activations and hidden states
with torch.no_grad():
  out_noun = model(batch_noun
  out_adje = model(

In [None]:
nLayers = len(out_noun.hidden_states)

In [None]:
out_noun.hidden_states[3].shape

# Exercise 3: PCA on the activation vectors

In [None]:
# number of rows is layers X batchsize X 2
numRows =

# initializations
all_acts = np.zeros((numRows,model.config.n_embd))
lookuptable = np.zeros((numRows,2),dtype
rowi = 0


# loop over sequences
for seqi in range(batchsize):
  for layeri in range(nLayers):

    ### NOUNS
    # get the activation vector for target token in this layer in this sequence
    all_acts[rowi,:] = out_noun.

    # identifiers
    lookuptable[rowi,0] = layeri # which layer
    lookuptable[rowi,1] =        # which target
    rowi += 1


    ### ADJECTIVES
    all_acts[rowi,:] = out_adje.hidden_states[layeri][seqi,context_pre,:].cpu().numpy().squeeze()
    lookuptable[rowi,0] =  # which layer
    lookuptable[rowi,1] =  # which target
    rowi += 1


In [None]:
lookuptable.shape, all_acts.shape

In [None]:
# PCA with 20 components just to show variance explained
pca = PCA().fit(all_acts)
scree = pca.explained_variance_ratio_ # convert to percent

# and plot
plt.figure(figsize=(8,3))
plt.plot(,'ks',markerfacecolor=[.9,.7,.7],markersize=10)

plt.gca().set(xlabel='Component number',xticks=range(0,21,2),
              ylabel='Percent variance explained',title='Scree plot')

plt.show()

In [None]:
# project to 2D
proj2d = pca.transform(all_acts) # first two columns or first two rows??

print(all_acts.shape,proj2d.shape)

# Exercise 4: Visualize the average trajectories

In [None]:
aveTrajectories = np.zeros((nLayers,2,2))

for i in range(nLayers):
  aveTrajectories[i,:,0] = proj2d[ THISLAYER & ISNOUN ,:].mean(axis=0) # nouns
  aveTrajectories[i,:,1] = # same for adjectives

# calculate distances between noun and adjective tokens
noun_pts = aveTrajectories[:,:,0]
adj_pts  = aveTrajectories[:,:,1]
pos_distances =

In [None]:
fig,axs = plt.subplots(1,3,figsize=(14,4))

### plot the trajectories in "layer space"
axs[0].plot(aveTrajectories[:,0,0],label='Nouns PC1')
axs[0].plot(aveTrajectories,label='Nouns PC2')
axs[0].plot(,label='Adjectives PC1')
axs[0].plot(,label='Adjectives PC2')
axs[0].set(xlim=[0,nLayers-1],xlabel='Layer',ylabel='Projection (a.u.)',
           title='PC projections in "layer space"')
axs[0].legend()


### trajectories in PC state-space
# plot the trajectory for nouns
axs[1].plot(,'r',zorder=-3,label='Nouns')
h = axs[1].scatter(aveTrajectories[:,0,0],aveTrajectories[:,1,0],marker='s',
               s=np.linspace(20,120,nLayers),c=np.arange(nLayers),cmap='Reds')

# repeat for adjectives
axs[1].plot(aveTrajectories[:,0,1],aveTrajectories[:,1,1],'b',zorder=-3,label='Adjectives')
axs[1].scatter(,cmap='Blues')

axs[1].set(xlabel='PC1',ylabel='PC2',title='Trajectories in "PC space"')
axs[1].legend()
fig.colorbar(h,ax=axs[1],label='Layer',pad=.01)



### show distances
axs[2].plot(,'g',linewidth=2,zorder=-6)
axs[2].scatter(,marker='s',s=100,c=np.arange(nLayers),cmap='Greens')
axs[2].set(xlabel='Layer',ylabel='Euclidean distance',title='Nouns vs. adjectives distances',ylim=[-1,None])

plt.tight_layout()
plt.show()

# Exercise 5: Calculate trajectory distances

In [None]:
# initialize
distances = np.zeros((nLayers-1,2))

# using for-loops for clarity
for i in range(1,nLayers):
  for j in range(2):

    # x points
    x1 = aveTrajectories[i-1,0,j]
    x2 = aveTrajectories[i,0,j]

    # y points
    y1 =
    y2 =

    # euclidean distances
    distances[i-1,j] = np.sqrt( ()**2 + ()**2 )


# plotting
_,axs = plt.subplots(1,2,figsize=(10,4))
axs[0].plot(,'ks-',markerfacecolor=[.9,.7,.7],linewidth=.2,label='Nouns')
axs[0].plot(,'ko-',markerfacecolor=[.7,.9,.7],linewidth=.2,label='Adjectives')
axs[0].legend()
axs[0].set(xlabel='Layer',ylabel='Log distance',title='Trajectory distances')

axs[1].plot(,'k^-',linewidth=.2,markersize=9,markerfacecolor=[.7,.7,.9])
axs[1].set(xlabel='Layer',ylabel='Log-distance difference',title='Noun - adj differences')

plt.tight_layout()
plt.show()