|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 5:</h2>|<h1>Observation (non-causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Investigating layers<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge HELPER: Token-related similarities across layers<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, GPT2Tokenizer

# vector matplotlib
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Exercise 1: Load model, implant hooks, get activations

In [None]:
# load GPT2 model and tokenizer
model = AutoModelForCausalLM.from_pretrained('gpt2-xl')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

model.eval()

# variable for the number of embedding dimensions
nEmb =

In [None]:
# Define a hook function to store QVK vectors
activations = {}

def implant_hook(layer_number):
  def hook(module, input, output):
    activations[f'attn_{layer_number}_qvk'] = output.detach().numpy()
  return hook


# surgery ;)

In [None]:
# generated by Claude.ai
sentences = [
    "I saw her at the market.",
    "She gave her the book.",
    "They asked her for advice.",
    "We invited her to dinner.",
    "The dog followed her home.",
    "They asked her to join.",
    "He saw her at the park yesterday.",
    "Did you give her your address?",
    "I haven't seen her in ages.",
    "I told her the truth.",
    "They congratulated her on his success.",
    "She recognized her immediately.",
    "The teacher praised her for his work.",
    "I met her last summer.",
    "The child hugged her tightly.",
    "They warned her about the danger.",
    "She drove her to the airport.",
    "We waited for her for hours.",
    "The cat scratched her accidentally.",
    "They surprised her with a gift.",
    "She called her on the phone.",
    "The jury found her not guilty.",
    "I remembered her from school.",
    "They elected her as president.",
    "She forgave her for his mistake.",
    "The police questioned her yesterday.",
    "I helped her with his homework.",
    "They spotted her in the crowd.",
    "She visited her in the hospital.",
    "The manager promoted her last week.",
    "I trusted her completely.",
    "They respected her for his honesty.",
    "She taught her how to swim.",
    "The bird attacked her suddenly.",
    "I greeted her warmly.",
    "They supported her through difficult times.",
    "She ignored her at the party.",
    "The judge sentenced her to community service.",
    "I photographed her during the event.",
    "They believed her despite the evidence.",
    "She surprised her on his birthday.",
    "The guard stopped her at the entrance.",
    "I missed her terribly.",
    "They watched her leave the building.",
    "She accompanied her to the concert.",
    "The crowd cheered her enthusiastically.",
    "I described her to the police.",
    "They thanked her for his help.",
    "She admired her for his courage.",
    "The committee nominated her for the award.",
    "I married her last spring.",
    "They informed her about the changes.",
    "She introduced her to the parents.",
    "The author based the character on her."
]

target_token = tokenizer.encode(' her')[0]
print(f'There are {len(sentences)} sentences.')

In [None]:
# need to specify a padding token
tokenizer.pad_token = tokenizer.eos_token

# tokenize
tokens = tokenizer(sentences,padding=True,return_tensors='pt')

# push through the model (~1 min for gpt2-xl in cpu)
with torch.no_grad(): model(**tokens)

In [None]:
# unique values for each layer type
qLoc = 1
kLoc = 2
vLoc = 3

# a vector mask
vectorMask =

# outer product to create a matrix with unique values for each interaction
matrixMask = vectorMask[:,None] @ vectorMask[None,:]
matrixMask = np.triu(matrixMask,

# Exercise 2: A function to get layer-specific activations

In [None]:
## function to get activations for target and non-target tokens

def get_QKV_activations(whichlayer):

  # initialize
  actsAll_trg = np.zeros(
  actsAll_non =

  # loop over sentences b/c target position varies
  for senti in range(len(sentences)):

    # find the index of the target token (convert to list, then .index to find)
    targidx =

    # TARGET get the activation for this token
    actsAll_trg[senti,:] =

    # NON-TARGET get the activation for this token
    actsAll_non[senti,:] =

  return actsAll_trg,actsAll_non

# Exercise 3: Gather statistics from each layers

In [None]:
# same bins for all histograms
edges = np.linspace(-1,1,101)

# lots of initializations
yQQ_trg = np.zeros((model.config.n_layer,len(edges)-1))
yKK_trg = np.zeros((model.config.n_layer,len(edges)-1))
yVV_trg = np.zeros((model.config.n_layer,len(edges)-1))
yQQ_non = np.zeros((model.config.n_layer,len(edges)-1))
yKK_non = np.zeros((model.config.n_layer,len(edges)-1))
yVV_non = np.zeros((model.config.n_layer,len(edges)-1))

yQK_trg = np.zeros((model.config.n_layer,len(edges)-1))
yQV_trg = np.zeros((model.config.n_layer,len(edges)-1))
yKV_trg = np.zeros((model.config.n_layer,len(edges)-1))
yQK_non = np.zeros((model.config.n_layer,len(edges)-1))
yQV_non = np.zeros((model.config.n_layer,len(edges)-1))
yKV_non = np.zeros((model.config.n_layer,len(edges)-1))

all_vars = np.zeros((2,model.config.n_layer,3))
all_mean = np.zeros((2,model.config.n_layer,3))



for layeri in range(model.config.n_layer):

  ### get activations from this layer
  actsAll_trg,actsAll_non = get_QKV_activations(


  ### get variances
  vars = actsAll_trg.var(axis=0)
  all_vars[0,layeri,0] = np.mean(vars[:nEmb])
  all_vars[0,layeri,1] = np.mean(
  all_vars[0,layeri,2] = np.mean(

  vars = actsAll_non.var(axis=0)
  all_vars[1,layeri,0] =
  all_vars[1,layeri,1] =
  all_vars[1,layeri,2] =


  ### get means
  meenz =
  all_mean[0,layeri,0] = np.mean(meenz[
  all_mean[0,layeri,1] = np.mean(meenz[
  all_mean[0,layeri,2] = np.mean

  meenz = actsAll_non.var(axis=0)



  ### calculate cosine similarity
  # TARGET
  actsAllNorm = actsAll_trg / np.linalg.norm()
  cossim_trg = actsAllNorm.T @

  # NON-TARGET
  actsAllNorm =
  cossim_non =



  ### Extract unique similarities
  # TARGET within
  QQcs_trg = cossim_trg[matrixMask==qLoc*qLoc]
  KKcs_trg = cossim_trg[
  VVcs_trg =

  # cross-terms
  QKcs_trg = cossim_trg[matrixMask==qLoc*kLoc]
  QVcs_trg = cossim_trg[
  KVcs_trg =

  # NON-TARGET within
  QQcs_non =
  KKcs_non =
  VVcs_non =

  # cross-terms
  QKcs_non =
  QVcs_non =
  KVcs_non =



  ### Generate histograms for target and non-target

  ## TARGET
  # within-matrix histograms
  yQQ_trg[layeri,:],_ = np.histogram(QQcs_trg,bins=edges)
  yKK_trg[layeri,:],_ = np.histogram()
  yVV_trg[layeri,:],_ = np.histogram(

  # and between-matrix
  yQK_trg[layeri,:],_ = np.histogram(QKcs_trg,bins=edges)
  yQV_trg[layeri,:],_ = np.
  yKV_trg[layeri,:],_ =


  ## NON-TARGET
  # within-matrix histograms
  yQQ_non[layeri,:],_ = np.histogram(QQcs_non,=edges)
  yKK_non[layeri,:],_ =
  yVV_non[layeri,:],_ =

  # and between-matrix
  yQK_non[layeri,:],_ =
  yQV_non[layeri,:],_ =
  yKV_non[layeri,:],_ =


# Exercise 4: Visualize the cosine similarity histograms

In [None]:
fig,axs = plt.subplots(2,2,figsize=(12,6))


for layeri in range(model.config.n_layer):

  #### plotting
  # QQ targets
  axs[0,0].plot(edges[:-1],,color=mpl.cm.plasma(layeri/model.config.n_layer))
  axs[0,0].set(xlim=edges[[0,-1]],xticks=[],yticks=[],ylabel='Count (a.u.)',title='Q-Q targets')


  # QQ nontargets
  axs[0,1].plot(edges[:-1],,color=mpl.cm.plasma(layeri/model.config.n_layer))
  axs[0,1].set(xlim=edges[[0,-1]],xticks=[],yticks=[],title='Q-Q nontargets')

  # KV targets
  axs[1,0].plot(,color=mpl.cm.plasma(layeri/model.config.n_layer))
  axs[1,0].set(xlim=edges[[0,-1]],xlabel='Cosine similarity',ylabel='Count (a.u.)',yticks=[],title='K-V targets')

  # KV nontargets
  axs[1,1].plot(
  axs[1,1].set(xlim=edges[[0,-1]],xlabel='Cosine similarity',yticks=[],title='K-V nontargets')


# manually adjust the y-lims
axs[0,0].set(ylim=[0,axs[0,0].get_ylim()[1]/4])
axs[1,0].set(ylim=[0,axs[1,0].get_ylim()[1]/4])
axs[0,1].set(ylim=[0,None])
axs[1,1].set(ylim=[0,None])

# colorbar for line color (layer number)
cmap = mpl.colormaps['plasma']
norm = mpl.colors.BoundaryNorm(np.arange(model.config.n_layer), cmap.N)
sm = mpl.cm.ScalarMappable(cmap=cmap, norm=norm)
for a in axs.flatten(): cbar = fig.colorbar(sm, ax=a, pad=.01)


# finalize the plot
plt.tight_layout()
plt.show()

In [None]:
fig,axs = plt.subplots(1,2,figsize=(12,3.5))

h = axs[0].imshow(,origin='lower',aspect='auto',
              extent=[edges[0],edges[-1],0,model.config.n_layer])
axs[0].set(xlabel='Cosine similarity',ylabel='Layer',title='Q-Q target')
fig.colorbar(h,ax=axs[0],pad=.01,label='Count')



axs[1].set(xlabel='Cosine similarity',ylabel='Layer',title='Q-Q nontarget')
fig.colorbar(h,ax=axs[1],pad=.01,label='Count')

plt.tight_layout()
plt.show()

# Exercise 5: Visualize the means and variances

In [None]:
_,axs = plt.subplots(1,2,figsize=(12,4.5))

for i in range(2):
  axs[i].plot(all_mean,'ko',markerfacecolor=[.9,.7,.7,.7],markersize=10,label='Q')
  axs[i].plot(,'ks',markerfacecolor=[.7,.9,.7,.7],markersize=10,label='K')
  axs[i].plot(,label='V')
  axs[i].set(xlabel='Layer',ylabel='Activation means',ylim=[all_mean.min()*1.5,all_mean.max()*1.1])
  axs[i].legend()

axs[0].set(title='Targets')
axs[1].set(title='Non-targets')

plt.tight_layout()
plt.show()

In [None]:
_,axs = plt.subplots(1,2,figsize=(12,4.5))

for i in range(2):
  axs[i].plot(label='Q')
  axs[i].plot(label='K')
  axs[i].plot(label='V')
  axs[i].set(xlabel='Layer',ylabel='Activation variance',ylim=[-.01,all_vars.max()*1.1])
  axs[i].legend()

axs[0].set(title='Targets')
axs[1].set(title='Non-targets')

plt.tight_layout()
plt.show()