|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 5:</h2>|<h1>Observation (non-causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Identifying latent factors<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: GED for category isolation across layers<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.linalg
import scipy.stats as stats
from sklearn.decomposition import PCA

from tqdm import tqdm # for progress bar

import torch
from transformers import GPT2Model, GPT2Tokenizer

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
model = GPT2Model.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Exercise 1: Tokenize the training and test texts

In [None]:
# generated by Claude.ai
sentences = [
    "I saw him at the market.",
    "She gave him the book.",
    "They asked him for advice.",
    "We invited him to dinner.",
    "The dog followed him home.",
    "They asked him to join.",
    "He saw him at the park yesterday.",
    "Did you give him your address?",
    "I haven't seen him in ages.",
    "I told him the truth.",
    "They congratulated him on his success.",
    "She recognized him immediately.",
    "The teacher praised him for his work.",
    "I met him last summer.",
    "The child hugged him tightly.",
    "They warned him about the danger.",
    "She drove him to the airport.",
    "We waited for him for hours.",
    "The cat scratched him accidentally.",
    "They surprised him with a gift.",
    "She called him on the phone.",
    "The jury found him not guilty.",
    "I remembered him from school.",
    "They elected him as president.",
    "She forgave him for his mistake.",
    "The police questioned him yesterday.",
    "I helped him with his homework.",
    "They spotted him in the crowd.",
    "She visited him in the hospital.",
    "The manager promoted him last week.",
    "I trusted him completely.",
    "They respected him for his honesty.",
    "She taught him how to swim.",
    "The bird attacked him suddenly.",
    "I greeted him warmly.",
    "They supported him through difficult times.",
    "She ignored him at the party.",
    "The judge sentenced him to community service.",
    "I photographed him during the event.",
    "They believed him despite the evidence.",
    "She surprised him on his birthday.",
    "The guard stopped him at the entrance.",
    "I missed him terribly.",
    "They watched him leave the building.",
    "She accompanied him to the concert.",
    "The crowd cheered him enthusiastically.",
    "I described him to the police.",
    "They thanked him for his help.",
    "She admired him for his courage.",
    "The committee nominated him for the award.",
    "I married him last spring.",
    "They informed him about the changes.",
    "She introduced him to the parents.",
    "The author based the character on him.",

## same sentences but with "her"

    "I saw her at the market.",
    "She gave her the book.",
    "They asked her for advice.",
    "We invited her to dinner.",
    "The dog followed her home.",
    "They asked her to join.",
    "He saw her at the park yesterday.",
    "Did you give her your address?",
    "I haven't seen her in ages.",
    "I told her the truth.",
    "They congratulated her on his success.",
    "She recognized her immediately.",
    "The teacher praised her for his work.",
    "I met her last summer.",
    "The child hugged her tightly.",
    "They warned her about the danger.",
    "She drove her to the airport.",
    "We waited for her for hours.",
    "The cat scratched her accidentally.",
    "They surprised her with a gift.",
    "She called her on the phone.",
    "The jury found her not guilty.",
    "I remembered her from school.",
    "They elected her as president.",
    "She forgave her for his mistake.",
    "The police questioned her yesterday.",
    "I helped her with his homework.",
    "They spotted her in the crowd.",
    "She visited her in the hospital.",
    "The manager promoted her last week.",
    "I trusted her completely.",
    "They respected her for his honesty.",
    "She taught her how to swim.",
    "The bird attacked her suddenly.",
    "I greeted her warmly.",
    "They supported her through difficult times.",
    "She ignored her at the party.",
    "The judge sentenced her to community service.",
    "I photographed her during the event.",
    "They believed her despite the evidence.",
    "She surprised her on his birthday.",
    "The guard stopped her at the entrance.",
    "I missed her terribly.",
    "They watched her leave the building.",
    "She accompanied her to the concert.",
    "The crowd cheered her enthusiastically.",
    "I described her to the police.",
    "They thanked her for his help.",
    "She admired her for his courage.",
    "The committee nominated her for the award.",
    "I married her last spring.",
    "They informed her about the changes.",
    "She introduced her to the parents.",
    "The author based the character on her."
]

# indices of him/her sentences
him_sentences = np.arange(len(sentences)//2)
her_sentences = np.arange(len(sentences)//2,len(sentences))

print(f'There are {len(sentences)} sentences.')

In [None]:
# identify the target token
target_token_him = tokenizer.encode(' him')
target_token_her = tokenizer.encode(' her')
print(f'The target token indices are {target_token_him} and {target_token_her}\n')

# need to specify a padding token
tokenizer.pad_token = tokenizer.eos_token

# tokenize
train_tokens = tokenizer(sentences,padding=True,return_tensors='pt')

In [None]:
# import fineweb, look through tokens, find 50 "him" and 50 "her", take tokens before and after for context
!pip install datatrove
from datatrove.pipeline.readers import ParquetReader

numDocs = 2000 # how many documents to retrive; each doc has ~750 tokens
data_reader = ParquetReader('hf://datasets/HuggingFaceFW/fineweb/data',limit=numDocs)

# join all texts into one token vector
tokens = np.array([],dtype=int)
for t in data_reader():
  tokens = np.append(tokens,tokenizer.encode(t.text))

In [None]:
# find all "him" and "her" tokens
him_tokens = np.where(tokens==target_token_him[0])[0]
her_tokens = np.where(tokens==target_token_her[0])[0]

# let's see a few, in context
for i in him_tokens[:10]:
  print(tokenizer.decode(tokens[i-5:i+5]))

In [None]:
# create a batch
batch = np.zeros((100,10),dtype=int)
for i in range(50):
  batch[i,:] = tokens[him_tokens[i]-5:him_tokens[i]+5]

  # "her" tokens have source index offset to avoid risk of using the same sentence
  batch[50+i,:] = tokens[her_tokens[len(her_tokens)//2+i]-5:her_tokens[len(her_tokens)//2+i]+5]

# create a tokens dictionary
test_tokens = {}
test_tokens['input_ids'] = torch.tensor(batch)
test_tokens['attention_mask'] = torch.ones_like(test_tokens['input_ids'])

test_tokens['input_ids'].shape

In [None]:
# and show a few examples
for i in [16,34,60,90]:
  print(f'Example {i}:',tokenizer.decode(test_tokens['input_ids'][i]))

# Exercise 2: Get model activations for train and test

In [None]:
# hook function to store MLP attention vectors
activations = {}
def implant_hook_mlp(layer_number):
  def hook(module, input, output):
    activations[f'mlp_{layer_number}'] = output.detach().numpy()
  return hook

# hook it!
numLayers = model.config.n_layer
for layeri in range(numLayers):
  model.h[layeri].mlp.c_fc.register_forward_hook(implant_hook_mlp(layeri))

In [None]:
# TRAIN tokens to define GED vectors
with torch.no_grad(): model(**train_tokens)
train_activations = activations.copy()

# TEST tokens to evaluate out-of-sample performance
with torch.no_grad(): model(**test_tokens)
test_activations = activations

In [None]:
# confirm dataset sizes
print('Train activations are size:',train_activations['mlp_5'].shape)
print('Test activations are size:',test_activations['mlp_5'].shape)

# Exercise 3: A function for PCA

In [None]:
def dimred_PCA(layerNum):

  # --- get target activations
  trainacts = np.zeros((len(sentences),train_activations[f'mlp_{layerNum}'].shape[-1]))

  for senti in range(len(sentences)):

    # find the index of either of the target tokens
    targBool = (train_tokens['input_ids'][senti].numpy()==target_token_him) | (train_tokens['input_ids'][senti].numpy()==target_token_her)
    targidx = np.where(targBool)[0]

    # then get the activation
    trainacts[senti,:] = train_activations[f'mlp_{layerNum}'][senti,targidx,:]


  # FYI, manual approach, which is considerably slower than sklearn.
  # # --- PCA via eigendecomposition of covariance matrix
  # d,PCA_eigvecs = scipy.linalg.eigh( np.cov(trainacts.T) )

  # # sort the values and vectors
  # idx = d.argsort()[::-1]
  # d = np.real(d[idx])
  # PCA_eigvecs = np.real(PCA_eigvecs[:,idx]) # sort the columns, not the rows!

  # # transform the eigenvalues to cumulative % variance explained
  # varExplained = d*100/np.sum(d)
  # cumVarExplained = np.cumsum(varExplained)


  # --- PCA via sklearn's PCA()
  pca = PCA().fit(trainacts)
  cumVarExplained = np.cumsum(100*pca.explained_variance_ratio_)
  PCA_eigvecs = pca.components_.T # pca.components_ have eigenvectors in the rows




  # how many components to explain 99% of the variability?
  numComps2keep = np.where(cumVarExplained>99)[0][0]

  # --- output the activations, eigenvectors, and number of components to keep
  return trainacts,PCA_eigvecs,numComps2keep

# Exercise 4: A function for GED

In [None]:
def sourcesep_GED_train(layerNum):

  # --- create the regularized covariance matrices

  # compress down to numComps2keep dimensions
  lowD_acts = trainacts @ PCA_eigvecs[:,:numComps2keep]

  # covariance matrices
  himProjcov = np.cov(lowD_acts[him_sentences,:].T)
  herProjcov = np.cov(lowD_acts[her_sentences,:].T)

  # regularization
  regu_gam = .01
  himProjcovS = (1-regu_gam)*himProjcov + regu_gam*np.mean(np.linalg.eig(himProjcov)[0])*np.eye(numComps2keep)
  herProjcovS = (1-regu_gam)*herProjcov + regu_gam*np.mean(np.linalg.eig(herProjcov)[0])*np.eye(numComps2keep)


  # --- generalized eigendecomposition
  #     (note: using regularized matrices only for "R" (second input))

  # HIM > HER: eig and sort
  evalsHim,evecsHim = scipy.linalg.eigh(himProjcov,herProjcovS)
  idx = evalsHim.argsort()[::-1]
  evalsHim = np.real(evalsHim[idx])
  evecsHim = np.real(evecsHim[:,idx]) # sort the columns, not the rows!

  # HER > HIM
  evalsHer,evecsHer = scipy.linalg.eigh(herProjcov,himProjcovS)
  idx = evalsHer.argsort()[::-1]
  evalsHer = np.real(evalsHer[idx])
  evecsHer = np.real(evecsHer[:,idx])

  # project the data onto the top GED vectors
  ged_proj_him = lowD_acts @ evecsHim[:,0]
  ged_proj_her = lowD_acts @ evecsHer[:,0]

  # find the pattern in the covariance matrix (through the PC vectors to return to MLP space)
  mlpPattern_him = PCA_eigvecs[:,:numComps2keep] @ himProjcov @ evecsHim[:,0]
  mlpPattern_her = PCA_eigvecs[:,:numComps2keep] @ herProjcov @ evecsHer[:,0]

  r = stats.pearsonr(np.real(mlpPattern_him),np.real(mlpPattern_her))

  # --- output the projections, vectors, and correlations
  return ged_proj_him,ged_proj_her,evecsHim,evecsHer,r,evalsHim[0],evalsHer[0]

# Exercise 5: Loop over layers

In [None]:
# initialize a results matrix
ged_results = np.zeros((numLayers,7,2))


# loop over layers
for layeri in tqdm(range(numLayers),desc='Transformer blocks'):


  ### --- training data to get GED vectors
  # PCA
  trainacts,PCA_eigvecs,numComps2keep = dimred_PCA(layeri)

  # GED
  ged_proj_him,ged_proj_her,evecsHim,evecsHer,r,evHim,evHer = sourcesep_GED_train(layeri)

  # statistical results and their p-values
  tres = stats.ttest_ind(ged_proj_him[him_sentences],ged_proj_him[her_sentences])
  ged_results[layeri,0,0] = abs( tres.statistic )
  ged_results[layeri,0,1] = tres.pvalue

  tres = stats.ttest_ind(ged_proj_her[him_sentences],ged_proj_her[her_sentences])
  ged_results[layeri,1,0] = abs( tres.statistic )
  ged_results[layeri,1,1] = tres.pvalue

  ged_results[layeri,2,0] = abs( r.statistic )
  ged_results[layeri,2,1] = r.pvalue

  # top eigenvalues
  ged_results[layeri,5,0] = evHim
  ged_results[layeri,5,1] = evHer

  # number of components to reach 99% variance explained
  ged_results[layeri,6,0] = numComps2keep


  ### --- out-of-sample evaluation
  # get target activations (target token is always in index 5)
  testacts = test_activations[f'mlp_{layeri}'][:,5,:]

  # project down to numComps2keep dimensions
  actsProj = testacts @ PCA_eigvecs[:,:numComps2keep]

  # then project onto the top GED vectors
  ged_proj_him_test = actsProj @ evecsHim[:,0]
  ged_proj_her_test = actsProj @ evecsHer[:,0]

  # ttests
  tres = stats.ttest_ind(ged_proj_him_test[:50],ged_proj_him_test[50:])
  ged_results[layeri,3,0] = abs( tres.statistic )
  ged_results[layeri,3,1] = tres.pvalue

  tres = stats.ttest_ind(ged_proj_her_test[:50],ged_proj_her_test[50:])
  ged_results[layeri,4,0] = abs( tres.statistic )
  ged_results[layeri,4,1] = tres.pvalue


In [None]:
plt.figure(figsize=(10,4))

plt.plot(ged_results[:,6,0],'kh-',markerfacecolor=[.9,.7,.9],linewidth=.5,markersize=12)

plt.gca().set(xlabel='Transformer layer',ylabel='Count',xlim=[-.5,numLayers-.5],
              title='Number of components to explain 99% variability')

plt.show()

# Exercise 6: Visualize GED results

In [None]:
_,axs = plt.subplots(1,2,figsize=(12,3.5))

xtix = np.arange(1,numLayers+1)
pvalthresh = .05/numLayers

# TRAIN him > her
p = ged_results[:,0,1]
axs[0].plot(xtix,ged_results[:,0,0],color=[.7,.7,.9])
axs[0].plot(xtix[p<pvalthresh],ged_results[p<pvalthresh,0,0],'ko',markerfacecolor=[.7,.7,.9],linewidth=.5,label='TRAIN him>her')
axs[0].plot(xtix[p>pvalthresh],ged_results[p>pvalthresh,0,0],'rx')

# TRAIN her > him
p = ged_results[:,1,1]
axs[0].plot(xtix,ged_results[:,1,0],color=[.9,.7,.7])
axs[0].plot(xtix[p<pvalthresh],ged_results[p<pvalthresh,1,0],'ks',markerfacecolor=[.9,.7,.7],linewidth=.5,label='TRAIN her>him')
axs[0].plot(xtix[p>pvalthresh],ged_results[p>pvalthresh,1,0],'rx')

# TEST him > her
p = ged_results[:,3,1]
axs[0].plot(xtix,ged_results[:,3,0],color=[.7,.9,.7])
axs[0].plot(xtix[p<pvalthresh],ged_results[p<pvalthresh,3,0],'k^',markerfacecolor=[.7,.9,.7],linewidth=.5,label='TEST him>her')
axs[0].plot(xtix[p>pvalthresh],ged_results[p>pvalthresh,3,0],'rx')

# TEST her > him
p = ged_results[:,4,1]
axs[0].plot(xtix,ged_results[:,4,0],color=[.7,.9,.9])
axs[0].plot(xtix[p<pvalthresh],ged_results[p<pvalthresh,4,0],'kd',markerfacecolor=[.7,.9,.9],linewidth=.5,label='TEST her>him')
axs[0].plot(xtix[p>pvalthresh],ged_results[p>pvalthresh,4,0],'rx')
axs[0].legend(ncols=2)
axs[0].set(ylim=[0,50],xlabel='Layer',ylabel='|t| value',title='Magnitude of t-tests')

# correlation
p = ged_results[:,2,1]
axs[1].plot(xtix,ged_results[:,2,0],color=[.9,.9,.9])
axs[1].plot(xtix[p<pvalthresh],ged_results[p<pvalthresh,2,0],'ko',markerfacecolor=[.7,.9,.9],linewidth=.5,label='TEST him>her')
axs[1].plot(xtix[p>pvalthresh],ged_results[p>pvalthresh,2,0],'rx')
axs[1].plot(xtix,ged_results[:,2,0],'ks-',markerfacecolor=[.9,.9,.9],linewidth=.5)
axs[1].set(xlabel='Layer',ylabel='|r| value',title='Correlation between "him" and "her" patterns')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10,4))

# top eigenvalues for him > her
plt.plot(np.arange(numLayers)-.1,ged_results[:,5,0],'ko-',markerfacecolor=[.7,.9,.7],
         linewidth=.5,markersize=8,label='him > her')

# top eigenvalues for her > him
plt.plot(np.arange(numLayers)+.1,ged_results[:,5,1],'ks-',markerfacecolor=[.7,.7,.9],
         linewidth=.5,markersize=8,label='her > him')

plt.legend()
plt.gca().set(xlabel='Transformer layer',ylabel='Eigenvalue magnitude',xlim=[-.5,numLayers-.5],
              title='Largest eigenvalue from GEDs')

plt.show()