|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 5:</h2>|<h1>Observation (non-causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Investigating neurons and dimensions<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Grammar tuning in MLP neurons?<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

import requests
import scipy.stats as stats

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# high res matplotlib
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Exercise 1: Get nouns and verbs

In [None]:
# Eleuther's tokenizer
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125m')
tokenizer.pad_token_id = tokenizer.encode(' ')[0]

# load in GPTneo
model = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125m')
model.eval()

embed_dim = model.config.hidden_size

In [None]:
# lists of verbs
url = 'https://raw.githubusercontent.com/david47k/top-english-wordlists/refs/heads/master/top_english_verbs_lower_10000.txt'
verbs = requests.get(url).text
verbs = verbs.split('\n')[:100]
verbs

In [None]:
# repeat for nouns
url = 'https://raw.githubusercontent.com/david47k/top-english-wordlists/refs/heads/master/top_english_nouns_lower_10000.txt'
nouns = requests.get(url).text
nouns = nouns.split('\n')[:100]
nouns

# Exercise 2: Implant a hook

In [None]:
model

In [None]:
# number of MLP 'expansion' units
nneurons = model.transformer.h[8].mlp.c_fc.weight.shape[0]

In [None]:
# a hook function to grab the activations
activations = {}

def implant_hook(layer_number):
  def hook(module, input, output):

    # get the activations
    acts = module.c_fc(input[0])  # [batch, seq, 4xembed_dim]

    # store in the dictionary
    activations[f'mlp_{layer_number}_x'] = acts
  return hook


# pick the layer to hook
layer2hook = 8
hookName = f'mlp_{layer2hook}_x'

# surgery ;)
model.transformer.h[layer2hook].mlp.register_forward_hook(implant_hook(layer2hook))

In [None]:
# test
text = 'Are you not entertained?'
tokens = tokenizer.encode(text,return_tensors='pt')

# forward pass to trigger the hook
with torch.no_grad(): model(tokens)
activations[hookName].shape

# Exercise 3: Get activations for all words

In [None]:
# initialize tensor for all activations
all_activations = np.zeros((2,len(verbs),nneurons))


# loop over the tokens
for wordi in range(len(verbs)):

  # forward pass this verb
  with torch.no_grad(): model(tokenizer.encode(verbs[wordi],return_tensors='pt'))

  # grab the activations
  all_activations[0,wordi,:] = activations[hookName].mean(dim=1).squeeze().detach().numpy()



  ### repeat for nouns
  with torch.no_grad(): model(tokenizer.encode(nouns[wordi],return_tensors='pt'))
  all_activations[1,wordi,:] = activations[hookName].mean(dim=1).squeeze().detach().numpy()


In [None]:
_,axs = plt.subplots(1,2,figsize=(12,4))

axs[0].imshow(all_activations[0,:,:],aspect='auto',vmin=-2,vmax=1)
axs[0].set(xlabel='Neurons',ylabel='Verbs (index)',title='Verbs activations')

axs[1].plot(all_activations[1,:,:].mean(axis=0),'ko',markerfacecolor=[.7,.7,.7,.6])
axs[1].set(xlim=[-6,nneurons+7],xlabel='Neurons',ylabel='Activation',title='Mean activations over all nouns')

plt.tight_layout()
plt.show()

# Exercise 4: Compare activations with t-tests

In [None]:
all_activations.shape
np.diff(all_activations,axis=0).shape
# np.diff(all_activations,axis=0)

In [None]:
# run the t-test
t,p = stats.ttest_1samp(np.diff(all_activations,axis=0),popmean=0,axis=1)

In [None]:
# p-value threshold (Bonferroni correction for multiple comparisons)
pThresh = .05/nneurons

plt.figure(figsize=(10,4))
plt.plot(np.where(p>pThresh)[1],t[p>pThresh],'rs',markersize=4,markerfacecolor=[.9,.6,.6],label='Non-significant')
plt.plot(np.where(p<pThresh)[1],t[p<pThresh],'bo',markersize=6,markerfacecolor=[.5,.5,.9],label='Significant')

plt.legend()
plt.gca().set(xlabel='Neuron',ylabel='T-value',xlim=[-7,nneurons+8],
              title='Statistical significance of NOUN-VERB in MLP activations')
plt.show()

In [None]:
maxTneuron = np.argmax(t)
minTneuron = np.argmin(t)
maxTneuron,minTneuron

# Exercise 5: Test generalizability using a heatmap in new text

In [None]:
# https://en.wikipedia.org/wiki/Randomness
text = "In common usage, randomness is the apparent or actual lack of definite pattern or predictability in information.[1][2] A random sequence of events, symbols or steps often has no order and does not follow an intelligible pattern or combination. Individual random events are, by definition, unpredictable, but if there is a known probability distribution, the frequency of different outcomes over repeated events (or 'trials') is predictable. For example, when throwing two dice, the outcome of any particular roll is unpredictable, but a sum of 7 will tend to occur twice as often as 4. In this view, randomness is not haphazardness; it is a measure of uncertainty of an outcome. Randomness applies to concepts of chance, probability, and information entropy."
tokens = tokenizer.encode(text,return_tensors='pt')

# get activations from forward pass
with torch.no_grad(): model(tokens)

In [None]:
# min-max scale the activations

activationMax = activations[hookName][0,:,maxTneuron]
activationMin = activations[hookName][0,:,minTneuron]

activationMax = (activationMax-activationMax.min()) / (activationMax.max()-activationMax.min())
activationMin = (activationMin-activationMin.min()) / (activationMin.max()-activationMin.min())

In [None]:
# calculate letter width
fig,ax = plt.subplots(figsize=(10,2))

# draw a text object and get its bounding box
temp_text = ax.text(0,0,'n',fontsize=12,fontfamily='monospace')
bbox = temp_text.get_window_extent(renderer=fig.canvas.get_renderer())

# convert to axis coordinates
inv = ax.transAxes.inverted()
bbox_axes = inv.transform([[bbox.x0,bbox.y0], [bbox.x1,bbox.y1]])
en_width = bbox_axes[1,0] - bbox_axes[0,0] # bbox is [(x0,y0),(x1,y1)]
plt.close(fig) # close the figure

In [None]:
# counter to reset position values
tokenCount = 0

x_pos = 0  # starting x position (in axis coordinates)
y_pos = 1  # vertical center

# setup the figure
fig, axs = plt.subplots(2,1,figsize=(10,6))
axs[0].axis('off')
axs[1].axis('off')

for toki in range(len(tokens[0])):

  # decode the token
  word = tokenizer.decode(tokens[0,toki])

  # width of the word
  word_width = en_width*len(word)

  # text object with background color matching the activation
  axs[0].text(x_pos+word_width/2, y_pos, word, fontsize=12, ha='center', va='center',fontfamily='monospace',
          bbox = dict(boxstyle='round,pad=.3', facecolor=mpl.cm.Reds(activationMax[toki]), edgecolor='none', alpha=.8))

  axs[1].text(x_pos+word_width/2, y_pos, word, fontsize=12, ha='center', va='center',fontfamily='monospace',
          bbox = dict(boxstyle='round,pad=.3', facecolor=mpl.cm.Blues(activationMin[toki]), edgecolor='none', alpha=.8))


  # update the word counter and x_pos
  tokenCount += 1
  x_pos += word_width + .015 # plus a small gap

  # end of the line; reset coordinates and counter
  if tokenCount>=20:
    y_pos -= .12
    x_pos = 0
    tokenCount = 0


plt.show()