|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 6:</h2>|<h1>Intervention (causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Editing hidden states<h1>|
|<h2>Lecture:</h2>|<h1><b>Activation patching with indirect object identification<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Import the model and create tokens

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = GPT2LMHeadModel.from_pretrained('gpt2-xl').to(device)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

n_layers = model.config.n_layer
model.eval()

In [None]:
text_ME = 'When Mike and Emma went to the cafe, Mike gave a coffee to'
text_EM = 'When Mike and Emma went to the cafe, Emma gave a coffee to'

target_M = tokenizer.encode(' Mike')[0]
target_E = tokenizer.encode(' Emma')[0]

tokensME = tokenizer.encode(text_ME,return_tensors='pt').to(device)
tokensEM = tokenizer.encode(text_EM,return_tensors='pt').to(device)

# Get "clean" data on texts (no patching)

In [None]:
with torch.no_grad():
  outME = model(tokensME,output_hidden_states=True)
  outEM = model(tokensEM,output_hidden_states=True)

hs_ME = outME.hidden_states
outME.keys(), outME.hidden_states[3].shape

In [None]:
logitDiff_ME = outME.logits[0,-1,target_M] - outME.logits[0,-1,target_E]
logitDiff_EM = outEM.logits[0,-1,target_M] - outEM.logits[0,-1,target_E]

print(f'Logit difference for text "ME": {logitDiff_ME:6.3f}')
print(f'Logit difference for text "EM": {logitDiff_EM:6.3f}')

# Run the IOI experiment over layers

In [None]:
# initializations
confirmManipulation = np.zeros((n_layers,2))
logitDiffs = np.zeros(n_layers)

# loop over layers
for layeri in range(n_layers):

  # patch this layer
  def hookfun(module, input, output):
    hs = output[0].clone()
    hs[0,-1,:] = outME.hidden_states[layeri+1][0,-1,:]
    output = (hs,*output[1:])
    return output

  # implant the hook
  handle = model.transformer.h[layeri].register_forward_hook(hookfun)

  # forward pass with hook
  with torch.no_grad():
    outEM = model(tokensEM,output_hidden_states=True)
  hs_EM = outEM.hidden_states

  # remove the hook
  handle.remove()

  # confirmation: first element should be zero, second non-zero
  confirmManipulation[layeri,0] = hs_EM[layeri+1][0,-1,10] - hs_ME[layeri+1][0,-1,10]
  confirmManipulation[layeri,1] = hs_EM[layeri+1][0,-2,10] - hs_ME[layeri+1][0,-2,10]

  # now for the logit-difference test
  logitDiffs[layeri] = outEM.logits[0,-1,target_M] - outEM.logits[0,-1,target_E]


In [None]:
# sanity check :)
confirmManipulation

In [None]:
# visualization
plt.figure(figsize=(11,4))

# plot the logit differences for the "clean" runs (no patching)
plt.axhline(logitDiff_EM.cpu(),color='b',label='Clean "EM"')
plt.axhline(logitDiff_ME.cpu(),color='r',label='Clean "ME"')

# then for the experiment results
plt.plot(logitDiffs,'ko',markerfacecolor=[.9,.7,.9],markersize=10,label='A patched to B')

# the dividing line
plt.axhline(0,linestyle='--',color='gray',linewidth=.5)
plt.text(0,.1,'Prefer "Mike"',fontsize=12,va='bottom')
plt.text(0,-.1,'Prefer "Emma"',fontsize=12,va='top')

plt.gca().set(xlabel='Transformer block',ylabel='Logit difference',title='Reversing logit bias towards target-Mike')
plt.legend()
plt.show()