<a href="https://colab.research.google.com/github/sc22lg/ML-Notebooks/blob/gpt2-small-paper-recreation/semantic_attention_recreation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## A Recreation of the Results of: The Self-Hating Attention Head: A Deep Dive in GPT-2 - Matteo Migliarini July 2025
by Leo Gott

Original publication can be found [here](https://www.lesswrong.com/posts/wxPvdBwWeaneAsWRB/the-self-hating-attention-head-a-deep-dive-in-gpt-2-1)

### Overall idea:
"gpt2-small's head L1H5 directs attention to semantically similar tokens and actively suppresses self-attention"
### Results to re-create:
- Create inputs to ellicit expected behaviour
- Use inputs to identify heads performing behaviour in gpt2-small (expected head L1H5)
- Perform mean-ablation of preceding components to find which components effect L1H5

### Setup:

In [4]:
import os
import sys
from pathlib import Path

import pkg_resources

installed_packages = [pkg.key for pkg in pkg_resources.working_set]
if "transformer-lens" not in installed_packages:
    %pip install transformer_lens==2.11.0 einops eindex-callum jaxtyping git+https://github.com/callummcdougall/CircuitsVis.git#subdirectory=python

import pandas as pd
import circuitsvis as cv
import einops
import numpy as np
import torch as t
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from transformer_lens import (
    ActivationCache,
    FactoredMatrix,
    HookedTransformer,
    HookedTransformerConfig,
    utils,
)
from transformer_lens.hook_points import HookPoint

In [5]:
import random as rand
import plotly.express as px
from IPython.display import display

### 1.1 Generate input prompt

In [6]:
semantic_words_file = pd.read_csv('semantic_words.csv', header=None)
print(semantic_words_file.to_string())

         0           1          2          3           4           5
0   Monday     Tuesday  Wednesday   Thursday      Friday    Saturday
1      red        blue      green     silver       white        Blue
2     1918        1920       1930       1943        1998        2000
3      You          He        his        she         her       their
4    Italy     Iceland    Austria     Mexico       Spain      France
5      dog         cat      horse    hamster        fish      lizard
6       60          65         69         70          71          90
7    angry       happy        sad    excited       bored    stressed
8      car         bus        van      truck   motorbike   aeroplane
9     rose       tulip       lily      daisy      orchid   sunflower
10  guitar       piano     violin       drum       flute     trumpet
11  soccer  basketball     tennis   baseball       rugby      hockey
12  circle      square   triangle  rectangle     hexagon     octagon
13   chair       table       sofa 

In [7]:
# Create shuffled list of tokens
n_sequences = 30
n_tokens = 16
n_rows = semantic_words_file.shape[0]

inputs = np.empty((n_sequences, n_tokens), dtype=tuple)

for i in range(n_sequences):
  subset = semantic_words_file.sample(4)
  for j in range(n_tokens):
    category_list = subset.sample(1)
    category = category_list.index[0]
    token = category_list.iloc[0].sample(1).values[0]
    inputs[i, j] = (category, token)
# print(inputs)

In [8]:
#create masks representing where tokens in an input share a category
masks = np.zeros((n_sequences, n_tokens, n_tokens))

for seq in range(n_sequences):
  for i in range(n_tokens):
    for j in range(n_tokens):
      if inputs[seq, i][0] == inputs[seq, j][0] and inputs[seq, i][1] != inputs[seq, j][1] and i > j: # ensures upper triangle is 0s
        masks[seq, i, j] = 1

In [9]:
show_mask = 1
fig = px.imshow(masks[show_mask], labels=dict(x="Token Index", y="Token Index", color="Same Category")) # Added labels dictionary
fig.update_layout(xaxis = dict(
        tickmode = 'array',
        tickvals = list(range(n_tokens)),
        ticktext = [inputs[show_mask, i][1] for i in range(n_tokens)] # Use tokens from inputs for x-axis
    ),
    yaxis = dict(
        tickmode = 'array',
        tickvals = list(range(n_tokens)),
        ticktext = [inputs[show_mask, i][1] for i in range(n_tokens)] # Use tokens from inputs for y-axis
    )
)
fig.show()

### Load & test gpt2-small:

In [10]:
model = HookedTransformer.from_pretrained("gpt2-small", device="cpu")
print(model.cfg)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Loaded pretrained model gpt2-small into HookedTransformer
HookedTransformerConfig:
{'NTK_by_parts_factor': 8.0,
 'NTK_by_parts_high_freq_factor': 4.0,
 'NTK_by_parts_low_freq_factor': 1.0,
 'act_fn': 'gelu_new',
 'attention_dir': 'causal',
 'attn_only': False,
 'attn_scale': 8.0,
 'attn_scores_soft_cap': -1.0,
 'attn_types': None,
 'checkpoint_index': None,
 'checkpoint_label_type': None,
 'checkpoint_value': None,
 'd_head': 64,
 'd_mlp': 3072,
 'd_model': 768,
 'd_vocab': 50257,
 'd_vocab_out': 50257,
 'decoder_start_token_id': None,
 'default_prepend_bos': True,
 'device': 'cpu',
 'dtype': torch.float32,
 'eps': 1e-05,
 'experts_per_token': None,
 'final_rms': False,
 'from_checkpoint': False,
 'gated_mlp': False,
 'init_mode': 'gpt2',
 'init_weights': False,
 'initializer_range': 0.02886751345948129,
 'load_in_4bit': False,
 'model_name': 'gpt2',
 'n_ctx': 1024,
 'n_devices': 1,
 'n_heads': 12,
 'n_key_value_heads': None,
 'n_layers': 12,
 'n_params': 84934656,
 'normalization_type

In [11]:
# run model on selected sequence & cache activation
sequence_index = 1
test_input = ' '.join([inputs[sequence_index, i][1] for i in range(n_tokens)])
print("input: " + test_input)
input_tokens = model.to_tokens(test_input)
logits, cache = model.run_with_cache(input_tokens)

input: nurse Iceland dentist white Spain silver Italy green doctor lizard Spain lizard blue Spain silver Spain


In [12]:
layer1_patterns = cache["pattern", 1]
print(layer1_patterns.shape)
print(input_tokens.shape)
print(input_tokens.squeeze())
str_tokens = model.to_str_tokens(input_tokens.squeeze())

display(
    cv.attention.attention_patterns(
        tokens=str_tokens,
        attention=layer1_patterns.squeeze(),
        attention_head_names=[f"L1H{i}" for i in range(12)],
    )
)

torch.Size([1, 12, 18, 18])
torch.Size([1, 18])
tensor([50256,    77, 12321, 17333, 38408,  2330,  8602,  8465,  8031,  4077,
         6253, 42406,  8602, 42406,  4171,  8602,  8465,  8602])


### Pattern detector:

In [14]:
print(f"layers: {model.cfg.n_layers}")
print(f"heads per layer: {model.cfg.n_heads}")

layers: 12
heads per layer: 12


In [None]:
def semantic_head_detector(cache: ActivationCache):
  scores = np.zeros((model.cfg.n_layers, model.cfg.n_heads))

  # calculate attention score for current input for each attention head
  for layer in range(model.cfg.n_layers):
    for head in range(model.cfg.n_heads):
      attention_pattern = cache["pattern", layer][head]
      expected_attention = attention_pattern[t.from_numpy(masks[sequence_index]).bool()]
      scores[layer, head] = t.mean(expected_attention)
  return scores
