In [1]:
# !pip install faiss-gpu==1.6.5
# !pip install tsnecuda==3.0.0 --no-deps
# !pip install mkl
# # need to install MKL: https://github.com/eddelbuettel/mkl4deb

In [2]:
import json
import days.w3d2.w3d2_tests as w3d2_tests
import transformers
import torch
from einops import rearrange, reduce, repeat

from sklearn.decomposition import PCA
from tsnecuda import TSNE
import matplotlib.pyplot as plt
import plotly.express as px
import pandas as pd

In [3]:
tokenizer = transformers.GPT2Tokenizer.from_pretrained("gpt2")
tokenizer._add_tokens(["[BEGIN]", "[END]"])
tokenizer.pad_token = "[END]"
tokenizer.eos_token = "[END]"
# 50258 is the pad_token_id
# 50257 is the BEGIN token id

with open("../w3d2/test_tokens_owt_subset.json") as f:
    test_tokens = torch.LongTensor(json.load(f))[:,:512]

sentence = (
"Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly " +
"normal, thank you very much. They were the last people you’d expect to be involved in anything " +
"strange or mysterious, because they just didn’t hold with such nonsense.\n" +
"Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy " +
"man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin " +
"and blonde and had nearly twice the usual amount of neck, which came in very useful as she " +
"spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys " +
"had a small son called Dudley and in their opinion there was no finer boy anywhere.\n" +
"The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was " +
"that somebody would discover it. They didn’t think they could bear it if anyone found out about " +
"the Potters. Mrs. Potter was Mrs. Dursley’s sister, but they hadn’t met for several years; in fact, " +
"Mrs. Dursley pretended she didn’t have a sister, because her sister and her good-for-nothing " +
"husband were as unDursleyish as it was possible to be. The Dursleys shuddered to think what the " +
"neighbors would say if the Potters arrived in the street. The Dursleys knew that the Potters had a " +
"small son, too, but they had never even seen him. This boy was another good reason for keeping " +
"the Potters away; they didn’t want Dudley mixing with a child like that."
)
sentence_tokens = tokenizer(sentence).input_ids

In [4]:
# True -> use GPT
# False -> use 2-layer attention-only model
use_gpt = False

if use_gpt:
    model = transformers.GPT2Model.from_pretrained('gpt2')
    VOCAB_SIZE = 50257
    MAX_SEQ_LEN = 1024
    token_embed_fn = model.wte
    pos_embed_fn = model.wpe
else:
    model = w3d2_tests.get_minigpt("../w3d2/model.pt")
    VOCAB_SIZE = 50259
    MAX_SEQ_LEN = 512
    token_embed_fn = model.token_embedding
    pos_embed_fn = model.pos_embedding

In [44]:
with torch.no_grad():
    all_token_embeddings = token_embed_fn(torch.arange(VOCAB_SIZE))
all_token_embeddings.shape

torch.Size([50259, 256])

In [45]:
with torch.no_grad():
    all_position_embeddings = pos_embed_fn(torch.arange(MAX_SEQ_LEN))
all_position_embeddings.shape

torch.Size([512, 256])

In [52]:
head_output_vecs = []
for layer in model.blocks:
    output_matrix = layer.project_output.weight
    head_size = layer.head_size
    for head in range(layer.n_heads):
        col_start = head * head_size
        col_end = (head + 1) * head_size
        head_output_vecs.append(output_matrix[:,col_start:col_end])
head_output_vecs = torch.concat(head_output_vecs, dim=-1).T.detach()

In [53]:
X = torch.concat([all_token_embeddings, all_position_embeddings, output_vecs], dim=0)

pca = PCA(n_components=2)
projected = pca.fit_transform(X)
projected_transpose = rearrange(projected, "x y -> y x")

In [54]:
tsne_input = X
tsne = TSNE()
tsne_result = tsne.fit_transform(tsne_input)
tsne_result_transpose = rearrange(tsne_result, "x y -> y x")

In [49]:
word_list = tokenizer.batch_decode([token for token in range(VOCAB_SIZE)])

In [57]:
position_names = [f"<pos{i}>" for i in range(MAX_SEQ_LEN)]
head_output_names = []
head_output_types = []
for layer_i, layer in enumerate(model.blocks):
    for head in range(layer.n_heads):
        for i in range(layer.head_size):
            head_output_names.append(f"head {layer_i}.{head} output-{i}")
            head_output_types.append(f"head {layer_i}.{head}")

In [58]:
word_dict = {
    'name': word_list + position_names + head_output_names, 
    'pca component 1': projected_transpose[0], 
    'pca component 2': projected_transpose[1],
    'tsne component 1': tsne_result_transpose[0],
    'tsne component 2': tsne_result_transpose[1],
    'type': ["token" for _ in range(VOCAB_SIZE)] + ["position" for _ in range(MAX_SEQ_LEN)] + head_output_types
}
df = pd.DataFrame(data=word_dict)
df

Unnamed: 0,name,pca component 1,pca component 2,tsne component 1,tsne component 2,type
0,!,1.259115,0.015668,46.439877,4.521946,token
1,"""",1.481495,0.000313,9.825200,42.903072,token
2,#,0.723518,0.366266,5.977057,19.531647,token
3,$,0.678572,0.337494,37.817890,-10.819846,token
4,%,1.023928,-0.002171,2.698970,23.573929,token
...,...,...,...,...,...,...
51278,head 1.7 output-27,1.193664,0.064126,12.372522,38.760639,head 1.7
51279,head 1.7 output-28,1.286109,-0.047953,12.271119,38.863560,head 1.7
51280,head 1.7 output-29,1.239856,-0.192347,11.708760,38.981422,head 1.7
51281,head 1.7 output-30,1.182586,-0.147669,12.351735,38.919960,head 1.7


In [61]:
x = "tsne component 1"
y = "tsne component 2"
fig = px.scatter(
    df, 
    x=x, 
    y=y, 
    hover_name="name",
    color="type",
    labels={
        x: "Component 1",
        y: "Component 2",
    },
    width=650, 
    height=650, 
)
fig.update_traces(
    marker=dict(size=3),
    selector=dict(mode='markers'),
)
fig.show()

32


torch.Size([256, 512])