|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 1:</h2>|<h1>Tokenizations and embeddings<h1>|
|<h2>Section:</h2>|<h1>Embedding spaces<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Unembeddings (vectors to tokens)<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Exercise 1: Import GPT2 model and tokenizer, and get dimensions

In [None]:
from transformers import GPT2Model,GPT2Tokenizer

# pretrained GPT-2 model and tokenizer
gpt2 = GPT2Model.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
# embeddings matrix
embeddings = gpt2.wte.weight.detach().numpy()

In [None]:
# find the size parameters in .config
gpt2.config

In [None]:
# the properties we'll use later
print(f'Embedding dimensions: {gpt2.config.n_embd}')
print(f'Vocab size: {gpt2.config.vocab_size}')
print(f'Size of embeddings matrix: {embeddings.shape}')

# Exercise 2: Real and random unembeddings

In [None]:
# unembeddings matrix as the transpose of the (real) embeddings
unembeddings = embeddings.T

# confirm that transposing matrix a copy
print('id of embeddings:  ',id(embeddings))
print('id of unembeddings:',id(unembeddings))

In [None]:
# a random unembeddings matrix
unembeddingsRand = np.random.randn(gpt2.config.n_embd,gpt2.config.vocab_size)

print(f'         Size of embeddings matrix: {embeddings.shape}')
print(f'Size of random unembeddings matrix: {unembeddingsRand.shape}')
print(f'  Size of real unembeddings matrix: {unembeddings.shape}')

# Exercise 3: California embedding

In [None]:
# pick a word
seedword = ' California'

# its token index
seed_idx = tokenizer.encode(seedword)

# make sure it's one token
seed_idx

In [None]:
# find its embedding vector
embed_vector = embeddings[seed_idx,:]

# plot it!
plt.figure(figsize=(10,3))
plt.scatter(range(gpt2.config.n_embd),embed_vector.squeeze(),s=30,c=abs(embed_vector),cmap='RdPu')
plt.gca().set(xlabel='Embedding dimension',ylabel='Embedding weight',xlim=[-3,gpt2.config.n_embd+2],
              title=f'Embedding (GPT2) of "{tokenizer.decode(seed_idx)}"')
plt.show()

In [None]:
# project the embedding vector onto the unembedding matrix
dpRand = embed_vector @ unembeddingsRand

# next token is the maximum dot product (unscaled cosine similarity)!
nextTokenRand_idx = np.argmax(dpRand)
nextTokenRand = tokenizer.decode(nextTokenRand_idx)

# check the sizes
print('embed_vector  X  unembeddings  =  dotproducts')
print(f'  {embed_vector.shape}       {unembeddingsRand.shape}      {dpRand.shape}')

In [None]:
# repeat for the real unembeddings matrix
dpReal = embed_vector @ unembeddings
nextTokenReal_idx = np.argmax(dpReal)
nextTokenReal = tokenizer.decode(nextTokenReal_idx)

In [None]:
print('** Random unembeddings matrix:')
print(f'   "{tokenizer.decode(seed_idx)}" has largest dot product with token "{nextTokenRand}"\n')

print('** Real unembeddings matrix:')
print(f'   "{tokenizer.decode(seed_idx)}" has largest dot product with token "{nextTokenReal}"')

In [None]:
# plot it!
_,axs = plt.subplots(1,2,figsize=(12,3))
axs[0].scatter(range(tokenizer.vocab_size),dpRand,s=30,c=abs(dpRand),cmap='RdPu',alpha=.4)
axs[0].axvline(nextTokenRand_idx,linestyle='--',color='k',alpha=1/3)
axs[0].plot(nextTokenRand_idx,dpRand[0,nextTokenRand_idx],'gv')
axs[0].set(xlabel='Unembedding dimension',ylabel='Dot product',xlim=[-11,tokenizer.vocab_size+10],
              title=f'(Random) dot products with "{tokenizer.decode(seed_idx)}"')

axs[1].scatter(range(tokenizer.vocab_size),dpReal,s=30,c=abs(dpReal),cmap='RdPu',alpha=.4)
axs[1].axvline(nextTokenReal_idx,linestyle='--',color='k',alpha=1/3)
axs[1].plot(nextTokenReal_idx,dpReal[0,nextTokenReal_idx],'gv')
axs[1].set(xlabel='Unembedding dimension',ylabel='Dot product',xlim=[-11,tokenizer.vocab_size+10],
              title=f'(Real) dot products with "{tokenizer.decode(seed_idx)}"')


plt.tight_layout()
plt.show()

# Exercise 4: Find top-10 unembeddings

In [None]:
top10 = np.argsort(dpReal[0])[::-1][:10]

for i in top10:
  print(f'Dot product {dpReal[0,i]:6.3f} for token "{tokenizer.decode(i)}"')

# Exercise 5: Generate a token sequence

In [None]:
# sequence length
seq_len = 10

# initial seed
nextword = 'budget'

# initializing a list that will contain the text
text = nextword


# loop to create the sequence
for i in range(seq_len-1):

  # step 1: tokenize
  token = tokenizer.encode(nextword)

  # step 2: get embedding vector
  embed_vector = embeddings[token,:]

  # step 3: project onto unembedding matrix (dot products)
  dp = embed_vector @ unembeddings

  # step 4: find top10 projections
  top10 = np.argsort(dp[0])[::-1][:10]

  # step 5: randomly pick one for next token
  aRandomToken = np.random.choice(top10)
  nextword = tokenizer.decode(aRandomToken)

  # step 6: append the text
  text += nextword

# print the final result!
print('Our very philosophically meaningful text:\n',text)

# Repeat with random unembeddings

In [None]:
# initialize
nextword = 'budget'
text = nextword

# loop
for i in range(seq_len-1):

  # step 1: tokenize
  token = tokenizer.encode(nextword)

  # step 2: get embedding vector
  embed_vector = embeddings[token,:]

  # step 3: project onto RANDOM unembedding matrix
  dp = embed_vector @ unembeddingsRand

  # step 4: find top10 projections
  top10 = np.argsort(dp[0])[::-1][:10]

  # step 5: randomly pick one for next token
  aRandomToken = np.random.choice(top10)
  nextword = tokenizer.decode(aRandomToken)

  # step 6: append the text
  text += nextword

print('Our randomly meanderingful text:\n',text)