|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 1:</h2>|<h1>Tokenizations and embeddings<h1>|
|<h2>Section:</h2>|<h1>Embedding spaces<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Exploring position embeddings</b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Import GPT-2 model and extract its position embedding matrix

In [None]:
from transformers import GPT2Model

# get the Word Position Embeddings matrix
gpt2 = GPT2Model.from_pretrained('gpt2')
positions = gpt2.wpe.weight.detach().numpy()

# Exercise 1: Histogram of embeddings cosine similarities

In [None]:
# copied from "embed_positionEmbeddings.ipynb"

# cosine similarities for "time series" (token index)
Pnorm1 = positions / np.linalg.norm(positions,axis=1,keepdims=True)
cossim_tokens = Pnorm1 @ Pnorm1.T

# cosine similarities across embedding dimensions
Pnorm0 = positions / np.linalg.norm(positions,axis=0,keepdims=True)
cossim_embeds = Pnorm0.T @ Pnorm0

In [None]:
# draw the images (also copied from previous code file)
fig,axs = plt.subplots(1,2,figsize=(12,5))

h = axs[0].imshow(cossim_tokens,vmin=-1,vmax=1)
axs[0].set(xlabel='Token index ("time")',ylabel='Token index ("time")',title='$S_c$ over "time"')
ch = fig.colorbar(h,ax=axs[0],pad=.02,fraction=.046)
ch.ax.tick_params(labelsize=10)
ch.ax.set_yticks(np.arange(-1,1.1,.5))

h = axs[1].imshow(cossim_embeds,vmin=-1,vmax=1)
axs[1].set(xlabel='Embedding index',ylabel='Embedding index',title='$S_c$ across embeddings')
ch = fig.colorbar(h,ax=axs[1],pad=.02,fraction=.046)
ch.ax.tick_params(labelsize=10)
ch.ax.set_yticks(np.arange(-1,1.1,.5))

plt.tight_layout()
plt.show()

In [None]:
# small demo about triu:
A = np.random.randint(0,9,(4,4))
print(A)
print('')
A[np.nonzero(np.triu(A,1))]

In [None]:
# get the unique cosine similarity values from the upper-triangle
unique_cs_embeds = cossim_embeds[np.nonzero(np.triu(cossim_embeds,1))] # note the ",1" to avoid the trivial diagonal
unique_cs_tokens = cossim_tokens[np.nonzero(np.triu(cossim_tokens,1))]

# get their distributions
embed_hy,embed_hx = np.histogram(unique_cs_embeds,100)
token_hy,token_hx = np.histogram(unique_cs_tokens,100)

# visualize!
plt.figure(figsize=(12,4))
plt.bar(embed_hx[:-1],embed_hy,width=np.diff(embed_hx[:2]),alpha=.4,label='$S_c$ across embeddings')
plt.bar(token_hx[:-1],token_hy,width=np.diff(token_hx[:2]),alpha=.4,label='$S_c$ across "time"')
plt.plot(embed_hx[:-1],embed_hy)
plt.plot(token_hx[:-1],token_hy)

plt.legend()
plt.gca().set(xlim=[-1,1],xlabel='Cosine similarity',ylabel='Count',title='Distributions of $S_c$ in the positional embeddings matrix')
plt.show()

# Exercise 2: Create a shuffled cosine similarity distribution

In [None]:
# vectorize and copy the positions
randomEmbeds = positions.flatten()

# randomly shuffle them
np.random.shuffle(randomEmbeds)

# reshape back to the matrix
randomEmbeds = randomEmbeds.reshape(positions.shape)


In [None]:
_,axs = plt.subplots(2,1,figsize=(8,7))

axs[0].imshow(positions.T,aspect='auto',vmin=-.1,vmax=.1)
axs[0].set(xlabel='Token position',ylabel='Dimensions',title='GPT-2 position embeddings matrix')

axs[1].imshow(randomEmbeds.T,aspect='auto',vmin=-.1,vmax=.1)
axs[1].set(xlabel='Token position',ylabel='Dimensions',title='Shuffled embeddings matrix')


plt.tight_layout()
plt.show()

In [None]:
# calculate cosine similarity
Rnorm0 = randomEmbeds / np.linalg.norm(randomEmbeds,axis=0,keepdims=True)
cossim_random = Rnorm0.T @ Rnorm0

In [None]:
# get the unique cosine similarity values from the upper-triangle
unique_cs_random = cossim_random[np.nonzero(np.triu(cossim_random,1))]

# get their distribution
random_hy,random_hx = np.histogram(unique_cs_random,100)

# visualize!
plt.figure(figsize=(12,4))
plt.bar(embed_hx[:-1],embed_hy,width=np.diff(embed_hx[:2]),alpha=.4,label='$S_c$ across embeddings')
plt.bar(random_hx[:-1],random_hy,width=np.diff(random_hx[:2]),alpha=.4,label='$S_c$ in shuffled vectors')
plt.plot(embed_hx[:-1],embed_hy)
plt.plot(random_hx[:-1],random_hy)

plt.legend()
plt.gca().set(xlim=[-1,1],xlabel='Cosine similarity',ylabel='Count',title='Distributions of $S_c$ in the positional embeddings matrix')
plt.show()

# Exercise 3: Find similar pairs

In [None]:
# reminder: positions matrix is size [index,embedding]

sortidx = np.argsort(np.triu(cossim_embeds,1).flatten())[::-1]
xx,yy = np.unravel_index(sortidx,cossim_embeds.shape)

for i in np.linspace(0,200,10).astype(int):

  # get and print the pairs
  pairname = f'({xx[i]},{yy[i]})'
  print(f'Cossim of {cossim_embeds[xx[i],yy[i]]:.3f} in pair {pairname}')

  # plot them
  plt.plot(positions[:,xx[i]],positions[:,yy[i]],'.-',alpha=.5,label=pairname)

# adjustments
plt.gca().set(xlabel='Embedding dimension "x"',ylabel='Embedding dimension "y"')
plt.legend(fontsize=9)
plt.show()