|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 1:</h2>|<h1>Tokenizations and embeddings<h1>|
|<h2>Section:</h2>|<h1>Embedding spaces<h1>|
|<h2>Lecture:</h2>|<h1><b>Position embeddings<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# higres plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Import GPT-2 model and extract its position embedding matrix

In [None]:
# load the model
from transformers import GPT2Model
gpt2 = GPT2Model.from_pretrained('gpt2')
gpt2

In [None]:
# position embeddings
positions = gpt2.wpe.weight.detach().numpy()

In [None]:
# check the size of this matrix
positions.shape

In [None]:
# visualize the matrix
plt.figure(figsize=(10,4))
plt.imshow(positions.T,aspect='auto',vmin=-.2,vmax=.2)
plt.gca().set(xlabel='Token position',ylabel='Dimensions',title='GPT-2 position embedding matrix')
plt.show()

# Visualize some position vectors

In [None]:
_,axs = plt.subplots(3,4,figsize=(16,6))

# pick random vectors
for a in axs.flatten():

  # a random position embedding vector
  randidx = np.random.randint(positions.shape[1])

  # and plot it
  a.plot(positions[:,randidx],'k',label=f'Position index {randidx}')
  a.axhline(0,linestyle='--',color='gray',zorder=-3)

  a.set(xticks=[],yticks=[0],xlim=[0,positions.shape[0]])
  a.legend(fontsize=10)


# x-axis label on one plot
a.set_xlabel('Start of text <---> end of context window',fontsize=12)
plt.tight_layout()
plt.show()

# Similarities across vectors

In [None]:
# cosine similarities for "time series" (token index)
Pnorm1 = positions / np.linalg.norm(positions,axis=1,keepdims=True)
cossim_tokens = Pnorm1 @ Pnorm1.T

# cosine similarities across embedding dimensions
Pnorm0 = positions / np.linalg.norm(positions,axis=0,keepdims=True)
cossim_embeds = Pnorm0.T @ Pnorm0


# draw the images
fig,axs = plt.subplots(1,2,figsize=(12,5))

h = axs[0].imshow(cossim_tokens,vmin=-1,vmax=1)
axs[0].set(xlabel='Token index ("time")',ylabel='Token index ("time")',title='$S_c$ over "time"')
ch = fig.colorbar(h,ax=axs[0],pad=.02,fraction=.046)
ch.ax.tick_params(labelsize=10)
ch.ax.set_yticks(np.arange(-1,1.1,.5))

h = axs[1].imshow(cossim_embeds,vmin=-1,vmax=1)
axs[1].set(xlabel='Embedding index',ylabel='Embedding index',title='$S_c$ across embeddings')
ch = fig.colorbar(h,ax=axs[1],pad=.02,fraction=.046)
ch.ax.tick_params(labelsize=10)
ch.ax.set_yticks(np.arange(-1,1.1,.5))

plt.tight_layout()
plt.show()

# Sinusoidal embeddings as defined in "Attention" paper

In [None]:
positionsFormula = np.zeros_like(gpt2.wpe.weight.data)
d = positionsFormula.shape[1]

# token position ("time")
th = np.arange(positionsFormula.shape[0])

# create the vectors
for i in range(0,positionsFormula.shape[1],2):

  # denominator scaling factor
  denom = 10000 ** (2*i//2 / d)

  # define the embeddings
  positionsFormula[:,i]   = np.sin(th / denom)
  positionsFormula[:,i+1] = np.cos(th / denom)



#### and visualize
_,axs = plt.subplots(1,2,figsize=(12,4))
axs[0].imshow(positionsFormula.T,vmin=-1,vmax=1)
axs[0].set(ylabel='Embedding dimensions',xlabel='Token order ("time")',title='All position embeddings')

pos2show = np.linspace(200,600,4,dtype=int)
h = axs[1].plot(positionsFormula[:,pos2show])
axs[1].set(ylabel='Weight value',xlabel='Token order ("time")',xlim=[0,len(th)],title='A few position embeddings')

for i,p in enumerate(pos2show):
  axs[0].axhline(p,linestyle='--',color=h[i].get_color(),linewidth=1.8)


plt.tight_layout()
plt.show()

In [None]:
# sample plot as earlier with the learned embeddings

_,axs = plt.subplots(3,4,figsize=(16,6))

# pick random vectors
for a in axs.flatten():

  # a random position embedding vector
  randidx = np.random.randint(positions.shape[1])

  # and plot it
  a.plot(positionsFormula[:,randidx],'k',label=f'Position index {randidx}')
  a.axhline(0,linestyle='--',color='gray',zorder=-3)

  a.set(xticks=[],yticks=[0],xlim=[0,positions.shape[0]])
  a.legend(fontsize=10)


# x-axis label on one plot
a.set(xlabel='<-- present     ...     past -->')
plt.tight_layout()
plt.show()