|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 1:</h2>|<h1>Tokenizations and embeddings<h1>|
|<h2>Section:</h2>|<h1>Embedding spaces<h1>|
|<h2>Lecture:</h2>|<h1><b>Exploring GPT2 and BERT embeddings<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

# higher-res plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Import GPT-2 and BERT models and tokenizers

In [None]:
from transformers import GPT2Model,GPT2Tokenizer
gpt2 = GPT2Model.from_pretrained('gpt2')
tokenizerG = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
from transformers import BertTokenizer, BertModel
tokenizerB = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased')

In [None]:
# check out the model
print(gpt2)

In [None]:
# check out the model
print(bert)

# How long are the tokens?

In [None]:
print(f'BERT has {tokenizerB.vocab_size:,} tokens.')
print(f'GPT2 has {tokenizerG.vocab_size:,} tokens.')

In [None]:
# GPT: get all individual lengths
token_lengths = np.zeros(tokenizerG.vocab_size,dtype=int)
for idx,word in enumerate( tokenizerG.encoder.keys() ):
  token_lengths[idx] = len(word)

# unique values and counts for bar plot
uniqueLengthsG,tokenCountG = np.unique(token_lengths,return_counts=True)



# BERT: get all individual lengths
token_lengths = np.zeros(tokenizerB.vocab_size,dtype=int)
for idx,word in enumerate( tokenizerB.get_vocab().keys() ): # note difference from GPT!
  token_lengths[idx] = len(word)

# unique values and counts for bar plot
uniqueLengthsB,tokenCountB = np.unique(token_lengths,return_counts=True)

In [None]:
# draw the barplot!
_,axs = plt.subplots(2,1,figsize=(8,6))
axs[0].bar(uniqueLengthsG,tokenCountG,color='k',edgecolor='gray')
axs[0].set(xlabel='Token length (chars)',ylabel='Token count',title='GPT2 token lengths (clipped at 22)',
           xlim=[0,22])

axs[1].bar(uniqueLengthsB,tokenCountB,color='k',edgecolor='gray')
axs[1].set(xlabel='Token length (chars)',ylabel='Token count',title='BERT token lengths',
           xlim=[0,22])

plt.tight_layout()
plt.show()

# Extract and visualize the embeddings matrices

In [None]:
# get the Word Token Embeddings matrix
embeddingsG = gpt2.wte.weight.detach().numpy()

# get the word token embeddings matrix (note the different variable names compared to GPT2!
embeddingsB = bert.embeddings.word_embeddings.weight.detach().numpy()

# print their sizes
print(f'BERT embedding matrix is of size {embeddingsB.shape}.')
print(f'GPT2 embedding matrix is of size {embeddingsG.shape}.')

In [None]:
fig,axs = plt.subplots(2,1,figsize=(10,8))

# GPT embeddings
h = axs[0].imshow(embeddingsG.T,aspect='auto',vmin=-.15,vmax=.15)
axs[0].set(xlabel='Tokens',ylabel='Dimensions',title='GPT-2 embeddings matrix')
fig.colorbar(h,ax=axs[0],pad=.01)

# BERT embeddings
h = axs[1].imshow(embeddingsB.T,aspect='auto',vmin=-.15,vmax=.15)
axs[1].set(xlabel='Tokens',ylabel='Dimensions',title='BERT embedding matrix')
fig.colorbar(h,ax=axs[1],pad=.01)

plt.tight_layout()
plt.show()

# Comparing the same token index

In [None]:
# pick a random token index
ridx = np.random.randint(10000,20000)

_,axs = plt.subplots(1,2,figsize=(12,4))

axs[0].plot(embeddingsB[ridx,:],label='BERT',linewidth=.5)
axs[0].plot(embeddingsG[ridx,:],label='GPT2',linewidth=.5)
axs[0].legend()
axs[0].set(xlabel='Embeddings dimension',ylabel='Embedding value',xlim=[0,embeddingsB.shape[1]],title=f'Token {ridx}')

axs[1].plot(embeddingsB[ridx,:],embeddingsG[ridx,:],'s',markerfacecolor=[.7,.9,.7])
axs[1].set(xlabel=f'BERT ("{tokenizerB.decode(ridx)}")',ylabel=f'GPT2 ("{tokenizerG.decode(ridx)}")',
           title='Embedding comparison')

plt.show()

In [None]:
# how about the same word?
token = 'the'
token_idxB = tokenizerB.encode(token)[1]
token_idxG = tokenizerG.encode(token)[0]

print(f'BERT: "{token}" is index {token_idxB}')
print(f'GPT2: "{token}" is index {token_idxG}')

In [None]:
# their correlation
corr = np.corrcoef(embeddingsB[token_idxB,:],embeddingsG[token_idxG,:])

plt.plot(embeddingsB[token_idxB,:],embeddingsG[token_idxG,:],'s',markerfacecolor=[.7,.9,.9])
plt.gca().set(xlabel=f'BERT ("{tokenizerB.decode(token_idxB)}")',ylabel=f'GPT2 ("{tokenizerG.decode(token_idxG)}")',
           title=f'Embedding comparison (r = {corr[0,1]:.2f})')

plt.show()

# Distributions of embeddings characteristics

In [None]:
_,axs = plt.subplots(1,2,figsize=(12,3.5))

# compare the embeddings variances
yB,xB = np.histogram(embeddingsB.var(axis=1),bins=100,density=True)
yG,xG = np.histogram(embeddingsG.var(axis=1),bins=100,density=True)

axs[0].plot(xB[:-1],yB,linewidth=2,label='BERT')
axs[0].plot(xG[:-1],yG,linewidth=2,label='GPT2')
axs[0].set(xlabel='Variance',ylabel='Density',xlim=[0,None],ylim=[0,None],title='Variances distributions')
axs[0].legend()

# compare the embeddings means
yB,xB = np.histogram(embeddingsB.mean(axis=1),bins=100,density=True)
yG,xG = np.histogram(embeddingsG.mean(axis=1),bins=100,density=True)

axs[1].plot(xB[:-1],yB,linewidth=2,label='BERT')
axs[1].plot(xG[:-1],yG,linewidth=2,label='GPT2')
axs[1].axvline(0,color=[.7,.7,.7],linestyle='--')
axs[1].set(xlabel='Average',ylabel='Density',ylim=[0,None],title='Means distributions')
axs[1].legend()


plt.show()