|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 1:</h2>|<h1>Tokenizations and embeddings<h1>|
|<h2>Section:</h2>|<h1>Embedding spaces<h1>|
|<h2>Lecture:</h2>|<h1><b>Pretrained embeddings (GloVe)<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
# download a small GloVe model (Wikipedia + Gigaword, 50D)

# NOTE: If you get errors importing, run the following !pip... line,
# then restart your session (from Runtime menu) and comment out the pip line.
# !pip install gensim

import gensim.downloader as api
glove = api.load('glove-wiki-gigaword-50')

In [None]:
import numpy as np
import scipy
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

# svg plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Explore the glove variable

In [None]:
# check the properties and methods
dir(glove)

In [None]:
print(f'The dictionary contains {len( glove.key_to_index.keys())} items.' )
list(glove.key_to_index.keys())[:50]

# Explore the vocab

In [None]:
# print 10 words at random
for idx in np.random.randint(0,len(glove.key_to_index),10):
  print(f'Index {idx:>6} is "{glove.index_to_key[idx]}"')

In [None]:
# distribution of token character lengths
token_lengths = np.zeros(len( glove.key_to_index.keys()),dtype=int)
for idx,word in enumerate( glove.key_to_index.keys() ):
  token_lengths[idx] = len(word)

# counts for the bar plot
uniqVals,uniqCounts = np.unique(token_lengths,return_counts=True)


# visualize the distribution of lengths
plt.figure(figsize=(12,4))
plt.bar(uniqVals,np.log(uniqCounts),width=uniqVals[1]-uniqVals[0],facecolor=[.9,.7,.9],edgecolor='k')
plt.gca().set(xlabel='Word length (num characters)',ylabel='Count')

plt.show()

# Explore the embeddings matrix

In [None]:
# size of the embeddings matrix
print(f'The embeddings matrix is {glove.vectors.shape}')

print(f'The word "apple" has index #{glove.key_to_index["apple"]}')

# can also access it this way:
glove.get_index('apple')

In [None]:
plt.figure(figsize=(12,4))
plt.imshow(glove.vectors.T,vmin=-1,vmax=1,aspect='auto')
plt.gca().set(ylabel='Dimension',xlabel='Word index',title='Embeddings matrix')
plt.colorbar(pad=.01)
plt.show()

In [None]:
# mean and std across each embedding dim
emb_mean = glove.vectors.mean(axis=1)
emb_std  = glove.vectors.std(axis=1)


# seaborn has nice visualization routines
import seaborn as sns
import pandas as pd # though seaborn only works on pandas dataframes :/

df = pd.DataFrame(np.vstack((emb_mean,emb_std)).T,columns=['Mean','std'])

sns.jointplot(x='Mean',y='std',data=df,alpha=.2)
plt.show()

# Explore individual embeddings vectors

In [None]:
# pick a word
word = 'banana'

# get its index in the embeddings matrix
wordidx = glove.key_to_index[word]

# get the embedding vector
thisWordVector = glove.vectors[wordidx,:]

# inspect the vector
print(f'The embedding vector for "{word}" is\n {thisWordVector}')

In [None]:
# even easier ;)
thisWordVector = glove[word]

print(f'The embedding vector for "{word}" is\n {thisWordVector}')

In [None]:
# visualize it
plt.figure(figsize=(10,4))
plt.plot(glove.vectors[wordidx,:],'ks',markersize=10,markerfacecolor=[.7,.7,.9])

plt.xlabel('Dimension')
plt.title(f'Embedding vector for "{word}"')
plt.show()

# Relationships across embedding vectors

In [None]:
# pick three words
word1 = 'banana'
word2 = 'apple'
word3 = 'cosmic'


# setup the figure subplot geometry
fig = plt.figure(figsize=(10,7))
gs = GridSpec(2,2)
ax0 = fig.add_subplot(gs[0,:])
ax1 = fig.add_subplot(gs[1,0])
ax2 = fig.add_subplot(gs[1,1])

# plot the embeddings by dimension
for idx,word in enumerate([word1,word2,word3]):
  ax0.plot(glove[word],'s-',label=word)

ax0.set(xlabel='Dimension',title='Embeddings',xlim=[-1,glove.vectors.shape[1]+1])
ax0.legend()


# plot the embeddings by each other
cossim = glove.similarity(word1,word2)
ax1.plot(glove[word1],glove[word2],'ko',markerfacecolor=[.9,.7,.7])
ax1.set(xlabel=word1,ylabel=word2,title=f'Cosine similarity = {cossim:.3f}')

cossim = glove.similarity(word1,word3)
ax2.plot(glove[word1],glove[word3],'ko',markerfacecolor=[.7,.9,.7])
ax2.set(xlabel=word1,ylabel=word3,title=f'Cosine similarity = {cossim:.3f}')

# final touches
plt.tight_layout()
plt.show()

# Methods to identify similar and dissimilar words

In [None]:
# most similar words ("similar" is high cosine similarity)
glove.most_similar('fashion',topn=9)

In [None]:
# One these things is not like the others...
lists = [ [ 'apple','banana','pirate','peach' ],
          [ 'apple','banana','peach','kiwi','starfruit' ],
          [ 'apple','banana','pirate','peach','kiwi','starfruit' ],
          [ 'apple','banana','orange','kiwi' ]
        ]

for l in lists:
  print(f'In the word list {l}:')
  print(f'  The most similar word is "{glove.most_similar(l,topn=1)[0][0]}"')
  print(f'  and the non-matching word is "{glove.doesnt_match(l)}"\n')