|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Build a GPT<h1>|
|<h2>Lecture:</h2>|<h1><b>Understanding nn.Embedding and nn.Linear<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import torch
import torch.nn as nn

import numpy as np
import matplotlib.pyplot as plt
# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Create the two tensor objects

In [None]:
vocab_size = 5000
embed_dim = int(vocab_size**(1/2))
print(f'{vocab_size:,} tokens in the vocab, and {embed_dim} embedding dimensions.')

In [None]:
E = nn.Embedding(vocab_size,embed_dim ) # input [in  X out]
L = nn.Linear(    embed_dim,vocab_size) # input [out X in]

print(E)
print(L)
print(f'\nEmbedding matrix size: {E.weight.shape}')
print(f'Linear matrix size:    {L.weight.shape}')

# Explore their attributes

In [None]:
len(dir(E)), len(dir(L))

In [None]:
dir(E)

In [None]:
# unique attributes of each type
attrE = dir(E)
attrL = dir(L)

print('Attributes in Embedding that are not in Linear:')
for e in attrE:
  if e not in attrL:
    print('  ' + e)


print('\n\nAttributes in Linear that are not in Embedding:')
for l in attrL:
  if l not in attrE:
    print('  ' + l)

# Indexing their vectors

In [None]:
print(E.weight)
print('')
print(L.weight)

In [None]:
tokenidx = torch.tensor([14])
E(tokenidx) # emulates one-hot encoding

In [None]:
# L(tokenidx)
L.weight[tokenidx,:]

In [None]:
# explicit one-hot encoding with vector-matrix multiplication
oneHotVect = torch.zeros(vocab_size,1)
oneHotVect[tokenidx] = 1

# as vector-matrix multiplication
oneHotVect.t() @ L.weight

# Distributions of weights at initialization

In [None]:
# extract histograms to get empirical distributions
Ey,Ex = np.histogram(E.weight.flatten().detach(),100)
Ly,Lx = np.histogram(L.weight.flatten().detach(),100)

plt.figure(figsize=(10,4))
plt.plot(Ex[:-1],Ey,label='nn.Embedding')
plt.plot(Lx[:-1],Ly,label='nn.Linear')

plt.gca().set(xlim=[np.min(Ex),np.max(Ex)],xlabel='Weight value',ylabel='Count')
plt.legend()
plt.show()

In [None]:
# Note: More on weights initialization in video
# "Weight initializations" in section "Pretrain LLMs"

print('*** Statistics of nn.Linear weights ***\n')
print(f'Kaiming expected std: { np.sqrt(1/embed_dim)/np.sqrt(3) :.4f}')
print(f'Observed std of L: {torch.std(L.weight.detach()) :.4f}')

print(f'\nKaiming expected bounds: {np.sqrt(1/embed_dim):.4f}')
print(f'Observed min/max vals: [{torch.min(L.weight):.4f},{torch.max(L.weight):.4f}]')

In [None]:
# repeat the above for Xavier
print('*** Statistics of nn.Embeddings weights ***\n')
print(f'Xavier expected std: { np.sqrt(2/(vocab_size+embed_dim)) :.4f}')
print(f'Observed std of E: {torch.std(E.weight.detach()) :.4f}') # it's normal, not Xavier

In [None]:
# initialize L to match properties of E
Ln = nn.Linear(embed_dim,vocab_size) # new matrix to avoid overwriting
torch.nn.init.normal_(Ln.weight)

In [None]:
Lny,Lnx = np.histogram(Ln.weight.flatten().detach(),100)

plt.figure(figsize=(10,4))
plt.plot(Ex[:-1],Ey,label='nn.Embedding')
plt.plot(Lx[:-1],Ly,label='nn.Linear orig')
plt.plot(Lnx[:-1],Lny,label='nn.Linear weight init')

plt.gca().set(xlim=[np.min(Ex),np.max(Ex)],xlabel='Weight value',ylabel='Count')
plt.legend()
plt.show()

In [None]:
# both created via nn.Parameter
??nn.Embedding
??nn.Linear

In [None]:
# Conclusions:
#   1) Embeddings weights are initialized as standard normal, not Xavier normal.
#   2) they are "different" for practical reasons but reflect the same thing (trainable random weights)