|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Build a GPT<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: GPT-2 trained weights distributions<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
from transformers import AutoModelForCausalLM

# svg plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
# load SMALL pretrained GPT-2 model and tokenizer
gpt2 = AutoModelForCausalLM.from_pretrained('gpt2')

# Exercise 1: Distributions of token and position embeddings

In [None]:
bins = np.linspace(-.6,.6,71)

yE,xE = np.histogram(gpt2.transformer.wte.weight.detach(),bins,density=True)
yP,xP = np.histogram(gpt2.transformer.wpe.weight.detach(),bins,density=True)

plt.figure(figsize=(8,4))
plt.plot(bins[:-1],yE,'.-',linewidth=2,label='Token')
plt.plot(bins[:-1],yP,'.-',linewidth=2,label='Position')

plt.legend()
plt.gca().set(xlabel='Parameter value',ylabel='Density',xlim=bins[[0,-1]],title='Embeddings weights')
plt.show()

# Exercise 2: Distributions of attention and MLP weights

In [None]:
_,axs = plt.subplots(1,2,figsize=(13,4))


bins = np.linspace(-1,1,171)
colors = plt.cm.coolwarm(np.linspace(0,1,len(gpt2.transformer.h)))

for hidx in range(len(gpt2.transformer.h)):

  # attention weights
  y,x = np.histogram(gpt2.transformer.h[hidx].attn.c_attn.weight.detach(),bins=bins)
  axs[0].plot(x[:-1],y,color=colors[hidx],label=f'h{hidx}')

  # MLP weights
  allMLP = torch.concatenate((gpt2.transformer.h[hidx].mlp.c_fc.weight.flatten(),gpt2.transformer.h[hidx].mlp.c_proj.weight.flatten()))
  y,x = np.histogram(allMLP.detach(),bins=bins)
  axs[1].plot(x[:-1],y,color=colors[hidx],label=f'h{hidx}')


for a in axs:
  a.legend(fontsize=10)
  a.set(xlim=bins[[0,-1]],xlabel='Weight value',ylabel='Counts')

axs[0].set_title('Attention weights')
axs[1].set_title('MLP weights')

plt.tight_layout()
plt.show()

# Exercise 3: Separately for Q, K, and V matrices

In [None]:
# q,k,v are each of size n_emb X n_emb, and are concatenated into c_attn
gpt2.transformer.h[0].attn.c_attn.weight.shape#[1]/3

In [None]:
# example splitting the matrix into q, k, and v
n_emb = gpt2.transformer.wte.weight.shape[1]
q_weight, k_weight, v_weight = torch.split(gpt2.transformer.h[0].attn.c_attn.weight,n_emb,dim=1)

In [None]:

bins = np.linspace(-1,1,151)

# initialize
allq = np.array([])
allk = np.array([])
allv = np.array([])

n_emb = gpt2.transformer.wte.weight.shape[1]

# loop over all transformers and get their weights
for hidx in range(len(gpt2.transformer.h)):

  # split into matrices
  q,k,v = torch.split(gpt2.transformer.h[hidx].attn.c_attn.weight,n_emb,dim=1)

  # add to the pile o' numbers
  allq = np.concatenate((allq,q.detach().flatten().numpy()),axis=0)
  allk = np.concatenate((allk,k.detach().flatten().numpy()),axis=0)
  allv = np.concatenate((allv,v.detach().flatten().numpy()),axis=0)



# get histogram bins
yq,xq = np.histogram(allq,bins=bins,density=True)
yk,xk = np.histogram(allk,bins=bins,density=True)
yv,xv = np.histogram(allv,bins=bins,density=True)

# and plot
plt.figure(figsize=(10,4))
plt.plot(xq[:-1],100*yq,linewidth=2,label='Q')
plt.plot(xk[:-1],100*yk,linewidth=2,label='K')
plt.plot(xv[:-1],100*yv,linewidth=2,label='V')

plt.legend()
plt.gca().set(xlim=bins[[0,-1]],xlabel='Weight value',ylabel='Density',
              title=f'Distributions of attention matrices (n={len(allq):,} each)')

plt.show()