|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Fine-tune pretrained models<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Impact of freezing neo on fine-tuning<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202508" target="_blank">udemy.com/course/dulm_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM

import time
import requests

# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
# Eleuther's tokenizer
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125m')
tokenizer.pad_token_id = tokenizer.encode(' ')[0]

# -> GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# load in two GPTneo's and push to GPU
modelFreeze = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125m').to(device)
modelTrain  = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125m').to(device)

# Exercise 1: Find the most frequent 100 tokens

In [None]:
# Moby Dick
text = requests.get('https://www.gutenberg.org/cache/epub/2701/pg2701.txt').text
tokens = tokenizer.encode(text,return_tensors='pt')[0]

# summary
print(f'Moby Dick has {len(tokens):,} tokens, of which {len(torch.unique(tokens)):,} are unique.')

In [None]:
uniq,counts = np.unique(tokens,return_counts=True)
freqidx = np.argsort(counts)[::-1]
top100 = uniq[freqidx[:100]]

for t in top100:
  print(f'Token {t:5} appears {torch.sum(tokens==t):4} times and is "{tokenizer.decode(t)}"')

In [None]:
numreps =  10 # number of random repetitions
numtoks = 100 # output length

# initialize
tokenUsage = np.zeros((2,2)) # [ pre/post , Freeze/Train ]

# random starting tokens
randstarts = torch.randint(tokenizer.vocab_size,(numreps,1)).to(device)


# FREEZE: generate and store tokens
outFreeze = modelFreeze.generate(randstarts,
      min_length  = numtoks+1,
      max_length  = numtoks+1,
      do_sample   = True, pad_token_id = tokenizer.pad_token_id ).cpu()
genTokensFreeze = outFreeze[:,1:].reshape(-1)


# TRAIN: same as above :)
outTrain = modelTrain.generate(randstarts, min_length=numtoks+1, max_length=numtoks+1,
      do_sample=True, pad_token_id=tokenizer.pad_token_id ).cpu()
genTokensTrain = outTrain[:,1:].reshape(-1)


# calculate the percentage
tokenUsage[0,0] = np.mean(100*np.isin(genTokensFreeze,top100))
tokenUsage[0,1] = np.mean(100*np.isin(genTokensTrain ,top100))

In [None]:
tokenUsage

# Exercise 2: Targeted training

In [None]:
for name,param in modelFreeze.named_parameters():
  splitstr = name.split('.')
  print(splitstr)

In [None]:
# TEST: identify QVK weights in layers >5
for name,param in modelFreeze.named_parameters():
  splitstr = name.split('.')
  if (len(splitstr)>5) and (splitstr[3]=='attn'):
    if (int(splitstr[2])>5) and (splitstr[5][0] in 'qvk'):
      print(name)

In [None]:
for name,param in modelFreeze.named_parameters():

  # split the name by .
  splitstr = name.split('.')

  # see if this fits our filter
  if (len(splitstr)>5) and (splitstr[3]=='attn'):
    if (int(splitstr[2])>5) and (splitstr[5][0] in 'qvk'):
      param.requires_grad = True
      print(f'+++ Layer {name} is trainable (.requires_grad = {param.requires_grad}).')

  # otherwise, freeze the layer
  else:
    param.requires_grad = False
    print(f'--- Layer {name} is frozen (.requires_grad = {param.requires_grad}).')

# Exercise 3: Fine-tune the models

In [None]:
# FREEZE optimizer
optimizerFreeze = torch.optim.AdamW(modelFreeze.parameters(), lr=.0005)

# TRAIN optimizer
optimizerTrain = torch.optim.AdamW(modelTrain.parameters(), lr=.0005)

In [None]:
# training parameters
seq_len     = 256 # max sequence length
batch_size  =  16
num_samples = 474

In [None]:
# initialize losses
losses = np.zeros((num_samples,2))
delta_norm_em = np.zeros((num_samples,2))

# and computation times
timeTrain = 0
timeFreeze = 0


# grab the initial MLP weights for comparison
prev_emFreeze = modelFreeze.transformer.h[6].attn.attention.k_proj.weight.detach() + 0
prev_emTrain = modelTrain.transformer.h[6].attn.attention.k_proj.weight.detach() + 0



# and run the training!
for sampli in range(num_samples):

  # get a batch of data
  ix = torch.randint(len(tokens)-seq_len,size=(batch_size,))
  X  = tokens[ix[:,None] + torch.arange(seq_len)].to(device)


  ### --- FREEZE fine-tuning
  # forward pass and get loss
  start_time = time.time() # start the timer
  modelFreeze.zero_grad()
  outputs = modelFreeze(X,labels=X)

  # backprop and store loss
  outputs.loss.backward()
  optimizerFreeze.step()
  losses[sampli,0] = outputs.loss.item()
  timeFreeze += time.time()-start_time # end the timer and add
  ### ---------------------


  ### --- TRAIN fine-tuning
  # forward pass and get loss
  start_time = time.time() # start the timer
  modelTrain.zero_grad()
  outputs = modelTrain(X,labels=X)

  # backprop and store loss
  outputs.loss.backward()
  optimizerTrain.step()
  losses[sampli,1] = outputs.loss.item()
  timeTrain += time.time()-start_time # end the timer and add
  ### ---------------------



  ### --- matrix norm to assess change in MLP layer
  delta_norm_em[sampli,0] = torch.norm(modelFreeze.transformer.h[6].attn.attention.k_proj.weight.detach() - prev_emFreeze)
  prev_emFreeze = modelFreeze.transformer.h[6].attn.attention.k_proj.weight.detach() + 0

  delta_norm_em[sampli,1] = torch.norm(modelTrain.transformer.h[6].attn.attention.k_proj.weight.detach() - prev_emTrain)
  prev_emTrain = modelTrain.transformer.h[6].attn.attention.k_proj.weight.detach() + 0




  # update progress display
  if sampli%37==0:
    print(f'Sample {sampli:4}/{num_samples}, losses (Freeze/Train): {losses[sampli,0]:.2f}/{losses[sampli,1]:.2f}')

# Exercise 4: Visualize the results

In [None]:
# plot the losses
_,axs = plt.subplots(1,2,figsize=(10,4))
axs[0].plot(losses[:,0],'k',markersize=8,label='FREEZE loss')
axs[0].plot(losses[:,1],'b',markersize=8,label='TRAIN loss')
axs[0].legend()
axs[0].set(ylim=[0,5],xlabel='Data sample',ylabel='Loss',xlim=[0,num_samples],title='Losses over training')

axs[1].plot(losses[:,0],losses[:,1],'ko',markerfacecolor=[.9,.7,.7],alpha=.4,label='Data')
xylim = [np.min(losses)-.1,np.max(losses)+.1]
axs[1].plot(xylim,xylim,'k--',zorder=-3,label='Unity')
axs[1].set(xlabel='FREEZE model',ylabel='TRAIN model',title='Losses',xlim=xylim,ylim=xylim)
axs[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# random starting tokens
randstarts = torch.randint(tokenizer.vocab_size,(numreps,1)).to(device)


# FREEZE: generate and store tokens
outFreeze = modelFreeze.generate(randstarts,
      min_length  = numtoks+1,
      max_length  = numtoks+1,
      do_sample   = True, pad_token_id = tokenizer.pad_token_id ).cpu()
genTokensFreeze = outFreeze[:,1:].reshape(-1)


# TRAIN: same as above :)
outTrain = modelTrain.generate(randstarts, min_length=numtoks+1, max_length=numtoks+1,
      do_sample=True, pad_token_id=tokenizer.pad_token_id ).cpu()
genTokensTrain = outTrain[:,1:].reshape(-1)


# calculate the percentage
tokenUsage[1,0] = np.mean(100*np.isin(genTokensFreeze,top100))
tokenUsage[1,1] = np.mean(100*np.isin(genTokensTrain ,top100))

In [None]:
print(tokenizer.decode(genTokensTrain))

In [None]:
# visualize the results!

_,axs = plt.subplots(1,2,figsize=(10,3))

# show the pre-train token usage
axs[0].bar([.8,1.8],tokenUsage[0,:],width=.4,label='BEFORE')
axs[0].bar([1.2,2.2],tokenUsage[1,:],width=.4,label='AFTER')
axs[0].set(ylim=[min(tokenUsage.flatten())-2,max(tokenUsage.flatten())+2],xticks=[1,2],xlim=[.3,2.6],
           xticklabels=['FREEZE model','TRAIN model'],ylabel='Percent generated (%)',title='Common Moby Dick tokens generated')
axs[0].legend()

axs[1].bar([1,2],np.diff(tokenUsage,axis=0)[0])
axs[1].set(xticks=[1,2],xlim=[.3,2.6],xticklabels=['FREEZE model','TRAIN model'],
           ylabel='Change in generated tokens (%)',title='Post- minus pre-training')


plt.tight_layout()
plt.show()

In [None]:
# how did the embeddings weights change?
plt.figure(figsize=(8,3))

plt.plot(delta_norm_em[:,0],linewidth=2,label='FREEZE')
plt.plot(delta_norm_em[:,1],linewidth=2,label='TRAIN')

plt.legend()
plt.gca().set(xlim=[0,num_samples],xlabel='Training sample',ylabel='Matrix difference norm')
plt.show()

In [None]:
# Computation time
plt.bar([1,2],[timeFreeze,timeTrain])
plt.gca().set(xticks=[1,2],xticklabels=['FREEZE','TRAIN'],ylabel='Computation time (s)',
              ylim=[min(timeFreeze,timeTrain)*.8,max(timeFreeze,timeTrain)*1.2],
              title=f'Computation time across {num_samples} training samples')
plt.show()