|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Fine-tune pretrained models<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Quantify the Alice/Edgar fine-tuning<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202508" target="_blank">udemy.com/course/dulm_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

import requests

# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Exercise 1: Tokenize and find the most frequent 100 tokens

In [None]:
# Eleuther's tokenizer
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125m')
tokenizer.pad_token_id = tokenizer.encode(' ')[0]

# load in two GPTneo's and push to GPU
modelAlice = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125m')
modelEdgar = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125m')

# -> GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
modelAlice = modelAlice.to(device)
modelEdgar = modelEdgar.to(device)

### Import texts

In [None]:
# Alice Adventures in Wonderland
text = requests.get('https://www.gutenberg.org/cache/epub/11/pg11.txt').text
aliceTokens = tokenizer.encode(text,return_tensors='pt')[0]

# Edgar Allen Poe
text = requests.get('https://www.gutenberg.org/cache/epub/2148/pg2148.txt').text
edgarTokens = tokenizer.encode(text,return_tensors='pt')[0]

# summary
print(f'Alice in Wonderland has  {len(aliceTokens):7,} tokens.')
print(f'Edgar Allen Poe text has {len(edgarTokens):7,} tokens.')

### Find the most frequent tokens

In [None]:
# create a filtered token vector initialized to zeros
aliceTokensFilt = np.full(len(aliceTokens),-1,dtype=int)

# copy over the token only if it has >2 characters
for t in range(len(aliceTokens)):
  if len(tokenizer.decode(aliceTokens[t]))>2:
    aliceTokensFilt[t] = aliceTokens[t]


# repeat for edgar
edgarTokensFilt = np.full(len(edgarTokens),-1,dtype=int)

# copy over the token only if it has >2 characters
for t in range(len(edgarTokens)):
  if len(tokenizer.decode(edgarTokens[t]))>2:
    edgarTokensFilt[t] = edgarTokens[t]

In [None]:
# report
print('EDGAR:')
print(f'  {(edgarTokensFilt==-1).sum():,}/{len(edgarTokensFilt):,} ({100*(edgarTokensFilt==-1).sum()/len(edgarTokensFilt):.2f}%) tokens have <3 characters.')

print('\n\nALICE:')
print(f'  {(aliceTokensFilt==-1).sum():,}/{len(aliceTokensFilt):,} ({100*(aliceTokensFilt==-1).sum()/len(aliceTokensFilt):.2f}%) tokens have <3 characters.')

In [None]:
# for Alice
uniq,counts = np.unique(aliceTokensFilt,return_counts=True)
freqidx = np.argsort(counts)[::-1]
top100Alice = uniq[freqidx[1:101]]

# for Edgar
uniq,counts = np.unique(edgarTokensFilt,return_counts=True)
freqidx = np.argsort(counts)[::-1]
top100Edgar = uniq[freqidx[1:101]]

for t in top100Alice:
  print(f'Token {t:5} appears {np.sum(aliceTokensFilt==t):4} times and is "{tokenizer.decode(t)}"')

In [None]:
for t in top100Edgar:
  print(f'Token {t:5} appears {np.sum(edgarTokensFilt==t):4} times and is "{tokenizer.decode(t)}"')

# Exercise 2: Quantify common token usage pretraining

In [None]:
numreps =  10 # number of random repetitions
numtoks = 100 # output length

# initialize
tokenUsageAlice = np.zeros((2,2)) # [ pre/post , Alice/Edgar ]
tokenUsageEdgar = np.zeros((2,2)) # [ pre/post , Alice/Edgar ]

# random starting tokens
randstarts = torch.randint(tokenizer.vocab_size,(numreps,1)).to(device)


# ALICE: generate and store tokens
outAlice = modelAlice.generate(randstarts,
      min_length  = numtoks+1,
      max_length  = numtoks+1,
      do_sample   = True,
      pad_token_id = tokenizer.pad_token_id ).cpu()
genTokensAlice = outAlice[:,1:].reshape(-1)


# EDGAR: same as above but compressed :)
outEdgar = modelEdgar.generate(randstarts, min_length=numtoks+1, max_length=numtoks+1,
      do_sample=True, pad_token_id=tokenizer.pad_token_id ).cpu()
genTokensEdgar = outEdgar[:,1:].reshape(-1)

# calculate the percentage
tokenUsageAlice[0,0] = np.mean(100*np.isin(genTokensAlice,top100Alice)) # ALICE model, ALICE tokens
tokenUsageAlice[0,1] = np.mean(100*np.isin(genTokensEdgar,top100Alice)) # EDGAR model, ALICE tokens

tokenUsageEdgar[0,0] = np.mean(100*np.isin(genTokensAlice,top100Edgar)) # ALICE model, EDGAR tokens
tokenUsageEdgar[0,1] = np.mean(100*np.isin(genTokensEdgar,top100Edgar)) # EDGAR model, EDGAR tokens

tokenUsageEdgar # have a look

# Exercise 3: Fine-tune the models

In [None]:
# ALICE optimizer
optimizerAlice = torch.optim.AdamW(modelAlice.parameters(), lr=5e-5)

# EDGAR optimizer
optimizerEdgar = torch.optim.AdamW(modelEdgar.parameters(), lr=5e-5)

In [None]:
# training parameters
seq_len     = 256 # max sequence length
batch_size  =  32
num_samples = 476

In [None]:
# initialize losses
lossAlice = np.zeros(num_samples)
lossEdgar = np.zeros(num_samples)

for sampli in range(num_samples):


  ### --- ALICE fine-tuning
  # get a batch of data
  ix = torch.randint(len(aliceTokens)-seq_len,size=(batch_size,))
  X  = aliceTokens[ix[:,None] + torch.arange(seq_len)].to(device)

  # forward pass and get loss
  modelAlice.zero_grad()
  outputs = modelAlice(X,labels=X)

  # backprop and store loss
  outputs.loss.backward()
  optimizerAlice.step()
  lossAlice[sampli] = outputs.loss.item()
  ### ---------------------


  ### --- EDGAR fine-tuning
  # get a batch of data
  ix = torch.randint(len(edgarTokens)-seq_len,size=(batch_size,))
  X  = edgarTokens[ix[:,None] + torch.arange(seq_len)].to(device)

  # forward pass and get loss
  modelEdgar.zero_grad()
  outputs = modelEdgar(X,labels=X)

  # backprop and store loss
  outputs.loss.backward()
  optimizerEdgar.step()
  lossEdgar[sampli] = outputs.loss.item()
  ### ---------------------

  # update progress display
  if sampli%77==0:
    print(f'Sample {sampli:4}/{num_samples}, losses (Alice/Edgar): {lossAlice[sampli]:.2f}/{lossEdgar[sampli]:.2f}')

In [None]:
# plot the losses
plt.figure(figsize=(10,3))
plt.plot(lossAlice,'k',markersize=8,label='ALICE loss')
plt.plot(lossEdgar,'b',markersize=8,label='EDGAR loss')

plt.legend()
plt.gca().set(xlabel='Data sample',ylabel='Loss',xlim=[0,num_samples])
plt.show()

# Exercise 4: Evaluation the fine-tuning

In [None]:
# random starting tokens
randstarts = torch.randint(tokenizer.vocab_size,(numreps,1)).to(device)


# ALICE: generate and store tokens
outAlice = modelAlice.generate(randstarts,
      min_length  = numtoks+1,
      max_length  = numtoks+1,
      do_sample   = True,
      pad_token_id = tokenizer.pad_token_id ).cpu()
genTokensAlice = outAlice[:,1:].reshape(-1)


# EDGAR: same as above :)
outEdgar = modelEdgar.generate(randstarts, min_length=numtoks+1, max_length=numtoks+1,
      do_sample=True, pad_token_id=tokenizer.pad_token_id ).cpu()
genTokensEdgar = outEdgar[:,1:].reshape(-1)


# calculate the percentage
tokenUsageAlice[1,0] = np.mean(100*np.isin(genTokensAlice,top100Alice)) # ALICE model, ALICE tokens
tokenUsageAlice[1,1] = np.mean(100*np.isin(genTokensEdgar,top100Alice)) # EDGAR model, ALICE tokens

tokenUsageEdgar[1,0] = np.mean(100*np.isin(genTokensAlice,top100Edgar)) # ALICE model, EDGAR tokens
tokenUsageEdgar[1,1] = np.mean(100*np.isin(genTokensEdgar,top100Edgar)) # EDGAR model, EDGAR tokens

In [None]:
# visualize the results!

_,axs = plt.subplots(1,2,figsize=(10,3.5))


# show the pre-train token usage
axs[0].bar([.8,1.8],tokenUsageAlice[0,:],width=.4,label='ALICE tokens')
axs[0].bar([1.2,2.2],tokenUsageEdgar[0,:],width=.4,label='EDGAR tokens')
minmaxY = np.sort(np.concatenate((tokenUsageAlice[0,:],tokenUsageEdgar[0,:])))[[0,-1]]
axs[0].set(ylim=[minmaxY[0]-2,minmaxY[1]+2],xticks=[1,2],xticklabels=['ALICE model','EDGAR model'],
           ylabel='Percent generated (%)',title='BEFORE training')
axs[0].legend()

# show the post-train token usage
axs[1].bar([.8,1.8],tokenUsageAlice[1,:],width=.4,label='ALICE tokens')
axs[1].bar([1.2,2.2],tokenUsageEdgar[1,:],width=.4,label='EDGAR tokens')
minmaxY = np.sort(np.concatenate((tokenUsageAlice[1,:],tokenUsageEdgar[1,:])))[[0,-1]]
axs[1].set(ylim=[minmaxY[0]-2,minmaxY[1]+2],xticks=[1,2],xticklabels=['ALICE model','EDGAR model'],
           ylabel='Percent generated (%)',title='AFTER training')
axs[1].legend()

plt.tight_layout()
plt.show()

## Qualitative assessment

In [None]:
# input
x = tokenizer.encode('What did the Red Queen say to Alice?', return_tensors='pt').to(device)

# get the output
outAlice = modelAlice.generate(x,max_new_tokens=120,do_sample=True,pad_token_id=50256)
outEdgar = modelEdgar.generate(x,max_new_tokens=120,do_sample=True,pad_token_id=50256)

# print both models' outputs
print('** Alice model says:')
print(tokenizer.decode(outAlice[0].cpu()))

print('\n\n** Edgar model says:')
print(tokenizer.decode(outEdgar[0].cpu()))