|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Fine-tune pretrained models<h1>|
|<h2>Lecture:</h2>|<h1><b>Alice in Wonderland and Edgar Allen Poe (with GPT-neo)<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

!pip install torchinfo
from torchinfo import summary

import requests

In [None]:
# Eleuther's tokenizer
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125m')
tokenizer.pad_token_id = tokenizer.encode(' ')[0]

# load in two GPTneo's and push to GPU
modelAlice = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125m')
modelEdgar = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125m')

# -> GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
modelAlice = modelAlice.to(device)
modelEdgar = modelEdgar.to(device)

# Inspect the model

In [None]:
# inspect the model
modelAlice

In [None]:
# accessing a particular weights matrix
modelAlice.transformer.h[3].attn.attention.k_proj.weight.shape

In [None]:
# model summary
x = tokenizer.encode('What did the Red Queen say to Alice?', return_tensors='pt').to(device)
summary(modelAlice, input_data=x, col_names=['input_size','output_size','num_params'])

In [None]:
# are the embeddings and unembeddings tied?
print('** Embedding:\n',modelAlice.transformer.wte.weight.detach())
print('\n** Unembedding:\n',modelAlice.lm_head.weight.detach())

# Explore the tokenizer

In [None]:
# A bit about their tokenizer
print(f'Tokenizer has {tokenizer.vocab_size:,} tokens.\nA few random tokens:\n')

for i in range(30):
  # generate a random token
  randtok = torch.randint(tokenizer.vocab_size,(1,))
  print(f'Token {randtok[0]:5} is "{tokenizer.decode(randtok)}"')

In [None]:
# check the raw output shape
into = tokenizer.encode('What did the Red Queen say to Alice?', return_tensors='pt').to(device)

out = modelAlice(into)
print(out.logits.shape) # [batch, tokens, embedding]

In [None]:
# text generation
out = modelAlice.generate(into,max_new_tokens=120,do_sample=True,pad_token_id=50256)
print(tokenizer.decode(out[0].cpu()))

In [None]:
# is it different from GPT's tokenizer?

from transformers import GPT2Tokenizer
gptTokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
for i in range(30):
  # generate a random token
  randtok = torch.randint(tokenizer.vocab_size,(1,))

  # get the token text for both tokenizers
  e = tokenizer.decode(randtok)
  g = gptTokenizer.decode(randtok)
  print(f'Token {randtok[0]:5} is "{e}" and "{g}"')

# Import and process texts

In [None]:
# Through the Looking Glass (aka Alice in Wonderland)
text = requests.get('https://www.gutenberg.org/cache/epub/11/pg11.txt').text
aliceTokens = torch.tensor( tokenizer.encode(text),dtype=torch.long )

# Edgar Allen Poe
text = requests.get('https://www.gutenberg.org/cache/epub/2148/pg2148.txt').text
edgarTokens = torch.tensor( tokenizer.encode(text),dtype=torch.long )

# summary
print(f'Alice in Wonderland has  {len(aliceTokens):7,} tokens.')
print(f'Edgar Allen Poe text has {len(edgarTokens):7,} tokens.')

# Prepare for fine-tuning

In [None]:
# ALICE optimizer
optimizerAlice = torch.optim.AdamW(modelAlice.parameters(), lr=5e-5, weight_decay=.01)

# EDGAR optimizer
optimizerEdgar = torch.optim.AdamW(modelEdgar.parameters(), lr=5e-5, weight_decay=.01)

In [None]:
# training parameters
seq_len     = 256 # max sequence length
batch_size  =  16
num_samples = 476

# Fine-tune the model

In [None]:
# initialize losses
lossAlice = np.zeros(num_samples)
lossEdgar = np.zeros(num_samples)


# loop over training
for sampli in range(num_samples):


  ### --- ALICE fine-tuning
  # get a batch of data
  ix = torch.randint(len(aliceTokens)-seq_len,size=(batch_size,))
  X  = aliceTokens[ix[:,None] + torch.arange(seq_len)].to(device)

  # forward pass and get loss
  modelAlice.zero_grad()
  outputs = modelAlice(X,labels=X)

  # backprop and store loss
  outputs.loss.backward()
  optimizerAlice.step()
  lossAlice[sampli] = outputs.loss.item()
  ### ---------------------


  ### --- EDGAR fine-tuning
  # get a batch of data
  ix = torch.randint(len(edgarTokens)-seq_len,size=(batch_size,))
  X  = edgarTokens[ix[:,None] + torch.arange(seq_len)].to(device)

  # forward pass and get loss
  modelEdgar.zero_grad()
  outputs = modelEdgar(X,labels=X)

  # backprop and store loss
  outputs.loss.backward()
  optimizerEdgar.step()
  lossEdgar[sampli] = outputs.loss.item()
  ### ---------------------

  # update progress display
  if sampli%77==0:
    print(f'Sample {sampli:4}/{num_samples}, losses (Alice/Edgar): {lossAlice[sampli]:.2f}/{lossEdgar[sampli]:.2f}')

In [None]:
# plot the losses
plt.figure(figsize=(8,4))
plt.plot(lossAlice,'k',markersize=8,label='ALICE loss')
plt.plot(lossEdgar,'b',markersize=8,label='EDGAR loss')

plt.legend()
plt.gca().set(xlabel='Data sample',ylabel='Loss',xlim=[0,num_samples])
plt.show()

# Qualtative assessment

In [None]:
# input
x = tokenizer.encode('What did the Red Queen say to Alice?', return_tensors='pt').to(device)

# get the output
outAlice = modelAlice.generate(x,max_new_tokens=120,do_sample=True,pad_token_id=50256)
outEdgar = modelEdgar.generate(x,max_new_tokens=120,do_sample=True,pad_token_id=50256)

# print both models' outputs
print('** Alice model says:')
print(tokenizer.decode(outAlice[0].cpu()))

print('\n\n** Edgar model says:')
print(tokenizer.decode(outEdgar[0].cpu()))