|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Fine-tune pretrained models<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: A chat between Alice and Edgar<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202508" target="_blank">udemy.com/course/dulm_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import torch.nn as nn

import numpy as np
import matplotlib.pyplot as plt

import requests

# Exercise 1: Copy and condense from the previous codechallenge

In [None]:
# Eleuther's tokenizer
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125m')
tokenizer.pad_token_id = tokenizer.encode(' ')[0]

# load in two GPTneo's and push to GPU
modelAlice = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125m')
modelEdgar = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125m')

# -> GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
modelAlice = modelAlice.to(device)
modelEdgar = modelEdgar.to(device)

In [None]:
# Alice Adventures in Wonderland
text = requests.get('https://www.gutenberg.org/cache/epub/11/pg11.txt').text
aliceTokens = tokenizer.encode(text,return_tensors='pt')[0]

# Edgar Allen Poe
text = requests.get('https://www.gutenberg.org/cache/epub/2148/pg2148.txt').text
edgarTokens = tokenizer.encode(text,return_tensors='pt')[0]

# Fine-tune the model

In [None]:
# ALICE optimizer
optimizerAlice = torch.optim.AdamW(modelAlice.parameters(), lr=5e-5, weight_decay=.01)

# EDGAR optimizer
optimizerEdgar = torch.optim.AdamW(modelEdgar.parameters(), lr=5e-5, weight_decay=.01)

In [None]:
# training parameters
seq_len    = 256 # max sequence length
batch_size =  16
num_samples = 476

In [None]:
# initialize losses
lossAlice = np.zeros(num_samples)
lossEdgar = np.zeros(num_samples)

for sampli in range(num_samples):


  ### --- ALICE fine-tuning
  # get a batch of data
  ix = torch.randint(len(aliceTokens)-seq_len,size=(batch_size,))
  X  = aliceTokens[ix[:,None] + torch.arange(seq_len)].to(device)

  # forward pass and get loss
  modelAlice.zero_grad()
  outputs = modelAlice(X,labels=X)

  # backprop and store loss
  outputs.loss.backward()
  optimizerAlice.step()
  lossAlice[sampli] = outputs.loss.item()
  ### ---------------------


  ### --- EDGAR fine-tuning
  # get a batch of data
  ix = torch.randint(len(edgarTokens)-seq_len,size=(batch_size,))
  X  = edgarTokens[ix[:,None] + torch.arange(seq_len)].to(device)

  # forward pass and get loss
  modelEdgar.zero_grad()
  outputs = modelEdgar(X,labels=X)

  # backprop and store loss
  outputs.loss.backward()
  optimizerEdgar.step()
  lossEdgar[sampli] = outputs.loss.item()
  ### ---------------------

  # update progress display
  if sampli%77==0:
    print(f'Sample {sampli:4}/{num_samples}, losses (Alice/Edgar): {lossAlice[sampli]:.2f}/{lossEdgar[sampli]:.2f}')

In [None]:
# plot the losses
plt.figure(figsize=(10,3))
plt.plot(lossAlice,'k',markersize=8,label='ALICE loss')
plt.plot(lossEdgar,'b',markersize=8,label='EDGAR loss')

plt.legend()
plt.gca().set(xlabel='Data sample',ylabel='Loss',xlim=[0,num_samples])
plt.show()

# Exercise 2: Have the models chat with each other

In [None]:
# kick-off the convo
outAlice = tokenizer.encode('Hello, my name is Alice.', return_tensors='pt').to(device)
print('\n\n** Alice says:\n',tokenizer.decode(outAlice[0].cpu()))

for _ in range(5):

  # Edgar's turn
  outEdgar = modelEdgar.generate(outAlice,max_new_tokens=50,do_sample=True,pad_token_id=50256)
  print(f'\n\n** Edgar says (total token count: {len(outEdgar[0])}):\n',
        tokenizer.decode(outEdgar[0][len(outAlice[0]):].cpu()))

  # Alice's turn
  outAlice = modelAlice.generate(outEdgar,max_new_tokens=50,do_sample=True,pad_token_id=50256)
  print(f'\n\n** Alice says (total token count: {len(outAlice[0])}):\n',
        tokenizer.decode(outAlice[0][len(outEdgar[0]):].cpu()))