|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dulm_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Fine-tune pretrained models<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Maximize the "X" factor<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dulm_x/?couponCode=202508" target="_blank">udemy.com/course/dulm_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# pytorch stuff
import torch
import torch.nn as nn
from torch.nn import functional as F

from transformers import AutoModelForCausalLM,GPT2Tokenizer

# for text printing
import textwrap

# Exercise 1: Import and inspect GPT2-medium

In [None]:
# use the GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# load pretrained GPT-2 model and tokenizer
gpt2 = AutoModelForCausalLM.from_pretrained('gpt2-medium').to(device)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token # set pad token

In [None]:
gpt2

In [None]:
gpt2.config

In [None]:
print(f'There are {len(gpt2.transformer.h)} transformer blocks.')

# Exercise 2: Create and test the loss function

In [None]:
class myLoss_x(nn.Module):
  def __init__(self):
    super().__init__()

    # mask: 1 if token contains a target, 0 otherwise
    self.mask = torch.zeros(tokenizer.vocab_size, device=device)
    for t in range(tokenizer.vocab_size):
      thistoken = tokenizer.decode([t])
      if 'x' in thistoken:
        self.mask[t] = 1

    # normalize to pdist
    self.mask = self.mask/torch.sum(self.mask)

  def forward(self, log_probs):
    # assumes log-softmax-prob input!
    return F.kl_div(log_probs, self.mask, reduction='batchmean')

In [None]:
# create a loss function instance
loss_function = myLoss_x().to(device)

In [None]:
batch_size = 4
seq_len = 64

# generate data and move data to GPU
X = torch.randint(0,tokenizer.vocab_size,(batch_size,seq_len)).to(device)

# forward pass (disable gradient-associated calculations)
with torch.no_grad():
  out = gpt2(X)

print(f'Model input has size: {X.shape}')
# print(f'Model output has size: {out.shape}')
print(f'Model output has size: {out[0].shape}')

In [None]:
# is this a probability distribution, a log-probdist, or neither?
print(f'Sum of outputs for one token: {out[0][0,0,:].sum()}')
print(f'Sum of exp(outputs) for one token: {torch.exp(out[0][0,0,:]).sum()}')

In [None]:
# reshape the output and transform to log-softmax
logprobs = F.log_softmax(out[0],dim=-1)
logprobs_reshape = logprobs.view(-1,tokenizer.vocab_size)

print('Shape of logprob(logits):   ',logprobs.shape)
print('Shape of reshaped logprobs: ',logprobs_reshape.shape)
print('Shape of loss function mask:',loss_function.mask.shape)

In [None]:
# calculate KL losses
loss_function(logprobs_reshape)

# Exercise 3: Train the model

In [None]:
# pre-fine-tuning evals
X = tokenizer.encode('Why did the chicken cross the road?',return_tensors='pt').to(device)
Y = gpt2.generate(X,do_sample=True,max_length=200)
print(textwrap.fill(tokenizer.decode(Y[0].tolist()), width=100))


# how many generated tokens contain a target letter?
hasTarget = 0
for t in Y[0][len(X[0]):]:
  if 'x' in tokenizer.decode(t):
    hasTarget += 1

print('\n\n')
print(f'{hasTarget} of {len(Y[0][len(X[0]):])} tokens have a target.')

In [None]:
# create the optimizer function (lower learning than before!)
optimizer = torch.optim.AdamW(gpt2.parameters(), lr=1e-6, weight_decay=.01)

In [None]:
num_epochs = 300

# initialize losses
total_loss = np.zeros(num_epochs)


for epoch in range(num_epochs):

  # generate data and move data to GPU
  X = torch.randint(0,tokenizer.vocab_size,(batch_size,seq_len)).to(device)

  # forward pass
  optimizer.zero_grad()
  logits = gpt2(X)[0]

  # calculate the losses
  logits_reshape = logits.view(-1,tokenizer.vocab_size)
  logprobs_reshape = F.log_softmax(logits_reshape,dim=-1)
  loss = loss_function(logprobs_reshape)

  # backprop
  loss.backward()
  optimizer.step()

  # get the loss
  total_loss[epoch] = loss.item()

  # update our progress :)
  if epoch%37==0:
    print(f'Finished epoch {epoch:4} with loss {total_loss[epoch]:.4f}')

In [None]:
# plot the losses
plt.figure(figsize=(8,3))
plt.plot(total_loss,'k')
plt.gca().set(xlabel='Epoch',ylabel='Loss')
plt.show()

In [None]:
# pre-fine-tuning evals
X = tokenizer.encode('Why did the chicken cross the road?',return_tensors='pt').to(device)
Y = gpt2.generate(X,do_sample=True,max_length=200)
print(textwrap.fill(tokenizer.decode(Y[0].tolist()), width=100))


# how many generated tokens contain a target letter?
hasTarget = 0
for t in Y[0][len(X[0]):]:
  if 'x' in tokenizer.decode(t):
    hasTarget += 1

print('\n\n')
print(f'{hasTarget} of {len(Y[0][len(X[0]):])} tokens have a target.')