|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Pretrain LLMs<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge: Train model 1 with GPT2's embeddings<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import requests
import matplotlib.pyplot as plt

# note the random_split
from torch.utils.data import Dataset, DataLoader, random_split
import torch
import torch.nn as nn
from torch.nn import functional as F

In [None]:
# GPT-2's tokenizer
from transformers import GPT2Tokenizer,AutoModelForCausalLM
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters

In [None]:
# data hyperparameters
seq_len = 8 # aka context length
stride = 2

# model hyperparameters
embed_dim = 768

# training hyperparameters
batch_size = 64

# Exercise 1: Data and model

In [None]:
# import and tokenize the text
text = requests.get('https://www.gutenberg.org/files/35/35-0.txt').text
tmTokens = torch.tensor( tokenizer.encode(text) )

In [None]:
# create a class for a dataset (note: batching is done by the DataLoader, not in the dataset)
class TokenDataset(Dataset):
  def __init__(self, tokens, seq_len=8, stride=4):

    # initialize
    self.inputs  = []
    self.targets = []

    # overlapping sequences of seq_len
    for i in range(0,len(tokens)-seq_len,stride):

      # get c tokens and append to the lists
      self.inputs.append( tokens[i   : i+seq_len])
      self.targets.append(tokens[i+1 : i+seq_len+1])

  def __len__(self):
    return len(self.inputs)

  def __getitem__(self, idx):
    return self.inputs[idx], self.targets[idx]

In [None]:
# create an instance!
token_dataset = TokenDataset(tmTokens,seq_len,stride)

# find the sizes
train_ratio = .9
train_size = int(train_ratio * len(token_dataset))
test_size  = len(token_dataset) - train_size

# create train/test subsets
train_dataset, test_dataset = random_split(
    token_dataset, [train_size, test_size]
   )

# create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False)

print(f'Train set has {train_size:5} sequences.')
print(f'Test set has  {test_size:5} sequences.')

# The model

In [None]:
class Model(nn.Module):
  def __init__(self):
    super().__init__()

    # embedding matrix
    self.embedding = nn.Embedding(tokenizer.vocab_size,embed_dim)

    # embedding to output (linear) layer
    self.gelu = nn.GELU()
    self.finalLinear = nn.Linear(embed_dim,tokenizer.vocab_size,bias=False)



  def forward(self,tokx):

    # forward pass
    x = self.embedding(tokx) # batch, token, embedding dimension
    x = self.gelu(x)
    x = self.finalLinear(x)

    # return log-softmax
    return F.log_softmax(x,dim=-1)

  def generate(self,tokx,n_new_tokens=30):
    for _ in range(n_new_tokens):
      x = self(tokx)
      x = x[:,-1,:]
      probs = torch.exp(x) # undo the log, keep the softmax
      tokx_next = torch.multinomial(probs,num_samples=1)
      tokx = torch.cat( (tokx,tokx_next),dim=1)
    return tokx


# Exercise 2: Copy GPT2's embeddings onto Model 1

In [None]:
# import the GPT2 small model
gpt2 = AutoModelForCausalLM.from_pretrained('gpt2')

In [None]:
model = Model()

In [None]:
# check that the sizes match
print(f'My model embedding:   {model.embedding.weight.shape}')
print(f'GPT2 model embedding: {gpt2.transformer.wte.weight.shape}')

In [None]:
# and copy over
model.embedding.weight.data = gpt2.transformer.wte.weight.data

In [None]:
# check for equality
model.embedding.weight[0] - gpt2.transformer.wte.weight[0]

# Exercise 3: Train the data with frozen embeddings

In [None]:
# toggle this ON if you're working on exercise 3 :P
this_is_exercise3 = True

if this_is_exercise3:

  # initial state
  print(model.embedding.weight.requires_grad)

  # switch it off
  model.embedding.weight.requires_grad = False

  # confirm
  print(model.embedding.weight.requires_grad)

# Prepare for training

In [None]:
# push the model to the GPU
model = model.to(device)

In [None]:
# create the loss and optimizer functions
loss_function = nn.NLLLoss().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=.001, weight_decay=.01)

# Now train the model!

In [None]:
num_epochs = 10

# initialize losses
train_loss = []
test_loss = []

for epoch in range(num_epochs):

  # initialize
  epoch_loss = 0

  # loop over batches in the data loader
  for X,y in train_dataloader:

    # move data to GPU
    X,y = X.to(device), y.to(device)

    # clear previous gradients
    model.zero_grad()

    # forward pass
    log_probs = model(X)

    # calculate the losses on the (reshaped) final target word
    log_probs_flat = log_probs[:,:-1,:].reshape(-1,log_probs.shape[-1])
    y_flat = y[:,1:].reshape(-1)
    loss = loss_function(log_probs_flat, y_flat)

    # backprop
    loss.backward()
    optimizer.step()

    # sum the per-epoch losses
    epoch_loss += loss.item()
  #- loop over batches ends here


  # evaluate the model with the test set
  with torch.no_grad():
    testloss = 0 # reset the test loss
    for X,y in test_dataloader:
      X,y = X.to(device), y.to(device)  # push it to the GPU
      out = model(X)                    # forward pass
      out_flat = out[:,:-1,:].reshape(-1,out.shape[-1]) # reshape output
      y_flat = y[:,1:].reshape(-1)     # reshape targets
      thisloss = loss_function(out_flat, y_flat) # calculate loss
      testloss += thisloss.item()



  # scale the train loss by the number of tokens in this dataloader
  train_loss.append(epoch_loss / len(train_dataloader))
  test_loss.append(testloss / len(test_dataloader))

  # update our progress :)
  print(f'Epoch {epoch+1:2}, train loss: {epoch_loss / len(train_dataloader):.4f}, test loss: {testloss/len(test_dataloader):.4f}')

In [None]:
# plot the losses
plt.plot(train_loss,'ks-',markerfacecolor='w',markersize=8,label='Train loss')
plt.plot(test_loss,'ro-',markerfacecolor='w',markersize=8,label='Test loss')

plt.legend()
plt.gca().set(xlabel='Epoch',ylabel='Loss',ylim=[3,9])
plt.show()