|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Pretrain LLMs<h1>|
|<h2>Lecture:</h2>|<h1><b>Train model 1<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import requests
import matplotlib.pyplot as plt

# pytorch stuff
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from torch.nn import functional as F

# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
# GPT-2's tokenizer
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
# use the GPU for speed
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters

In [None]:
# data hyperparameters
seq_len = 8 # context length
stride = 2

# model hyperparameters
embed_dim = 2**6 # 64

# training hyperparameters
batch_size = 64

# Get data

In [None]:
# tokenize the text and make it a tensor
text = requests.get('https://www.gutenberg.org/files/35/35-0.txt').text
tmTokens = torch.tensor( tokenizer.encode(text) )
len(tmTokens)

### DataLoader

In [None]:
# create a class for a dataset (note: batching is done by the DataLoader, not in the dataset)
class TokenDataset(Dataset):
  def __init__(self, tokens, seq_len=8, stride=4):

    # initialize
    self.inputs  = []
    self.targets = []

    # overlapping sequences of seq_len
    for i in range(0,len(tokens)-seq_len,stride):

      # get c tokens and append to the lists
      self.inputs.append( tokens[i   : i+seq_len])
      self.targets.append(tokens[i+1 : i+seq_len+1])

  def __len__(self):
    return len(self.inputs)

  def __getitem__(self, idx):
    return self.inputs[idx], self.targets[idx]


In [None]:
# create an instance!
token_dataset = TokenDataset(tmTokens,seq_len,stride)

token_dataloader = DataLoader(token_dataset, batch_size=batch_size, shuffle=True)
next(iter(token_dataloader))

# The model

In [None]:
class Model(nn.Module):
  def __init__(self):
    super().__init__()

    # embedding matrix
    self.embedding = nn.Embedding(tokenizer.vocab_size,embed_dim)

    # embedding to output (linear) layer
    self.gelu = nn.GELU()
    self.finalLinear = nn.Linear(embed_dim,tokenizer.vocab_size)



  def forward(self,tokx):

    # forward pass
    x = self.embedding(tokx) # [batch, token, embeddings]
    x = self.gelu(x)
    x = self.finalLinear(x) # [embeddings, vocab]

    #n return log-softmax
    return F.log_softmax(x,dim=-1)

  def generate(self,tokx,n_new_tokens=30):
    # mostly same as in DULM_buildGPT_model1 but without comments
    for _ in range(n_new_tokens):
      x = self(tokx)
      x = x[:,-1,:]
      probs = torch.exp(x) #n undo the log but keep the softmax
      nextToken = torch.multinomial(probs,num_samples=1)
      tokx = torch.cat( (tokx,nextToken),dim=1)
    return tokx


### Check the model output

In [None]:
model = Model()
X,y = token_dataset[4]
out = model(X)

print(X.shape)
print(y.shape)
print(out.shape) # confirm torch.sum(torch.exp(out))==1

In [None]:
print(X)
print(y)
print(torch.argmax(out))

# Prepare for training

In [None]:
# push the model to the GPU
model = model.to(device)

In [None]:
# create the loss and optimizer functions
loss_function = nn.NLLLoss().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=.001, weight_decay=.01)

In [None]:
# check loss function with sizes
X,y = next(iter(token_dataloader))
log_probs = model(X.to(device))

print(f'Model input is size:   {X.shape}')
print(f'Model output is size:  {log_probs.shape}')
print(f'Target tokens is size: {y.shape}')

# uh oh...
loss_function(log_probs[:,:-1,:],y[:,1:].to(device))

In [None]:
# flatten to [batch*token, vocab]
log_probs_flat = log_probs[:,:-1,:].reshape(-1,log_probs.shape[-1])

# flatten to a vector
y_flat = y[:,1:].reshape(-1)

print(f'Model output is size:  {log_probs_flat.shape}')
print(f'Target tokens is size: {y_flat.shape}')

# Now compute the loss
loss = loss_function(log_probs_flat, y_flat.to(device))
print('\nLoss:',loss)

In [None]:
# some model generated text (compare with post-training)

# needs to be a tensor with one batch
startToks = torch.tensor(tokenizer.encode('I thought the Eloi would be smarter than')).unsqueeze(0)

# text generation
Y = model.generate(startToks.to(device))
print(tokenizer.decode(Y[0].tolist()))

# Now train the model!

In [None]:
num_epochs = 25

# initialize losses
total_loss = np.zeros(num_epochs)



# training loop
for epoch in range(num_epochs):

  # initialize batch losses to accumulate
  epoch_loss = 0

  # loop over batches in the data loader
  for X,y in token_dataloader:

    # move data to GPU
    X,y = X.to(device), y.to(device)

    # clear previous gradients
    model.zero_grad()

    # forward pass
    log_probs = model(X)

    # calculate the losses on the (reshaped) final target word
    log_probs_flat = log_probs[:,:-1,:].reshape(-1,log_probs.shape[-1]) # tokens 0:N-1
    y_flat = y[:,1:].reshape(-1) # tokens 1:N
    loss = loss_function(log_probs_flat, y_flat)

    # backprop
    loss.backward()
    optimizer.step()

    # sum the batch loss
    epoch_loss += loss.item()

  # scale by the number of tokens in this dataloader
  total_loss[epoch] = epoch_loss / len(token_dataloader)

  # update our progress :)
  if epoch%2==0:
    print(f'Finished epoch {epoch+1:2} with loss {epoch_loss / len(token_dataloader):.4f}')

In [None]:
# plot the losses
plt.plot(total_loss,'ks-',markerfacecolor='w',markersize=8)
plt.gca().set(xlabel='Epoch',ylabel='Loss')
plt.show()

In [None]:
# theoretical loss for untrained weights
-np.log(1/tokenizer.vocab_size)

In [None]:
# check the model output for a training sequence
print(tokenizer.decode(X[6].tolist()))
print(tokenizer.decode(y[6].tolist()))

In [None]:
# and generate new topic-related data

# needs to be a tensor with one batch
startToks = torch.tensor(tokenizer.encode('I thought the Eloi would be smarter than')).unsqueeze(0)

# text generation
Y = model.generate(startToks.to(device),n_new_tokens=80)
print(tokenizer.decode(Y[0].tolist()))

In [None]:
# when the initial text doesn't repeat:
startToks,Y

In [None]:
# use this:
print(tokenizer.decode(Y[0].tolist()).replace('\r','\n'))

In [None]:
# b/c of the carriage return (go to the beginning of the line, not the next line)
tokenizer.decode([201])