|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 2:</h2>|<h1>Large language models<h1>|
|<h2>Section:</h2>|<h1>Fine-tune pretrained models<h1>|
|<h2>Lecture:</h2>|<h1><b>BERT decides: Alice or Edgar?<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
# typical python libraries
import numpy as np
import matplotlib.pyplot as plt
import requests

# pytorch libraries
import torch
import torch.nn as nn

# huggingface libraries
from transformers import BertModel, BertTokenizer

In [None]:
# import the BERT model and tokenizer
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Import the two datasets

In [None]:
# Alice in Wonderland
text = requests.get('https://www.gutenberg.org/cache/epub/11/pg11.txt').text
aliceTokens = torch.tensor( tokenizer.encode(text),dtype=torch.long )

# Edgar Allen Poe
text = requests.get('https://www.gutenberg.org/cache/epub/2148/pg2148.txt').text
edgarTokens = torch.tensor( tokenizer.encode(text),dtype=torch.long )

# Create an LLM model using pretrained BERT with a new head

In [None]:
class BertForBinaryClassification(nn.Module):
  def __init__(self, num_labels=2):
    super(BertForBinaryClassification, self).__init__()

    # Load the pre-trained BERT model.
    self.bert = BertModel.from_pretrained('bert-base-uncased')

    # classification head that converts the 768-d pooled output into 2 final outputs.
    self.classifier = nn.Linear(768,2)
    self.dropout = nn.Dropout(.1) # hard-coded dropout at 10%

    # initialize the weights and biases
    nn.init.xavier_uniform_(self.classifier.weight)
    nn.init.zeros_(self.classifier.bias)


  def forward(self, input_ids, attention_mask=None, token_type_ids=None):

    # forward pass through the downloaded (pretrained) BERT
    outputs = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
      token_type_ids=token_type_ids)

    # extract the pooled output and apply dropout
    pooled_output = self.dropout( outputs.pooler_output )

    # final push through the classification layer.
    logits = self.classifier(pooled_output)
    return logits

In [None]:
# create an instance of the model and test it:
model = BertForBinaryClassification().to(device)
model

# Prepare to fine-tune the model

In [None]:
# training hyperparameters
num_training = 150
batch_size = 64
seq_len = 256

# optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(),lr=5e-7)
loss_fun = nn.CrossEntropyLoss()

In [None]:
# create a batch of data
ixA = torch.randint(len(aliceTokens)-seq_len,size=(batch_size//2,))
ixE = torch.randint(len(edgarTokens)-seq_len,size=(batch_size//2,))
X   = torch.concatenate(
    (aliceTokens[ixA[:,None] + torch.arange(seq_len)],
     edgarTokens[ixE[:,None] + torch.arange(seq_len)]), axis=0).to(device)

# and the labels (same for every batch)
labels = torch.concatenate((
            torch.zeros(batch_size//2,dtype=torch.long),
            torch.ones(batch_size//2,dtype=torch.long)),
                        axis=0).to(device)

print(f'Data batch shape: {X.shape}')
print(f'Labels batch shape: {labels.shape}')


# forward pass, get model predictions, and report the loss+accuracy
logits = model(X)
predLabels = torch.argmax(logits, dim=1)
loss = loss_fun(logits,labels).item()

print('\nPredicted labels:\n',predLabels)
print('Actual labels:\n',labels)

print(f'\nLoss: {loss:.4f}')
print(f'\nAccuracy: {(predLabels == labels).sum().item()/batch_size}')

# Train the model

In [None]:
# initialize performance metrices
losses = np.zeros(num_training)
accuracy = np.zeros(num_training)


## loop over data samples
for sampli in range(num_training):

  # create a batch of data ('labels' created in previous cell)
  ixA = torch.randint(len(aliceTokens)-seq_len,size=(batch_size//2,))
  ixE = torch.randint(len(edgarTokens)-seq_len,size=(batch_size//2,))
  X   = torch.concatenate(
           (aliceTokens[ixA[:,None] + torch.arange(seq_len)],
            edgarTokens[ixE[:,None] + torch.arange(seq_len)]), axis=0).to(device)

  # clear the previous gradients
  optimizer.zero_grad()

  # forward pass and get model predictions
  logits = model(X)
  predLabels = torch.argmax(logits, dim=1)

  # calculate and store loss + average accuracy
  loss = loss_fun(logits,labels)
  losses[sampli] = loss.item()
  accuracy[sampli] = (predLabels == labels).sum().item()/batch_size

  # backward pass
  loss.backward()

  # update the weights and the learning rate
  optimizer.step()

  # test the model and report losses every k samples
  if sampli%7 == 0:
    # report the results
    print(f'Sample {sampli:4}/{num_training}, losses: {losses[sampli]:.2f}, accuracy: {accuracy[sampli]:.2f}')

In [None]:
# mean-smoothing function
def meansmooth(x,k=3):
  y = x+0 # copy of the data
  w = (k-1)//2 # number of elements to average on either side

  # loop over samples
  for i in range(w,len(x)-w):
    y[i] = x[i-w:i+w].mean() # centered mean

  return y

# demo
x = np.array([1,5,3,0,4,1,2,-2,-1])
y = meansmooth(x)
plt.plot(x,'s-',label='Original')
plt.plot(y,'o-',label='Smoothed')
plt.legend()
plt.show()

In [None]:
_,ax = plt.subplots(1,figsize=(8,4))

# plot the losses
ax.plot(losses,'C0',linewidth=.5)
ax.plot(meansmooth(losses,5),'C0')
ax.set_ylabel('Loss',color='C0')
ax.tick_params(axis='y',color='C0',labelcolor='C0')

axr = ax.twinx()
axr.plot(accuracy,'C1',linewidth=.5)
axr.plot(meansmooth(accuracy,5),'C1')
axr.set_ylabel('Accuracy',color='C1')
axr.tick_params(axis='y',color='C1',labelcolor='C1')

ax.set(xlabel='Training sample',xlim=[-1,num_training])
plt.show()

# Save the model

In [None]:
torch.save(model.state_dict(),'bert_classifier_AliceVsEdgar.pt')
# don't forget to download it :D