|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 3:</h2>|<h1>Evaluating LLMs<h1>|
|<h2>Section:</h2>|<h1>Quantitative evaluations<h1>|
|<h2>Lecture:</h2>|<h1><b>Perplexity<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
# run this code, then restart the python session (and then comment it out)
# !pip install -U datasets huggingface_hub fsspec

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
from transformers import AutoModelForCausalLM, GPT2Tokenizer
from datasets import load_dataset

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# Import LLM and wikitext data

In [None]:
# load SMALL pretrained GPT-2 model and tokenizer
gpt2 = AutoModelForCausalLM.from_pretrained('gpt2').to(device)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# no training! we can use eval mode
gpt2.eval()

In [None]:
text = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
tokens = tokenizer.encode('\n\n'.join(text['text']), return_tensors='pt')
num_tokens = torch.numel(tokens)

In [None]:
# example data
text['text'][1234]

# What does perplexity mean?

In [None]:
# each list item is model outputs (logits)
situations = [
    [ 1,1,1,9 ],
    [ 1,1,1,2 ],
    [ 1,1,2,2 ],
    [ 3,1,1,2 ],
    [ 9,1,1,1 ] ]

y = 3 # target category index

# create a figure
plt.figure(figsize=(12,4))
xlabls = [] # x-axis tick labels

for i,sit in enumerate(situations):

  # raw model output (logits)
  model_output = torch.tensor([sit],dtype=torch.float64)

  # log-softmax
  softmax_output = torch.exp(model_output) / torch.sum(torch.exp(model_output))
  log_softmax = torch.log(softmax_output)

  # negative log-likelihood loss
  loss = -log_softmax[0,y]

  # perplexity
  ppl = torch.exp(loss)

  # draw the results
  plt.bar(np.array([.7,.9,1.1,1.3])+i,model_output[0].detach(),width=.2,edgecolor='k')
  plt.text(1.3+i,model_output[0,-1].detach()+.1,'Targ',font={'size':14},ha='center',va='bottom')

  # print the results
  print(f'Situation "{i}"')
  print(f'  Raw logits: {[f"{o}" for o in model_output[0]]}')
  print(f'  Softmax: {[f"{o:.4f}" for o in log_softmax[0]]}')
  print(f'  Loss: {loss.item():.4f}')
  print(f'  Perplexity: {ppl.item():.4f}\n')

  # x-axis tick label
  xlabls.append(f'"{i}"\nppl = {ppl.item():.3f}')



plt.gca().set(title='Model outputs (logits) and perplexity',ylabel='Logits',
              xticks=range(1,len(situations)+1),xticklabels=xlabls,ylim=[0,10])
plt.show()

# Perplexity in a small sample

In [None]:
# get a batch of data
seq_len = gpt2.config.n_positions # 1024
batch_size = 4

ix = torch.randint(num_tokens-seq_len,size=(batch_size,))
X  = tokens[0][ix[:,None] + torch.arange(seq_len)].to(device)

# forward pass to get the loss (internally calculated)
outputs = gpt2(X,labels=X)

# perplexity is exp(loss)
print(f'Perplexity in this random batch is {torch.exp(outputs.loss).item():.3f}')

# Perplexity across many samples

In [None]:
nSamples = 300
losses = np.zeros(nSamples)

for i in range(nSamples):

  # get a batch of data
  ix = torch.randint(num_tokens-seq_len,size=(batch_size,))
  X  = tokens[0][ix[:,None] + torch.arange(seq_len)].to(device)

  # test it
  with torch.no_grad():
    outputs = gpt2(X,labels=X)

  # calculate and store losses
  losses[i] = outputs.loss.item()

# perplexity is exp(average)
perplexity = np.exp(losses.mean())

In [None]:
plt.figure(figsize=(10,4))

plt.plot(np.exp(losses),'k.',markersize=6,label='Random segments')
plt.axhline(perplexity,color='r',linewidth=2,label='Average')

plt.legend()
plt.gca().set(xlim=[-2,nSamples+2],xlabel='Sample index (random positions)',ylabel='Perplexity',
              title='Perplexity in samples and on average')
plt.show()

# Perplexity over the text (no batches)

In [None]:
nSamples = num_tokens//seq_len # how many samples fit into the data
perplexities = np.zeros(nSamples)

for i in range(nSamples):

  # start and end values
  start = i*seq_len
  end = start + seq_len

  # the data
  X = tokens[0][start:end].to(device)

  # test it
  with torch.no_grad():
    outputs = gpt2(X,labels=X)

  # calculate and store perplexities
  perplexities[i] = torch.exp(outputs.loss).item()

ave_perplexity = perplexities.mean()

In [None]:
plt.figure(figsize=(10,4))

plt.plot(perplexities,'ko-',markersize=6,markerfacecolor=[.7,.7,.7],label='Segment')
plt.axhline(perplexities.mean(),linestyle='--',color='m',zorder=-4,label='Average')

plt.legend()
plt.gca().set(xlabel='Sample position (ordered)',ylabel='Perplexity',xlim=[-2,nSamples+2])
plt.show()