|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 3:</h2>|<h1>Evaluating LLMs<h1>|
|<h2>Section:</h2>|<h1>Quantitative evaluations<h1>|
|<h2>Lecture:</h2>|<h1><b>Hellaswag<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
# run this code, then restart the python session (and then comment it out)
# !pip install -U datasets huggingface_hub fsspec

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F

from transformers import GPT2LMHeadModel, GPT2TokenizerFast

from datasets import load_dataset

# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
# import the model and tokenizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# import the mdoel and disable normalizations
gpt2 = GPT2LMHeadModel.from_pretrained('gpt2-medium').to(device)
gpt2.eval()

tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')

In [None]:
# import the HellaSwag validation set
dataset = load_dataset('hellaswag',split='validation',trust_remote_code=True)
dataset

In [None]:
dataset[0]

# One small example to show how the eval works

In [None]:
# pick a random example
exampleNum = 224
answer = int(dataset[exampleNum]['label']) # the true answer

# context tokens and length
context = dataset[exampleNum]['ctx']
context_len = len( tokenizer.encode(context) )

# prompts and their lengths
promptC = f'{context} {dataset[exampleNum]["endings"][answer]}'
promptC_tox = tokenizer.encode(promptC,return_tensors='pt')
promptC_len = len( promptC_tox[0] )

promptI = f'{context} {dataset[exampleNum]["endings"][3-answer]}'
promptI_tox = tokenizer.encode(promptI,return_tensors='pt')
promptI_len = len( promptI_tox[0] )

# show the prompts
print(f'Context:\n   "{context}"')
print(f'Correct ending:\n   "{promptC}"')
print(f'Incorrect ending:\n   "{promptI}"')

In [None]:
# forward pass through the model
with torch.no_grad():
  logitsC = gpt2(promptC_tox.to(device)).logits
  logitsI = gpt2(promptI_tox.to(device)).logits

# log softmax (more numerically stable than prob values)
lsm_logitsC = F.log_softmax(logitsC,dim=-1)
lsm_logitsI = F.log_softmax(logitsI,dim=-1)


# get the sequence of sm logits for the correct prompt
lsmSeqC = np.zeros(promptC_len-1)
for i in range(0,promptC_len-1):
  lsmSeqC[i] = lsm_logitsC[0,i,promptC_tox[0][i+1]]

# repeat for the incorrect prompt
lsmSeqI = np.zeros(promptI_len-1)
for i in range(0,promptI_len-1):
  lsmSeqI[i] = lsm_logitsI[0,i,promptI_tox[0][i+1]]


# probabilities of prompts (sum of logs equals product of probabilities)
probC = lsmSeqC[context_len-1:].sum()
probI = lsmSeqI[context_len-1:].sum()

# visualize the logits
plt.figure(figsize=(12,4))
plt.plot(lsmSeqC,'bo-',markersize=10,markerfacecolor='w',label=f'Correct ending ($\sum\ln(p)$={probC:.3f})')
plt.plot(lsmSeqI,'rs-',label=f'Incorrect ending ($\sum\ln(p)$={probI:.3f})')
plt.axvline(context_len-1.5,linestyle='--',color='gray')

plt.gca().set(xlabel='Token position (index)',ylabel='Log-softmax probs from distribution',title='Token log-probabilities in HellaSwag evaluation')
plt.legend()
plt.show()

# Incorporating all endings

In [None]:
example = dataset[42]


# find context length
context = example['ctx']
context_len = len( tokenizer.encode(context) )
answer = int(example['label']) # the true answer

loglikelihoods = np.zeros(len(example['endings']))

# loop over candidate endings, create prompts, get logits, and sum prob scores
for opti in range(len(example['endings'])):

  # prompts and their lengths
  prompt = f'{context} {example["endings"][opti]}'
  prompt_tox = tokenizer.encode(prompt,return_tensors='pt')
  prompt_len = len( prompt_tox[0] )

  # forward pass through the model
  with torch.no_grad():
    logits = gpt2(prompt_tox.to(device)).logits

  # convert to log probabilities
  log_probs = F.log_softmax(logits,dim=-1)

  # get the predicting log-probs for each token
  smSeq = np.array([ log_probs[0,i,prompt_tox[0][i+1]].item() for i in range(prompt_len-1)])

  loglikelihoods[opti] = np.sum(smSeq)


# consider accuracy
if np.argmax(loglikelihoods)==answer:
  print('Model was correct!')
else:
  print('Model needs more training ;)')