|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 3:</h2>|<h1>Evaluating LLMs<h1>|
|<h2>Section:</h2>|<h1>Quantitative evaluations<h1>|
|<h2>Lecture:</h2>|<h1><b>CodeChallenge HELPER: HellaSwag evals in several models<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
# run first to install and then restart
# !pip install bitsandbytes
# !pip install -U datasets huggingface_hub fsspec

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F

from datasets import load_dataset
from tqdm import tqdm # progress bar for for-loops
from transformers import AutoTokenizer,AutoModelForCausalLM, BitsAndBytesConfig

# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Exercise 1: Import and inspect the Zephyr model

In [None]:
# tokenizer
zephyr_tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-alpha')

# need a BitsAndBytesConfig object
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_compute_dtype = 'float16', # multiplication at higher precision
    bnb_4bit_use_double_quant = True,   # help preserves accuracy
)

# import the model
zephyr_model = AutoModelForCausalLM.from_pretrained('HuggingFaceH4/zephyr-7b-alpha',
    quantization_config = quantization_config)

In [None]:
# switch to eval and move to GPU

In [None]:
# counting parameters via numel
param_total = sum(p.numel() for p in
param_trainable = sum( if p.requires_grad)

# and manually based on the model description
emb   =
attn  =
mlp   =
unemb =
man_total = emb + 32*(attn+mlp) + unemb

# print the results
print(f'Total parameters: {param_total:13,} ({}B)')
print(f'Trainable params: {param_trainable:13,}')
print(f'Non-trainable   :
print(f'Manual counting :

In [None]:
# example
zephyr_model.model.layers[4].self_attn.q_proj#.weight.shape[0]/(4096*4096)

# Exercise 2: A function to test one HellaSwag sample

In [None]:
# a function to calculate accuracy on one sample
def oneHellaSample(sample,model,tokenizer):

  # find context length
  context = sample['ctx']
  context_len = len( tokenizer.encode(context) )

  smSeqs = np.zeros(len(sample['endings']))

  # loop over candidate endings, create prompts, get logits, and sum prob scores
  for opti in range(len(sample['endings'])):

    # prompts and their lengths
    prompt = f'{context} {sample["endings"][]}'
    prompt_tox = # tokenize
    prompt_len = # number of tokens

    # forward pass through the model
    with torch.no_grad():
      logits = model( # just get the logits

    # convert to log probabilities
    log_probs =

    # get the predicting log-probs for each token
    smSeq =

    # Sum log-probabilities to get the total log-likelihood
    smSeqs[opti] =

  return smSeqs # also return the index of the correct answer

In [None]:
# import the HellaSwag validation set
dataset = load_dataset('hellaswag',split='validation',trust_remote_code=True)
dataset

In [None]:
# test it with one sample
loglikelihoods,answer = oneHellaSample

if np.argmax(loglikelihoods)==answer:
  print('Model was correct!')
else:
  print('Model needs more training ;)')

# Exercise 3: Evaluate Zephyr and GPT2-small

In [None]:
# import GPT2 and disable normalizations
gpt2_model = AutoModelForCausalLM.from_pretrained('gpt2').to(device)
gpt2_model.eval()

gpt2_tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [None]:
num_samples = 500

accuracies = np.zeros((2,num_samples))


# loop over data samples with progress bar
for datai in tqdm(range(num_samples),desc='Evaluating on HellaSwag'):

  # extract one sample from the data
  example =

  # ZEPHYR: calculate the loglikelihoods
  loglikelihoods,answer =

  if np.argmax(loglikelihoods)==answer:
    accuracies[0,datai] = 1
  # -------------------------------------


  # repeat for GPT2



In [None]:
# report the average accuracy
print(f'Zepher had {np.mean(accuracies[0,:])*100:.1f}% accuracy.')
print(f'  GPT2 had {np.mean(accuracies[1,:])*100:.1f}% accuracy.')

In [None]:
# visualize
plt.figure(figsize=(12,3))

plt.plot...

plt.legend()
plt.gca().set(ylim=[-.2,.75],xlim=[-2,num_samples+1],xlabel='Swag item (index)',
              yticks=[0,.5],yticklabels=['Error','Correct'])
plt.show()

In [None]:
# Example that Zephyr got and GPT2 missed