|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 3:</h2>|<h1>Evaluating LLMs<h1>|
|<h2>Section:</h2>|<h1>Quantitative evaluations<h1>|
|<h2>Lecture:</h2>|<h1><b>MAUVE<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
# run this code, then restart the python session (and then comment it out)
# !pip install -U datasets huggingface_hub fsspec

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import requests

import torch
from transformers import AutoTokenizer,AutoModelForCausalLM

from datasets import load_dataset
import textwrap

# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# import GPT2 and disable normalizations
model = AutoModelForCausalLM.from_pretrained('gpt2-large')
model.eval()
model.to(device)

tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token_id = tokenizer.eos_token_id

# Aggregate human data

In [None]:
# reference dataset
dataset = load_dataset('wikitext','wikitext-2-raw-v1',split='train')
dataset

In [None]:
human_data = []
i = 0

while len(human_data)<100:

  # tokenize this text sample
  toks = tokenizer.encode(dataset[i]['text'], return_tensors='pt')[0]

  # append the text if there are >200 tokens (but only the first 200)
  if len(toks)>200:
      human_data.append(tokenizer.decode(toks[:200]))
  i += 1 # increment counter

human_data

# Model data

In [None]:
model_data = []

# 100 samples of 200 tokens each
for _ in range(100):
  out = model.generate(
      torch.tensor([[tokenizer.bos_token_id]]).to(device),
      pad_token_id = tokenizer.eos_token_id,
      min_length = 200,
      max_length = 200,
      do_sample  = True,
      top_k      = 50,
      top_p      = .95,
  )
  model_data.append(tokenizer.decode(out[0][1:]))

In [None]:
model_data

# MAUVE

In [None]:
!pip install mauve-text
import mauve

In [None]:
dir(mauve)

In [None]:
# calculate MAUVE score
mauve_output  = mauve.compute_mauve(
    p_text    = model_data, # p is the model's output
    q_text    = human_data, # q is human-written text
    verbose   = True,
    device_id = 0
)
mauve_output.mauve

In [None]:
dir(mauve_output)

In [None]:
_,axs = plt.subplots(1,2,figsize=(10,4))

xvals4bar = np.arange(len(mauve_output.p_hist))

axs[0].bar(xvals4bar-.15,mauve_output.p_hist,width=.5,label='Human-generated text')
axs[0].bar(xvals4bar+.15,mauve_output.q_hist,width=.5,label='Model-generated text')
axs[0].set(xlabel='Value (au)',ylabel='Proportion',title='Histograms of quantized distributions',xlim=[-1,len(mauve_output.p_hist)])
axs[0].legend()


# the divergence curve and the area
x = mauve_output.divergence_curve[:,0]
y = mauve_output.divergence_curve[:,1]

axs[1].plot(x,y,'ko-',markersize=7,markerfacecolor=[.9,.7,.7])
axs[1].fill_between(x,y,color=[.7,.7,.9,.5],label=f'AUC = {mauve_output.mauve:.2f}')
axs[1].set(title='Divergence curve',xlabel='Model | Human',ylabel='Human | Model',
           xlim=[0,1.02],ylim=[0,1.02])
axs[1].legend()

plt.tight_layout()
plt.show()