In [6]:
import os
from transformers import (
    GPT2TokenizerFast,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from google.colab import drive
drive.mount('/content/drive')
import torch

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
def validate_gpt2_qa(model, tokenizer, samples, max_new_tokens=30):
    results = []
    exact_matches = 0

    model.eval()
    with torch.no_grad():
        for sample in samples:
            prompt = f"Answer the question by extracting the exact answer from the context.\n\nContext: {sample['context']}\n\nQuestion: {sample['question']}\n\nAnswer:"
            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

            out = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                eos_token_id=tokenizer.eos_token_id
            )

            # Decode only the generated tokens, ignoring the prompt
            answer = tokenizer.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True).strip()
            is_exact = answer.lower() == sample['expected_answer'].lower()
            if is_exact:
                exact_matches += 1

            results.append({
                "question": sample['question'],
                "generated_answer": answer,
                "expected_answer": sample['expected_answer'],
                "exact_match": is_exact
            })

    accuracy = exact_matches / len(samples)
    return results, accuracy

In [9]:
# Example usage with 5 QA samples
tokenizer = GPT2TokenizerFast.from_pretrained("/content/drive/MyDrive/Carbonemission/Version2/gpt2-squadv2-80")
model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/Carbonemission/Version2/gpt2-squadv2-80").to("cuda")

samples = [
    {"question": "Who wrote the 1813 novel Pride and Prejudice?",
     "context": "Pride and Prejudice is a novel of manners by the English author Jane Austen.",
     "expected_answer": "Jane Austen"},
    {"question": "What is the capital of France?",
     "context": "France is a country in Europe. Its capital is Paris.",
     "expected_answer": "Paris"},
    {"question": "Who painted the Mona Lisa?",
     "context": "The Mona Lisa is a portrait painted by Leonardo da Vinci in the early 16th century.",
     "expected_answer": "Leonardo da Vinci"},
    {"question": "What element does 'O' represent on the periodic table?",
     "context": "Oxygen is a chemical element with symbol O and atomic number 8.",
     "expected_answer": "Oxygen"},
    {"question": "Who developed the theory of relativity?",
     "context": "Albert Einstein was a theoretical physicist who developed the theory of relativity.",
     "expected_answer": "Albert Einstein"}
]

results, accuracy = validate_gpt2_qa(model, tokenizer, samples)
for r in results:
    print(f"Q: {r['question']}\nPredicted: {r['generated_answer']}\nExpected: {r['expected_answer']}\nExact Match: {r['exact_match']}\n")
print("Validation Accuracy:", accuracy)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: Who wrote the 1813 novel Pride and Prejudice?
Predicted: Jane Austen. ~~~~~~~~~~~~~~~ Pride and Prejudice is a novel of manners by the English author Jane Austen. ~~~~~~~~~~~~
Expected: Jane Austen
Exact Match: False

Q: What is the capital of France?
Predicted: Paris. Its capital is Paris. Its capital is Paris. Its capital is Paris. Its capital is Paris. Its capital is Paris. Its capital is
Expected: Paris
Exact Match: False

Q: Who painted the Mona Lisa?
Predicted: Leonardo da Vinci in the early 16th century. The Mona Lisa is a portrait painted by Leonardo da Vinci in the early 16th century
Expected: Leonardo da Vinci
Exact Match: False

Q: What element does 'O' represent on the periodic table?
Predicted: Oxygen is a chemical element with symbol O and atomic number 8. Oxygen is a chemical element with symbol O and atomic number 8. Oxygen
Expected: Oxygen
Exact Match: False

Q: Who developed the theory of relativity?
Predicted: Albert Einstein was a theoretical physicist who develo