In [1]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from peft import PeftModel
import evaluate
import numpy as np
import random
import os
from tqdm.auto import tqdm
import collections
import string
import re

print("All libraries imported.")

All libraries imported.


In [2]:
# --- CONFIGURATION ---
DEVICE = torch.device("cpu")
MODEL_CHECKPOINT = "distilbert-base-uncased"

# --- CRITICAL: CONFIRM THESE PATHS ---
# Make sure these paths point to the latest checkpoint folders for each model.
LORA_MODEL_PATH = "./backend/results_lora_final/checkpoint-36183"  # Example path
IA3_MODEL_PATH = "./backend/results_ia3_final/checkpoint-36183"   # <-- UPDATE THIS PATH
# ---

print("Configuration set.")

Configuration set.


In [9]:
# --- ALL HELPER FUNCTIONS ---

def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size=20, max_answer_length=30):
    all_start_logits, all_end_logits = raw_predictions
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features): features_per_example[example_id_to_index[feature["example_id"]]].append(i)
    predictions = collections.OrderedDict()
    for example_index, example in enumerate(tqdm(examples)):
        feature_indices = features_per_example[example_index]; min_null_score = None; valid_answers = []
        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]; end_logits = all_end_logits[feature_index]
            offset_mapping = features[feature_index]["offset_mapping"]
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score: min_null_score = feature_null_score
            start_indexes = np.argsort(start_logits)[-1:-n_best_size-1:-1].tolist()
            end_indexes = np.argsort(end_logits)[-1:-n_best_size-1:-1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (start_index >= len(offset_mapping) or end_index >= len(offset_mapping) or
                        offset_mapping[start_index] is None or offset_mapping[end_index] is None): continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length: continue
                    start_char = offset_mapping[start_index][0]; end_char = offset_mapping[end_index][1]
                    valid_answers.append({"score": start_logits[start_index] + end_logits[end_index], "text": context[start_char:end_char]})
        if len(valid_answers) > 0: best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else: best_answer = {"text": "", "score": 0.0}
        predictions[example["id"]] = best_answer["text"]
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return formatted_predictions, references

def predict_custom(context, query, model, tokenizer):
    model.eval()
    inputs = tokenizer.encode_plus(
        query, 
        context, 
        return_tensors='pt', 
        max_length=512, 
        truncation=True
    ).to(DEVICE)
    with torch.no_grad():
        outputs = model(**inputs)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits
    answer_start = torch.argmax(start_logits)
    answer_end = torch.argmax(end_logits) + 1
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))
    if answer == tokenizer.cls_token: return "[No answer found]"
    return answer

def normalize_text(s):
    def remove_articles(text): return re.sub(r"\b(a|an|the)\b", " ", text)
    def white_space_fix(text): return " ".join(text.split())
    def remove_punc(text): return "".join(ch for ch in text if ch not in exclude)
    exclude = set(string.punctuation)
    return white_space_fix(remove_articles(remove_punc(s.lower())))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split(); truth_tokens = normalize_text(truth).split()
    if len(pred_tokens) == 0 or len(truth_tokens) == 0: return int(pred_tokens == truth_tokens)
    common_tokens = set(pred_tokens) & set(truth_tokens)
    if len(common_tokens) == 0: return 0
    prec = len(common_tokens) / len(pred_tokens); rec = len(common_tokens) / len(truth_tokens)
    return 2 * (prec * rec) / (prec + rec)

def prepare_train_features(examples):
    # This function is needed just to process the validation set for the predict() method
    tokenized_examples = tokenizer(
        examples["question"], examples["context"], truncation="only_second", max_length=256, stride=64,
        return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length",
    )
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    tokenized_examples["example_id"] = []
    for i in range(len(tokenized_examples["input_ids"])):
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])
    return tokenized_examples

print("Helper functions defined.")

Helper functions defined.


In [10]:
# Load the tokenizer and the original SQuAD dataset
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)
dataset = load_dataset("squad")

# We only need to process the validation set for evaluation
print("Processing validation dataset...")
validation_features = dataset["validation"].map(
    prepare_train_features,
    batched=True,
    remove_columns=dataset["validation"].column_names,
    desc="Tokenizing validation set",
)
print("Validation data ready.")

Processing validation dataset...
Validation data ready.


In [11]:
# This cell loads your trained models from the checkpoint files
print("Loading fine-tuned models from disk...")

# Load the base model
base_model = AutoModelForQuestionAnswering.from_pretrained(MODEL_CHECKPOINT)

# Apply the LoRA adapter
lora_model = PeftModel.from_pretrained(base_model, LORA_MODEL_PATH)
lora_model = lora_model.to(DEVICE)

# Apply the (IA)3 adapter (re-create base model to avoid conflicts)
base_model_2 = AutoModelForQuestionAnswering.from_pretrained(MODEL_CHECKPOINT)
ia3_model = PeftModel.from_pretrained(base_model_2, IA3_MODEL_PATH)
ia3_model = ia3_model.to(DEVICE)

print("LoRA and (IA)3 models loaded successfully.")

Loading fine-tuned models from disk...


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LoRA and (IA)3 models loaded successfully.


In [12]:
print("Custom Example Side-by-Side Analysis")

custom_examples = [
    {"context": "The formation of our Solar System began approximately 4.6 billion years ago with the gravitational collapse of a small part of a giant molecular cloud, often referred to as the Solar Nebula. Most of the collapsing mass collected in the center, forming the Sun, while the rest flattened into a spinning, swirling protoplanetary disk out of which the planets, moons, asteroids, and other small Solar System bodies formed. This model, known as the nebular hypothesis, is the most widely accepted scientific model for the formation of star systems. As the disk spun, particles of dust and gas began to clump together through a process called accretion. Close to the young, hot protostar, only materials with high melting points, such as metals and silicates, could survive. This led to the formation of the rocky terrestrial planets: Mercury, Venus, Earth, and Mars. Farther out in the disk, beyond the asteroid belt, the environment was cooler. This temperature boundary, known as the frost line, was located between the orbits of present-day Mars and Jupiter. Beyond this line, volatile icy compounds were able to condense, allowing the gas giants Jupiter and Saturn, and the ice giants Uranus and Neptune, to grow to enormous sizes by accreting vast amounts of hydrogen and helium gas.", "question": "What is the name of the boundary that separated the formation of rocky planets from gas giants?", "answer": "the frost line"},
    {"context": "The formation of our Solar System began approximately 4.6 billion years ago with the gravitational collapse of a small part of a giant molecular cloud, often referred to as the Solar Nebula. Most of the collapsing mass collected in the center, forming the Sun, while the rest flattened into a spinning, swirling protoplanetary disk out of which the planets, moons, asteroids, and other small Solar System bodies formed. This model, known as the nebular hypothesis, is the most widely accepted scientific model for the formation of star systems. As the disk spun, particles of dust and gas began to clump together through a process called accretion. Close to the young, hot protostar, only materials with high melting points, such as metals and silicates, could survive. This led to the formation of the rocky terrestrial planets: Mercury, Venus, Earth, and Mars. Farther out in the disk, beyond the asteroid belt, the environment was cooler. This temperature boundary, known as the frost line, was located between the orbits of present-day Mars and Jupiter. Beyond this line, volatile icy compounds were able to condense, allowing the gas giants Jupiter and Saturn, and the ice giants Uranus and Neptune, to grow to enormous sizes by accreting vast amounts of hydrogen and helium gas.", "question": "when did the solar system start", "answer": "4.6 billion years ago"},
    {"context": "Python is an interpreted, high-level, general-purpose programming language. Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant indentation. This core philosophy is often summarized in the Zen of Python, which includes aphorisms like `Beautiful is better than ugly` and `Simple is better than complex.` The language's name is a tribute to the British comedy group Monty Python, as Rossum was a fan of their show, Monty Python's Flying Circus.", "question": "What was Guido van Rossum's primary goal when designing Python?", "answer": "code readability"},
    {"context": "Gardens by the Bay is a nature park spanning 101 hectares in the Central Region of Singapore, adjacent to the Marina Reservoir. The park consists of three waterfront gardens and is famous for its iconic Supertree Grove. It also features two large cooled conservatories for visitors. Inside the Cloud Forest conservatory, guests can experience a cool-moist climate and see a 35-metre-tall indoor waterfall, which is the world's second tallest. The other conservatory, the Flower Dome, holds the Guinness World Record as the world's largest glass greenhouse.", "question": "Which conservatory at Gardens by the Bay holds a Guinness World Record?", "answer": "the Flower Dome"},
    {"context": "The National University of Singapore (NUS) is the national research university of Singapore. Founded in 1905 as the Straits Settlements and Federated Malay States Government Medical School, it is the oldest autonomous university in the country. The institution's history is complex, reflecting the development of Singapore as a nation. The medical school was renamed the King Edward VII College of Medicine in 1921. A second institution, Raffles College, was established in 1928 to promote education in arts and social sciences. On 8 October 1949, a significant milestone was reached when these two colleges were merged to create the University of Malaya. Following the decolonization of Malaya, the university was split into two autonomous divisions in 1959: one in Kuala Lumpur and one in Singapore. The Singapore division, located in Bukit Timah, was later established as the independent University of Singapore in 1962. The modern NUS was finally formed in 1980 through a merger between the University of Singapore and Nanyang University.", "question": "What were the two colonial-era colleges that were merged to create the University of Malaya in 1949?", "answer": "the King Edward VII College of Medicine and Raffles College"},
]

# A single loop to get predictions from both models for each question
for i, example in enumerate(custom_examples):
    context = example["context"]
    question = example["question"]
    true_answer = example["answer"]

    # Get prediction from both models
    lora_prediction = predict_custom(context, question, lora_model, tokenizer)
    ia3_prediction = predict_custom(context, question, ia3_model, tokenizer)
    
    print(f"--- Example {i+1} ---")
    print(f"Question: {question}")
    print(f"True Answer: '{true_answer}'")
    print(f"Prediction from LoRA: '{lora_prediction}'")
    print(f"Prediction from (IA)³: '{ia3_prediction}'")
    print("\n") 

Custom Example Side-by-Side Analysis
--- Example 1 ---
Question: What is the name of the boundary that separated the formation of rocky planets from gas giants?
True Answer: 'the frost line'
Prediction from LoRA: 'the frost line'
Prediction from (IA)³: 'mercury, venus, earth, and mars'


--- Example 2 ---
Question: when did the solar system start
True Answer: '4.6 billion years ago'
Prediction from LoRA: '[No answer found]'
Prediction from (IA)³: '4. 6 billion years ago with the gravitational collapse of a small part of a giant molecular cloud, often referred to as the solar nebula'


--- Example 3 ---
Question: What was Guido van Rossum's primary goal when designing Python?
True Answer: 'code readability'
Prediction from LoRA: 'code readability'
Prediction from (IA)³: '1991, python ' s design philosophy emphasizes code readability'


--- Example 4 ---
Question: Which conservatory at Gardens by the Bay holds a Guinness World Record?
True Answer: 'the Flower Dome'
Prediction from LoRA: 