# **Task 6**

## **Import Libraries**

In [None]:
%pip install evaluate -qq

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m81.9/84.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import kagglehub
import os
import json
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import evaluate # Import the evaluate library
from datasets import load_dataset

## **Load data and Exploration**

In [None]:
# Download latest version
path = kagglehub.dataset_download("stanfordu/stanford-question-answering-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/stanfordu/stanford-question-answering-dataset?dataset_version_number=2...


100%|██████████| 8.73M/8.73M [00:00<00:00, 136MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/stanfordu/stanford-question-answering-dataset/versions/2


In [None]:
# SQuAD v1.1 has JSON files (train-v1.1.json, dev-v1.1.json)
train_file = os.path.join(path, "train-v1.1.json")
dev_file = os.path.join(path, "dev-v1.1.json")

In [None]:
# Load a few samples from dev set
with open(dev_file, "r") as f:
    squad_dev = json.load(f)

examples = []
for article in squad_dev["data"][:2]:  # take first 2 articles for demo
    for paragraph in article["paragraphs"]:
        context = paragraph["context"]
        for qa in paragraph["qas"]:
            question = qa["question"]
            answers = qa["answers"]  # list of dicts with 'text' and 'answer_start'
            examples.append({"context": context, "question": question, "answers": answers})

In [None]:
print(f"Loaded {len(examples)} QA pairs.")

Loaded 1057 QA pairs.


## **Load Pretrained QA Model**

In [None]:
model_name = "distilbert-base-uncased-distilled-squad"  # small & fine-tuned for SQuAD
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

Device set to use cpu


## **Run QA on Sample**

In [None]:
print("\n--- Sample Predictions ---")
for ex in examples[:10]:
    result = qa_pipeline(question=ex["question"], context=ex["context"])
    print(f"Q: {ex['question']}")
    print(f"A (predicted): {result['answer']}")
    print(f"A (true): {[a['text'] for a in ex['answers']]}")
    print("-" * 60)


--- Sample Predictions ---
Q: Which NFL team represented the AFC at Super Bowl 50?
A (predicted): Denver Broncos
A (true): ['Denver Broncos', 'Denver Broncos', 'Denver Broncos']
------------------------------------------------------------
Q: Which NFL team represented the NFC at Super Bowl 50?
A (predicted): Carolina Panthers
A (true): ['Carolina Panthers', 'Carolina Panthers', 'Carolina Panthers']
------------------------------------------------------------
Q: Where did Super Bowl 50 take place?
A (predicted): Levi's Stadium
A (true): ['Santa Clara, California', "Levi's Stadium", "Levi's Stadium in the San Francisco Bay Area at Santa Clara, California."]
------------------------------------------------------------
Q: Which NFL team won Super Bowl 50?
A (predicted): Denver Broncos
A (true): ['Denver Broncos', 'Denver Broncos', 'Denver Broncos']
------------------------------------------------------------
Q: What color was used to emphasize the 50th anniversary of the Super Bowl?
A (pr

## **Evaluate with exact match and F1**

In [None]:
metric = evaluate.load("squad")

preds = []
refs = []

for ex in examples[:50]:  # evaluate on first 50 for speed
    result = qa_pipeline(question=ex["question"], context=ex["context"])
    preds.append({"id": ex["question"], "prediction_text": result["answer"]})
    refs.append({"id": ex["question"], "answers": ex["answers"]})

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

In [None]:
# Reformat references to match the expected structure
formatted_refs = []
for ex in examples[:50]:
    # Extract text and answer_start into separate lists
    answer_texts = [a['text'] for a in ex['answers']]
    answer_starts = [a['answer_start'] for a in ex['answers']]
    formatted_refs.append({"id": ex["question"], "answers": {"text": answer_texts, "answer_start": answer_starts}})

eval_results = metric.compute(predictions=preds, references=formatted_refs)

print("\n--- Evaluation ---")
print(f"Exact Match (EM): {eval_results['exact_match']:.2f}")
print(f"F1 Score: {eval_results['f1']:.2f}")


--- Evaluation ---
Exact Match (EM): 84.00
F1 Score: 85.60


# **Bouns Task 6**

## **Compare Different Models**

In [None]:
%%writefile app.py
import kagglehub
import os
import json
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import evaluate
# import streamlit as st


Writing app.py


In [None]:
model_names = [
    "distilbert-base-uncased-distilled-squad",   # small baseline
    "bert-large-uncased-whole-word-masking-finetuned-squad",  # BERT
    "deepset/roberta-base-squad2",               # RoBERTa
    "twmkn9/albert-base-v2-squad2"               # ALBERT
]

## **Evaluate Models**

In [None]:
results_summary = []

for name in model_names:
    print(f"\nLoading model: {name}")
    tokenizer = AutoTokenizer.from_pretrained(name)
    model = AutoModelForQuestionAnswering.from_pretrained(name)
    qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

    # Evaluate on first 30 examples for speed
    preds = []
    formatted_refs = []
    for ex in examples[:30]:
        result = qa_pipeline(question=ex["question"], context=ex["context"])
        preds.append({"id": ex["question"], "prediction_text": result["answer"]})
        # format refs
        answer_texts = [a['text'] for a in ex['answers']]
        answer_starts = [a['answer_start'] for a in ex['answers']]
        formatted_refs.append({"id": ex["question"], "answers": {"text": answer_texts, "answer_start": answer_starts}})

    eval_results = metric.compute(predictions=preds, references=formatted_refs)
    print(f"Exact Match: {eval_results['exact_match']:.2f}, F1: {eval_results['f1']:.2f}")

    results_summary.append({"model": name, "EM": eval_results['exact_match'], "F1": eval_results['f1']})


Loading model: distilbert-base-uncased-distilled-squad


Device set to use cpu


Exact Match: 90.00, F1: 92.67

Loading model: bert-large-uncased-whole-word-masking-finetuned-squad


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Exact Match: 86.67, F1: 96.10

Loading model: deepset/roberta-base-squad2


tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Device set to use cpu


Exact Match: 93.33, F1: 97.43

Loading model: twmkn9/albert-base-v2-squad2


tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/716 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

Some weights of the model checkpoint at twmkn9/albert-base-v2-squad2 were not used when initializing AlbertForQuestionAnswering: ['albert.pooler.bias', 'albert.pooler.weight']
- This IS expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


model.safetensors:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

Exact Match: 76.67, F1: 79.33


In [None]:
print("\n--- Model Comparison Summary ---")
for r in results_summary:
    print(f"{r['model']}: EM={r['EM']:.2f}, F1={r['F1']:.2f}")


--- Model Comparison Summary ---
distilbert-base-uncased-distilled-squad: EM=90.00, F1=92.67
bert-large-uncased-whole-word-masking-finetuned-squad: EM=86.67, F1=96.10
deepset/roberta-base-squad2: EM=93.33, F1=97.43
twmkn9/albert-base-v2-squad2: EM=76.67, F1=79.33


## **Simple Streamlit Interface**

In [None]:
# **Command-line Interface**
while True:
    context = input("\nEnter a passage (or 'quit' to stop): ")
    if context.lower() == "quit":
        break
    question = input("Enter your question: ")

    # Use the best performing model (e.g., RoBERTa)
    model_name = "deepset/roberta-base-squad2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

    result = qa_pipeline(question=question, context=context)
    print(f"Answer: {result['answer']}")



Enter a passage (or 'quit' to stop): Albert Einstein was a German-born theoretical physicist who developed the theory of relativity, one of the two pillars of modern physics. His work is also known for its influence on the philosophy of science.
Enter your question: Who developed the theory of relativity?


Device set to use cpu


Answer: Albert Einstein

Enter a passage (or 'quit' to stop): Albert Einstein was a German-born theoretical physicist who developed the theory of relativity, one of the two pillars of modern physics. His work is also known for its influence on the philosophy of science.
Enter your question: What was Einstein's contribution to physics?


Device set to use cpu


Answer: developed the theory of relativity

Enter a passage (or 'quit' to stop): stop
Enter your question: stop


Device set to use cpu


Answer: stop

Enter a passage (or 'quit' to stop): quit
