In [3]:
# ================================================
#  Question Answering on SQuAD v1.1
# Tools: HuggingFace Transformers, Tokenizers, Pandas, Streamlit
# ================================================
!pip install -q transformers datasets evaluate streamlit

import pandas as pd, json
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import evaluate

# === Load Dataset ===
train_path = "/content/drive/MyDrive/Datasets/train-v1.1.json"
dev_path   = "/content/drive/MyDrive/Datasets/dev-v1.1.json"

def load_squad(file_path):
    with open(file_path) as f: data = json.load(f)
    rows = []
    for a in data['data']:
        for p in a['paragraphs']:
            context = p['context']
            for qa in p['qas']:
                question = qa['question']
                answer = qa['answers'][0]['text']
                rows.append({'context': context, 'question': question, 'answer': answer})
    return pd.DataFrame(rows)

train_df, dev_df = load_squad(train_path), load_squad(dev_path)

# === Evaluation Metric (Fixed) ===
metric = evaluate.load("squad")

def evaluate_model(model_name):
    print(f"\n=== Evaluating {model_name} ===")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

    preds, refs = [], []
    for i in range(10):  # subset for quick demo
        q, c, a = dev_df.iloc[i][['question','context','answer']]
        result = qa_pipeline(question=q, context=c)
        preds.append({"id": str(i), "prediction_text": result['answer']})
        refs.append({"id": str(i), "answers": {"text": [a], "answer_start": [c.find(a)]}})

    scores = metric.compute(predictions=preds, references=refs)
    print(f"Exact Match: {scores['exact_match']:.2f}, F1: {scores['f1']:.2f}")

# === Compare Models ===
for model in ["distilbert-base-uncased", "roberta-base", "albert-base-v2"]:
    evaluate_model(model)

# === Bonus: Simple Streamlit Interface ===
with open("app.py","w") as f:
    f.write('''
import streamlit as st
from transformers import pipeline

st.title(" Question Answering System")
qa_model = st.selectbox("Choose a model", ["distilbert-base-uncased", "roberta-base", "albert-base-v2"])
nlp = pipeline("question-answering", model=qa_model)
context = st.text_area("Enter passage")
question = st.text_input("Enter your question")
if st.button("Get Answer"):
    if context and question:
        ans = nlp(question=question, context=context)
        st.success(f"Answer: {ans['answer']}")
''')
print("\n✅ All models evaluated.\nRun Streamlit app with:\n!streamlit run app.py --server.port 8080 --server.address 0.0.0.0")


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]


=== Evaluating distilbert-base-uncased ===


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


Exact Match: 0.00, F1: 0.00

=== Evaluating roberta-base ===


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


Exact Match: 0.00, F1: 18.91

=== Evaluating albert-base-v2 ===


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


Exact Match: 0.00, F1: 3.33

✅ All models evaluated.
Run Streamlit app with:
!streamlit run app.py --server.port 8080 --server.address 0.0.0.0
