In [1]:
# Python 3.10+ recommended
!pip install --upgrade pip
!pip install transformers[torch] sentence-transformers faiss-cpu accelerate datasets evaluate fastapi "uvicorn[standard]" gradio matplotlib seaborn torch
# Optional for ONNX/quantization: onnxruntime onnx transformers-onnx

Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting httptools>=0.6.3 (from uvicorn[standard])
  Downloading httptools-0.6.4-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting uvloop>=0.15.1 (from uvicorn[standard])
  Downloading uvloop-0.21.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.

In [2]:
"""
qa_fullstack.py
Comprehensive Question Answering (QA) codebase — from basics to advanced (2025 best practices).

Structure:
- Section A: Configuration & utilities
- Section B: Simple modern QA via pipeline (recommended for most students)
- Section C: Under-the-hood: manual tokenization + logits -> answer span (learning)
- Section D: Long-context handling (sliding window)
- Section E: Retrieval-Augmented QA (embeddings + FAISS)
- Section F: Optimization notes (quantization, ONNX, batching)
- Section G: Serving (FastAPI + Gradio example)
- Section H: Evaluation utilities (EM / F1)
- Section I: Model selection notes & links (2025-relevant)
"""

'\nqa_fullstack.py\nComprehensive Question Answering (QA) codebase — from basics to advanced (2025 best practices).\n\nStructure:\n- Section A: Configuration & utilities\n- Section B: Simple modern QA via pipeline (recommended for most students)\n- Section C: Under-the-hood: manual tokenization + logits -> answer span (learning)\n- Section D: Long-context handling (sliding window)\n- Section E: Retrieval-Augmented QA (embeddings + FAISS)\n- Section F: Optimization notes (quantization, ONNX, batching)\n- Section G: Serving (FastAPI + Gradio example)\n- Section H: Evaluation utilities (EM / F1)\n- Section I: Model selection notes & links (2025-relevant)\n'

In [3]:
from typing import List, Dict, Tuple, Optional
import numpy as np
import torch
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    AutoModel,
    AutoConfig,
)
import textwrap
import math
import faiss
from sentence_transformers import SentenceTransformer
import evaluate
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# ---------------------------
# Section A: Config & Helpers
# ---------------------------

# Change these to switch models. Recommendations (2025):
# - deepset/deberta-v3-large-squad2 : strong extractive QA model (SQuAD2.0). Good balance.
# - models like E5 / ModernBERT / Mistral-based encoders are often preferred for retrieval & embeddings.
RECOMMENDED_QA_MODEL = "deepset/deberta-v3-large-squad2"
RECOMMENDED_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

DEVICE = 0 if torch.cuda.is_available() else -1  # pipeline uses -1 for CPU

def smart_print(title: str, text: str):
    print("\n" + "=" * 10 + f" {title} " + "=" * 10)
    print(textwrap.fill(text, 120))

def to_numpy(tensor: torch.Tensor) -> np.ndarray:
    return tensor.detach().cpu().numpy() if isinstance(tensor, torch.Tensor) else np.array(tensor)

In [5]:
# ---------------------------
# Section B: Modern simple QA
# ---------------------------

def create_qa_pipeline(model_name: str = RECOMMENDED_QA_MODEL, device: int = DEVICE):
    """
    Create a high-level Hugging Face pipeline for QA.
    This is the recommended entry point for learning & quick experiments.
    """
    return pipeline("question-answering", model=model_name, tokenizer=model_name, device=device)

def example_simple_pipeline():
    qa = create_qa_pipeline()
    question = "When was the first DVD released?"
    context = ("The first DVD (Digital Versatile Disc) was released on March 24, 1997. "
               "It was a movie titled 'Twister' and was released in Japan.")
    res = qa({"question": question, "context": context})
    # pipeline returns dict with 'answer', 'score', 'start', 'end'
    print("Pipeline result:", res)

In [6]:
# ---------------------------
# Section C: Under-the-hood QA (learning)
# ---------------------------

class ManualQAModel:
    """
    Show how a QA model works under the hood:
    - Tokenize question+context
    - Run model -> start_logits, end_logits
    - Convert logits -> token span -> detokenize
    """
    def __init__(self, model_name: str = RECOMMENDED_QA_MODEL, device: int = None):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForQuestionAnswering.from_pretrained(model_name)
        if device is not None and device >= 0:
            self.model.to(torch.device(f"cuda:{device}"))

    def answer(self, question: str, context: str, topk: int = 1) -> List[Dict]:
        # Tokenize with return tensors for PyTorch
        inputs = self.tokenizer(question, context, return_tensors="pt", truncation=True)
        with torch.no_grad():
            outputs = self.model(**inputs)

        start_logits = outputs.start_logits[0]   # (seq_len,)
        end_logits = outputs.end_logits[0]       # (seq_len,)

        # Convert to numpy and compute softmax probabilities (for explanation)
        start_probs = torch.softmax(start_logits, dim=0).cpu().numpy()
        end_probs = torch.softmax(end_logits, dim=0).cpu().numpy()

        # Beam-style selection of top spans (simple approach)
        span_scores = []
        seq_len = start_logits.shape[1] if len(start_logits.shape) > 1 else start_logits.shape[0]
        # we iterate over top start & end candidates for small inputs (demo)
        start_top_idx = np.argsort(start_probs)[-topk:][::-1]
        end_top_idx = np.argsort(end_probs)[-topk:][::-1]
        for s in start_top_idx:
            for e in end_top_idx:
                if e >= s and (e - s) < 60:  # limit span length
                    score = start_probs[s] * end_probs[e]
                    span_scores.append((score, int(s), int(e)))

        if not span_scores:
            return [{"answer": "", "score": 0.0}]

        span_scores.sort(reverse=True)
        results = []
        for score, s, e in span_scores[:topk]:
            tokens = inputs["input_ids"][0][s:e+1]
            answer_text = self.tokenizer.decode(tokens, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            results.append({"answer": answer_text, "score": float(score), "start": s, "end": e})
        return results

def example_manual_qa():
    m = ManualQAModel()
    print(m.answer("When was the first DVD released?",
                   "The first DVD (Digital Versatile Disc) was released on March 24, 1997."))


In [7]:
# ---------------------------
# Section D: Long-Context handling (sliding window)
# ---------------------------

def sliding_window_tokenize(tokenizer, question: str, context: str, max_length: int = 512, doc_stride: int = 128):
    """
    Create chunks (windows) of tokenized inputs for long contexts.
    Yields dicts that can be batched through the model.
    """
    # tokenizer.encode_plus with return_overflowing_tokens helps, but show manual pattern
    inputs = tokenizer(question, context, return_overflowing_tokens=True, max_length=max_length,
                       stride=doc_stride, truncation=True, return_tensors="pt")
    # Hugging Face returns 'overflow_to_sample_mapping' and multiple tensors; return them
    return inputs

def example_sliding_window():
    tokenizer = AutoTokenizer.from_pretrained(RECOMMENDED_QA_MODEL)
    large_context = " ".join(["Sentence about cars."] * 2000)  # fake long context
    inputs = sliding_window_tokenize(tokenizer, "What is the dealership name?", large_context, max_length=384, doc_stride=128)
    print("Generated windows:", inputs["input_ids"].shape)

In [8]:
# ---------------------------
# Section E: Retrieval-Augmented QA (RAG-lite with FAISS)
# ---------------------------

class RetrieverQA:
    """
    Simple retrieval-augmented QA example:
    - Build embeddings for documents (chunks)
    - Index with FAISS
    - Retrieve top-k passages, then run a QA model on the concatenated retrieved passages
    """
    def __init__(self, embedding_model_name: str = RECOMMENDED_EMBEDDING_MODEL,
                 qa_model_name: str = RECOMMENDED_QA_MODEL, faiss_index_factory: str = "Flat"):
        self.embedder = SentenceTransformer(embedding_model_name)
        self.qa_pipeline = create_qa_pipeline(qa_model_name, device=DEVICE)
        self.index = None
        self.documents = []  # original text chunks
        self.emb_dim = self.embedder.get_sentence_embedding_dimension()

    def build_index(self, documents: List[str]):
        # store docs and build embeddings -> FAISS index
        self.documents = documents
        embeddings = np.vstack([self.embedder.encode(d, convert_to_numpy=True) for d in documents]).astype("float32")
        self.index = faiss.index_factory(self.emb_dim, "Flat")
        self.index.add(embeddings)

    def retrieve(self, query: str, top_k: int = 5) -> List[Tuple[int, float]]:
        q_emb = self.embedder.encode(query, convert_to_numpy=True).astype("float32")
        D, I = self.index.search(np.expand_dims(q_emb, axis=0), top_k)
        return [(int(i), float(d)) for i, d in zip(I[0], D[0])]

    def answer(self, question: str, top_k: int = 5):
        results = self.retrieve(question, top_k=top_k)
        retrieved = [self.documents[i] for i, _ in results]
        combined_context = "\n\n".join(retrieved)
        return self.qa_pipeline({"question": question, "context": combined_context})

def example_rag():
    docs = [
        "Sunset Motors is in Crestwood, California. Established in 1978.",
        "Sunset Motors has Ford, Toyota, Honda, Chevrolet, and BMW.",
        "Sunset Motors spans over 10 acres and has solar panels since 2010.",
    ]
    rag = RetrieverQA()
    rag.build_index(docs)
    print(rag.answer("Where is Sunset Motors located?"))

In [9]:
# ---------------------------
# Section F: Optimization notes (quantization, ONNX, batching)
# ---------------------------

"""
Optimization summary (not executed code):
1) Use Hugging Face 'pipeline' with device_map / accelerate for multi-GPU / CPU offloading.
2) For CPU inference, convert model to ONNX and run with onnxruntime (or ORT with quantization).
   - transformers-onnx can export, then use onnxruntime.quantization for INT8.
3) Use bitsandbytes 8-bit/4-bit quantization for GPU inference if memory constrained.
4) Use torch.compile (PyTorch 2.x) if available for speedups on supported hardware.
5) Use batching: build a batch of question+contexts and run them together for throughput.
6) For low latency, keep model loaded and reuse tokenizer; avoid reloading per request.
"""

"\nOptimization summary (not executed code):\n1) Use Hugging Face 'pipeline' with device_map / accelerate for multi-GPU / CPU offloading.\n2) For CPU inference, convert model to ONNX and run with onnxruntime (or ORT with quantization).\n   - transformers-onnx can export, then use onnxruntime.quantization for INT8.\n3) Use bitsandbytes 8-bit/4-bit quantization for GPU inference if memory constrained.\n4) Use torch.compile (PyTorch 2.x) if available for speedups on supported hardware.\n5) Use batching: build a batch of question+contexts and run them together for throughput.\n6) For low latency, keep model loaded and reuse tokenizer; avoid reloading per request.\n"

In [10]:
# ---------------------------
# Section G: Minimal serving example (FastAPI + Gradio)
# ---------------------------

def start_fastapi_server(qa_model_name: str = RECOMMENDED_QA_MODEL):
    """
    Minimal FastAPI server. Run with:
    uvicorn qa_fullstack:start_fastapi_server --reload
    (or copy this pattern into an ASGI entrypoint)
    """
    from fastapi import FastAPI
    from pydantic import BaseModel
    app = FastAPI()
    qa = create_qa_pipeline(qa_model_name, device=DEVICE)

    class QARequest(BaseModel):
        question: str
        context: str

    @app.post("/answer")
    def answer(req: QARequest):
        res = qa({"question": req.question, "context": req.context})
        return res

    return app  # uvicorn expects an ASGI app object

def gradio_demo():
    import gradio as gr
    qa = create_qa_pipeline()
    def predict(question, context):
        return qa({"question": question, "context": context})
    demo = gr.Interface(fn=predict, inputs=["text", "text"], outputs="json", title="QA Demo")
    demo.launch()

In [11]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
[33m  DEPRECATION: Building 'rouge_score' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'rouge_score'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=2956f877e3b63c3200aca68a228dcffebfad28dbdf1b8d54a6f5f99da456c0e7
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfu

In [12]:
# ---------------------------
# Section H: Evaluation (EM / F1) utilities
# ---------------------------

rouge = evaluate.load("rouge")  # just example; exact EM/F1 below

def compute_em_f1(pred: str, gold: str) -> Dict[str, float]:
    """
    Compute Exact Match (EM) and token-level F1 as used in SQuAD evaluation.
    """
    def normalize(s):
        import re, string
        s = s.lower()
        s = re.sub(r"\b(a|an|the)\b", " ", s)
        s = ''.join(ch for ch in s if ch not in set(string.punctuation))
        s = " ".join(s.split())
        return s
    p = normalize(pred)
    g = normalize(gold)
    em = 1.0 if p == g else 0.0
    p_tokens = p.split()
    g_tokens = g.split()
    common = set(p_tokens) & set(g_tokens)
    # token overlap with counts
    from collections import Counter
    cp = Counter(p_tokens)
    cg = Counter(g_tokens)
    num_same = sum(min(cp[t], cg[t]) for t in cp)
    if num_same == 0:
        f1 = 0.0
    else:
        precision = num_same / len(p_tokens) if p_tokens else 0
        recall = num_same / len(g_tokens) if g_tokens else 0
        f1 = 2 * precision * recall / (precision + recall + 1e-12)
    return {"em": em, "f1": f1}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

In [13]:
# ---------------------------
# Section I: Model selection notes (2025)
# ---------------------------

MODEL_SELECTION_NOTES = """
2025 notes (high level):
- For extractive QA, use models fine-tuned on SQuAD/SQuAD2.0 or domain-specific corpora.
  DeBERTa-v3 variants like deepset/deberta-v3-large-squad2 are strong off-the-shelf choices. (See Hugging Face model page.)
- For retrieval/semantic search, use specialized embedding models (OpenAI/GPT embeddings, E5, or sentence-transformers miniLM variants).
- For production, prefer pipelines + accelerator/ORT or quantized models for CPU inference.
- For large documents, use sliding-window chunking + confidence & aggregation heuristics, or use a generative RAG (LLM) with a retriever if abstractive answers are acceptable.
Citations: Hugging Face pipelines & model hubs.
"""

# ---------------------------
# If run as script, show examples
# ---------------------------
if __name__ == "__main__":
    smart_print("Quick Pipeline Example", "Demonstrate the high level pipeline (recommended for quick experiments).")
    example_simple_pipeline()

    smart_print("Manual QA Example", "See QA under the hood (token logits -> answer span).")
    example_manual_qa()

    smart_print("RAG-lite Example", "Simple retriever + QA pipeline.")
    example_rag()

    smart_print("Model selection note", MODEL_SELECTION_NOTES)


Demonstrate the high level pipeline (recommended for quick experiments).


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/392 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.65M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

Device set to use cpu


Pipeline result: {'score': 0.9754787068814039, 'start': 54, 'end': 70, 'answer': ' March 24, 1997.'}

See QA under the hood (token logits -> answer span).
[{'answer': 'March 24, 1997', 'score': 0.948204755783081, 'start': 20, 'end': 23}]

Simple retriever + QA pipeline.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Device set to use cpu


{'score': 0.9375135698501254, 'start': 19, 'end': 42, 'answer': ' Crestwood, California.'}

 2025 notes (high level): - For extractive QA, use models fine-tuned on SQuAD/SQuAD2.0 or domain-specific corpora.
DeBERTa-v3 variants like deepset/deberta-v3-large-squad2 are strong off-the-shelf choices. (See Hugging Face model
page.) - For retrieval/semantic search, use specialized embedding models (OpenAI/GPT embeddings, E5, or sentence-
transformers miniLM variants). - For production, prefer pipelines + accelerator/ORT or quantized models for CPU
inference. - For large documents, use sliding-window chunking + confidence & aggregation heuristics, or use a generative
RAG (LLM) with a retriever if abstractive answers are acceptable. Citations: Hugging Face pipelines & model hubs.
