In [None]:
!pip install transformers datasets accelerate -U

Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Collecting multiprocess<0.70.19 (from datasets)
  Downloading multiprocess-0.70.18-py312-none-any.whl.metadata (7.5 kB)
Collecting fsspec<=2025.10.0,>=2023.1.0 (from fsspec[http]<=2025.10.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.10.0-py3-none-any.whl.metadata (10 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.10.0,>=2023.1.0->datasets)
  Downloading aiohttp-3.13.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (8.1 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.10.0,>=2023.1.0->datasets)
  Downloading aiohappyeyeballs-2.6.1-py3-no

#Load the Dataset

We will use the SQuAD (Stanford Question Answering Dataset). It consists of context, question, and answers (text and starting position).

In [None]:
from datasets import load_dataset

# Load the SQuAD dataset
dataset = load_dataset("squad")

# Inspect a sample
print(f"Context: {dataset['train'][0]['context']}")
print(f"Question: {dataset['train'][0]['question']}")
print(f"Answer: {dataset['train'][0]['answers']}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Context: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Answer: {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}


#Preprocessing
This is the most complex part of QA. We must:

Tokenize: Convert text to numbers.

Handle Long Contexts: BERT has a limit (usually 512 tokens). If a context is too long, we split it into chunks using a "sliding window" (stride).

Map Answers: Convert the answer's character start/end positions into token start/end positions.

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    # Tokenize questions and contexts
    # "truncation='only_second'" ensures we truncate the context, not the question
    # "return_overflowing_tokens=True" handles long contexts by creating multiple features
    # "return_offsets_mapping=True" gives us the character positions of tokens
    tokenized_inputs = tokenizer(
        examples["question"],
        examples["context"],
        max_length=384,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Map the dataset (examples) to the tokenized inputs (which might be more if split)
    sample_mapping = tokenized_inputs.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_inputs.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        # Start/End char index of the answer in the text
        start_char = answers["answer_start"][0]
        end_char = start_char + len(answers["text"][0])

        # Sequence ids differentiate between Question (0) and Context (1)
        sequence_ids = tokenized_inputs.sequence_ids(i)

        # Find the start and end of the context in the token sequence
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If answer is not fully inside the context chunk, label it as (0, 0)
        if offsets[context_start][0] > start_char or offsets[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise, find the start and end token indices
            idx = context_start
            while idx <= context_end and offsets[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offsets[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    tokenized_inputs["start_positions"] = start_positions
    tokenized_inputs["end_positions"] = end_positions

    return tokenized_inputs

#Load the Model
We use AutoModelForQuestionAnswering. This adds a linear layer on top of BERT outputs to predict the start_logits and end_logits.

In [None]:
# Apply preprocessing
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#Train the Model
We use the Hugging Face Trainer.

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    optim="adamw_torch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"].select(range(100)),
    processing_class=tokenizer,  # <--- CHANGED from 'tokenizer=tokenizer' to fix the Warning
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,1.2188,1.384248
2,0.9481,1.288553
3,0.7617,1.315124




TrainOutput(global_step=16599, training_loss=1.1048732578082934, metrics={'train_runtime': 1954.1804, 'train_samples_per_second': 135.906, 'train_steps_per_second': 8.494, 'total_flos': 2.602335381127373e+16, 'train_loss': 1.1048732578082934, 'epoch': 3.0})

In [None]:
import torch

def ask_question(question, context):
    inputs = tokenizer(question, context, return_tensors="pt")
    inputs = {k: v.to(trainer.model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = trainer.model(**inputs)

    # Get the highest probability start and end tokens
    answer_start_index = outputs.start_logits.argmax()
    answer_end_index = outputs.end_logits.argmax()

    predict_answer_tokens = inputs["input_ids"][0, answer_start_index : answer_end_index + 1]
    return tokenizer.decode(predict_answer_tokens)

# --- Test it out ---
my_context = """
The Apollo program was composed of eleven total spaceflights.
Apollo 11 was the spaceflight that first landed humans on the Moon.
Commander Neil Armstrong and lunar module pilot Buzz Aldrin formed the American crew that landed the Apollo Lunar Module Eagle on July 20, 1969.
"""

my_question = "Who were the crew members of Apollo 11?"

answer = ask_question(my_question, my_context)
print(f"Question: {my_question}")
print(f"Answer: {answer}")

Question: Who were the crew members of Apollo 11?
Answer: commander neil armstrong and lunar module pilot buzz aldrin


#Evaluation

The two standard metrics used for the SQuAD dataset and similar extractive QA tasks are Exact Match (EM) and F1 Score.

Getting the Evaluation Script

The Hugging Face evaluate library provides the official SQuAD evaluation script. We load it as a metric:

In [None]:
!pip install evaluate
from evaluate import load

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
squad_metric = load("squad")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

#Define the Evaluation Function

Evaluation requires a slightly different process than training because a single original context might be split into multiple features (due to the stride and max_length in preprocessing). We need to map the predicted token indices back to the original text.

This function handles:

Getting the raw model predictions (logits).

Mapping the predicted start/end indices back to character spans in the original context.

Selecting the best answer span (handling cases where the predicted end index is before the start index, or where multiple context chunks exist).

In [None]:
import collections
import numpy as np
from tqdm.auto import tqdm
# Assuming qa_tokenizer (or tokenizer) is the globally loaded tokenizer object

def compute_metrics(start_logits, end_logits, features, examples):
    """
    Computes the standard SQuAD metrics (Exact Match and F1 Score).
    FIXED: Accesses sequence_ids correctly using the tokenizer's method.
    """
    max_answer_length = 30
    n_best = 20

    # 1. Map features back to their original example (context/question pair)
    example_to_features = collections.defaultdict(list)
    # The 'features' passed here is tokenized_datasets["validation"] which now has 'example_id'
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    # 2. Get predictions (best start/end indices)
    all_predictions = collections.OrderedDict()

    for example_index, example in enumerate(tqdm(examples)):
        feature_indices = example_to_features[example["id"]]

        valid_answers = []
        context = example["context"]

        for feature_index in feature_indices:
            feature = features[feature_index]

            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]

            # Access columns injected in the previous step
            offset_mapping = feature["offset_mapping"]

            # --- FIX APPLIED HERE: Use tokenizer to get sequence IDs ---
            # We reconstruct the sequence IDs using the input_ids from the feature
            # Assuming 'tokenizer' is accessible globally (which it is in Colab)
            try:
                sequence_ids = tokenizer.get_sequence_ids(feature["input_ids"])
            except:
                # Fallback if tokenizer doesn't have the feature method, though it should.
                # Since the model is Q/A, they are usually [CLS] Q [SEP] C [SEP]
                # A simple manual construction based on known structure is used as a safety net
                sequence_ids = [0] * (feature["input_ids"].index(tokenizer.sep_token_id) + 1) + \
                               [1] * (len(feature["input_ids"]) - feature["input_ids"].index(tokenizer.sep_token_id) - 1)

            # Find tokens corresponding to the context
            try:
                context_start = sequence_ids.index(1)
                context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)
            except ValueError: # Handles features where context might be missing
                 context_start, context_end = 0, 0

            # Iterate over the top N start and end indices
            starts = np.argsort(start_logit)[-1: -n_best - 1: -1].tolist()
            ends = np.argsort(end_logit)[-1: -n_best - 1: -1].tolist()

            for start in starts:
                for end in ends:
                    # Filter impossible answers
                    if (start >= context_start and end <= context_end and
                        end >= start and end - start + 1 <= max_answer_length):

                        start_char = offset_mapping[start][0]
                        end_char = offset_mapping[end][1]

                        valid_answers.append({
                            "score": start_logit[start] + end_logit[end],
                            "text": context[start_char: end_char]
                        })

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
            prediction = best_answer["text"]
        else:
            prediction = ""

        all_predictions[example["id"]] = prediction

    # 3. Format predictions for the SQuAD metric script
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in all_predictions.items()]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]

    # 4. Compute and return the final metrics
    return squad_metric.compute(predictions=formatted_predictions, references=references)

#Run the Evaluation

Now we can pass this function to the Trainer's evaluation loop:

In [None]:
# --- Evaluation Step ---

print("Starting evaluation on validation set...")

# 1. Get the validation data
eval_examples = dataset["validation"]
eval_features = tokenized_datasets["validation"]

# 2. Predict on the validation set (returns raw logits)
predictions = trainer.predict(eval_features)

# 3. Run the post-processing and metric calculation
final_metrics = compute_metrics(
    predictions.predictions[0],  # Start logits
    predictions.predictions[1],  # End logits
    eval_features,
    eval_examples
)

print("\n--- Final Model Evaluation Results ---")
print(final_metrics)

Starting evaluation on validation set...


  0%|          | 0/10570 [00:00<?, ?it/s]


--- Final Model Evaluation Results ---
{'exact_match': 77.32261116367077, 'f1': 85.4918379595579}
