# Question Answering

In [None]:
#!pip install transformers datasets scikit-learn

^C


Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Downloading huggingface_hub-0.29.1-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp311-cp311-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-19.0.1-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.27 (from transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhas


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import numpy as np
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer,
                          DataCollatorWithPadding)
from transformers import pipeline
from sklearn.metrics import accuracy_score

### Example of Question Answering

In [2]:
example_question = "Who founded OpenAI?"
example_context = "OpenAI was founded in December 2015 by Elon Musk, Sam Altman, Greg Brockman, and others."

qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

output = qa_pipeline(question=example_question, context=example_context)

print("\nQuestion Answering Output:")
print(output)

Device set to use cuda:0
  attn_output = torch.nn.functional.scaled_dot_product_attention(



Question Answering Output:
{'score': 0.5449853539466858, 'start': 39, 'end': 75, 'answer': 'Elon Musk, Sam Altman, Greg Brockman'}


### Full Question Answering Workflow

1- Dataset Preparation

In [3]:
dataset = load_dataset("squad_v2")
print("\nExample:", dataset["train"][0])


Example: {'id': '56be85543aeaaa14008c9063', 'title': 'Beyoncé', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".', 'question': 'When did Beyonce start becoming popular?', 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}}


2- Tokenizer Initialization

In [4]:
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

3- Data Preprocessing

In [5]:
def preprocess_function(examples):
    """
    Tokenizes input text and aligns answer positions to tokenized input.
    """
    questions = [q.strip() for q in examples["question"]]
    contexts = examples["context"]
    answers = examples["answers"]

    # Tokenize with truncation and return offsets for aligning answers
    tokenized_inputs = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        padding="max_length",
        max_length=384,
        stride=128,
        return_offsets_mapping=True
    )

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(tokenized_inputs["offset_mapping"]):
        # Handle empty answers (for unanswerable questions in SQuAD v2)
        if len(answers[i]["text"]) == 0:
            start_positions.append(0)  # Default to CLS token for no-answer cases
            end_positions.append(0)
            continue

        # Get answer text and start position
        answer_text = answers[i]["text"][0]  # First answer in case of multiple answers
        answer_start = answers[i]["answer_start"][0]
        answer_end = answer_start + len(answer_text)

        # Find token positions corresponding to character positions
        token_start_index = 0
        token_end_index = 0
        for j, (start, end) in enumerate(offsets):
            if start <= answer_start < end:
                token_start_index = j
            if start < answer_end <= end:
                token_end_index = j

        start_positions.append(token_start_index)
        end_positions.append(token_end_index)

    # Assign new labels to tokenized inputs
    tokenized_inputs["start_positions"] = start_positions
    tokenized_inputs["end_positions"] = end_positions
    return tokenized_inputs



tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

4- Model Loading

In [6]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


5- Data Collation & Training Configuration

In [11]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./qa_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
)

6- Evaluation Metrics

In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    start_logits, end_logits = logits
    start_positions, end_positions = labels

    start_accuracy = accuracy_score(start_positions, np.argmax(start_logits, axis=1))
    end_accuracy = accuracy_score(end_positions, np.argmax(end_logits, axis=1))

    return {"start_accuracy": start_accuracy, "end_accuracy": end_accuracy}

7- Model Training & Evaluation

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Start Accuracy,End Accuracy
1,0.9971,1.056125,0.683484,0.689463
2,0.6666,1.184064,0.685505,0.695949


TrainOutput(global_step=32580, training_loss=0.9421648150795877, metrics={'train_runtime': 4188.7122, 'train_samples_per_second': 62.224, 'train_steps_per_second': 7.778, 'total_flos': 5.107789806161818e+16, 'train_loss': 0.9421648150795877, 'epoch': 2.0})

8- Model Saving & Inference

In [14]:
trainer.save_model("./fine_tuned_qa")
tokenizer.save_pretrained("./fine_tuned_qa")

qa_pipeline = pipeline("question-answering", model="./fine_tuned_qa")

question = "Who founded OpenAI?"
context = "OpenAI was founded in December 2015 by Elon Musk, Sam Altman, Greg Brockman, and others."

output = qa_pipeline(question=question, context=context)

print("\nFinal Inference Output:", output)

Device set to use cuda:0



Final Inference Output: {'score': 0.3498505651950836, 'start': 39, 'end': 87, 'answer': 'Elon Musk, Sam Altman, Greg Brockman, and others'}
