<a href="https://colab.research.google.com/github/rifadukrifat-0035/Test-class/blob/main/Fine_Tuning_Transformers_for_Question_Answering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets evaluate accelerate -q


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/75.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.1/75.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.7/119.7 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.3/150.3 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.9/193.9 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Project: Fine-Tuning Transformers for Question Answering

## 1. Introduction
This project focuses on fine-tuning a pre-trained BERT model (`bert-base-uncased`) on the SQuAD v1.1 dataset for Extractive Question Answering. Unlike text classification, where the goal is to predict a single label for a sequence, Question Answering (QA) requires the model to predict the specific **start** and **end** token indices of the answer within a given context. This involves complex preprocessing strategies like "Sliding Windows" (doc_stride) to handle long documents that exceed the model's maximum sequence length.

In [2]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, DefaultDataCollator

# 1. Load Dataset
squad = load_dataset("squad", split="train[:5000]")
squad = squad.train_test_split(test_size=0.2) # Create train/val split

# 2. Tokenizer Setup
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# 3. Preprocessing Function (Handling Sliding Windows)
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        stride=128,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If answer is not fully inside context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise map character position to token position
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# Apply Preprocessing
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
print("Data Preprocessing Complete!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Data Preprocessing Complete!


In [5]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

# Load Pre-trained Model
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

# Training Arguments
training_args = TrainingArguments(
    output_dir="my_qa_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    push_to_hub=False,
    optim="adamw_torch", # Specify a non-fused optimizer for XLA compatibility
)

data_collator = DefaultDataCollator()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start Training
trainer.train()

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,2.200917




TrainOutput(global_step=255, training_loss=3.2835475547640933, metrics={'train_runtime': 142.093, 'train_samples_per_second': 28.714, 'train_steps_per_second': 1.795, 'total_flos': 798980157909504.0, 'train_loss': 3.2835475547640933, 'epoch': 1.0})

In [6]:
import torch

def answer_question(question, context):
    inputs = tokenizer(question, context, return_tensors="pt")
    inputs = {k: v.to(trainer.model.device) for k, v in inputs.items()} # Move to GPU if available

    with torch.no_grad():
        outputs = model(**inputs)

    # Get the highest probability start and end tokens
    answer_start_index = outputs.start_logits.argmax()
    answer_end_index = outputs.end_logits.argmax()

    predict_answer_tokens = inputs["input_ids"][0, answer_start_index : answer_end_index + 1]
    return tokenizer.decode(predict_answer_tokens)

# --- TEST 1 ---
context1 = "Pabna University of Science and Technology (PUST) is located in Pabna, Bangladesh. It was established in 2008."
question1 = "When was PUST established?"
print(f"Q: {question1}\nA: {answer_question(question1, context1)}\n")

# --- TEST 2 ---
context2 = "Machine learning is a field of inquiry devoted to understanding and building methods that 'learn', that is, methods that leverage data to improve performance on some set of tasks."
question2 = "What is machine learning devoted to?"
print(f"Q: {question2}\nA: {answer_question(question2, context2)}\n")

Q: When was PUST established?
A: pabna university of science and technology ( pust ) is located in pabna, bangladesh. it was established in 2008

Q: What is machine learning devoted to?
A: inquiry

