<a href="https://colab.research.google.com/github/shubhangkhare/Transformers/blob/main/10.%20Fine%20Tuning%20BERT%20for%20Question%20Answering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [12]:
!pip install transformers -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m56.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m65.5 MB/s[0m eta [36m0:00:00[0m
[?25h

# Download Data

In [3]:
!mkdir squad
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squad/train-v2.0.json
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json

mkdir: cannot create directory ‘squad’: File exists
--2023-09-25 16:01:40--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘squad/train-v2.0.json’


2023-09-25 16:01:46 (300 MB/s) - ‘squad/train-v2.0.json’ saved [42123633/42123633]

--2023-09-25 16:01:46--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4370528 (4.2M) [application/json]
Saving to: ‘squad/dev-v2.0.json’


2023-09-25 16:01:46 (65.3 MB/s

# Process Data

## Convert to list format

In [4]:
import json
from pathlib import Path

def read_squad(path):
    '''Convert JSON data into list format'''
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad('squad/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')

In [5]:
type(train_contexts), type(train_questions), type(train_answers)

(list, list, list)

In [6]:
len(train_contexts), len(train_questions), len(train_answers)

(86821, 86821, 86821)

In [7]:
# List of strings
train_contexts[0]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

In [8]:
# List of strings
train_questions[0]

'When did Beyonce start becoming popular?'

In [9]:
# List of dictionaries
train_answers[0]

{'text': 'in the late 1990s', 'answer_start': 269}

**NOTE:** The context, question and answers needs to be converted into list format before proceeding with the next steps

## Add end character index

In [10]:
def add_end_idx(answers, contexts):
    '''Add index of character at which the answer ends in the passage'''
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

## Tokenize the question and context

In [11]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [12]:
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

## Convert character start/end position to token start/end position




In [13]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

## Convert to Torch Dataset

In [14]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = Dataset(train_encodings)
val_dataset = Dataset(val_encodings)

# Fine Tuning BERT

## Load Model

In [15]:
from transformers import BertForQuestionAnswering
model = BertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['encoder.layer.2.attention.self.key.bias', 'encoder.layer.5.attention.output.dense.weight', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.10.attention.self.value.bias', 'encoder.layer.1.attention.self.value.weight', 'encoder.layer.9.attention.output.LayerNorm.weight', 'encoder.layer.11.attention.self.key.bias', 'encoder.layer.10.attention.output.dense.bias', 'encoder.layer.1.attention.self.query.bias', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.4.attention.output.LayerNorm.bias', 'encoder.layer.6.attention.self.key.weight', 'encoder.layer.8.attention.output.LayerNorm.weight', 'encoder.layer.8.attention.self.value

## Function for evaluation

In [None]:
# Define a function to compute metrics for question answering
def compute_metrics(p):
    return {"f1": p["f1"], "exact_match": p["exact_match"]}

## Train

In [24]:
!pip install accelerate -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/258.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/258.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m256.0/258.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [31]:
!pip install transformers[torch] -q

In [27]:
!pip show accelerate

Name: accelerate
Version: 0.23.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: sylvain@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, torch
Required-by: 


In [1]:
import accelerate

In [None]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir="./output",  # Directory to save the model checkpoints and results
    evaluation_strategy="steps",  # Evaluate every 'n' steps
    save_total_limit=2,  # Maximum number of checkpoints to save
    num_train_epochs=2,  # Number of training epochs
    per_device_train_batch_size=16,  # Batch size per GPU
    per_device_eval_batch_size=16,  # Batch size for evaluation per GPU
    warmup_steps=500,  # Number of warmup steps
    weight_decay=0.01,  # Weight decay for regularization
    #logging_dir="./logs",  # Directory for storing logs
    #logging_steps=100,  # Log training progress every 'n' steps
    #do_train=True,  # Perform training
    #do_eval=True,  # Perform evaluation
    #evaluation_steps=500,  # Evaluate every 'n' steps
    save_steps=500,  # Save model checkpoints every 'n' steps
    load_best_model_at_end=True,  # Load the best model at the end of training
    gradient_accumulation_steps=1  # Number of gradient accumulation steps
)

# Create a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=None,  # You can use custom data collators if needed
    train_dataset= train_dataset,  # Training dataset
    eval_dataset= val_dataset,  # Validation dataset
)

# Start training
trainer.train()

Step,Training Loss,Validation Loss
500,3.9756,4.727454
1000,4.4975,4.480231
1500,4.3622,4.45849
2000,4.2927,4.377884


Step,Training Loss,Validation Loss
500,3.9756,4.727454
1000,4.4975,4.480231
1500,4.3622,4.45849
2000,4.2927,4.377884


# Evaluation Metrics

In [None]:
trainer.evaluate()

# Save Model

In [None]:
trainer.save_model('CustomModel')

# Inference Model

In [None]:
from transformers import BertForQuestionAnswering, BertTokenizerFast, pipeline

# Load the trained model
model_path = "./bert_qa_model"  # Replace with the path to your trained model
model = BertForQuestionAnswering.from_pretrained(model_path)

# Load the BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Instantiate the pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

In [None]:
answer = qa_pipeline(question=question, context=context)

# Print the answer
print("Question:", question)
print("Answer:", answer["answer"])

# Reference

1. [Question Answering with SQuAD 2.0
](https://huggingface.co/transformers/v3.4.0/custom_datasets.html#question-answering-with-squad-2-0)
2. [Fine Tuning Roberta QA blog](https://github.com/skandavivek/transformerQA-finetuning/blob/main/fine_tuning_roberta_QA_blog.ipynb)

