# Fine Tune T5 Model for Question Answering

In [1]:
# Please install other necessary imports needed

# Import the necessary libraries
import nltk
from datasets import load_dataset
import evaluate
import numpy as np
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer 
#Sequence to Sequence is inherited from TrainingArguments. 
#It is specifically tailored for models that deals with tasks like summarization, translation, or question answering with text generation.

from datasets.dataset_dict import DatasetDict
from datasets import Dataset
import datasets

  from .autonotebook import tqdm as notebook_tqdm


Step 1: Prepare the Dataset

Dataset is from https://huggingface.co/datasets/Aashi/All_About_Apple_Devices


In [2]:
# Load the dataset
dataset = load_dataset("Aashi/All_About_Apple_Devices", data_files={"train": "QandA.csv"})
dataset = dataset["train"].train_test_split(test_size=0.3) #training split is 7:3

# Data Cleaning
# Missing values (None) are found in the 'Answer' column
# Missing values (None) replaced with a specific string
train_dataset = dataset["train"].to_pandas()
train_dataset.loc[train_dataset['Answer'].isna(), 'Answer'] = 'No question asked. Please provide more information.'
test_dataset = dataset["test"].to_pandas()

train_dataset = Dataset.from_dict(train_dataset)
test_dataset = Dataset.from_dict(test_dataset)
df = datasets.DatasetDict({"train":train_dataset,"test":test_dataset})


Step 2: Choose the model variant

In [3]:
# Load the tokenizer, model, and data collator
# Model used is t5-small. Given the number of rows/ parameters in the dataset, the small variant should be sufficient.
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Step 3: Fine-tuning

In [4]:
# We prefix our tasks with "answer the question"
prefix = "question: "

# Define our preprocessing function
def preprocess_function(examples):
    """Add prefix to the sentences, tokenize the text, and set the labels"""
    # The "inputs" are the tokenized answer:
    inputs = [prefix + doc for doc in examples["Question"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    
    # The "labels" are the tokenized outputs:
    labels = tokenizer(text_target=examples["Answer"], max_length=512, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Map the preprocessing function across our dataset
tokenized_dataset = df.map(preprocess_function, batched=True)

Map: 100%|██████████| 703/703 [00:00<00:00, 5834.18 examples/s]
Map: 100%|██████████| 302/302 [00:00<00:00, 5876.26 examples/s]


In [5]:
# Set up Rouge score for evaluation
# Rouge Score is used as evaluation for each checkpoint
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

Step 4: Train the model

In [6]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 16
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 2
NUM_EPOCHS = 4

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./results-t5small-1",
   evaluation_strategy="epoch",
   save_strategy = "epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False,
   load_best_model_at_end = True
)

# Set up trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
                                                
 25%|██▌       | 44/176 [04:29<09:23,  4.27s/it]

{'eval_loss': 2.3671135902404785, 'eval_rouge1': 0.4055520115160618, 'eval_rouge2': 0.23284759220853263, 'eval_rougeL': 0.3442626659574591, 'eval_rougeLsum': 0.35363735228795623, 'eval_runtime': 62.6332, 'eval_samples_per_second': 4.822, 'eval_steps_per_second': 1.213, 'epoch': 1.0}


                                                
 50%|█████     | 88/176 [09:12<05:55,  4.04s/it]

{'eval_loss': 2.158734083175659, 'eval_rouge1': 0.3847353579008105, 'eval_rouge2': 0.21707462350919127, 'eval_rougeL': 0.3426791450797543, 'eval_rougeLsum': 0.350863030329464, 'eval_runtime': 71.4746, 'eval_samples_per_second': 4.225, 'eval_steps_per_second': 1.063, 'epoch': 2.0}


                                                 
 75%|███████▌  | 132/176 [13:50<03:08,  4.29s/it]

{'eval_loss': 2.0756192207336426, 'eval_rouge1': 0.3946039215886532, 'eval_rouge2': 0.2236089184235806, 'eval_rougeL': 0.35151363861428053, 'eval_rougeLsum': 0.3592645081859433, 'eval_runtime': 65.4363, 'eval_samples_per_second': 4.615, 'eval_steps_per_second': 1.161, 'epoch': 3.0}


                                                 
100%|██████████| 176/176 [18:26<00:00,  4.76s/it]

{'eval_loss': 2.046698808670044, 'eval_rouge1': 0.4016228284873723, 'eval_rouge2': 0.23147898714292364, 'eval_rougeL': 0.3576088857194156, 'eval_rougeLsum': 0.3640750858297943, 'eval_runtime': 64.3844, 'eval_samples_per_second': 4.691, 'eval_steps_per_second': 1.18, 'epoch': 4.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 176/176 [18:27<00:00,  6.29s/it]

{'train_runtime': 1107.0266, 'train_samples_per_second': 2.54, 'train_steps_per_second': 0.159, 'train_loss': 2.437184073708274, 'epoch': 4.0}





TrainOutput(global_step=176, training_loss=2.437184073708274, metrics={'train_runtime': 1107.0266, 'train_samples_per_second': 2.54, 'train_steps_per_second': 0.159, 'train_loss': 2.437184073708274, 'epoch': 4.0})

Step 5: Save best model

In [7]:
best_checkpoint = "./results-t5small-1/checkpoint-best"


model.save_pretrained(best_checkpoint)
tokenizer.save_pretrained(best_checkpoint)	

('./results-t5small-1/checkpoint-best\\tokenizer_config.json',
 './results-t5small-1/checkpoint-best\\special_tokens_map.json',
 './results-t5small-1/checkpoint-best\\spiece.model',
 './results-t5small-1/checkpoint-best\\added_tokens.json')

Step 6: Testing/Inference

Using Pipeline

In [54]:
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained(best_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(best_checkpoint)

q = "How do I take a screenshot on an iPhone?"
t = "To take a screenshot on an iPhone, press and hold the Side button and the Volume Up button simultaneously. The screen will flash briefly, and a thumbnail of the screenshot will appear in the bottom-left corner of the screen. Tap the thumbnail to view or edit the screenshot."

question_answerer = pipeline("question-answering", model=model, tokenizer = tokenizer)
prediction = question_answerer(question=q, context = t)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of T5ForQuestionAnswering were not initialized from the model checkpoint at ./results-t5small-1/checkpoint-best and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

f1_score = max((compute_f1(prediction['answer'], answer)) for answer in t)
f1_score

0

Using PyTorch

In [73]:
from transformers import AutoTokenizer

q = "How do I take a screenshot on an iPhone?"
t = "To take a screenshot on an iPhone, press and hold the Side button and the Volume Up button simultaneously. The screen will flash briefly, and a thumbnail of the screenshot will appear in the bottom-left corner of the screen. Tap the thumbnail to view or edit the screenshot."

tokenizer = AutoTokenizer.from_pretrained(best_checkpoint)
inputs = tokenizer(q, t, return_tensors="pt")

import torch
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained(best_checkpoint)
with torch.no_grad():
    outputs = model(**inputs)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of T5ForQuestionAnswering were not initialized from the model checkpoint at ./results-t5small-1/checkpoint-best and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


'How do I take a'

# Results

- For Step 6: Testing/Inference, result is not what is expected.
- Either the fine tuning of the model is insufficient or the implementation of inference is wrong.
- I had added a 'context' for a closed book question answering model. 
- Every run of the pipeline results to a differect answer. 

Not yet sure what to do about this. Needs more research. 


Step 7: Evaluation Metrics

- Best Evaluation metric besides exact match should be F1 score which takes into account both precision and recall.

In [None]:
#Codes below are from a site. 

def get_gold_answers(example):
    """helper function that retrieves all possible true answers from a squad2.0 example"""
    
    gold_answers = [answer["text"] for answer in example.answers if answer["text"]]

    # if gold_answers doesn't exist it's because this is a negative example - 
    # the only correct answer is an empty string
    if not gold_answers:
        gold_answers = [""]
        
    return gold_answers

def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [None]:
em_score = max((compute_exact_match(prediction, answer)) for answer in gold_answers)
f1_score = max((compute_f1(prediction, answer)) for answer in gold_answers)

# References

Codes are from the listed references:

https://www.datacamp.com/tutorial/flan-t5-tutorial

https://www.toughdata.net/blog/post/finetune-flan-t5-question-answer-quora-dataset

https://learnopencv.com/fine-tuning-t5/

https://qa.fastforwardlabs.com/no%20answer/null%20threshold/bert/distilbert/exact%20match/f1/robust%20predictions/2020/06/09/Evaluating_BERT_on_SQuAD.html