In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, pipeline
from datasets import load_dataset
import torch
import random

dataset_name = 'aqua_rat'
model_name = 'google-t5/t5-base'

dataset = load_dataset(dataset_name, split='train')
reduced_dataset = dataset.shuffle(seed=42).select(range(200))

tokenizer = T5Tokenizer.from_pretrained(model_name)

model = T5ForConditionalGeneration.from_pretrained(model_name)
model.save_pretrained('./before_finetuning')


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
def preprocess_aquarat(examples):
    # Concatenate question and options into a single string.
    # Format could be: "question: <question> options: A: <optionA> B: <optionB> C: <optionC> D: <optionD> E: <optionE>"
    questions_and_options = [
        #f"{body}, {question}" for body, question in zip(examples['Body'], examples['Question'])
        f"question: {q} options: {opts[0]} {opts[1]} {opts[2]} {opts[3]} {opts[4]}."
        for q, opts in zip(examples["question"], examples["options"])
    ]
    
    # The correct answer is mapped to its full text option.
    correct_answers = [opts[ord(examples["correct"][i]) - ord('A')] for i, opts in enumerate(examples["options"])]
    #correct_answers = [str(answer) for answer in examples['Answer']]
    
    # Tokenize inputs and labels
    input_encodings = tokenizer(questions_and_options, padding="max_length", truncation=True, max_length=512)
    target_encodings = tokenizer(correct_answers, padding="max_length", truncation=True, max_length=128)
    
    return {
        "input_ids": input_encodings.input_ids,
        "attention_mask": input_encodings.attention_mask,
        "labels": target_encodings.input_ids,
        "input_text": questions_and_options,
        "labels_text": correct_answers
    }

def ask_math_question(input_ids, model, tokenizer):
    # Generate the output ids with the model
    output_ids = model.generate(input_ids, max_length=50, num_beams=5, early_stopping=True)[0]
    # Decode the generated ids to get the answer
    answer = tokenizer.decode(output_ids, skip_special_tokens=True)
    return answer

# Apply preprocessing
processed_dataset = reduced_dataset.map(preprocess_aquarat, batched=True)


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [3]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="no",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset,
    # eval_dataset=processed_eval_dataset, # If you have an evaluation dataset
)

trainer.train()

model.save_pretrained('./after_finetuning')

  0%|          | 0/75 [00:00<?, ?it/s]

{'train_runtime': 842.2716, 'train_samples_per_second': 0.712, 'train_steps_per_second': 0.089, 'train_loss': 13.30950927734375, 'epoch': 3.0}


In [4]:
# Select a random question from the training dataset
sample_index = random.randint(0, len(processed_dataset) - 1)
sample_question = processed_dataset[sample_index]

# Load models for comparison
model_before_finetuning = T5ForConditionalGeneration.from_pretrained('./before_finetuning')
model_after_finetuning = T5ForConditionalGeneration.from_pretrained('./after_finetuning')

# Generate answers to the selected question
input_ids = torch.tensor(sample_question['input_ids']).unsqueeze(0)  # Add batch dimension
pre_finetuning_answer = ask_math_question(input_ids, model_before_finetuning, tokenizer)
post_finetuning_answer = ask_math_question(input_ids, model_after_finetuning, tokenizer)

# Display the question and compare answers
print("Question and options:", tokenizer.decode(input_ids[0], skip_special_tokens=True))
print("Answer before fine-tuning:", pre_finetuning_answer)
print("Answer after fine-tuning:", post_finetuning_answer)

Question and options: question: Let q represent the integer length of a side of a triangle. If r represents the number of distinct values for q such that we can create a triangle with lengths q, 12, and 19, what is the value of r? options: A)15 B)22 C)23 D)25 E)31.
Answer before fine-tuning: True
Answer after fine-tuning: ................................................ the
