In [22]:
from transformers import (
    T5Tokenizer, 
    T5ForConditionalGeneration, 
    Trainer, 
    TrainingArguments, 
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    RobertaForMultipleChoice,
    RobertaForQuestionAnswering,
    RobertaTokenizer
)
from datasets import load_dataset
import torch
import random
import sympy as sp

dataset_name = 'aqua_rat'
model_name = 'google-t5/t5-small'

dataset = load_dataset(dataset_name, split='train')
reduced_dataset = dataset.shuffle(seed=42).select(range(200))

dataset2_name = 'ChilleD/SVAMP'
dataset2 = load_dataset(dataset2_name, split='train')
dataset2 = dataset2.shuffle(seed=42).select(range(400))

tokenizer = T5Tokenizer.from_pretrained(model_name)

model = T5ForConditionalGeneration.from_pretrained(model_name)
model.save_pretrained('./before_finetuning')

flan_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
flan_t5 = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

roberta = RobertaForQuestionAnswering.from_pretrained("FacebookAI/roberta-base")
roberta_tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
print(reduced_dataset)
print(dataset2)

Dataset({
    features: ['question', 'options', 'rationale', 'correct'],
    num_rows: 200
})
Dataset({
    features: ['ID', 'Question', 'Type', 'Answer', 'Body', 'Equation'],
    num_rows: 400
})


In [5]:
def preprocess_aquarat(examples):
    questions_and_options = [
        f"question: {q} options: {opts[0]} {opts[1]} {opts[2]} {opts[3]} {opts[4]}." 
        for q, opts in zip(examples["question"], examples["options"])]

    correct_answers = [opts[ord(examples["correct"][i]) - ord('A')] for i, opts in enumerate(examples["options"])]

    input_encodings = tokenizer(questions_and_options, padding="max_length", truncation=True, max_length=512)
    target_encodings = tokenizer(correct_answers, padding="max_length", truncation=True, max_length=128)
    
    return {
        "input_ids": input_encodings.input_ids,
        "attention_mask": input_encodings.attention_mask,
        "labels": target_encodings.input_ids,
        "input_text": questions_and_options,
        "labels_text": correct_answers
    }

def preprocess_svamp(examples):
    questions_and_options = [
        f"question: {q} context: {bod}" 
        for q, bod in zip(examples["Question"], examples["Body"])]

    correct_answers = [str(ans) for ans in examples["Answer"]]

    input_encodings = tokenizer(questions_and_options, padding="max_length", truncation=True, max_length=512)
    target_encodings = tokenizer(correct_answers, padding="max_length", truncation=True, max_length=128)
    
    return {
        "input_ids": input_encodings.input_ids,
        "attention_mask": input_encodings.attention_mask,
        "labels": target_encodings.input_ids
    }

def ask_math_question(input_ids, model, tokenizer):
    # Generate the output ids with the model
    output_ids = model.generate(input_ids, max_length=50, num_beams=5, early_stopping=True)[0]
    # Decode the generated ids to get the answer
    answer = tokenizer.decode(output_ids, skip_special_tokens=True)
    return answer



In [6]:
# Apply preprocessing
processed_dataset = dataset2.map(preprocess_svamp, batched=True)

print(processed_dataset)

Dataset({
    features: ['ID', 'Question', 'Type', 'Answer', 'Body', 'Equation', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 400
})


In [6]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=8,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=5e-5,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=50,
    #load_best_model_at_end=True,
    gradient_accumulation_steps=2,  # Accumulate gradients for 2 steps
    max_grad_norm=1.0,  # Clip gradients to have a maximum norm of 1.0

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset,
    # eval_dataset=processed_eval_dataset, # If you have an evaluation dataset
)

trainer.train()

model.save_pretrained('./after_finetuning')

  0%|          | 0/200 [00:00<?, ?it/s]

{'loss': 25.6577, 'learning_rate': 5e-06, 'epoch': 2.0}
{'loss': 19.2122, 'learning_rate': 1e-05, 'epoch': 4.0}
{'loss': 7.1767, 'learning_rate': 1.5e-05, 'epoch': 6.0}
{'loss': 0.9383, 'learning_rate': 2e-05, 'epoch': 8.0}
{'train_runtime': 227.1911, 'train_samples_per_second': 14.085, 'train_steps_per_second': 0.88, 'train_loss': 13.246222667694092, 'epoch': 8.0}


In [7]:
# Select a random question from the training dataset
sample_index = random.randint(0, len(processed_dataset) - 1)
sample_question = processed_dataset[sample_index]

# Load models for comparison
model_before_finetuning = T5ForConditionalGeneration.from_pretrained('./before_finetuning')
model_after_finetuning = T5ForConditionalGeneration.from_pretrained('./after_finetuning')

# Generate answers to the selected question
input_ids = torch.tensor(sample_question['input_ids']).unsqueeze(0)  # Add batch dimension
pre_finetuning_answer = ask_math_question(input_ids, model_before_finetuning, tokenizer)
post_finetuning_answer = ask_math_question(input_ids, model_after_finetuning, tokenizer)

In [8]:
# Display the question and compare answers
print("Question and options:", tokenizer.decode(input_ids[0], skip_special_tokens=True))
print("Answer before fine-tuning:", pre_finetuning_answer)
print("Answer after fine-tuning:", post_finetuning_answer)

Question and options: question: How many new games do they have together? context: Katie had 78 new games and 86 old games. Her friends had 48 new games.
Answer before fine-tuning: 48
Answer after fine-tuning: 


In [9]:
def generate_answer(model, tokenizer, prompt):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output_ids = model.generate(input_ids, max_length=50, num_beams=5, early_stopping=True)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output

print(generate_answer(model_before_finetuning, tokenizer, 'question: what is x + y? context: x = 4, y = 3.'))

3.


In [10]:
dataset2_name = 'ChilleD/SVAMP'
dataset2 = load_dataset(dataset2_name, split='train')
questions2 = dataset2['Question']
body2 = dataset2['Body']

print(dataset2)

Dataset({
    features: ['ID', 'Question', 'Type', 'Answer', 'Body', 'Equation'],
    num_rows: 700
})


In [31]:
question_rob = "How many days did he take to finish the book?"
context_rob = "Frank was reading through his favorite book, the book had 392 pages and he read 14 pages per day."

# Encode the inputs
inputs = roberta_tokenizer(question_rob, context_rob, return_tensors='pt')
input_ids = inputs['input_ids']

# Get model's answer span predictions
outputs = roberta(**inputs)
answer_start_scores = outputs.start_logits
answer_end_scores = outputs.end_logits

# Determine the start and end positions of the answer
answer_start = torch.argmax(answer_start_scores)
answer_end = torch.argmax(answer_end_scores) + 1

# Decode the predicted answer
answer_tokens = input_ids[0, answer_start:answer_end]
answer = roberta_tokenizer.decode(answer_tokens)



print(f"Model's extracted answer: {answer}")


Model's extracted answer: <s>How many days did he take to finish the book?</s></s>Frank was reading through his favorite book, the book had 392 pages and he read 14 pages per


In [36]:
prompt_list = []
flan_t5_list = []
t5_list = []
correct_list = []

for qid in range(80):
    prompt = f'question: What is x+y? context: x=2 y=3.'
    prompt2 = tokenizer.decode(processed_dataset['input_ids'][qid], skip_special_tokens=True)
    answer2 = tokenizer.decode(processed_dataset['labels'][qid], skip_special_tokens=True)
    promptflan = flan_tokenizer.decode(processed_dataset['input_ids'][qid], skip_special_tokens=True)
    promptroberta = roberta_tokenizer.decode(processed_dataset['input_ids'][qid], skip_special_tokens=True)

    prompt_list.append(prompt2)
    flan_t5_list.append(generate_answer(flan_t5, flan_tokenizer, promptflan))
    t5_list.append(generate_answer(model_before_finetuning, tokenizer, prompt2))
    correct_list.append(answer2)

#print(prompt2)
#print(f'Flan-T5: {generate_answer(flan_t5, flan_tokenizer, promptflan)}')
#print(f'RoBERTa: {generate_answer(roberta, roberta_tokenizer, promptroberta)}')
#print(f'T5 before Fine-tuning: {generate_answer(model_before_finetuning, tokenizer, prompt2)}')
#print(f'T5 fter Finetuning: {generate_answer(model_after_finetuning, tokenizer, prompt2)}')
#print(f'Correct Answer: {answer2}')

In [41]:
for i in range(len(prompt_list)):
    print(prompt_list[i])
    print(f'Correct Answer: {correct_list[i]}')
    print(f'Flan-T5: {flan_t5_list[i]}')
    print(f'T5 before Fine-tuning: {t5_list[i]}\n')

question: How many bottle caps did danny have at first? context: Danny collects bottle caps. He found 63 bottle caps at the park while he threw away 51 old ones. Now he has 33 bottle caps in his collection.
Correct Answer: 21.0
Flan-T5: 33
T5 before Fine-tuning: 33

question: How many more pages of math homework than biology homework did she have? context: Rachel had to complete 11 pages of math homework, 2 pages of reading homework and 3 more pages of biology homework.
Correct Answer: 8.0
Flan-T5: 3
T5 before Fine-tuning: 3

question: How many more movies than books are there in the'crazy silly school'series? context: There are 17 different movies and 11 different books in the'crazy silly school'series. If you read 13 of the books and watched 63 of the movies
Correct Answer: 6.0
Flan-T5: 10
T5 before Fine-tuning: 11

question: How many action figures did he add to the shelf? context: Jerry had 4 action figures on a shelf in his room. Later he added some more action figures to the shel