In [1]:
from transformers import (
    T5Tokenizer, 
    T5ForConditionalGeneration, 
    Trainer, 
    TrainingArguments, 
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    RobertaForMultipleChoice,
    RobertaForQuestionAnswering,
    RobertaTokenizer
)
from datasets import load_dataset
import torch
import random
import sympy as sp
import sqlite3
import pandas as pd

In [16]:
dataset_name = 'aqua_rat'
model_name = 'google-t5/t5-small'

dataset = load_dataset(dataset_name, split='train')
reduced_dataset = dataset.shuffle(seed=42).select(range(2000))
dataset_test = load_dataset(dataset_name, split='test')

dataset2_name = 'ChilleD/SVAMP'
dataset2i = load_dataset(dataset2_name, split='train')
dataset2 = dataset2i.shuffle(seed=42).select(range(400))

tokenizer = T5Tokenizer.from_pretrained(model_name)

model = T5ForConditionalGeneration.from_pretrained(model_name)
model.save_pretrained('./before_finetuning')

flan_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
flan_t5 = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

roberta = RobertaForQuestionAnswering.from_pretrained("FacebookAI/roberta-base")
roberta_tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
database_name = '../teacher_llm_dataset.bd'
conn = sqlite3.connect(database_name)
query = "SELECT * FROM LLM_results"
df = pd.read_sql_query(query, conn)
conn.close()

DatabaseError: Execution failed on sql 'SELECT * FROM LLM_results': no such table: LLM_results

In [17]:
print(reduced_dataset)
print(dataset2)
print(dataset2i)
print(dataset_test)

Dataset({
    features: ['question', 'options', 'rationale', 'correct'],
    num_rows: 2000
})
Dataset({
    features: ['ID', 'Question', 'Type', 'Answer', 'Body', 'Equation'],
    num_rows: 400
})
Dataset({
    features: ['ID', 'Question', 'Type', 'Answer', 'Body', 'Equation'],
    num_rows: 700
})
Dataset({
    features: ['question', 'options', 'rationale', 'correct'],
    num_rows: 254
})


In [8]:
def preprocess_aquarat(examples):
    questions_and_options = [
        f"question: {q} options: {opts[0]} {opts[1]} {opts[2]} {opts[3]} {opts[4]}." 
        for q, opts in zip(examples["question"], examples["options"])]

    correct_answers = [opts[ord(examples["correct"][i]) - ord('A')] for i, opts in enumerate(examples["options"])]

    input_encodings = tokenizer(questions_and_options, padding="max_length", truncation=True, max_length=512)
    target_encodings = tokenizer(correct_answers, padding="max_length", truncation=True, max_length=128)
    
    return {
        "input_ids": input_encodings.input_ids,
        "attention_mask": input_encodings.attention_mask,
        "labels": target_encodings.input_ids
    }

def preprocess_svamp(examples):
    questions_and_options = [
        f"question: {q} context: {bod}" 
        for q, bod in zip(examples["Question"], examples["Body"])]

    correct_answers = [str(ans) for ans in examples["Answer"]]

    input_encodings = tokenizer(questions_and_options, padding="max_length", truncation=True, max_length=512)
    target_encodings = tokenizer(correct_answers, padding="max_length", truncation=True, max_length=128)
    
    return {
        "input_ids": input_encodings.input_ids,
        "attention_mask": input_encodings.attention_mask,
        "labels": target_encodings.input_ids
    }

def ask_math_question(input_ids, model, tokenizer):
    # Generate the output ids with the model
    output_ids = model.generate(input_ids, max_length=50, num_beams=5, early_stopping=True)[0]
    # Decode the generated ids to get the answer
    answer = tokenizer.decode(output_ids, skip_special_tokens=True)
    return answer



In [19]:
# Apply preprocessing
processed_dataset = reduced_dataset.map(preprocess_aquarat, batched=True)
process_dataset_test = dataset_test.map(preprocess_aquarat, batched=True)

print(processed_dataset)
print(process_dataset_test)

Dataset({
    features: ['question', 'options', 'rationale', 'correct', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 2000
})
Dataset({
    features: ['question', 'options', 'rationale', 'correct', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 254
})


In [10]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=8,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=5e-5,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=50,
    #load_best_model_at_end=True,
    gradient_accumulation_steps=2,  # Accumulate gradients for 2 steps
    max_grad_norm=1.0,  # Clip gradients to have a maximum norm of 1.0

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset,
    # eval_dataset=processed_eval_dataset, # If you have an evaluation dataset
)

trainer.train()

model.save_pretrained('./after_finetuning')

  0%|          | 0/1000 [00:00<?, ?it/s]

{'loss': 15.3583, 'learning_rate': 5e-06, 'epoch': 0.4}
{'loss': 9.9169, 'learning_rate': 1e-05, 'epoch': 0.8}
{'loss': 2.9047, 'learning_rate': 1.5e-05, 'epoch': 1.2}
{'loss': 0.6761, 'learning_rate': 2e-05, 'epoch': 1.6}
{'loss': 0.2116, 'learning_rate': 2.5e-05, 'epoch': 2.0}
{'loss': 0.1383, 'learning_rate': 3e-05, 'epoch': 2.4}
{'loss': 0.0628, 'learning_rate': 3.5e-05, 'epoch': 2.8}
{'loss': 0.0312, 'learning_rate': 4e-05, 'epoch': 3.2}
{'loss': 0.0232, 'learning_rate': 4.5e-05, 'epoch': 3.6}
{'loss': 0.0212, 'learning_rate': 5e-05, 'epoch': 4.0}
{'loss': 0.0194, 'learning_rate': 4.5e-05, 'epoch': 4.4}
{'loss': 0.0184, 'learning_rate': 4e-05, 'epoch': 4.8}
{'loss': 0.0179, 'learning_rate': 3.5e-05, 'epoch': 5.2}
{'loss': 0.0165, 'learning_rate': 3e-05, 'epoch': 5.6}
{'loss': 0.0166, 'learning_rate': 2.5e-05, 'epoch': 6.0}
{'loss': 0.017, 'learning_rate': 2e-05, 'epoch': 6.4}
{'loss': 0.0166, 'learning_rate': 1.5e-05, 'epoch': 6.8}
{'loss': 0.0163, 'learning_rate': 1e-05, 'epoch':

In [11]:
# Select a random question from the training dataset
sample_index = random.randint(0, len(processed_dataset) - 1)
sample_question = processed_dataset[sample_index]

# Load models for comparison
model_before_finetuning = T5ForConditionalGeneration.from_pretrained('./before_finetuning')
model_after_finetuning = T5ForConditionalGeneration.from_pretrained('./after_finetuning')

# Generate answers to the selected question
input_ids = torch.tensor(sample_question['input_ids']).unsqueeze(0)  # Add batch dimension
pre_finetuning_answer = ask_math_question(input_ids, model_before_finetuning, tokenizer)
post_finetuning_answer = ask_math_question(input_ids, model_after_finetuning, tokenizer)

In [50]:
# Display the question and compare answers
print("Question and options:", tokenizer.decode(input_ids[0], skip_special_tokens=True))
print("Answer before fine-tuning:", pre_finetuning_answer)
print("Answer after fine-tuning:", post_finetuning_answer)
print(f'Correct answer: {j}')

Question and options: question: How many campers went rowing in the afternoon? context: 46 campers went rowing on a day. 43 campers went rowing in the morning and some more campers went rowing in the afternoon.
Answer before fine-tuning: 43
Answer after fine-tuning: 3.0


In [12]:
def generate_answer(model, tokenizer, prompt):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output_ids = model.generate(input_ids, max_length=50, num_beams=5, early_stopping=True)
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output

print(generate_answer(model_before_finetuning, tokenizer, 'question: what is x + y? context: x = 4, y = 3.'))

3.


In [10]:
dataset2_name = 'ChilleD/SVAMP'
dataset2 = load_dataset(dataset2_name, split='train')
questions2 = dataset2['Question']
body2 = dataset2['Body']

print(dataset2)

Dataset({
    features: ['ID', 'Question', 'Type', 'Answer', 'Body', 'Equation'],
    num_rows: 700
})


In [31]:
question_rob = "How many days did he take to finish the book?"
context_rob = "Frank was reading through his favorite book, the book had 392 pages and he read 14 pages per day."

# Encode the inputs
inputs = roberta_tokenizer(question_rob, context_rob, return_tensors='pt')
input_ids = inputs['input_ids']

# Get model's answer span predictions
outputs = roberta(**inputs)
answer_start_scores = outputs.start_logits
answer_end_scores = outputs.end_logits

# Determine the start and end positions of the answer
answer_start = torch.argmax(answer_start_scores)
answer_end = torch.argmax(answer_end_scores) + 1

# Decode the predicted answer
answer_tokens = input_ids[0, answer_start:answer_end]
answer = roberta_tokenizer.decode(answer_tokens)



print(f"Model's extracted answer: {answer}")


Model's extracted answer: <s>How many days did he take to finish the book?</s></s>Frank was reading through his favorite book, the book had 392 pages and he read 14 pages per


In [14]:
prompt_list = []
flan_t5_list = []
t5_list = []
correct_list = []
t5_tuned = []

for qid in range(10):
    prompt = f'question: What is x+y? context: x=2 y=3.'
    prompt2 = tokenizer.decode(processed_dataset['input_ids'][qid], skip_special_tokens=True)
    answer2 = tokenizer.decode(processed_dataset['labels'][qid], skip_special_tokens=True)
    promptflan = flan_tokenizer.decode(processed_dataset['input_ids'][qid], skip_special_tokens=True)
    #promptroberta = roberta_tokenizer.decode(processed_dataset['input_ids'][qid], skip_special_tokens=True)

    prompt_list.append(prompt2)
    flan_t5_list.append(generate_answer(flan_t5, flan_tokenizer, promptflan))
    t5_list.append(generate_answer(model_before_finetuning, tokenizer, prompt2))
    t5_tuned.append(generate_answer(model_after_finetuning, tokenizer, prompt2))
    correct_list.append(answer2)

#print(prompt2)
#print(f'Flan-T5: {generate_answer(flan_t5, flan_tokenizer, promptflan)}')
#print(f'RoBERTa: {generate_answer(roberta, roberta_tokenizer, promptroberta)}')
#print(f'T5 before Fine-tuning: {generate_answer(model_before_finetuning, tokenizer, prompt2)}')
#print(f'T5 fter Finetuning: {generate_answer(model_after_finetuning, tokenizer, prompt2)}')
#print(f'Correct Answer: {answer2}')

In [20]:
for i in range(10):
    print(prompt_list[i])
    print(f'Correct Answer: {correct_list[i]}')
    print(f'Flan-T5: {flan_t5_list[i]}')
    print(f'T5 before Fine-tuning: {t5_list[i]}')
    print(f'T5 after Fine-tuning: {t5_tuned[i]}\n')

question: There are r red ball, b blue ball and w white ball in a bag. What is the ratio of the number of blue ball to the total no. of ball in terms of r, b and w.? options: A)r / (r + b + w) B)r * (r + b + w) C)(r + b + w) D)r / (r + b ) E)r / (b + w).
Correct Answer: A)r / (r + b + w)
Flan-T5: D
T5 before Fine-tuning: r, b and w
T5 after Fine-tuning: A)r / (r + b + w)

question: From (1, 2, 3, 4, 5, 6), one number is picked out and replaced and one number is picked out again. If the sum of the 2 numbers is 9, what is the probability that the 2 numbers included the number 5? options: A)1/2 B)2/5 C)3/10 D)3/5 E)1/4.
Correct Answer: A)1/2
Flan-T5: C
T5 before Fine-tuning: one number is picked out and replaced
T5 after Fine-tuning: B)2/5

question: Suresh started a business, investing Rs.18000. After 3 months and 4 months respectively, Rohan and Sudhir joined him with capitals of 12000 and 9000. At the end of the year the total profit was Rs.4048. What is the difference between Rohan’s 

In [23]:
#TEST

prompt_list_test = []
flan_t5_list_test = []
t5_list_test = []
correct_list_test = []
t5_tuned_test = []

for qid in range(30):
    prompt2 = tokenizer.decode(process_dataset_test['input_ids'][qid], skip_special_tokens=True)
    answer2 = tokenizer.decode(process_dataset_test['labels'][qid], skip_special_tokens=True)
    promptflan = flan_tokenizer.decode(process_dataset_test['input_ids'][qid], skip_special_tokens=True)
    #promptroberta = roberta_tokenizer.decode(processed_dataset['input_ids'][qid], skip_special_tokens=True)

    prompt_list_test.append(prompt2)
    flan_t5_list_test.append(generate_answer(flan_t5, flan_tokenizer, promptflan))
    t5_list_test.append(generate_answer(model_before_finetuning, tokenizer, prompt2))
    t5_tuned_test.append(generate_answer(model_after_finetuning, tokenizer, prompt2))
    correct_list_test.append(answer2)

In [25]:
for i in range(30):
    print(prompt_list_test[i])
    print(f'Correct Answer: {correct_list_test[i]}')
    print(f'Flan-T5: {flan_t5_list_test[i]}')
    print(f'T5 before Fine-tuning: {t5_list_test[i]}')
    print(f'T5 after Fine-tuning: {t5_tuned_test[i]}\n')


question: A car is being driven, in a straight line and at a uniform speed, towards the base of a vertical tower. The top of the tower is observed from the car and, in the process, it takes 10 minutes for the angle of elevation to change from 45° to 60°. After how much more time will this car reach the base of the tower? options: A)5(3 + 1) B)6(3 + 2) C)7(3 – 1) D)8(3 – 2) E)None of these.
Correct Answer: A)5(3 + 1)
Flan-T5: C
T5 before Fine-tuning: a uniform speed
T5 after Fine-tuning: C)7(3 – 1)

question: The original price of an item is discounted 22%. A customer buys the item at this discounted price using a $20-off coupon. There is no tax on the item, and this was the only item the customer bought. If the customer paid $1.90 more than half the original price of the item, what was the original price of the item? options: A)$61 B)$65 C)$67.40 D)$70 E)$78.20.
Correct Answer: E)$78.20
Flan-T5: C
T5 before Fine-tuning: 78.20
T5 after Fine-tuning: C)$67.40

question: Find out which of 

Negative results can be good for the result 
Probaly train on more samples than 200 to 400

multiple choice answer from a model:
BERT instead of seq2seq --> Score for every answer
T5 (seq2seq) --> prompt to gen multiple outputs and select most freq answer
Could ask model: How much from 0 to 100 do you believe this answer is correct?

