In [1]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

%load_ext autoreload
%autoreload 2

In [2]:
import os
import warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
# Load GSM8K Dataset
dataset = load_dataset("gsm8k", "main")
train = dataset["train"]
eval = dataset["test"]
print(f"Training set size: {len(train)}")
print(f"Evaluation set size: {len(eval)}")
display(train[3])

Training set size: 7473
Evaluation set size: 1319


{'question': 'Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?',
 'answer': 'Maila read 12 x 2 = <<12*2=24>>24 pages today.\nSo she was able to read a total of 12 + 24 = <<12+24=36>>36 pages since yesterday.\nThere are 120 - 36 = <<120-36=84>>84 pages left to be read.\nSince she wants to read half of the remaining pages tomorrow, then she should read 84/2 = <<84/2=42>>42 pages.\n#### 42'}

In [4]:
# Load the T5-small model and tokenizer
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [5]:
# Define LoRA Configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q", "v"],
    task_type=TaskType.SEQ_2_SEQ_LM
)

In [None]:
# lora_config = LoraConfig(
#     r=16, 
#     lora_alpha=32,
#     lora_dropout=0.05, 
#     target_modules=["q", "v", "k", "o", "wi", "wo"],
#     task_type=TaskType.QUESTION_ANS,
#     bias="none" 
# )

In [6]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 294,912 || all params: 60,801,536 || trainable%: 0.4850


In [17]:
def preprocess_function(examples):
    # Tokenize the inputs and targets
    inputs = examples["question"]
    targets = examples["answer"]
    
    # Tokenize inputs and targets separately, keeping all relevant fields
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    
    # Include attention_mask in model_inputs and set labels
    model_inputs["labels"] = labels["input_ids"]
    
    return {
        "input_ids": model_inputs["input_ids"],
        "attention_mask": model_inputs["attention_mask"],
        "labels": model_inputs["labels"]
    }

In [18]:
# Tokenize and remove unnecessary columns
train_data = train.map(preprocess_function, batched=True, remove_columns=["question", "answer"])
eval_data = eval.map(preprocess_function, batched=True, remove_columns=["question", "answer"])

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [19]:
# Define the data collator
label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [14]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-small-gsm8k-lora",
    eval_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    remove_unused_columns=False,
)

In [None]:
# training_args = Seq2SeqTrainingArguments(
#     output_dir="./t5-small-gsm8k-lora",
#     evaluation_strategy="steps",  # Evaluate more frequently
#     eval_steps=100,  # Evaluate every 100 steps
#     learning_rate=1e-4,  # Lower learning rate for stability
#     per_device_train_batch_size=8,  # Increase if memory allows
#     per_device_eval_batch_size=8,
#     num_train_epochs=5,  # Train for more epochs
#     weight_decay=0.01,
#     save_strategy="steps",
#     save_steps=100,
#     logging_dir="./logs",
#     logging_steps=10,
#     load_best_model_at_end=True,
#     metric_for_best_model="eval_loss",  # Choose appropriate metric
#     greater_is_better=False,
#     remove_unused_columns=False,
#     fp16=True,  # Enable mixed precision training if GPU supports it
#     gradient_accumulation_steps=4,  # Simulate larger batch sizes
# )

In [15]:
# Define Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    data_collator=data_collator,
)
model.config.use_cache = False

In [24]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss


TrainOutput(global_step=5607, training_loss=1.4946347966814117, metrics={'train_runtime': 217.7338, 'train_samples_per_second': 102.965, 'train_steps_per_second': 25.752, 'total_flos': 3054538781097984.0, 'train_loss': 1.4946347966814117, 'epoch': 3.0})

In [25]:
model.save_pretrained("./t5-small-gsm8k-lora")
tokenizer.save_pretrained("./t5-small-gsm8k-lora")

('./t5-small-gsm8k-lora/tokenizer_config.json',
 './t5-small-gsm8k-lora/special_tokens_map.json',
 './t5-small-gsm8k-lora/spiece.model',
 './t5-small-gsm8k-lora/added_tokens.json',
 './t5-small-gsm8k-lora/tokenizer.json')

Inference

In [None]:
from transformers import pipeline

# Load the fine-tuned model for inference
qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# Generate predictions for each sample
for i in range(num_samples):
    question = eval[i]["question"]
    actual_answer = eval[i]["answer"]
    
    # Generate prediction
    prediction = qa_pipeline(question, max_length=256)[0]["generated_text"]

    # Print the question, actual answer, and model's prediction
    print(f"Sample {i + 1}")
    print("Question:", question)
    print("Actual Answer:", actual_answer)
    print("Predicted Answer:", prediction)
    print("-" * 40)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
The model 'PeftModelForSeq2SeqLM' is not supported for text2text-generation. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'Qwen2AudioForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformer

Sample 1
Question: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
Actual Answer: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18
Predicted Answer: Janet eats 16 eggs a day and bakes muffins for her friends every morning. She sells the remainder at the farmers' market for $2 / day = $2/2=4>>4 #### 4
----------------------------------------
Sample 2
Question: A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?
Actual Answer: It takes 2/2=<<2/2=1>>1 bolt of white fiber
So the total amount of fabric is 2+1=<<2+1=3>>3 bolts of fabric
#### 3
Predicted Answer: The robe takes 2*2=2*2=4>>4 bolts of blue fiber. The robe takes 4*2=4*

Evaluation

In [26]:
# Evaluation
eval_results = trainer.evaluate()
print(f"Perplexity: {torch.exp(torch.tensor(eval_results['eval_loss']))}")

Perplexity: 3.263364315032959


In [7]:
# Load the model and tokenizer from the saved directory
model_path = tokenizer_path = "./t5-small-gsm8k-lora"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

Accuracy

In [8]:
import re
from transformers import pipeline

In [9]:
# Load the T5-small model and tokenizer
base_tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [10]:
lora_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
lora_model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [11]:
from evaluation import *
evaluator = Evaluation(lora_model, lora_tokenizer, device=device)

In [None]:
# Evaluate on ROUGE metrics
rouge_results = evaluator.evaluate_rouge(eval)
print("ROUGE results:", rouge_results)

ROUGE results: {'rouge1': np.float64(0.25420457286348735), 'rouge2': np.float64(0.09781329534548126), 'rougeL': np.float64(0.20162815980831045), 'rougeLsum': np.float64(0.23487983490648348)}


In [21]:
accuracy_base = evaluator.evaluate_accuracy(eval, base_model, base_tokenizer)
print(f"Accuracy: {accuracy_base:.2f}%")

Accuracy: 0.00%
Accuracy: 2.20%


In [20]:
accuracy_lora = evaluator.evaluate_accuracy(eval, lora_model, lora_tokenizer)
print(f"Accuracy: {accuracy_lora:.2f}%")

Accuracy: 2.20%
Accuracy: 2.20%


Using Chain-Of-Thoughts

In [14]:
# Evaluate using Chain-of-Thought (CoT) and few-shot learning
accuracy_cot_few_shot = evaluator.evaluate_with_cot(eval)
print(f"Accuracy: {accuracy_cot_few_shot:.2f}%")

Accuracy: 1.97%
1.9711902956785443
