In [None]:
# https://www.philschmid.de/fine-tune-flan-t5-peft#1-setup-development-environment

In [7]:
# !pip install -q "peft==0.2.0"
# !pip install -q "transformers==4.27.2" "datasets==2.9.0" "accelerate==0.17.1" "evaluate==0.4.0" "bitsandbytes==0.37.1" loralib --upgrade --quiet
# !pip install -q rouge-score tensorboard py7zr

In [1]:
from transformers import (
    T5Tokenizer, 
    T5ForConditionalGeneration, 
    Trainer, 
    TrainingArguments, 
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    RobertaForMultipleChoice,
    RobertaForQuestionAnswering,
    RobertaTokenizer
)
from datasets import load_dataset
import torch
import sympy as sp
import sqlite3
import pandas as pd
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_name = 'aqua_rat'
model_name = 'google-t5/t5-small'

dataset = load_dataset(dataset_name, split='train')
reduced_dataset = dataset.shuffle(seed=42).select(range(2000))
dataset_test = load_dataset(dataset_name, split='test')

tokenizer = T5Tokenizer.from_pretrained(model_name)

model = T5ForConditionalGeneration.from_pretrained(model_name)
model.save_pretrained('./before_finetuning')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
def preprocess_aquarat(examples):
    questions_and_options = [
        f"question: {q} options: {opts[0]} {opts[1]} {opts[2]} {opts[3]} {opts[4]}." 
        for q, opts in zip(examples["question"], examples["options"])]

    correct_answers = [opts[ord(examples["correct"][i]) - ord('A')] for i, opts in enumerate(examples["options"])]

    input_encodings = tokenizer(questions_and_options, padding="max_length", truncation=True, max_length=512)
    target_encodings = tokenizer(correct_answers, padding="max_length", truncation=True, max_length=128)
    
    return {
        "input_ids": input_encodings.input_ids,
        "attention_mask": input_encodings.attention_mask,
        "labels": target_encodings.input_ids
    }

def ask_math_question(input_ids, model, tokenizer):
    # Generate the output ids with the model
    output_ids = model.generate(input_ids, max_length=50, num_beams=5, early_stopping=True)[0]
    # Decode the generated ids to get the answer
    answer = tokenizer.decode(output_ids, skip_special_tokens=True)
    return answer

In [4]:
# Apply preprocessing
processed_dataset = reduced_dataset.map(preprocess_aquarat, batched=True)
process_dataset_test = dataset_test.map(preprocess_aquarat, batched=True)

print(processed_dataset)
print(process_dataset_test)

Dataset({
    features: ['question', 'options', 'rationale', 'correct', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 2000
})
Dataset({
    features: ['question', 'options', 'rationale', 'correct', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 254
})


In [5]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
 r=16,
 lora_alpha=32,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)
# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()



trainable params: 589,824 || all params: 61,096,448 || trainable%: 0.9653981848502878


In [6]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [7]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir="./after_finetuning"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
	auto_find_batch_size=True,
    learning_rate=1e-3, # higher learning rate
    num_train_epochs=5,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="no",
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=processed_dataset,
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [None]:
# train model
trainer.train()

In [None]:
# Save our LoRA model & tokenizer results
peft_model_id="results"
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)
# if you want to save the base model to call
# trainer.model.base_model.save_pretrained(peft_model_id)