In [1]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import DataCollatorForSeq2Seq, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import bitsandbytes as bnb

%load_ext autoreload
%autoreload 2

In [2]:
import os
import warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
# Load GSM8K Dataset
dataset = load_dataset("gsm8k", "socratic")
train = dataset["train"]
eval = dataset["test"]
print(f"Training set size: {len(train)}")
print(f"Evaluation set size: {len(eval)}")
display(train[3])

Training set size: 7473
Evaluation set size: 1319


{'question': 'Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?',
 'answer': 'How many pages did Maila read today? ** Maila read 12 x 2 = <<12*2=24>>24 pages today.\nHow many pages did Maila read since yesterday? ** So she was able to read a total of 12 + 24 = <<12+24=36>>36 pages since yesterday.\nHow many pages are left to be read? ** There are 120 - 36 = <<120-36=84>>84 pages left to be read.\nHow many pages should she read tomorrow? ** Since she wants to read half of the remaining pages tomorrow, then she should read 84/2 = <<84/2=42>>42 pages.\n#### 42'}

In [4]:
# Define BitsAndBytesConfig for 4-bit quantization
bnb_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_compute_dtype=torch.bfloat16
)

In [5]:
# Load the T5-base model and tokenizer
model_name = "t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [15]:
# Define QLoRA Configuration
qlora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q", "v"],
    task_type=TaskType.SEQ_2_SEQ_LM
)

In [16]:
model = prepare_model_for_kbit_training(model)

In [17]:
# Apply the QLoRA configuration to the model
model = get_peft_model(model, qlora_config)
model.print_trainable_parameters()

trainable params: 884,736 || all params: 223,788,288 || trainable%: 0.3953


In [18]:
def preprocess_function(examples):
    # Tokenize the inputs and targets
    inputs = examples["question"]
    targets = examples["answer"]
    
    # Tokenize inputs and targets separately, keeping all relevant fields
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    
    # Include attention_mask in model_inputs and set labels
    model_inputs["labels"] = labels["input_ids"]
    
    return {
        "input_ids": model_inputs["input_ids"],
        "attention_mask": model_inputs["attention_mask"],
        "labels": model_inputs["labels"]
    }

In [19]:
# Tokenize and remove unnecessary columns
train_data = train.map(preprocess_function, batched=True, remove_columns=["question", "answer"])
eval_data = eval.map(preprocess_function, batched=True, remove_columns=["question", "answer"])

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [20]:
# Define the data collator
label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [21]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-small-gsm8k-socratic-qlora",
    eval_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=2,  # reduce batch size
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    remove_unused_columns=False,
    fp16=True  # enable mixed precision if available
)

In [22]:
# Define Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    data_collator=data_collator,
)
model.config.use_cache = False

In [23]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss


TrainOutput(global_step=11211, training_loss=1.648648853848976, metrics={'train_runtime': 2242.8066, 'train_samples_per_second': 9.996, 'train_steps_per_second': 4.999, 'total_flos': 1.3713159750156288e+16, 'train_loss': 1.648648853848976, 'epoch': 3.0})

In [24]:
model.save_pretrained("./t5-base-gsm8k-socratic-qlora")
tokenizer.save_pretrained("./t5-base-gsm8k-socratic-qlora")

('./t5-base-gsm8k-socratic-qlora/tokenizer_config.json',
 './t5-base-gsm8k-socratic-qlora/special_tokens_map.json',
 './t5-base-gsm8k-socratic-qlora/spiece.model',
 './t5-base-gsm8k-socratic-qlora/added_tokens.json',
 './t5-base-gsm8k-socratic-qlora/tokenizer.json')

Inference

Evaluation

In [25]:
# Evaluation
eval_results = trainer.evaluate()
print(f"Perplexity: {torch.exp(torch.tensor(eval_results['eval_loss']))}")

Perplexity: 2.375446081161499


In [6]:
# Load the model and tokenizer from the saved directory
model_path = tokenizer_path = "./t5-base-gsm8k-socratic-qlora"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

Accuracy

In [7]:
import re
from transformers import pipeline

In [8]:
# Load the T5-small model and tokenizer
base_tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [9]:
qlora_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
qlora_model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [10]:
from evaluation import *
evaluator = Evaluation(qlora_model, qlora_tokenizer, device=device)

In [36]:
# Evaluate on ROUGE metrics
rouge_results = evaluator.evaluate_rouge(eval)
print("ROUGE results:", rouge_results)

ROUGE results: {'rouge1': np.float64(0.23618406633267777), 'rouge2': np.float64(0.14452999341987302), 'rougeL': np.float64(0.21073281428230745), 'rougeLsum': np.float64(0.22943474210107465)}


In [11]:
accuracy_base = evaluator.evaluate_accuracy(eval, base_model, base_tokenizer)
print(f"Accuracy: {accuracy_base:.2f}%")

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Accuracy: 1.06%


In [12]:
accuracy = evaluator.evaluate_accuracy(eval, qlora_model, qlora_tokenizer)
print(f"Accuracy: {accuracy:.2f}%")

Accuracy: 0.99%


Using Chain-Of-Thoughts

In [None]:
# Evaluate using Chain-of-Thought (CoT) and few-shot learning
# accuracy_cot_few_shot = evaluator.evaluate_with_cot(eval)
# print(f"Accuracy: {accuracy_cot_few_shot:.2f}%")

Semantic Similarity

In [38]:
res = evaluator.evaluate_semantic_similarity(eval)
print(f"Average semantic similarity: {(res*100):.2f}%")

Average semantic similarity: 75.96%
