In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

%load_ext autoreload
%autoreload 2

In [None]:
import os
import warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
# Load GSM8K Dataset
dataset = load_dataset("gsm8k", "main")
train = dataset["train"]
eval = dataset["test"]
print(f"Training set size: {len(train)}")
print(f"Evaluation set size: {len(eval)}")
display(train[3])

In [None]:
# Load the T5-small model and tokenizer
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
# Define LoRA Configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q", "v"],
    task_type=TaskType.SEQ_2_SEQ_LM
)

In [None]:
# lora_config = LoraConfig(
#     r=16, 
#     lora_alpha=32,
#     lora_dropout=0.05, 
#     target_modules=["q", "v", "k", "o", "wi", "wo"],
#     task_type=TaskType.QUESTION_ANS,
#     bias="none" 
# )

In [None]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
def preprocess_function(examples):
    # Tokenize the inputs and targets
    inputs = examples["question"]
    targets = examples["answer"]
    
    # Tokenize inputs and targets separately, keeping all relevant fields
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    
    # Include attention_mask in model_inputs and set labels
    model_inputs["labels"] = labels["input_ids"]
    
    return {
        "input_ids": model_inputs["input_ids"],
        "attention_mask": model_inputs["attention_mask"],
        "labels": model_inputs["labels"]
    }

In [None]:
# Tokenize and remove unnecessary columns
train_data = train.map(preprocess_function, batched=True, remove_columns=["question", "answer"])
eval_data = eval.map(preprocess_function, batched=True, remove_columns=["question", "answer"])

In [None]:
# Define the data collator
label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [None]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-small-gsm8k-lora",
    eval_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    remove_unused_columns=False,
)

In [None]:
# training_args = Seq2SeqTrainingArguments(
#     output_dir="./t5-small-gsm8k-lora",
#     evaluation_strategy="steps",  # Evaluate more frequently
#     eval_steps=100,  # Evaluate every 100 steps
#     learning_rate=1e-4,  # Lower learning rate for stability
#     per_device_train_batch_size=8,  # Increase if memory allows
#     per_device_eval_batch_size=8,
#     num_train_epochs=5,  # Train for more epochs
#     weight_decay=0.01,
#     save_strategy="steps",
#     save_steps=100,
#     logging_dir="./logs",
#     logging_steps=10,
#     load_best_model_at_end=True,
#     metric_for_best_model="eval_loss",  # Choose appropriate metric
#     greater_is_better=False,
#     remove_unused_columns=False,
#     fp16=True,  # Enable mixed precision training if GPU supports it
#     gradient_accumulation_steps=4,  # Simulate larger batch sizes
# )

In [None]:
# Define Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    data_collator=data_collator,
)
model.config.use_cache = False

In [None]:
# Fine-tune the model
trainer.train()

In [None]:
model.save_pretrained("./t5-small-gsm8k-lora")
tokenizer.save_pretrained("./t5-small-gsm8k-lora")

Evaluation

In [None]:
# Evaluation
eval_results = trainer.evaluate()
print(f"Perplexity: {torch.exp(torch.tensor(eval_results['eval_loss']))}")

In [None]:
# Load the model and tokenizer from the saved directory
model_path = tokenizer_path = "./t5-small-gsm8k-lora"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

Accuracy

In [None]:
import re
from transformers import pipeline

In [None]:
# Load the T5-small model and tokenizer
base_tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
lora_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
lora_model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [None]:
from evaluation import *
evaluator = Evaluation(lora_model, lora_tokenizer, device=device)

In [None]:
# Evaluate on ROUGE metrics
rouge_results = evaluator.evaluate_rouge(eval)
print("ROUGE results:", rouge_results)

In [None]:
accuracy_base = evaluator.evaluate_accuracy(eval, base_model, base_tokenizer)
print(f"Accuracy: {accuracy_base:.2f}%")

In [None]:
accuracy_lora = evaluator.evaluate_accuracy(eval, lora_model, lora_tokenizer)
print(f"Accuracy: {accuracy_lora:.2f}%")

Using Chain-Of-Thoughts

In [None]:
# Evaluate using Chain-of-Thought (CoT) and few-shot learning
accuracy_cot_few_shot = evaluator.evaluate_with_cot(eval)
print(f"Accuracy: {accuracy_cot_few_shot:.2f}%")

STACKED_LORA

In [None]:
def preprocess_function_stacked(examples):
    # Tokenize the inputs and targets
    inputs = examples["query"]
    targets = examples["answer"]
    
    # Tokenize inputs and targets separately, keeping all relevant fields
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    
    # Include attention_mask in model_inputs and set labels
    model_inputs["labels"] = labels["input_ids"]
    
    return {
        "input_ids": model_inputs["input_ids"],
        "attention_mask": model_inputs["attention_mask"],
        "labels": model_inputs["labels"]
    }

In [None]:
from datasets import load_dataset

# Load the dataset
ds = load_dataset("m-a-p/CodeFeedback-Filtered-Instruction")

# Limit the dataset to 6000 data points
ds_train = ds['train'].select(range(6000))  # Assuming you want 6000 from the 'train' split

# Check the size of the limited dataset
print(ds_train)

mode = "./t5-small-gsm8k-lora"


In [None]:
# Tokenize and remove unnecessary columns
train_data = ds_train.map(preprocess_function_stacked, batched=True, remove_columns=["query", "answer","resource", "lang"])
eval_data = eval.map(preprocess_function, batched=True, remove_columns=["question", "answer"])

In [None]:
# Training arguments
training_args_stacked_lora = Seq2SeqTrainingArguments(
    output_dir="./t5-small-gsm8k-lora-stacked",
    eval_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    remove_unused_columns=False,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [None]:
# Define Trainer
trainer = Seq2SeqTrainer(
    model= model,
    args=training_args_stacked_lora,
    train_dataset=train_data,
    eval_dataset=eval_data,
    data_collator=data_collator,
)

In [None]:
# Fine-tune the model
trainer.train()

In [None]:
model.save_pretrained("./t5-small-gsm8k-lora-stacked")
tokenizer.save_pretrained("./t5-small-gsm8k-lora-stacked")

In [None]:
# Evaluation
eval_results = trainer.evaluate()
print(f"Perplexity: {torch.exp(torch.tensor(eval_results['eval_loss']))}")

In [None]:
# Load the model and tokenizer from the saved directory
model_path = tokenizer_path = "./t5-small-gsm8k-lora-stacked"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# Load the T5-small model and tokenizer
base_tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
lora_stacked_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
lora_stacked_model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [None]:
from evaluation import *
evaluator = Evaluation(lora_model, lora_tokenizer, device=device)

In [None]:
# Evaluate on ROUGE metrics
rouge_results = evaluator.evaluate_rouge(eval)
print("ROUGE results:", rouge_results)

In [None]:
accuracy_base = evaluator.evaluate_accuracy(eval, base_model, base_tokenizer)
print(f"Accuracy: {accuracy_base:.2f}%")

In [None]:
accuracy_lora = evaluator.evaluate_accuracy(eval, lora_model, lora_tokenizer)
print(f"Accuracy: {accuracy_lora:.2f}%")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import torch

# Function to extract embeddings from T5
def extract_t5_embeddings(text, model, tokenizer):
    """
    Extract embeddings from the T5 model.
    """
    # Tokenize input text and get tensor input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    input_ids = inputs['input_ids'].to(model.device)

    # Get hidden states (outputs from the last encoder layer)
    with torch.no_grad():
        outputs = model.encoder(input_ids=input_ids)
        # Get embeddings (hidden states of the last encoder layer)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Average across all token embeddings
    
    return embeddings

# Function to calculate semantic similarity using T5 embeddings
def calculate_semantic_similarity_t5(predicted_steps, ground_truth_steps, model, tokenizer):
    """
    Calculate semantic similarity using T5 embeddings.
    """
    predicted_embedding = extract_t5_embeddings(predicted_steps, model, tokenizer)
    ground_truth_embedding = extract_t5_embeddings(ground_truth_steps, model, tokenizer)
    
    # Compute cosine similarity
    similarity = cosine_similarity(predicted_embedding.cpu().numpy(), ground_truth_embedding.cpu().numpy())
    return similarity[0][0]

# Function to evaluate semantic similarity on the entire dataset
def evaluate_semantic_similarity(model, tokenizer, eval_data):
    """
    Evaluate semantic similarity between generated texts and reference answers for the entire dataset.
    
    Args:
        model (T5 model): Your fine-tuned T5 model.
        tokenizer (T5 tokenizer): The tokenizer for your fine-tuned T5 model.
        eval_data (list): A list of examples with 'question' and 'answer' keys.
    
    Returns:
        dict: A dictionary containing average semantic similarity score.
    """
    generated_texts = []
    references = []
    
    for example in eval_data:
        input_text = example['question']
        reference_answer = example["answer"]
        
        # Tokenize and generate text using the fine-tuned model
        inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
        outputs = model.generate(**inputs)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        generated_texts.append(generated_text)
        references.append(reference_answer)

    # Compute semantic similarity for the entire dataset
    similarities = [calculate_semantic_similarity_t5(pred, ref, model, tokenizer) for pred, ref in zip(generated_texts, references)]

    # Calculate the overall semantic similarity metric (average in this case)
    overall_semantic_similarity = sum(similarities) / len(similarities)

    return {'average_semantic_similarity': overall_semantic_similarity}

In [None]:
result = evaluate_semantic_similarity(fine_tuned_model, fine_tuned_tokenizer, eval)
print(result)