In [None]:
%pip install SentencePiece
%pip install datasets
%pip install transformers
%pip install peft
%pip install --upgrade transformers

In [31]:
import time
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType, PeftModel, PeftConfig
import os

In [32]:
from datasets import DatasetDict, Dataset

# Load your CSV files into Pandas DataFrames
train_df = pd.read_csv("../../resources/summary/train.csv")
test_df = pd.read_csv("../../resources/summary/test.csv")
validation_df = pd.read_csv("../../resources/summary/validation.csv")

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
validation_dataset = Dataset.from_pandas(validation_df)

# Create a DatasetDict
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "validation": validation_dataset
})

# Print dataset information
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
})


In [33]:
import torch
# Load pre-trained model and tokenizer
model_name = 'google/flan-t5-base'
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [34]:
#tokenize the dataset for Peft-LoRA finetuning
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map: 100%|██████████| 12460/12460 [00:08<00:00, 1397.58 examples/s]
Map: 100%|██████████| 1500/1500 [00:01<00:00, 1499.87 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 1438.83 examples/s]


In [35]:
#To save some time, we will subsample the dataset:
#tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

In [36]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (12460, 2)
Validation: (500, 2)
Test: (1500, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 500
    })
})


In [37]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"   #environt variable must be put to false

In [38]:
#intializing configuration for LoRA
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [39]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"



In [40]:
#final model= original model + LoRA config 
peft_model = get_peft_model(original_model,
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%


In [42]:
#setting training arguments for PEFT-LoRA and instantiating the trainer class
output_dir = "../../resources/summary/FINE_TUNNED_MODEL_WITH_PEFT"

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1 ,
    dataloader_num_workers=2,  # Set this value to a number greater than 0
    dataloader_prefetch_factor=2
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

In [43]:
#traing PEFT-LoRA and saving the model
peft_trainer.train()

peft_model_path = "../../resources/summary/FINE_TUNNED_MODEL_WITH_PEFT"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)


100%|██████████| 1/1 [48:39<00:00, 2919.42s/it]

{'loss': 47.25, 'grad_norm': 7.875, 'learning_rate': 0.0, 'epoch': 0.0}


100%|██████████| 1/1 [48:39<00:00, 2919.99s/it]


{'train_runtime': 2919.9842, 'train_samples_per_second': 0.003, 'train_steps_per_second': 0.0, 'train_loss': 47.25, 'epoch': 0.0}


('../../resources/summary/FINE_TUNNED_MODEL_WITH_PEFT\\tokenizer_config.json',
 '../../resources/summary/FINE_TUNNED_MODEL_WITH_PEFT\\special_tokens_map.json',
 '../../resources/summary/FINE_TUNNED_MODEL_WITH_PEFT\\spiece.model',
 '../../resources/summary/FINE_TUNNED_MODEL_WITH_PEFT\\added_tokens.json',
 '../../resources/summary/FINE_TUNNED_MODEL_WITH_PEFT\\tokenizer.json')

In [44]:
#PEFT_VALIDATION_LORA
import torch

# Load the fine-tuned PEFT model and tokenizer
peft_model_path = "../../resources/summary/FINE_TUNNED_MODEL_WITH_PEFT"
peft_model = AutoModelForSeq2SeqLM.from_pretrained(peft_model_path)
peft_tokenizer = AutoTokenizer.from_pretrained(peft_model_path)


In [45]:
index = 300
dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']
prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

inputs = peft_tokenizer(prompt, return_tensors='pt')




# Generate summary using PEFT model
output_peft = peft_tokenizer.decode(
    peft_model.generate(
        inputs["input_ids"],
        max_new_tokens=400,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')

print(dash_line)
print(f'MODEL GENERATION - PEFT:\n{output_peft}')  # <-- Added for PEFT summary

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: I cannot imagine if Trump were to be our President again.
#Person2#: I am proud to say that he is our President, and I will be really happy if he could be re-elected.
#Person1#: You voted for him, right?
#Person2#: Did you vote for him, because I know that I did.
#Person1#: I am not sure about this.
#Person2#: I have nothing but faith in Trump.
#Person1#: What?
#Person2#: I am pretty sure he will make America great again!
#Person1#: Well, though we do need some change in this country, I don't think he is the right person.
#Person2#: Our country is already changing as it is.
#Person1#: You are right about this.
#Person2#: I trust that he will take good care of our country.
#Person1#: Well, I don't think so. I will vote for Biden anyway.

Summary:

------------------------------------------------------------------------------