# Finetuning T5 using LoRA on GoEmotions and DialogSum Dataset

In [4]:
from datasets import load_dataset, concatenate_datasets, DatasetDict

#load datasets
goemotions = load_dataset("google-research-datasets/go_emotions", "simplified")
dialogsum = load_dataset("knkarthick/dialogsum")


#preprocess GoEmotions dataset
def preprocess_goemotions(batch):
  inputs = [f"Emotion: {label} Context: {text}" for text, label in zip(batch["text"], batch["labels"])]
  targets = batch["text"] #use the same text as target for simplicity
  return {"input_text": inputs, "target_text": targets}

processed_goemotions = goemotions.map(
    preprocess_goemotions, batched=True, remove_columns=goemotions["train"].column_names)


#preprocess DialogSum dataset
def preprocess_dialogsum(batch):
    inputs = [f"summarize: {dialogue}" for dialogue in batch["dialogue"]]
    targets = batch["summary"]
    return {"input_text": inputs, "target_text": targets}

processed_dialogsum = dialogsum.map(
    preprocess_dialogsum, batched=True, remove_columns=dialogsum["train"].column_names)

Generating train split: 100%|██████████| 43410/43410 [00:00<00:00, 1303811.99 examples/s]
Generating validation split: 100%|██████████| 5426/5426 [00:00<00:00, 843962.53 examples/s]
Generating test split: 100%|██████████| 5427/5427 [00:00<00:00, 874404.11 examples/s]
Generating train split: 100%|██████████| 12460/12460 [00:00<00:00, 78968.72 examples/s]
Generating validation split: 100%|██████████| 500/500 [00:00<00:00, 65683.79 examples/s]
Generating test split: 100%|██████████| 1500/1500 [00:00<00:00, 126767.20 examples/s]
Map: 100%|██████████| 43410/43410 [00:00<00:00, 227634.11 examples/s]
Map: 100%|██████████| 5426/5426 [00:00<00:00, 316654.75 examples/s]
Map: 100%|██████████| 5427/5427 [00:00<00:00, 324950.93 examples/s]
Map: 100%|██████████| 12460/12460 [00:00<00:00, 301598.73 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 149433.66 examples/s]
Map: 100%|██████████| 1500/1500 [00:00<00:00, 250147.35 examples/s]


In [2]:
#combining train and test splits because we're not interested in inferencing and metrics on test split
train_set = concatenate_datasets([processed_goemotions['train'], processed_goemotions['test'],
                                  processed_dialogsum['train'], processed_dialogsum['test']])
val_set = concatenate_datasets([processed_goemotions['validation'], processed_dialogsum['validation']])

#combine into one DatasetDict
dataset = DatasetDict({"train": train_set, "validation": val_set})
dataset

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 62797
    })
    validation: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 5926
    })
})

In [3]:
from transformers import T5Tokenizer

model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)

#tokenize the datasets
def tokenize_function(batch):
    model_inputs = tokenizer(
        batch["input_text"],
        padding=True,
        truncation=True,
        max_length=1024
    )
    labels = tokenizer(
        batch["target_text"],
        padding=True,
        truncation=True,
        max_length=256
        )["input_ids"]
    labels = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_seq]
        for label_seq in labels
    ]
    model_inputs["labels"] = labels
    return model_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Map: 100%|██████████| 62797/62797 [00:28<00:00, 2167.52 examples/s]
Map: 100%|██████████| 5926/5926 [00:02<00:00, 2828.22 examples/s]


In [5]:
from transformers import T5ForConditionalGeneration
from peft import LoraConfig, get_peft_model, TaskType

#define LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,  # Sequence-to-sequence task
    r=32,                            # LoRA rank
    lora_alpha=32,                   # Scaling factor
    lora_dropout=0.1,                # Regularization
)

#load pre-trained T5 model
model = T5ForConditionalGeneration.from_pretrained(model_name)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 3,538,944 || all params: 226,442,496 || trainable%: 1.5628


In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

training_args = TrainingArguments(
    output_dir="./t5-lora",
    label_names=["labels"],
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-4, #higher lr for LoRA
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True, #mixed precision
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator,
    processing_class=tokenizer,
)

trainer.train()

model.save_pretrained("./t5-lora")
tokenizer.save_pretrained("./t5-lora")

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.4701,0.092581
2,0.3866,0.088932
3,0.3705,0.087933


('./t5-lora/tokenizer_config.json',
 './t5-lora/special_tokens_map.json',
 './t5-lora/spiece.model',
 './t5-lora/added_tokens.json')

# Finetuning T5 using LoRA again on MEMO Dataset

In [1]:
!pip install git+https://github.com/google-research/bleurt.git -q
!pip install bert_score rouge-score evaluate hf_xet -q
!pip install -U peft -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 MB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [15]:
import os
import pandas as pd
import torch
import gc
from transformers import T5Tokenizer, T5ForConditionalGeneration
from peft import PeftModel

torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Preprcoessing dataset to find metrics

In [3]:
def preprocess_dataset(path):
    """
    Preprocesses the dataset by reading all the csv files in the given path and
    converts it into a list of dictionaries with the input text and the summary text.
    """
    csv_files = [os.path.join(path, file) for file in os.listdir(path) if file.endswith(".csv")]
    data = []

    for file in csv_files:
        df = pd.read_csv(file)

        #extract the summary
        df['Utterance_cleaned'] = df['Utterance'].str.lower().str.strip() # to handle "summary " and "Summary"
        summary_row = df[df["Utterance_cleaned"] == "summary"]
        summary_text = summary_row.iloc[0, 1] if not summary_row.empty else ""

        #filter out rows that are not actual utterances
        dialogue_df = df[~df["Utterance_cleaned"].isin(["summary", "primary_topic", "secondary_topic"])]

        #drop inactive utterances
        dialogue_df = dialogue_df[dialogue_df['Sub topic'] != 'inactive']

        #concatenate utterances and format input
        full_dialogue = " ".join(dialogue_df.dropna(subset=["Utterance"]).apply(
            lambda row: f"{'Therapist' if row['Type'] == 'T' else 'Patient'}: {row['Utterance']}", axis=1
        ))

        input_text = f"summarize: {full_dialogue}"
        data.append({"input": input_text, "summary": summary_text})

    return data

In [4]:
train_data = preprocess_dataset("/kaggle/input/nlp-dataset/dataset/Train")
val_data = preprocess_dataset("/kaggle/input/nlp-dataset/dataset/Validation")
test_data = preprocess_dataset("/kaggle/input/nlp-dataset/dataset/Test")

## Fine-tuning the model

In [5]:
#initializing model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("/kaggle/input/nlp-dataset/t5-lora")
base_model = T5ForConditionalGeneration.from_pretrained("t5-base")
model = PeftModel.from_pretrained(base_model, "/kaggle/input/nlp-dataset/t5-lora")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [6]:
from datasets import Dataset, DatasetDict

#making hugging face dataset instance to fine tune with trainer api
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)

#create a dataset dictionary
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})


#tokenize the dataset for finetuning
def preprocess_function(examples):
    """
    Preprocesses the dataset for fine tuning the model.
    """
    model_inputs = tokenizer(examples["input"], padding="max_length", truncation=True, max_length=1024)
    labels = tokenizer(examples["summary"], padding="max_length", truncation=True, max_length=256)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/131 [00:00<?, ? examples/s]

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

Map:   0%|          | 0/39 [00:00<?, ? examples/s]

In [7]:
from peft import LoraConfig, TaskType

lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,  # Sequence-to-sequence task
    r=16,                            # LoRA rank
    lora_alpha=32,                   # Scaling factor
    lora_dropout=0.1,                # Regularization
)

#add new LoRA adapter and freeze the old one
model.add_adapter("memo_lora", lora_config)
model.set_adapter("memo_lora")

model.print_trainable_parameters()

trainable params: 1,769,472 || all params: 228,211,968 || trainable%: 0.7754


In [8]:
from transformers import TrainingArguments, Trainer
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

#fine tuning the model with trainer api and save the model
training_args = TrainingArguments(
    output_dir="./t5-final",
    label_names=["labels"],
    eval_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=5e-5,
    num_train_epochs=15,
    warmup_steps=200,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True, #mixed precision
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"]
)

trainer.train()

model.save_pretrained("./t5-final")
tokenizer.save_pretrained("./t5-final")

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,12.4969,12.343877
2,12.4922,12.008297
3,11.5419,11.103907
4,10.3874,9.108187
5,7.8667,5.333262
6,3.9675,2.615454
7,2.7666,2.306018
8,2.2351,2.18644
9,2.0432,2.123861
10,2.2012,2.081446


('./t5-final/tokenizer_config.json',
 './t5-final/special_tokens_map.json',
 './t5-final/spiece.model',
 './t5-final/added_tokens.json')

## Summarizing the texts

In [9]:
tokenizer = T5Tokenizer.from_pretrained("./t5-final")
model = T5ForConditionalGeneration.from_pretrained("./t5-final")
model.to(device)
model.eval()

for item in test_data:
    input_text = item["input"]

    #tokenize input
    input_ids = tokenizer(input_text, return_tensors="pt", padding=True).input_ids.to(device)

    #generate summary
    summary_ids = model.generate(input_ids, max_length=256, num_beams=8, repetition_penalty=5.0, early_stopping=True)
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    #store the generated summary
    item["generated_summary"] = generated_summary

del model
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

## Calculating BLEU score and BERT score on test set

In [10]:
references = [] #list to store target summaries
predictions = [] #list to store generated summaries

for item in test_data:
    references.append(item["summary"])  #ground truth summaries
    predictions.append(item["generated_summary"])

In [11]:
from bert_score import score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import evaluate
from rouge_score import rouge_scorer


#Rouge score
def compute_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)
        scores['rouge1'].append(score['rouge1'].fmeasure)
        scores['rouge2'].append(score['rouge2'].fmeasure)
        scores['rougeL'].append(score['rougeL'].fmeasure)
    return {key: sum(val)/len(val) for key, val in scores.items()}  # Averaging scores

rouge_scores = compute_rouge(predictions, references)

# BLEURT score
bleurt = evaluate.load("bleurt", module_type="metric", config_name="bleurt-base-128")
results = bleurt.compute(predictions=predictions, references=references)
avg_bleurt = sum(results["scores"]) / len(results["scores"])

#BLEU score
smoothie = SmoothingFunction().method4
bleu_scores = [sentence_bleu(ref, pred.split(), smoothing_function=smoothie) for ref, pred in zip(references, predictions)]
avg_bleu = sum(bleu_scores) / len(bleu_scores)

#BERT score
P, R, F1 = score(predictions, [ref[0] for ref in references], lang="en")

Downloading builder script:   0%|          | 0.00/5.20k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/405M [00:00<?, ?B/s]

I0000 00:00:1744716679.270072      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 12552 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1744716679.270645      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13800 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
print(f"Rogue-1 Score: {rouge_scores['rouge1'] * 100:.2f}")
print(f"Rogue-2 Score: {rouge_scores['rouge2'] * 100:.2f}")
print(f"Rogue-L Score: {rouge_scores['rougeL'] * 100:.2f}")
print(f"BLEURT Score: {avg_bleurt:.4f}")
print()
print(f"BLEU score: {avg_bleu * 100:.2f}")
print(f"BERT score F1: {F1.mean().item() * 100:.2f}")
print(f"BERT score Precision: {P.mean().item() * 100:.2f}")
print(f"BERT score Recall: {R.mean().item() * 100:.2f}")

Rogue-1 Score: 30.85
Rogue-2 Score: 7.41
Rogue-L Score: 18.60
BLEURT Score: -0.7016

BLEU score: 0.33
BERT score F1: 79.78
BERT score Precision: 77.79
BERT score Recall: 81.88


In [13]:
print('Original summary:', references[10])
print()
print('Generated summary:', predictions[10])

Original summary: The patient has not been coping well since they have just broken up with their boyfriend. The patient feels there is something wrong with them hence they get rejected. They are not able to sleep or eat. The therapist assures it is gonna be fine in the long term as the patient heeds for assurance.

Generated summary: Therapist tells Patient about the problems her boyfriend broke up with and she's hurt. She feels like there's something really wrong with her because people don't just reject her all the time. Therapist also thinks it's important to talk about other stuff that went on during the week other than the breakup.


In [14]:
for i in range(len(predictions)):
    print('Original summary:', references[i])
    print()
    print('Generated summary:', predictions[i])
    print('\n-----------------------------------------\n')

Original summary: The therapist conducts a test to measure thinking and memory of the patient. The patient answers the date as 15, year as 15, month as June, day as Thursday, seasona as summer. The patient is not able to answer the building's name. The patient responds floor as hard, city as New york, state as Pennsylvania. The patient incorrectly repeats the objects names. The patient counts number 100 backwards by 7 as 93, 7, 14, 21, 32. The patient spells world backwards as world as it is. The patient is not able to recall three items mentioned to them. The patient idenitifes objects as watch and pencil. The patient repeats the phrase "No if's ands, or but's" correctly.  The patient is right handed. The therapist asks the patient to fold a paper, pick a clipboard and copy an image. 

Generated summary: George asks for a test that measures his thinking and memory. He tells him the date, month, season, name of the building, city, county, state, number 100, count backward by seven, spe