# <span style="color: blue"> Fine-Tune FLAN-T5 Base for Dialogue Summarization </span>

## Installation

In [1]:
%pip install -U datasets==2.17.0 # installs the datasets library from Hugging Face
%pip install --upgrade pip
%pip install --disable-pip-version-check torch==1.13.1 torchdata==0.5.1 --quiet
%pip install transformers==4.27.2 evaluate==0.4.0 peft==0.3.0 --quiet # install libraries from Hugging Face
%pip install rouge_score==0.1.2 loralib==0.1.1 --quiet

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Imports

In [2]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM # loads a pre-trained sequence-to-sequence model
from transformers import AutoTokenizer # loads the appropriate tokenizer for any given model
from transformers import GenerationConfig # allows you to configure generation parameters for text generation tasks, such as max length, temperature, top-k sampling, and others
from transformers import TrainingArguments # sets up model training configurations, such as learning rate, batch size, and logging
from transformers import Trainer # a high-level API to streamline model training, evaluation, and fine-tuning tasks

from peft import LoraConfig, get_peft_model, TaskType, PeftModel, PeftConfig

import torch
import evaluate
import pandas as pd
import numpy as np

2024-11-13 09:13:32.477237: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# <span style="color: green"> Dataset & Model </span>

In [3]:
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

## Inspecting the dataset

In [4]:
print(f"type(dataset): {type(dataset)}")
print(f"len(dataset) = {len(dataset)}")
print(f"dataset.keys(): {dataset.keys()}")
print(f"type(dataset['test']): {type(dataset['test'])}")
print(f"testdata[0].keys(): {dataset['test'][0].keys()}")
print(f"number of examples in train data      : {len(dataset['train'])}")
print(f"number of examples in validation data : {len(dataset['validation'])}")
print(f"number of examples in test data       : {len(dataset['test'])}")

type(dataset): <class 'datasets.dataset_dict.DatasetDict'>
len(dataset) = 3
dataset.keys(): dict_keys(['train', 'validation', 'test'])
type(dataset['test']): <class 'datasets.arrow_dataset.Dataset'>
testdata[0].keys(): dict_keys(['id', 'dialogue', 'summary', 'topic'])
number of examples in train data      : 12460
number of examples in validation data : 500
number of examples in test data       : 1500


**Note:**
- `dataset` is **DatasetDict** dictionary with keys *train*, *validation*, and *test*.
- Each member of the dataset, e.g., `dataset['test']` includes `examples` that can be accessed by indexing: `dataset['test'][index]`
- Each `example` is a dictionary with keys *id*, *dialogue*, *summary*, and *topic*

In [5]:
testdata = dataset['test']
example_indices = [1, 123]

hbar = '_'*40

for i, index in enumerate(example_indices):
    print(hbar)
    print(f"Example {i}")
    print('\nINPUT DIALOGUE:')
    print(testdata[index]['dialogue'])
    print('\nBASELINE HUMAN SUMMARY:')
    print(testdata[index]['summary'])

________________________________________
Example 0

INPUT DIALOGUE:
#Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official memos. The use of Instant Message programs by employees during working hours is strictly prohibited.
#Person2#: Sir, does this apply to intra-office communications only? Or will it also restrict external communications?
#Person1#: It should apply to all communications, not only in this office between employees, but also any outside communications.
#Person2#: But sir, many employees use Instant Messaging to communicate with their clients.
#Person1#: They will just have to change their communication methods. I don't want any - one using Instant Messaging in this off

## Pretrained Model: FLAN-T5

In [6]:
model_name='google/flan-t5-base'
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) # creates an instance of AutoModelForSeq2SeqLM class with .from_pretrained() method



In [7]:
print(original_model.named_parameters)

<bound method Module.named_parameters of T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_featu

In [8]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {trainable_model_params/all_model_params*100:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


## Tokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name) # tokenizer for the FLAN-T5 model using `AutoTokenizer.from_pretrained()` method

In [10]:
sentence = "How is it going Fakhreddin?"
sentence_encoded = tokenizer(sentence, return_tensors='pt') # return tensors in PyTorch ('pt') format
sentence_decoded = tokenizer.decode(sentence_encoded["input_ids"][0], skip_special_tokens=True)

print(f"ENCODED SENTENCE: {sentence_encoded['input_ids'][0]}")
print(f"SENTENCE:         {sentence}")
print(f"DECODED SENTENCE: {sentence_decoded}")

ENCODED SENTENCE: tensor([  571,    19,    34,   352,   377, 18965,  1271,  2644,    58,     1])
SENTENCE:         How is it going Fakhreddin?
DECODED SENTENCE: How is it going Fakhreddin?


**Note:**
- Other options for `return_tensors` are `'tf'` and `'np'` for tensorflow and numpy, respectively.
- `sentence_encoded` is a dictionary-like object with key `'input_ids'`.
- `input_ids`: After tokenization, each token is mapped to a unique number called an "ID" from a predefined vocabulary (which is part of the model's tokenizer). The `input_ids` are the list of these numbers corresponding to the tokens. These IDs are then passed to the model as input. For example, if "Summarize" corresponds to ID 123, the `input_ids` for the phrase "Summarize the following conversation" would be [123, X, Y, Z, ...].
- `sentence_encoded['input_ids']` is a 2D tensor with shape `(1,N)`, where `N` denotes the number of tokens in the sequence.
- `decoder` accepts 1D input with shape (N,): `sentence_encoded['input_ids'][0]`.

The `skip_special_tokens=True` parameter is used in the decode method to remove special tokens that were added by the tokenizer. Special tokens: In many NLP models, special tokens are added to the input for specific purposes. Setting `skip_special_tokens=True` tells the tokenizer's decode function to exclude these tokens from the output, so the decoded sentence appears clean and natural without extra symbols.

Example: Without `skip_special_tokens=True`, you might see output like this:

`How is it going Fakhreddin?</s>`

## Test the Model with Zero-Shot Inference

In [11]:
def make_zero_shot_prompt(dialogue):
    return dialogue

In [12]:
testdata = dataset['test']
test_indices = [200] # working with one single example

def generate(make_zero_shot_prompt):
    for i, index in enumerate(test_indices):
        dialogue = testdata[index]['dialogue']
        summary  = testdata[index]['summary']
    
        prompt = make_zero_shot_prompt(dialogue)
        inputs = tokenizer(prompt, return_tensors='pt')
    
        model_tokenized_output = original_model.generate(inputs["input_ids"], max_new_tokens=50)
        output = tokenizer.decode(model_tokenized_output[0], skip_special_tokens=True)
    
        print(hbar)
        print(f"Example {i}")
        print('\nINPUT DIALOGUE:')
        print(testdata[index]['dialogue'])
        print('\nBASELINE HUMAN SUMMARY:')
        print(testdata[index]['summary'])
        print('\nMODEL GENERATION:')
        print(output)
        print(f"\ninputs['input_ids'].size() = {inputs['input_ids'].size()}")
        print(f"\nmodel_tokenized_output.size() = {model_tokenized_output.size()}")
        
generate(make_zero_shot_prompt)

________________________________________
Example 0

INPUT DIALOGUE:
#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.

MODEL GENERATION:
#Person1#: I'm thinking of upgra

# <span style="color:green"> Fine-Tuning </span>

## Preparing the Dialogue-Summary Dataset

Here we prepare dialog-summary pairs in the correct format for fine-tuning a FLAN-T5 model. To adapt dialog and summary pairs for training, each dialog instance should be explicitly structured as a prompt-completion pair to help the language model understand the task. We add an instruction, "Summarize the following conversation," before each dialog and "Summary:" before each summary (completion), making the task requirement clear. We finally tokenize the dialogue-summary pairs by applying to be used for training the model.

**Note:** The original datset is a dictionary with `"train"`, `"validation"`, and `"test"` keys. Each category (e.g., `"train"` has examples accessible by indexing (e.g., `dataset["train"][index]`). Each example is itself a dictionary with important keys `"dialogue"` and `"summary"`.

In [13]:
def create_prompt(dialogue):
    return f"Summarize the following conversation\n\n{dialogue}\n\nSummary: "
def tokenize_function(example):
    # a functional tranformation to be applied on the dataset across all examples "train", `validation`, and "test"
    prompt = [create_prompt(dialogue) for dialogue in example["dialogue"]]
    example["input_ids"] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt")["input_ids"]
    example["labels"] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt")["input_ids"]    
    return example


tokenized_datasets = dataset.map(tokenize_function, batched=True) # functional transformation
tokenized_datasets = tokenized_datasets.remove_columns(["id", "topic", "dialogue", "summary"])
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1500
    })
})

**Note:**
- The column names `input_ids` and `labels` are crucial.
- `padding="max_length"`: This argument specifies that all tokenized sequences should be padded to a fixed maximum length. Padding is done by adding special padding tokens to sequences shorter than the max length, making all sequences in the batch the same length. This is important for batch processing, as uniform sequence lengths enable efficient computation on GPUs.

- `truncation=True:` This option instructs the tokenizer to truncate any sequence that exceeds the maximum length, keeping only the initial tokens up to the defined max length. This prevents sequences from exceeding the model's input limits, which would otherwise cause errors.

Let's see an example of the tokenized_datasets below.

In [14]:
index = test_indices[0]
print(tokenized_datasets["test"][index])

{'input_ids': [12198, 1635, 1737, 8, 826, 3634, 1713, 345, 13515, 536, 4663, 10, 2114, 25, 1702, 21066, 39, 358, 58, 1713, 345, 13515, 357, 4663, 10, 2163, 6, 68, 27, 31, 51, 59, 417, 125, 1776, 27, 133, 174, 5, 1713, 345, 13515, 536, 4663, 10, 148, 228, 1099, 2651, 3, 9, 3924, 478, 12, 39, 889, 5, 94, 133, 995, 25, 12, 143, 95, 39, 293, 3971, 277, 11, 11662, 7, 21, 3662, 5, 1713, 345, 13515, 357, 4663, 10, 466, 133, 36, 3, 9, 3, 14339, 4023, 5, 1713, 345, 13515, 536, 4663, 10, 148, 429, 92, 241, 12, 5941, 39, 4214, 250, 34, 19, 1134, 21643, 230, 5, 1713, 345, 13515, 357, 4663, 10, 571, 54, 62, 103, 24, 58, 1713, 345, 13515, 536, 4663, 10, 148, 31, 26, 1077, 174, 3, 9, 3627, 7502, 6, 12, 1731, 28, 5, 275, 25, 92, 174, 3, 9, 72, 2021, 614, 5025, 6, 72, 2594, 11, 3, 9, 3627, 2175, 51, 5, 531, 25, 43, 3, 9, 3190, 18, 13103, 1262, 58, 1713, 345, 13515, 357, 4663, 10, 465, 5, 1713, 345, 13515, 536, 4663, 10, 37, 29, 25, 429, 241, 12, 617, 3, 9, 3190, 18, 13103, 1262, 396, 6, 250, 167, 126, 

In [15]:
# **Practice:** Let's also write a functional trnsformation without the tokenization step, i.e., prepare the dialogue-summary pairs in text format.
# def tokenize_function(example):
#     example["prompt"] = [f"Summarize the following conversation\n\n{dialogue}\n\nSummary: " for dialogue in example["dialogue"]]
#     example["completion"] = example["summary"]
#     return example


# textual_datasets = dataset.map(tokenize_function, batched=True) # functional transformation
# textual_datasets = textual_datasets.remove_columns(["id", "topic", "dialogue", "summary"])
# textual_datasets

In [16]:
# print(textual_datasets["test"][index])

## Perform Parameter Efficient Fine-Tuning (PEFT)
Here we use **PEFT** with **Low-Rank Adaptation (LoRA)** technique. After fine-tuning for a specific task, use case, or tenant with LoRA, the result is that the original LLM remains unchanged and a newly-trained *LoRA adapter* emerges. This LoRA adapter is much, much smaller than the original LLM (MBs vs GBs).

## Setup the PEFT/LoRA model for Fine-Tuning

In [17]:
lora_config = LoraConfig(r=32,
                         lora_alpha=32,
                         target_modules=["q", "v"],
                         lora_dropout=0.05,
                         bias="none",
                         task_type=TaskType.SEQ_2_SEQ_LM)

**Notes:**
- `r=32`: This sets the rank of the low-rank matrices in the LoRA technique. The rank controls how many parameters will be modified in the adaptation process. A higher rank increases the number of parameters updated, potentially leading to better performance but also higher computational cost.
- `lora_alpha=32`: This parameter scales the learned low-rank matrices. It controls how much influence the low-rank adaptations should have relative to the original model parameters.
- `target_modules=["q", "v"]`: These are the target modules in the model where LoRA will be applied. For example, "q" and "v" refer to the query and value matrices in attention layers of a transformer model like FLAN-T5. By targeting these layers, LoRA modifies only specific components rather than the entire model, which saves memory and computation.
- `lora_dropout=0.05`: This is the dropout rate applied to the low-rank matrices. It helps prevent overfitting during fine-tuning by randomly dropping some weights during training.
- `bias="none"`: This specifies that no biases should be added to the LoRA layers. This can be adjusted based on the model architecture and task.
- `task_type=TaskType.SEQ_2_SEQ_LM`: This defines the task type, which in this case is sequence-to-sequence language modeling. The task type helps the model understand the nature of the fine-tuning task. FLAN-T5 is often used for tasks like summarization, translation, and other sequence-to-sequence tasks.

See the corresponding [Hugging Face web page for LoRA](https://huggingface.co/docs/peft/en/package_reference/lora) for more details.

In [18]:
peft_model = get_peft_model(original_model, lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%


## Train PEFT Adapter

In [19]:
output_dir = f'./model-checkpoints/'

peft_training_args = TrainingArguments(output_dir=output_dir,
                                       auto_find_batch_size=True,
                                       learning_rate=1e-3,
                                       num_train_epochs=3)
    
peft_trainer = Trainer(model=peft_model,
                       args=peft_training_args,
                       train_dataset=tokenized_datasets["train"])

# add eval_dataset=tokenized_datasets["validation"] to the trainer as well if needed

In [20]:
peft_trainer.train()
peft_model_path="./model-trained/"
peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)



Step,Training Loss
500,1.2886
1000,0.1283
1500,0.1231
2000,0.1192
2500,0.1152
3000,0.1128
3500,0.1114
4000,0.1064
4500,0.107
5000,0.1075


('./model-trained/tokenizer_config.json',
 './model-trained/special_tokens_map.json',
 './model-trained/tokenizer.json')

In [21]:
peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)
import copy
original_model = copy.deepcopy(peft_model_base).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(peft_model_path)
peft_model = PeftModel.from_pretrained(peft_model_base, peft_model_path, torch_dtype=torch.bfloat16, is_trainable=False).to("cuda")



In [22]:
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 0
all model parameters: 251116800
percentage of trainable model parameters: 0.00%


## Evaluate the Model Qualitatively (Human Evaluation)

In [23]:
dialogue = dataset['test'][index]['dialogue']
human_baseline_summary = dataset['test'][index]['summary']

prompt = create_prompt(dialogue)
input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(hbar)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(hbar)
print(f'PEFT MODEL:\n{peft_model_text_output}')

BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
________________________________________
ORIGINAL MODEL:
#Person1#: I'm thinking of upgrading my computer.
________________________________________
PEFT MODEL:
#Person2# wants to upgrade #Person2#'s system and #Person1# recommends adding a painting program to the software. #Person2# also wants to upgrade the hardware because it's outdated now.


## Evaluate the Model Quantitatively (with ROUGE Metric)

In [29]:
N = 30
test_cases = np.random.randint(0, len(tokenized_datasets["test"]), N)
dialogues = dataset['test'][test_cases]['dialogue']
human_baseline_summaries = dataset['test'][test_cases]['summary']

original_model_summaries = []
peft_model_summaries = []

for idx, dialogue in enumerate(dialogues):
    prompt = create_prompt(dialogue)
    input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")

    human_baseline_text_output = human_baseline_summaries[idx]
    
    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

    original_model_summaries.append(original_model_text_output)
    peft_model_summaries.append(peft_model_text_output)
    
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(predictions=original_model_summaries,
                                       references=human_baseline_summaries[0:len(original_model_summaries)],
                                       use_aggregator=True,
                                       use_stemmer=True)

peft_model_results = rouge.compute(predictions=peft_model_summaries,
                                   references=human_baseline_summaries[0:len(peft_model_summaries)],
                                   use_aggregator=True,
                                   use_stemmer=True,)

print('ORIGINAL MODEL:')
print(original_model_results)
print('PEFT MODEL:')
print(peft_model_results)

ORIGINAL MODEL:
{'rouge1': 0.26365087541568394, 'rouge2': 0.07914687096969314, 'rougeL': 0.22289375217994295, 'rougeLsum': 0.22338093151654878}
PEFT MODEL:
{'rouge1': 0.42470858226171526, 'rouge2': 0.1616871110768635, 'rougeL': 0.32928426369153757, 'rougeLsum': 0.32936830213851975}
