# **Shorya Sethia**
[GitHub](https://github.com/shoryasethia)

## Install required libraries

In [None]:
%pip install torch
%pip install torchdata
%pip install numpy
%pip intsall transformers
%pip install datasets
%pip install rouge_score
%pip install evaluate
%pip install loralib
%pip install peft

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

## Loading the required libraries

In [None]:
import pandas as pd
import numpy as np
import torch
import time
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import evaluate

##  Loading dataset

Numerous datasets are available for fine-tuning the model. In this instance, I will utilize the DialogSum DataSet from HuggingFace for the fine-tuning process.
DialogSum is an extensive dialogue summarization dataset, featuring 13,460 dialogues along with manually labeled summaries and topics.

It contains the below fields.

* dialogue: text of the dialogue.
* summary: human-written summary of the dialogue.
* topic: human written topic/one-liner of the dialogue.
* id: unique file id of an example.


In [None]:
huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(huggingface_dataset_name)

dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12460 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

## Loading the Pre-Trained model

 This model have been fine-tuned on more than 1000 additional tasks covering also more languages than T5

In [None]:
model_name = "google/flan-t5-base"

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype = torch.bfloat16)


# Tokenization

tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [None]:
def get_num_trainable_params(model):
  total_params = 0
  total_trainable_params = 0

  for _ , params in model.named_parameters():
    total_params += params.numel()

    if params.requires_grad:
      total_trainable_params += params.numel()

  return f"Trainable Param = {total_trainable_params}\nTotal Params = {total_params}\n% of trainable params = {100*(total_trainable_params/total_params)}"

print(get_num_trainable_params(original_model))

Trainable Param = 3538944
Total Params = 251116800
% of trainable params = 1.4092820552029972


## Test the Model with Zero Shot Inferencing

In [None]:
target_index = [200]

for i, index in enumerate (target_index):

  dialogue = dataset['test'][index]['dialogue']
  human_summary = dataset['test'][index]['summary']

  print("INPUT DIALOGUE :")
  print(dialogue)
  print("-"*50)
  print("BASELINE HUMAN SUMMARY :")
  print(human_summary)
  print("-"*50)
  print("SUMMARY WITH ZERO SHOT :")

  prompt = f"""
  Summarize following conversations :

  {dialogue}

  Summary :

  """

  input = tokenizer(prompt , return_tensors='pt')
  output = original_model.generate(input["input_ids"],max_new_tokens = 50)
  flant5_summary = tokenizer.decode( output[0],
                                      skip_special_tokens = True)

  print(flant5_summary)

INPUT DIALOGUE :
#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.
--------------------------------------------------
BASELINE HUMAN SUMMARY :
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
--------------------------------------------------


From the observation above, it's evident that the model faces challenges in summarizing the dialogue compared to the baseline summary. However, it manages to extract essential information from the text, suggesting the potential for fine-tuning the model for the specific task at hand.

##  Pre-processing dataset

The dataset cannot be directly employed for fine-tuning. It is essential to format the prompt in a way that the model can comprehend.

I will create some helper functions to format our input dataset, ensuring its suitability for the fine-tuning process. Here, I need to convert the dialog-summary (prompt-response) pairs into explicit instructions for the LLM.

In [None]:
def tokenize_function(example):
  start_prompt = "Symmarize following conversations:\n"
  end_prompt = "\nSumamry:\n"
  prompt = [start_prompt+dialogue+end_prompt for dialogue in example['dialogue']]
  example['input_ids'] = tokenizer(prompt, padding= 'max_length', truncation = True, return_tensors='pt').input_ids
  example['labels'] = tokenizer(example['summary'], padding= 'max_length', truncation = True, return_tensors='pt').input_ids

  return example


tokenized_dataset = dataset.map(tokenize_function, batched = True)
tokenized_dataset


Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'labels'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'labels'],
        num_rows: 1500
    })
})

The above function can be used to convert our input into prompt format.

Now, I will use our model tokenizer to process these prompts into tokenized ones.

My aim here is to generate input sequences with consistent lengths, which is beneficial for fine-tuning the language model by optimizing efficiency and minimizing computational overhead. It is essential to ensure that these sequences do not surpass the model's maximum token limit.

In [None]:
tokenized_dataset = tokenized_dataset.remove_columns(['id', 'dialogue', 'summary', 'topic'])
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1500
    })
})

In [None]:
print(f"Train data shape = {tokenized_dataset['train'].shape}")
print(f"validation data shape = {tokenized_dataset['validation'].shape}")
print(f"Test data shape = {tokenized_dataset['test'].shape}")

Train data shape = (12460, 2)
validation data shape = (500, 2)
Test data shape = (1500, 2)


In [None]:
tokenized_dataset = tokenized_dataset.filter(lambda example, index: index % 10 == 0, with_indices = True)

print(f"Train data shape = {tokenized_dataset['train'].shape}")
print(f"validation data shape = {tokenized_dataset['validation'].shape}")
print(f"Test data shape = {tokenized_dataset['test'].shape}")
print(tokenized_dataset)

Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

Train data shape = (1246, 2)
validation data shape = (50, 2)
Test data shape = (150, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1246
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 50
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 150
    })
})


## Setup PEFT for Fine-Tuning

Define the LoRA config for Fine-tuning the base model.

Note the rank (r) hyper-parameter, which defines the rank/dimension of the adapter to be trained. r is the rank of the low-rank matrix used in the adapters, which thus controls the number of parameters trained. A higher rank will allow for more expressivity, but there is a compute tradeoff.

alpha here is the scaling factor for the learned weights. The weight matrix is scaled by alpha/r, and thus a higher value for alpha assigns more weight to the LoRA activations.

Once everything is set up and the PEFT is prepared, we can use the print_trainable_parameters() helper function to see how many trainable parameters are in the model.

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, #Rank
    lora_alpha=32,
    target_modules=['q',
                    'v',],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type=TaskType.SEQ_2_SEQ_LM   #FLAN-T5
)


In [None]:
peft_model = get_peft_model(original_model, lora_config)

print(get_num_trainable_params(peft_model))

Trainable Param = 3538944
Total Params = 251116800
% of trainable params = 1.4092820552029972


## Train PEFT Adapter

Define training arguments and create Trainer instance.


In [None]:
output_dir = f'./flan-t5-peft-lora-dialogsum'

peft_training_args = TrainingArguments(
    output_dir = output_dir,
    auto_find_batch_size = True,
    learning_rate = 1e-3,
    num_train_epochs = 5,
    logging_steps = 1,)

peft_trainer = Trainer(model = peft_model,
                      args = peft_training_args,
                      train_dataset = tokenized_dataset['train'])

The hyperparameters used above might vary depending on the dataset/model we are trying to fine-tune. This is just to show the capability of fine-tuning.

In [None]:
peft_trainer.train()

peft_model_path =  "./flan-t5-peft-lora-dialogsum-checkpoint"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

Step,Training Loss
1,48.5
2,45.75
3,42.25
4,37.0
5,33.75
6,30.875
7,27.625
8,25.0
9,22.75
10,19.75




('./flan-t5-peft-lora-dialogsum-checkpoint/tokenizer_config.json',
 './flan-t5-peft-lora-dialogsum-checkpoint/special_tokens_map.json',
 './flan-t5-peft-lora-dialogsum-checkpoint/spiece.model',
 './flan-t5-peft-lora-dialogsum-checkpoint/added_tokens.json',
 './flan-t5-peft-lora-dialogsum-checkpoint/tokenizer.json')

Here, I am setting **is_trainable=False** because the plan is only to perform inference with this PEFT model.

In [None]:
from peft import PeftModel, PeftConfig

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype = torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

peft_model = PeftModel.from_pretrained(peft_model_base,
                                       "./flan-t5-peft-lora-dialogsum-checkpoint",
                                       torch_dtype = torch.bfloat16,
                                       is_trainable = False)

In [None]:
print(get_num_trainable_params(peft_model))

Trainable Param = 0
Total Params = 251116800
% of trainable params = 0.0


Fine-tuning is often an iterative process. Based on the validation and test sets results, one may need to make further adjustments to the model's architecture, hyperparameters, or training data to improve its performance.

In [None]:
index = 200
device = original_model.device

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversations :

{dialogue}

Summary :
"""

input_ids = tokenizer(prompt, return_tensors = 'pt').input_ids

original_model_output = original_model.generate(input_ids = input_ids.to(device),
                                                generation_config = GenerationConfig(max_new_tokens = 50, num_beams = 1))
original_model_text_output = tokenizer.decode(original_model_output[0], skip_special_tokens = True)

peft_model_output = peft_model.generate(input_ids = input_ids,
                                                generation_config = GenerationConfig(max_new_tokens = 50, num_beams = 1))
peft_model_text_output = tokenizer.decode(peft_model_output[0], skip_special_tokens = True)

print("-"*50)
print(f'Baseline Human Summary :\n{summary}')
print("-"*50)
print(f'Original Model :\n{original_model_text_output}')
print("-"*50)
print(f'PEFT Model :\n{peft_model_text_output}')

--------------------------------------------------
Baseline Human Summary :
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
--------------------------------------------------
Original Model :
The #Person2# wants to upgrade his system and upgrade his hardware.
--------------------------------------------------
PEFT Model :
#Person2# wants to upgrade his computer and his hardware. #Person2# suggests adding a painting program to his software. #Person2# suggests adding a CD-ROM drive.


In [None]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for idx, dialogue in enumerate(dialogues):
    prompt = f"""
    Summarize the following conversations :

    {dialogue}

    Summary :
    """
    input_ids = tokenizer(prompt, return_tensors = 'pt').input_ids

    human_baseline_text_output = human_baseline_summaries[idx]

    original_model_output = original_model.generate(input_ids = input_ids.to(device),
                                                generation_config = GenerationConfig(max_new_tokens = 50))
    original_model_text_output = tokenizer.decode(original_model_output[0], skip_special_tokens = True)

    peft_model_output = peft_model.generate(input_ids = input_ids,
                                                generation_config = GenerationConfig(max_new_tokens = 50))
    peft_model_text_output = tokenizer.decode(peft_model_output[0], skip_special_tokens = True)

    original_model_summaries.append(original_model_text_output)
    peft_model_summaries.append(peft_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, peft_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'peft_model_summaries'])

df

Unnamed: 0,human_baseline_summaries,original_model_summaries,peft_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,The new policy restricts employees' access to ...,#Person1# asks Ms. Dawson to take a dictation ...
1,In order to prevent employees from wasting tim...,#Person1# wants to change their communication ...,#Person1# asks Ms. Dawson to take a dictation ...
2,Ms. Dawson takes a dictation for #Person1# abo...,#Person1# asks Ms. Dawson to take a dictation ...,#Person1# asks Ms. Dawson to take a dictation ...
3,#Person2# arrives late because of traffic jam....,#Person2# is having a traffic jam near the Car...,#Person2# got stuck in traffic and got stuck i...
4,#Person2# decides to follow #Person1#'s sugges...,#Person1# is driving to work and is worried ab...,#Person2# got stuck in traffic and got stuck i...
5,#Person2# complains to #Person1# about the tra...,#Person1# is stuck in traffic and gets stuck i...,#Person2# got stuck in traffic and got stuck i...
6,#Person1# tells Kate that Masha and Hero get d...,Masha and Hero are getting divorced. #Person1#...,Masha and Hero are getting divorced. They are ...
7,#Person1# tells Kate that Masha and Hero are g...,Masha and Hero are getting divorced. They are ...,Masha and Hero are getting divorced. They are ...
8,#Person1# and Kate talk about the divorce betw...,Masha and Hero are getting a divorced couple.p...,Masha and Hero are getting divorced. They are ...
9,#Person1# and Brian are at the birthday party ...,Brian's birthday party is very good.MO.,Brian's birthday is coming soon. Brian will da...


## Evaluate the Model Quantitatively (with ROUGE Metric)

ROUGE, or Recall-Oriented Understudy for Gisting Evaluation, is a set of metrics and a software package used for evaluating automatic summarization and machine translation software in natural language processing. The metrics compare an automatically produced summary or translation against a reference or a set of references (human-produced) summary or translation.

ROUGE metric to quantify the validity of summarizations produced by models. It compares summarizations to a “baseline” summary which is usually created by a human. While it's not a perfect metric, it does indicate the overall increase in summarization effectiveness that we have accomplished by fine-tuning.

To demonstrate the capability of ROUGE Metric Evaluation I will use some sample inputs to evaluate.

In [None]:
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('Original Model:')
print(original_model_results)
print('PEFT Model:')
print(peft_model_results)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Original Model:
{'rouge1': 0.37651698036006545, 'rouge2': 0.1294083244083244, 'rougeL': 0.27954620138270747, 'rougeLsum': 0.27982735297964856}
PEFT Model:
{'rouge1': 0.42925850100353405, 'rouge2': 0.14080638230538671, 'rougeL': 0.2958583466677569, 'rougeLsum': 0.305648133766927}


In [None]:
print("Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL")

improvement = (np.array(list(peft_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(peft_model_results.keys(), improvement):
    print(f'{key}: {value*100}%')

Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL
rouge1: 4.72584793565314%
rouge2: 1.13980578970623214%
rougeL: 1.631214528504942%
rougeLsum: 2.5820780787278435%


As we can see in the above results, there is a significant improvement in the PEFT model as compared to the original model denoted in terms of percentage.

In [None]:
%pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.8.2 sacrebleu-2.4.2


In [None]:
import sacrebleu

original_model_references = human_baseline_summaries[0:len(original_model_summaries)]
peft_model_references = human_baseline_summaries[0:len(peft_model_summaries)]

original_model_bleu = sacrebleu.corpus_bleu(original_model_summaries, original_model_references)
peft_model_bleu = sacrebleu.corpus_bleu(peft_model_summaries, peft_model_references)

print('Original Model BLEU:', original_model_bleu.score)
print('PEFT Model BLEU:', peft_model_bleu.score)


Original Model BLEU: 0.1550404559570942
PEFT Model BLEU: 0.19056854347856564


In [None]:
!zip -r /content/flan-t5-peft-lora-dialogsum.zip /content/flan-t5-peft-lora-dialogsum


  adding: content/flan-t5-peft-lora-dialogsum/ (stored 0%)
  adding: content/flan-t5-peft-lora-dialogsum/checkpoint-500/ (stored 0%)
  adding: content/flan-t5-peft-lora-dialogsum/checkpoint-500/trainer_state.json (deflated 89%)
  adding: content/flan-t5-peft-lora-dialogsum/checkpoint-500/optimizer.pt (deflated 23%)
  adding: content/flan-t5-peft-lora-dialogsum/checkpoint-500/README.md (deflated 66%)
  adding: content/flan-t5-peft-lora-dialogsum/checkpoint-500/adapter_config.json (deflated 52%)
  adding: content/flan-t5-peft-lora-dialogsum/checkpoint-500/rng_state.pth (deflated 25%)
  adding: content/flan-t5-peft-lora-dialogsum/checkpoint-500/adapter_model.safetensors (deflated 22%)
  adding: content/flan-t5-peft-lora-dialogsum/checkpoint-500/training_args.bin (deflated 51%)
  adding: content/flan-t5-peft-lora-dialogsum/checkpoint-500/scheduler.pt (deflated 55%)
  adding: content/flan-t5-peft-lora-dialogsum/runs/ (stored 0%)
  adding: content/flan-t5-peft-lora-dialogsum/runs/May28_22-55

In [None]:
from google.colab import files
files.download('/content/flan-t5-peft-lora-dialogsum.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>