Finetune T5 for summarization task

In [1]:
# #install libs

# %pip install --upgrade pip
# %pip install --disable-pip-version-check \
#     torch==1.13.1 \
#     torchdata==0.5.1 --quiet

# %pip install \
#     transformers==4.27.2 \
#     datasets==2.11.0 \
#     evaluate==0.4.0 \
#     rouge_score==0.1.2 \
#     loralib==0.1.1 \
#     peft==0.3.0 --quiet



In [2]:
# import libs
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

TIME=str(int(time.time()))

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#add wandb logging
# %pip install wandb
import wandb
wandb.init(project="T5-peft-finetuning")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mshubsoni[0m. Use [1m`wandb login --relogin`[0m to force relogin


Load dataset and LLM

In [4]:
Hugging_face_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(Hugging_face_dataset_name)
dataset

Found cached dataset csv (/home/azureuser/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-931380d0e19583fc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
})

In [5]:
#load the pretrained FLAN-T5 model and tokenizer

model_name = "google/flan-t5-base"

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype = torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
# utility functions

def print_number_of_trainable_parameters(model):
    trainable_params =0
    all_model_params =0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_params+=param.numel()
    return f"trainable model parameters: {trainable_params}\nall model parameters: {all_model_params} \n. percentage trainable parameters: {trainable_params*(100)/all_model_params:.2f}%"

In [7]:
print_number_of_trainable_parameters(original_model)

'trainable model parameters: 247577856\nall model parameters: 247577856 \n. percentage trainable parameters: 100.00%'

Test the Model for zero shot inferencing

In [8]:
index =200

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
)[0],
skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f"INTIAL PROMPT:\n:{prompt}")
print(dash_line)
print(f"BASELINE HUMMAN SUMMARY:\n {summary}\n")
print(dash_line)
print(f"MODEL GENERATION- ZERO SHOT:\n {output}")


---------------------------------------------------------------------------------------------------
INTIAL PROMPT:
:
Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

-----------------------------------------------------------------

Preprocess the dataset to add instruction prompt

In [9]:
def tokenize_function(example):
    system_prompt = 'Summarize the following conversation.\n\n'
    trigger_prompt = '\n\nSummary: '
    prompt = [system_prompt + dialogue + trigger_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length",truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example


tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenize_dataset = tokenized_dataset.remove_columns(['id', 'topic', 'dialogue', 'summary'])

Loading cached processed dataset at /home/azureuser/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-931380d0e19583fc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-17819e0af33e0661.arrow
Loading cached processed dataset at /home/azureuser/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-931380d0e19583fc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-42fdf05daa718908.arrow
Loading cached processed dataset at /home/azureuser/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-931380d0e19583fc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-3d8db1d9c7b40260.arrow


In [10]:
tokenizer.decode(tokenize_dataset['train'][0]['input_ids'], skip_special_tokens=True)

"Summarize the following conversation. #Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today? #Person2#: I found it would be a good idea to get a check-up. #Person1#: Yes, well, you haven't had one for 5 years. You should have one every year. #Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor? #Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good. #Person2#: Ok. #Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith? #Person2#: Yes. #Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit. #Person2#: I've tried hundreds of times, but I just can't seem to kick the habit. #Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave. #Person2#: Ok, thanks doctor. Summary: "

In [11]:
# # Todo: Rmove, take a sample
# #
# tokenized_dataset = tokenized_dataset.filter(lambda example, index: index % 5==0, with_indices=True)

In [12]:
# check the size of the datasets

print("shape of the datasets\n")
print(f"Training: {tokenized_dataset['train'].shape}")
print(f"Validation: {tokenized_dataset['validation'].shape}")
print(f"Test: {tokenized_dataset['test'].shape}")

print(tokenized_dataset)

shape of the datasets

Training: (12460, 6)
Validation: (500, 6)
Test: (1500, 6)
DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'labels'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'labels'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'labels'],
        num_rows: 500
    })
})


Train the PEFT adapter

In [76]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=8,
    lora_alpha=128,
    # target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM #T5

)

In [77]:
# Add lora layers to the original LLM

peft_model = get_peft_model(original_model, lora_config)

print_number_of_trainable_parameters(peft_model)


'trainable model parameters: 884736\nall model parameters: 248462592 \n. percentage trainable parameters: 0.36%'

In [78]:
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir = output_dir,
    auto_find_batch_size=True,
    # we can use higher learning rate for peft adapter in comparison to full finetuning
    learning_rate = 1e-03,
    num_train_epochs=1,
    logging_steps=100,
    evaluation_strategy="steps",
    logging_strategy="steps",
    # max_steps =100
)

peft_trainer = Trainer(
    model = peft_model,
    args = training_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset= tokenized_dataset['validation']
)

In [79]:
peft_trainer.train()
peft_model_path = f"./peft-dialogue-summary-checkpoint-{TIME}"
peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)



Step,Training Loss,Validation Loss
100,2.7289,0.111352
200,0.1389,0.101086


In [None]:
# Add the adapter to the original T-5 Model and set trainable as False, as we will only use it for inference

from peft import PeftModel, PeftConfig
# we can switch different adapter with base model
instruct_model = PeftModel.from_pretrained(original_model,
                              peft_model_path,
                              torch_dtype= torch.bfloat16,
                              is_trainable=False)


print_number_of_trainable_parameters(instruct_model)

'trainable model parameters: 0\nall model parameters: 248462592 \n. percentage trainable parameters: 0.00%'

In [None]:
# load the model in GPU

instruct_model= instruct_model.to("cuda")
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype = torch.bfloat16)
base_model = base_model.to("cuda")

Evaluate Model Quality [Humman Evaluation]

In [None]:
index =100
dialogue = dataset['test'][index]['dialogue']
human_baseline_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
instruct_model_outputs = instruct_model.generate(input_ids=input_ids, max_new_tokens =200)
instruct_model_decoded_outputs = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

base_model_ouputs = base_model.generate(input_ids=input_ids, max_new_tokens =200)
base_model_decoded_output = tokenizer.decode(base_model_ouputs[0],skip_special_tokens=True)



print(dash_line)
print(f"Base line humman summary:\n {human_baseline_summary}")
print(dash_line)
print(f"base_model: \n {base_model_decoded_output}")
print(dash_line)
print(f"instruct_model:\n {instruct_model_decoded_outputs}")


---------------------------------------------------------------------------------------------------
Base line humman summary:
 #Person1# and Mike have a disagreement on how to act out a scene. #Person1# proposes that Mike can try to act in #Person1#'s way.
---------------------------------------------------------------------------------------------------
base_model: 
 The two of them will try to figure out how to express their feelings.
---------------------------------------------------------------------------------------------------
instruct_model:
 #Person1# tells Mike that she doesn't want to see him anymore, but he wants to get more anger from Mike.


Let's evaluate the model quantitatively

In [None]:
rouge = evaluate.load('rouge')

dialogues = dataset['test'][0:5]['dialogue']
human_baseline_summaries = dataset['test'][0:5]['summary']

base_model_summaries =[]
instruct_model_summaries = []

for _, dialogue in enumerate(dialogues):


    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
    instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config = GenerationConfig(max_new_tokens=200, num_beams=1))
    instruct_model_decoded_outputs = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
    instruct_model_summaries.append(instruct_model_decoded_outputs)

    base_model_ouputs = base_model.generate(input_ids=input_ids, generation_config= GenerationConfig(max_new_tokens=200,num_beams =1))
    base_model_decoded_output = tokenizer.decode(base_model_ouputs[0],skip_special_tokens=True)
    base_model_summaries.append(base_model_decoded_output)


zipped_summaries = list(zip(human_baseline_summaries, base_model_summaries, instruct_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'base_model_summaries', 'instruct_model_summaries'])
df


Unnamed: 0,human_baseline_summaries,base_model_summaries,instruct_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,#Person1#: I need to take a dictation for you.,#Person1# wants to take a dictation for Ms. Da...
1,In order to prevent employees from wasting tim...,#Person1#: I need to take a dictation for you.,#Person1# wants to take a dictation for Ms. Da...
2,Ms. Dawson takes a dictation for #Person1# abo...,#Person1#: I need to take a dictation for you.,#Person1# wants to take a dictation for Ms. Da...
3,#Person2# arrives late because of traffic jam....,The traffic jam at the Carrefour intersection ...,#Person2# thinks it's better for the environme...
4,#Person2# decides to follow #Person1#'s sugges...,The traffic jam at the Carrefour intersection ...,#Person2# thinks it's better for the environme...


In [None]:
print(dash_line)
print(f"Base line humman summary:\n {human_baseline_summaries}")
print(dash_line)
print(f"base_model: \n {base_model_summaries}")
print(dash_line)
print(f"instruct_model:\n {instruct_model_summaries}")

---------------------------------------------------------------------------------------------------
Base line humman summary:
 ['Ms. Dawson helps #Person1# to write a memo to inform every employee that they have to change the communication method and should not use Instant Messaging anymore.', 'In order to prevent employees from wasting time on Instant Message programs, #Person1# decides to terminate the use of those programs and asks Ms. Dawson to send out a memo to all employees by the afternoon.', 'Ms. Dawson takes a dictation for #Person1# about prohibiting the use of Instant Message programs in the office. They argue about its reasonability but #Person1# still insists.', '#Person2# arrives late because of traffic jam. #Person1# persuades #Person2# to use public transportations to keep healthy and to protect the environment.', "#Person2# decides to follow #Person1#'s suggestions on quitting driving to work and will try to use public transportations."]
------------------------------

In [None]:
# Evaluate the rouge on both model summaries to compare them side by side

base_model_results = rouge.compute(
    predictions = base_model_summaries,
    references=human_baseline_summaries,
    use_aggregator=True,
    use_stemmer = True
)

instruct_model_results = rouge.compute(
    predictions = instruct_model_summaries,
    references=human_baseline_summaries,
    use_aggregator=True,
    use_stemmer = True
)

print('ORIGINAL MODEL:')
print(base_model_results)
print('INSTRUCT MODEL')
print(instruct_model_results)


ORIGINAL MODEL:
{'rouge1': 0.15305555555555556, 'rouge2': 0.04862745098039215, 'rougeL': 0.14194444444444443, 'rougeLsum': 0.1422222222222222}
INSTRUCT MODEL
{'rouge1': 0.2438242722230418, 'rouge2': 0.06498599439775911, 'rougeL': 0.20532150776053215, 'rougeLsum': 0.20679544738933892}


In [None]:
print("Absolute percentage improvement of INSTRUCT MODEL over Humman Baseline")

improvement = (np.array(list(instruct_model_results.values()))- np.array(list(base_model_results.values())))

for key, value in zip(instruct_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

Absolute percentage improvement of INSTRUCT MODEL over Humman Baseline
rouge1: 9.08%
rouge2: 1.64%
rougeL: 6.34%
rougeLsum: 6.46%
