In [3]:
!pip install torch torchdata transformers datasets evaluate rouge_score loralib peft

Collecting torchdata
  Downloading torchdata-0.11.0-py3-none-any.whl.metadata (6.3 kB)
Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting loralib
  Downloading loralib-0.1.2-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_

In [4]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [5]:
dataset = load_dataset("knkarthick/dialogsum")
dataset

README.md:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

validation.csv:   0%|          | 0.00/442k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12460 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
model_name = 'google/flan-t5-base'
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype = torch.bfloat16).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [8]:
def print_number_of_traiable_model_parameters(model):
    trainable_model_parameters = 0
    all_model_params = 0

    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_parameters += param.numel()
    return f'Trainable model parameters: {trainable_model_parameters}\nall model parameters:{all_model_params}\npercentage of trainable model parameters:{100*trainable_model_parameters/all_model_params:.2f}%'

print(print_number_of_traiable_model_parameters(original_model))

Trainable model parameters: 247577856
all model parameters:247577856
percentage of trainable model parameters:100.00%


In [9]:
index = 200

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

inputs = tokenizer(prompt, return_tensors = 'pt').to(device)
output = tokenizer.decode(original_model.generate(
                            inputs["input_ids"],
                            max_new_tokens = 200,
                        )[0],
                         skip_special_tokens = True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'Input prompt:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
Input prompt:

Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

-------------------------------------------------------------------

In [10]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['input_ids'] = example['input_ids'].tolist()
    example['labels'] = example['labels'].tolist()
    return example

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [11]:
tokenized_datasets = tokenized_datasets.filter(lambda example, index:index%100 == 0, with_indices = True)

Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [12]:
print(f'Shape of the dataset:')
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shape of the dataset:
Training: (125, 2)
Validation: (5, 2)
Test: (15, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
})


In [13]:
import wandb

# Initialize a new W&B run
wandb.init(project="flanT5_summarization")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mshivenaggarwal[0m ([33mshivenaggarwal0[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [14]:
import os
os.environ["WANDB_DISABLED"] = "false"

output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

training_args = TrainingArguments(
                output_dir = output_dir,
                learning_rate = 1e-5,
                num_train_epochs = 1,
                weight_decay = 0.01,
                logging_steps = 1,
                max_steps = 1,
                report_to="wandb")

trainer = Trainer(
        model = original_model,
        args = training_args,
        train_dataset = tokenized_datasets['train'],
        eval_dataset = tokenized_datasets['validation']
)

In [15]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
1,49.0


TrainOutput(global_step=1, training_loss=49.0, metrics={'train_runtime': 15.0706, 'train_samples_per_second': 0.531, 'train_steps_per_second': 0.066, 'total_flos': 5478058819584.0, 'train_loss': 49.0, 'epoch': 0.0625})

In [16]:
instruct_model_name= 'truocpham/flan-dialogue-summary-checkpoint'

instruct_model = AutoModelForSeq2SeqLM.from_pretrained(instruct_model_name, torch_dtype=torch.bfloat16).to(device)

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [17]:
index = 200
dialogue = dataset['test'][index]['dialogue']
human_baseline_summary  = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.
{dialogue}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors = "pt").input_ids.to(device)

original_model_outputs = original_model.generate(
    input_ids = input_ids,
    generation_config = GenerationConfig(max_new_tokens = 200, num_beams = 1)
)

original_model_text_output = tokenizer.decode(
                                                original_model_outputs[0],
                                                skip_special_tokens = True
                                            )



instruct_model_outputs = instruct_model.generate(
    input_ids = input_ids,
    generation_config = GenerationConfig(max_new_tokens = 200, num_beams = 1))
instruct_model_text_output = tokenizer.decode(
                                                instruct_model_outputs[0],
                                                skip_special_tokens = True
                                             )

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
##Person1: Have you considered upgrading your system? ##Person2: Yes, but you might want to add a painting program to your software. ##Person1: I'd like to make up my own flyers and banners. ##Person2: I'd like to make up my own flyers and banners. ##Person1: I'd like to make up my own flyers and banners. ##Person2: I'd like to make up my own flyers and banners. ##Person1: I'd like to make up my own flyers and banners. ##Person2: I'd like to make up my own flyers and banners. ##Person1: I'd like to make up my own flyers and banners. ##Person2: I'd like to make up my own flyers and
------------------------------------------------------------------------------------

In [18]:
rouge = evaluate.load('rouge')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [19]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []

for _, dialogue in enumerate(dialogues):
    prompt = f"""
    Summarize the following conversation.

    {dialogue}

    Summary:"""
    input_ids = tokenizer(prompt, return_tensors = "pt").input_ids.to(device)

    original_model_outputs = original_model.generate(input_ids = input_ids, generation_config=GenerationConfig(max_new_tokens = 200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens = True)
    original_model_summaries.append(original_model_text_output)


    instruct_model_outputs = instruct_model.generate(input_ids = input_ids, generation_config = GenerationConfig(max_new_tokens = 200))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens = True)
    instruct_model_summaries.append(instruct_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries))
df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries'])
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,The memo should go out as an intra-office memo...,#Person1# asks Ms. Dawson to take a dictation ...
1,In order to prevent employees from wasting tim...,This memo should be distributed to all employe...,#Person1# asks Ms. Dawson to take a dictation ...
2,Ms. Dawson takes a dictation for #Person1# abo...,The new policy of instant messaging is in effe...,#Person1# asks Ms. Dawson to take a dictation ...
3,#Person2# arrives late because of traffic jam....,The public transport system is good for the en...,#Person2# got stuck in traffic again. #Person1...
4,#Person2# decides to follow #Person1#'s sugges...,#Person1: You're here!,#Person2# got stuck in traffic again. #Person1...
5,#Person2# complains to #Person1# about the tra...,"The traffic was bad, but the public transport ...",#Person2# got stuck in traffic again. #Person1...
6,#Person1# tells Kate that Masha and Hero get d...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced. Kate can'...
7,#Person1# tells Kate that Masha and Hero are g...,Masha and Hero are having a divorce.,Masha and Hero are getting divorced. Kate can'...
8,#Person1# and Kate talk about the divorce betw...,#Person1: Masha and Hero are getting divorced....,Masha and Hero are getting divorced. Kate can'...
9,#Person1# and Brian are at the birthday party ...,Brian's birthday is tomorrow.,Brian's birthday is coming. #Person1# invites ...


In [20]:
original_model_results = rouge.compute(
            predictions = original_model_summaries,
            references = human_baseline_summaries[0:len(original_model_summaries)],
            use_aggregator = True,
            use_stemmer = True)

instruct_model_results = rouge.compute(
            predictions = original_model_summaries,
            references = human_baseline_summaries[0:len(instruct_model_summaries)],
            use_aggregator = True,
            use_stemmer = True)

print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)

ORIGINAL MODEL:
{'rouge1': np.float64(0.2776202081884728), 'rouge2': np.float64(0.11949077451937487), 'rougeL': np.float64(0.25426037029939386), 'rougeLsum': np.float64(0.2566018646637783)}
INSTRUCT MODEL:
{'rouge1': np.float64(0.2776202081884728), 'rouge2': np.float64(0.11949077451937487), 'rougeL': np.float64(0.25426037029939386), 'rougeLsum': np.float64(0.2566018646637783)}


In [21]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [22]:
peft_model = get_peft_model(original_model,
                            lora_config)
print(print_number_of_traiable_model_parameters(peft_model))

Trainable model parameters: 3538944
all model parameters:251116800
percentage of trainable model parameters:1.41%


In [23]:
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'
peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1
    )
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets['train'],
    )

In [24]:
peft_trainer.train()
peft_model_path="./peft-dialogue-summary-checkpoint-local"
peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

Step,Training Loss
1,49.25


('./peft-dialogue-summary-checkpoint-local/tokenizer_config.json',
 './peft-dialogue-summary-checkpoint-local/special_tokens_map.json',
 './peft-dialogue-summary-checkpoint-local/spiece.model',
 './peft-dialogue-summary-checkpoint-local/added_tokens.json',
 './peft-dialogue-summary-checkpoint-local/tokenizer.json')

In [25]:
from peft import PeftModel, PeftConfig
peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
peft_model = PeftModel.from_pretrained(peft_model_base,
                                       peft_model_path,
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

In [26]:
print(print_number_of_traiable_model_parameters(peft_model))

Trainable model parameters: 0
all model parameters:251116800
percentage of trainable model parameters:0.00%


In [30]:
peft_model = peft_model.to(device)
instruct_model = instruct_model.to(device)
original_model = original_model.to(device)

In [33]:
index = 200
dialogue = dataset['test'][index]['dialogue']
baseline_human_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:"""

input_ids = tokenizer(prompt, return_tensors = "pt").input_ids.to(device)


original_model_outputs = original_model.generate(input_ids = input_ids, generation_config = GenerationConfig(max_new_tokens = 200, num_beams = 1))
original_mdoel_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens = True)

instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')
print(dash_line)
print(f'PEFT MODEL: {peft_model_text_output}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
The discussion is about upgrading your system.
---------------------------------------------------------------------------------------------------
INSTRUCT MODEL:
#Person1# suggests #Person2# upgrading #Person2#'s system, hardware, and CD-ROM drive. #Person2# thinks it's great.
---------------------------------------------------------------------------------------------------
PEFT MODEL: #Person1#: I'm thinking of upgrading my computer.


In [34]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for idx, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    human_baseline_text_output = human_baseline_summaries[idx]

    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

    original_model_summaries.append(original_model_text_output)
    instruct_model_summaries.append(instruct_model_text_output)
    peft_model_summaries.append(peft_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries, peft_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries', 'peft_model_summaries'])
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries,peft_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,The memo is being distributed to all employees...,#Person1# asks Ms. Dawson to take a dictation ...,The memo is to be distributed to all employees...
1,In order to prevent employees from wasting tim...,The memo is a memo to all employees.,#Person1# asks Ms. Dawson to take a dictation ...,The memo is to be distributed to all employees...
2,Ms. Dawson takes a dictation for #Person1# abo...,Employees will be asked to use a dictation mod...,#Person1# asks Ms. Dawson to take a dictation ...,The memo is to be distributed to all employees...
3,#Person2# arrives late because of traffic jam....,I'm sorry to hear that.,#Person2# got stuck in traffic again. #Person1...,The traffic jam at the Carrefour intersection ...
4,#Person2# decides to follow #Person1#'s sugges...,The person who asked me to take public transpo...,#Person2# got stuck in traffic again. #Person1...,The traffic jam at the Carrefour intersection ...
5,#Person2# complains to #Person1# about the tra...,#Person1: I'm finally here. I'm stuck in traff...,#Person2# got stuck in traffic again. #Person1...,The traffic jam at the Carrefour intersection ...
6,#Person1# tells Kate that Masha and Hero get d...,Masha and her boyfriend are getting divorced.,Masha and Hero are getting divorced. Kate can'...,Masha and Hero are getting divorced.
7,#Person1# tells Kate that Masha and Hero are g...,#Person1: Masha and Hero are getting a divorce...,Masha and Hero are getting divorced. Kate can'...,Masha and Hero are getting divorced.
8,#Person1# and Kate talk about the divorce betw...,Masha and Hero are getting divorced.,Masha and Hero are getting divorced. Kate can'...,Masha and Hero are getting divorced.
9,#Person1# and Brian are at the birthday party ...,People are coming to celebrate Brian's birthday.,Brian's birthday is coming. #Person1# invites ...,Brian's birthday is coming up.


In [35]:
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)
print('PEFT MODEL:')
print(peft_model_results)

ORIGINAL MODEL:
{'rouge1': np.float64(0.27043647095797285), 'rouge2': np.float64(0.10238386041439476), 'rougeL': np.float64(0.21012046012046012), 'rougeLsum': np.float64(0.2139655264053531)}
INSTRUCT MODEL:
{'rouge1': np.float64(0.41026607717457186), 'rouge2': np.float64(0.17840645241958838), 'rougeL': np.float64(0.2977022096267017), 'rougeLsum': np.float64(0.2987374187518165)}
PEFT MODEL:
{'rouge1': np.float64(0.26109650997150996), 'rouge2': np.float64(0.11055072463768116), 'rougeL': np.float64(0.2302777777777778), 'rougeLsum': np.float64(0.2339245014245014)}


In [36]:
print("Absolute percentage improvement of PEFT MODEL over HUMAN BASELINE")

improvement = (np.array(list(peft_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(peft_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

Absolute percentage improvement of PEFT MODEL over HUMAN BASELINE
rouge1: -0.93%
rouge2: 0.82%
rougeL: 2.02%
rougeLsum: 2.00%


In [37]:
print("Absolute percentage improvement of PEFT MODEL over INSTRUCT MODEL")

improvement = (np.array(list(peft_model_results.values())) - np.array(list(instruct_model_results.values())))
for key, value in zip(peft_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

Absolute percentage improvement of PEFT MODEL over INSTRUCT MODEL
rouge1: -14.92%
rouge2: -6.79%
rougeL: -6.74%
rougeLsum: -6.48%
