In [122]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

In [123]:
huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(huggingface_dataset_name)

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [124]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [125]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


In [126]:
index = 200

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"], 
        max_new_tokens=200,
    )[0], 
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

-------------------------------------------------------------------

In [127]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    
    return example

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])

In [128]:
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

In [129]:
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

max_steps is given, it will override any value given in num_train_epochs


In [130]:
trainer.train()

100%|██████████| 1/1 [00:03<00:00,  3.46s/it]

{'loss': 46.75, 'grad_norm': 430.0, 'learning_rate': 0.0, 'epoch': 0.06}


100%|██████████| 1/1 [00:05<00:00,  5.49s/it]

{'train_runtime': 5.4887, 'train_samples_per_second': 1.458, 'train_steps_per_second': 0.182, 'train_loss': 46.75, 'epoch': 0.06}





TrainOutput(global_step=1, training_loss=46.75, metrics={'train_runtime': 5.4887, 'train_samples_per_second': 1.458, 'train_steps_per_second': 0.182, 'total_flos': 5478058819584.0, 'train_loss': 46.75, 'epoch': 0.0625})

In [131]:
index = 200
dialogue = dataset['test'][index]['dialogue']
human_baseline_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors="pt").input_ids
original_model = original_model.cpu()
input_ids = input_ids.cpu()

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

print(f"Human baseline summary: {human_baseline_summary}")
print(f"Original model summary: {original_model_text_output}")

Human baseline summary: #Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
Original model summary: - You can add a program to your software. - You can add a program to your computer. - You can add a program to your computer. - You can add a program to your computer. - You can add a program to your computer. - You can add a program to your computer. - You can add a program to your computer.


In [132]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, 
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM 
)

In [133]:
peft_model = get_peft_model(original_model, 
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%


In [134]:
output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, 
    num_train_epochs=10,
    logging_steps=1,
    max_steps=10    
)
    
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

max_steps is given, it will override any value given in num_train_epochs


In [135]:
peft_trainer.train()

peft_model_path="./peft-dialogue-summary-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

 10%|█         | 1/10 [00:01<00:17,  1.97s/it]

{'loss': 47.25, 'grad_norm': 8.88475513458252, 'learning_rate': 0.0009000000000000001, 'epoch': 0.06}


 20%|██        | 2/10 [00:03<00:14,  1.82s/it]

{'loss': 46.0, 'grad_norm': 8.70297908782959, 'learning_rate': 0.0008, 'epoch': 0.12}


 30%|███       | 3/10 [00:05<00:12,  1.77s/it]

{'loss': 43.25, 'grad_norm': 10.226207733154297, 'learning_rate': 0.0007, 'epoch': 0.19}


 40%|████      | 4/10 [00:07<00:10,  1.75s/it]

{'loss': 39.75, 'grad_norm': 11.318265914916992, 'learning_rate': 0.0006, 'epoch': 0.25}


 50%|█████     | 5/10 [00:08<00:08,  1.74s/it]

{'loss': 35.0, 'grad_norm': 10.3463134765625, 'learning_rate': 0.0005, 'epoch': 0.31}


 60%|██████    | 6/10 [00:10<00:06,  1.73s/it]

{'loss': 32.0, 'grad_norm': 11.644235610961914, 'learning_rate': 0.0004, 'epoch': 0.38}


 70%|███████   | 7/10 [00:12<00:05,  1.72s/it]

{'loss': 29.875, 'grad_norm': 8.221043586730957, 'learning_rate': 0.0003, 'epoch': 0.44}


 80%|████████  | 8/10 [00:13<00:03,  1.72s/it]

{'loss': 29.75, 'grad_norm': 7.388314247131348, 'learning_rate': 0.0002, 'epoch': 0.5}


 90%|█████████ | 9/10 [00:15<00:01,  1.72s/it]

{'loss': 28.5, 'grad_norm': 7.936407566070557, 'learning_rate': 0.0001, 'epoch': 0.56}


100%|██████████| 10/10 [00:17<00:00,  1.72s/it]

{'loss': 27.5, 'grad_norm': 6.4275641441345215, 'learning_rate': 0.0, 'epoch': 0.62}


100%|██████████| 10/10 [00:17<00:00,  1.78s/it]


{'train_runtime': 17.8022, 'train_samples_per_second': 4.494, 'train_steps_per_second': 0.562, 'train_loss': 35.8875, 'epoch': 0.62}


('./peft-dialogue-summary-checkpoint-local/tokenizer_config.json',
 './peft-dialogue-summary-checkpoint-local/special_tokens_map.json',
 './peft-dialogue-summary-checkpoint-local/tokenizer.json')

In [136]:
from peft import PeftModel, PeftConfig

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

peft_model = PeftModel.from_pretrained(peft_model_base, 
                                       './peft-dialogue-summary-checkpoint-local', 
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

In [137]:
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 0
all model parameters: 251116800
percentage of trainable model parameters: 0.00%


In [138]:
index = 200
dialogue = dataset['test'][index]['dialogue']
baseline_human_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

input_ids = tokenizer(prompt, return_tensors="pt").input_ids
original_model = original_model.cpu()
peft_model = peft_model.cpu()
input_ids = input_ids.cpu()

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(f'PEFT MODEL: {peft_model_text_output}')

BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
ORIGINAL MODEL:
You might want to upgrade your system.
PEFT MODEL: You might want to upgrade your computer.


In [139]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

instruct_model_name='truocpham/flan-dialogue-summary-checkpoint'

instruct_model = AutoModelForSeq2SeqLM.from_pretrained( instruct_model_name, torch_dtype=torch.bfloat16)

In [140]:
index = 200
dialogue = dataset['test'][index]['dialogue']
human_baseline_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.cpu()
instruct_model = instruct_model.cpu()

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
You can upgrade your computer to a newer computer.
---------------------------------------------------------------------------------------------------
INSTRUCT MODEL:
#Person1# suggests #Person2# upgrading #Person2#'s system, hardware, and CD-ROM drive. #Person2# thinks it's great.
