# Setup

In [1]:
# %load_ext lab_black

In [2]:
# ! pip install torch==1.13.1 --quiet
# ! pip install torchdata==0.5.1 --quiet
# ! pip install transformers==4.27.2 datasets==2.11.0 --quiet
# ! pip install py7zr --quiet
# ! pip3 install evaluate==0.4.0 rouge_score==0.1.2 loralib==0.1.2 --quiet
# ! pip3 install peft --quiet
# ! pip3 install numerize --quiet

In [3]:
import os
import textwrap
import pandas as pd
import numpy as np
from pprint import pprint
import random
from numerize import numerize
from tqdm.auto import tqdm
import time

import torch

from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig
from transformers import TrainingArguments, Trainer, Seq2SeqTrainingArguments
import evaluate

In [4]:
# some util functions
print_dashes = lambda: print("-" * 80)
txtwrap = lambda text: textwrap.fill(text, width=80)
print_dialogue = lambda x: [print(txtwrap(text=i)) for i in x.split("\n")]

# data load

In [5]:
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name)



  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
# single data point
pprint(dataset["train"][13])

{'dialogue': '#Person1#: How old is Keith?\n'
             "#Person2#: He's 21. how old is James?\n"
             "#Person1#: He's a year older than Keith, but he looks younger.\n"
             "#Person2#: How's your father?\n"
             "#Person1#: He's fine. He retired last week. It's turning going "
             'in his life. Now he can relax and enjoy his retirement.\n'
             '#Person2#: He can spend more time with his grandchildren.\n'
             "#Person1#: Oh, I don't think he wants to. He wants to travel to "
             'several different countries around the world.\n'
             '#Person2#: So, he wants to have a more active retirement. Good '
             'idea!\n'
             '#Person1#: How do you want to spend your old age?\n'
             '#Person2#: In the same way, probably.',
 'id': 'train_13',
 'summary': "#Person1# and #Person2# are talking about their families' ages. "
            "#Person2#'s father wants to travel around the world after "
        

# load model

- Load model with special dtype that optimizes for memory.
- bfloat16 is a dtype that was developed by google brain
- it is 16 bit representation of 32 bit floating point
- [more about blfoat16](https://cloud.google.com/tpu/docs/bfloat16)

In [7]:
model_id = "google/flan-t5-base"

# Load tokenizer of FLAN-t5-small
tokenizer = AutoTokenizer.from_pretrained(model_id, torch_dtype=torch.bfloat16)

# Load model
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)

# prepare dataset

Here we convert data points in model consumable format

```python

{
 # raw data   
 'id': 'train_13',
 'dialogue': "#Person1#: How old is Keith?\n#Person2#: He's 21. how old is James?\n#Person1#: He's a year older than Keith, but he looks younger.\n#Person2#: How's your father?\n#Person1#: He's fine. He retired last week. It's turning going in his life. Now he can relax and enjoy his retirement.\n#Person2#: He can spend more time with his grandchildren.\n#Person1#: Oh, I don't think he wants to. He wants to travel to several different countries around the world.\n#Person2#: So, he wants to have a more active retirement. Good idea!\n#Person1#: How do you want to spend your old age?\n#Person2#: In the same way, probably.",
 'summary': "#Person1# and #Person2# are talking about their families' ages. #Person2#'s father wants to travel around the world after retirement.",
 'topic': 'age',

 # transformed data
 # - these are token ids of each token of DIALOGUE as generated by tokenizer
 'input_ids': tensor([[ 8779,   140,   125,  2817,    16,    48,  3582,     5,  1713,   345,
          13515,   536,  4663,    10,   571,   625,    19, 17017,    58,  1713,
            345, 13515,   357,  4663,    10,   216,    31,     7,  1401,     5,
            149,   625,    19,  2549,    58,  1713,   345, 13515,   536,  4663,
              ...
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0]]),

 # - these are token ids of each token of SUMMARY as generated by tokenizer
 'labels': tensor([[ 1713,   345, 13515,   536,  4663,    11,  1713,   345, 13515,   357,
           4663,    33,  2508,    81,    70,  1791,    31,     3,  2568,     5,
           1713,   345, 13515,   357,  4663,    31,     7,  2353,  2746,    12,
           1111,   300,     8,   296,   227,  6576,     5,     1,     0,     0,
              ...

              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0]])}
```

- Each model has its maximum sequence length
- We need to handle this while tokenizing the text sequences for fine tuning
- We get error of this kind if we pass a sequence longer than that required by the model

```Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512).
Running this sequence through the model will result in indexing errors```

- Special methods like padding and truncation exist to handle cases of sequence length

In [8]:
from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(
    lambda x: tokenizer(x["dialogue"], truncation=True),
    batched=True,
    remove_columns=["dialogue", "summary"],
)
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(
    lambda x: tokenizer(x["summary"], truncation=True),
    batched=True,
    remove_columns=["dialogue", "summary"],
)
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")



Max source length: 512
Max target length: 277


In [9]:
def summary_task_tokenize_util(sample, tokenizer, padding="max_length"):
    print("batch_size", len(sample["dialogue"]))

    # get each dialog in a batch and add instruction
    inputs = [f"Summarize:\n" + item for item in sample["dialogue"]]

    # tokenize each input in a batch
    model_inputs = tokenizer(
        inputs, max_length=max_source_length, padding=padding, truncation=True
    )

    # labels
    labels = tokenizer(
        text_target=sample["summary"],
        max_length=max_target_length,
        padding=padding,
        truncation=True,
    )

    # batch label cleaning
    # remove tokenizer pad_token and default to -100
    # if valid label i.e if not tokenizer pad_token keep token as it is
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label]
            for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]

    ############################################################
    # ^^^^^ setup inputs and outputs for the model ^^^^^
    # tokenize prompt for model fine tuning
    # for parallelization we need to make sure that model receives same sized vectors
    # due to varying sizes of text sequences the sequence can be shorter or longer
    # - padding if input length is shorter than that required by the model
    # - truncation if input length is greater than that required by model

    # more about padding and truncation here
    # https://huggingface.co/docs/transformers/pad_truncation
    ############################################################

    ############################################################
    # more optimized way for padding
    # transformers.DataCollatorForSeq2Seq
    # it dynamically pads sequences as it receives labels
    ############################################################

    return model_inputs


# sample test
# t = summary_task_tokenize_util(sample=dataset["train"][1000], tokenizer=tokenizer)
# t["input_ids"].shape

# mapping this function across the dataset
# - todo batched = True not working figure this out later
tokenized_datasets = dataset.map(
    summary_task_tokenize_util,
    batched=True,
    fn_kwargs={"tokenizer": tokenizer},
    remove_columns=["id", "topic", "dialogue", "summary"],
)



In [10]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 500
    })
})

In [11]:
# reducing size of dataset by filtering for finetuning on small set
tokenized_datasets = tokenized_datasets.filter(
    lambda example, index: index % 100 == 0, with_indices=True
)



In [12]:
len(tokenized_datasets["train"])

125

# total parameters

In [13]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
# numel returns total number of elements in the tensor
# summing number of elements in each set of parameters will give us total parameters

print(f"""total number of parameters : {numerize.numerize(total_params)}""")

total number of parameters : 247.58M


# total parameters using PEFT - LoRA
reference : https://huggingface.co/docs/peft/conceptual_guides/lora


In [14]:
from peft import LoraConfig, get_peft_model, TaskType

target_modules = ["q", "v"]  # using LoRA for attention modules only
lora_config = LoraConfig(
    r=16,
    target_modules=target_modules,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,  # specify task
)
lora_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.SEQ_2_SEQ_LM: 'SEQ_2_SEQ_LM'>, inference_mode=False, r=16, target_modules=['q', 'v'], lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)

In [15]:
peft_model = get_peft_model(model, lora_config)

total_params_peft = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(
    f"""total number of parameters in PEFT model: {numerize.numerize(total_params_peft)}"""
)

total number of parameters in PEFT model: 1.77M


In [16]:
print(
    f"reduction in number of paramters : {round((total_params - total_params_peft)*100/total_params,2)}"
)

reduction in number of paramters : 99.29


In [17]:
peft_model.print_trainable_parameters()

trainable params: 1,769,472 || all params: 249,347,328 || trainable%: 0.7096414524241463


# fine tuning LLM

# rouge_score

In [18]:
rouge = evaluate.load("rouge")


def compute_metrics(eval_pred):
    """
    Reference : https://huggingface.co/docs/transformers/tasks/summarization
    """

    # get predictions and ground truth
    predictions, labels = eval_pred

    # decode prediction token ids
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # decode ground truth token ids
    # -100 is pad token id in NLP tasks
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # use decoded_preds, decoded_labels to get rouge score
    result = rouge.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions
    ]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [19]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8
)

# init PEFT trainer

In [20]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Hugging Face repository id
# repository_id = f"{model_id.split('/')[1]}-{dataset_id}"

output_dir = f"./models-peft"

# Define training args
peft_training_args = Seq2SeqTrainingArguments(
    # learning
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=1e-3,
    num_train_epochs=10,
    fp16=False,  # Overflows with fp16
    # logging
    logging_dir=f"./logs",
    logging_strategy="steps",
    logging_steps=100,
    report_to="tensorboard",
    # evaluation
    predict_with_generate=True,
    evaluation_strategy="epoch",
    # checkpointing
    output_dir=output_dir,
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
)

peft_trainer = Seq2SeqTrainer(
    model=peft_model,
    args=peft_training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

In [21]:
peft_trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,1.35,0.2835,0.0637,0.2326,0.2324,19.0
2,No log,1.264063,0.3397,0.0985,0.2619,0.2615,18.6667
3,No log,1.260417,0.3644,0.1213,0.2845,0.2835,18.6667
4,No log,1.260937,0.3533,0.1195,0.2803,0.2802,18.9333
5,No log,1.248958,0.3664,0.1178,0.2699,0.2687,18.9333
6,No log,1.260937,0.3854,0.1285,0.2954,0.295,18.9333
7,1.292800,1.245833,0.3523,0.1057,0.2773,0.2775,18.9333
8,1.292800,1.261458,0.3749,0.1233,0.2908,0.2903,19.0
9,1.292800,1.253646,0.3887,0.1307,0.3018,0.3014,19.0
10,1.292800,1.257812,0.3695,0.1121,0.2909,0.2906,18.9333


TrainOutput(global_step=160, training_loss=1.2084716796875, metrics={'train_runtime': 308.8197, 'train_samples_per_second': 4.048, 'train_steps_per_second': 0.518, 'total_flos': 862741463040000.0, 'train_loss': 1.2084716796875, 'epoch': 10.0})

In [22]:
import locale


def getpreferredencoding(do_setlocale=True):
    return "UTF-8"


locale.getpreferredencoding = getpreferredencoding

In [23]:
! huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /roo

In [24]:
peft_trainer.model.push_to_hub(repo_id="flan-t5-dialogsum-peft")

adapter_model.bin:   0%|          | 0.00/7.13M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sagarshf/flan-t5-dialogsum-peft/commit/c34eb0f5681e05e142586f4a9643a793d857763d', commit_message='Upload model', commit_description='', oid='c34eb0f5681e05e142586f4a9643a793d857763d', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# tokenizer.push_to_hub(repo_id="https://huggingface.co/sagarshf/flan-t5-dialogsum")

# load trained model

In [25]:
from transformers import AutoModel

# original model (NOT fine tuned)
model_id = "google/flan-t5-base"
original_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_id, torch_dtype=torch.bfloat16
)

# Load tokenizer of FLAN-t5-base
tokenizer = AutoTokenizer.from_pretrained(model_id, torch_dtype=torch.bfloat16)

In [34]:
# load fully fine tuned model

ft_model_id = "sagarshf/flan-t5-dialogsum"
ft_model = AutoModelForSeq2SeqLM.from_pretrained(
    ft_model_id, torch_dtype=torch.bfloat16
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/495M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [26]:
# load peft model

from peft import PeftModel, PeftConfig

peft_model_base_id = "google/flan-t5-base"
peft_model_id = "sagarshf/flan-t5-dialogsum-peft"

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained(
    peft_model_base_id, torch_dtype=torch.bfloat16
)
peft_model = PeftModel.from_pretrained(
    peft_model_base, peft_model_id, torch_dtype=torch.bfloat16, is_trainable=False
)

Downloading adapter_model.bin:   0%|          | 0.00/7.13M [00:00<?, ?B/s]

# Qualitative evaluation

In [27]:
original_model.to("cuda:0")
ft_model.to("cuda:0")
peft_model.to("cuda:0")
0

0

In [64]:
def inference_summarize(model, dialogue, ref_summary, tokenizer):
    prompt = f"""Summarize:\n{dialogue}"""

    inputs = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
    model_out = model.generate(input_ids=inputs, max_new_tokens=100)[0]

    output = tokenizer.decode(model_out, skip_special_tokens=True)

    return (prompt, ref_summary, output)


def print_inference_summary(
    prompt, summary, output, which_model="MODEL", summary_only=False
):
    dash_line = "-".join("" for x in range(100))
    if not summary_only:
        print(dash_line)
        print(f"INPUT PROMPT:\n{prompt}")
        print(dash_line)
        pprint(f"BASELINE HUMAN SUMMARY:\n{summary}\n")
        print(dash_line)
        print()
        pprint(f"{which_model} GENERATION - ZERO SHOT:\n{output}")
    else:
        print()
        print(dash_line)
        pprint(f"{which_model} GENERATION - ZERO SHOT:\n{output}")


check_index = int(np.random.randint(0, 100, 1)[0])
print(check_index)

dialogue = dataset["test"][check_index]["dialogue"]
summary = dataset["test"][check_index]["summary"]

print_inference_summary(
    *inference_summarize(
        peft_model, dialogue, ref_summary=summary, tokenizer=tokenizer
    ),
    which_model="LoRA fine tuned MODEL",
)
print_inference_summary(
    *inference_summarize(ft_model, dialogue, ref_summary=summary, tokenizer=tokenizer),
    which_model="Full fine tuned MODEL",
    summary_only=True,
)
print_inference_summary(
    *inference_summarize(
        original_model, dialogue, ref_summary=summary, tokenizer=tokenizer
    ),
    which_model="NON fine tuned MODEL",
    summary_only=True,
)

23
---------------------------------------------------------------------------------------------------
INPUT PROMPT:
Summarize:
#Person1#: Good coming. What can I do for you?
#Person2#: I'm in Room 309. I'm checking out today. Can I have my bill now?
#Person1#: Certainly. Please wait a moment. Here you are.
#Person2#: Thanks. Wait... What's this? The 30 dollar for?
#Person1#: Excuse me... The charge for your laundry service on Nov. 20th.
#Person2#: But I did't take any laundry service during my stay here. I think you have added someone else's.
#Person1#: Ummmm...Sorry, would you mind waiting a moment? We check it with the department concerned.
#Person2#: No. As long as we get this straightened out.
#Person1#: I'm very sorry. There has been a mistake. We'll correct the bill. Please take a look.
#Person2#: Okay, here you are.
#Person1#: Goodbye.
---------------------------------------------------------------------------------------------------
('BASELINE HUMAN SUMMARY:\n'
 '#Person2# fin

# Quantitative evaluation

In [33]:
len(dataset["test"])

1500

In [39]:
eval_res = dict(
    prompt=[],
    reference_summary=[],
    original_model_summary=[],
    full_finetuned_model_summary=[],
    lora_finetuned_model_summary=[],
)

for i in tqdm(range(300)):
    dialogue = dataset["test"][i]["dialogue"]
    summary = dataset["test"][i]["summary"]
    p, r, o_ft = inference_summarize(
        ft_model, dialogue, ref_summary=summary, tokenizer=tokenizer
    )
    _, _, o_peft = inference_summarize(
        peft_model, dialogue, ref_summary=summary, tokenizer=tokenizer
    )
    _, _, o_orig = inference_summarize(
        original_model, dialogue, ref_summary=summary, tokenizer=tokenizer
    )

    eval_res["prompt"].append(p)
    eval_res["reference_summary"].append(r)
    eval_res["original_model_summary"].append(o_orig)
    eval_res["lora_finetuned_model_summary"].append(o_peft)
    eval_res["full_finetuned_model_summary"].append(o_ft)

  0%|          | 0/300 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1022 > 512). Running this sequence through the model will result in indexing errors


In [48]:
eval_res["full_finetuned_model_summary"] = eval_res.pop("finetuned_model_summary")

In [51]:
df_eval = pd.DataFrame(eval_res)

In [43]:
df_eval.rename(
    columns={"finetuned_model_summary": "full_finetuned_model_summary"}
).to_csv("./eval_res_all.csv", index=False)

In [44]:
rouge = evaluate.load("rouge")

In [45]:
original_metrics = rouge.compute(
    predictions=df_eval["original_model_summary"],
    references=df_eval["reference_summary"],
)
original_metrics

{'rouge1': 0.2270195753743356,
 'rouge2': 0.07081646152587748,
 'rougeL': 0.19283084038525689,
 'rougeLsum': 0.19302823606298322}

In [52]:
full_finetuned_metrics = rouge.compute(
    predictions=df_eval["full_finetuned_model_summary"],
    references=df_eval["reference_summary"],
)
full_finetuned_metrics

{'rouge1': 0.3535591664452966,
 'rouge2': 0.12308752487663749,
 'rougeL': 0.29275391565223374,
 'rougeLsum': 0.29286722576524893}

In [53]:
lora_finetuned_metrics = rouge.compute(
    predictions=df_eval["lora_finetuned_model_summary"],
    references=df_eval["reference_summary"],
)
lora_finetuned_metrics

{'rouge1': 0.3767057630892099,
 'rouge2': 0.13020658145858943,
 'rougeL': 0.3023473962740625,
 'rougeLsum': 0.30242823055986623}

In [54]:
perc_increase = {}
for i in ["rouge1", "rouge2", "rougeL", "rougeLsum"]:
    orig_score, fine_score = original_metrics[i], lora_finetuned_metrics[i]
    inc = f"{round((fine_score-orig_score)*100/orig_score)}%"
    perc_increase[i] = inc

perc_increase

{'rouge1': '66%', 'rouge2': '84%', 'rougeL': '57%', 'rougeLsum': '57%'}

We can see that we get similar ROUGE scores to fully fine tuned model upon fine tuning with LoRA, which is a more efficient way of training using less compute and trainable parameters.