# Setup

In [1]:
# %load_ext lab_black

In [2]:
import os
import textwrap
import pandas as pd
import numpy as np
import random
from numerize import numerize


import torch

from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig
from transformers import TrainingArguments, Trainer, Seq2SeqTrainingArguments
import evaluate

In [3]:
# some util functions
print_dashes = lambda: print("-" * 80)
txtwrap = lambda text: textwrap.fill(text, width=80)
print_dialogue = lambda x: [print(txtwrap(text=i)) for i in x.split("\n")]

# data load

In [4]:
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name)

Found cached dataset csv (/Users/sagar/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-931380d0e19583fc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
# single data point
print(dataset["train"][13])

{'id': 'train_13', 'dialogue': "#Person1#: How old is Keith?\n#Person2#: He's 21. how old is James?\n#Person1#: He's a year older than Keith, but he looks younger.\n#Person2#: How's your father?\n#Person1#: He's fine. He retired last week. It's turning going in his life. Now he can relax and enjoy his retirement.\n#Person2#: He can spend more time with his grandchildren.\n#Person1#: Oh, I don't think he wants to. He wants to travel to several different countries around the world.\n#Person2#: So, he wants to have a more active retirement. Good idea!\n#Person1#: How do you want to spend your old age?\n#Person2#: In the same way, probably.", 'summary': "#Person1# and #Person2# are talking about their families' ages. #Person2#'s father wants to travel around the world after retirement.", 'topic': 'age'}


In [6]:
print(dataset["train"][13]["dialogue"])

#Person1#: How old is Keith?
#Person2#: He's 21. how old is James?
#Person1#: He's a year older than Keith, but he looks younger.
#Person2#: How's your father?
#Person1#: He's fine. He retired last week. It's turning going in his life. Now he can relax and enjoy his retirement.
#Person2#: He can spend more time with his grandchildren.
#Person1#: Oh, I don't think he wants to. He wants to travel to several different countries around the world.
#Person2#: So, he wants to have a more active retirement. Good idea!
#Person1#: How do you want to spend your old age?
#Person2#: In the same way, probably.


In [7]:
print(dataset["train"][13]["summary"])

#Person1# and #Person2# are talking about their families' ages. #Person2#'s father wants to travel around the world after retirement.


# load model

- Load model with special dtype that optimizes for memory.
- bfloat16 is a dtype that was developed by google brain
- it is 16 bit representation of 32 bit floating point
- [more about blfoat16](https://cloud.google.com/tpu/docs/bfloat16)

In [8]:
model_id = "google/flan-t5-base"

# Load tokenizer of FLAN-t5-small
tokenizer = AutoTokenizer.from_pretrained(model_id, torch_dtype=torch.bfloat16)

# Load model
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)

# prepare dataset

Here we convert data points in model consumable format

```python

{
 # raw data   
 'id': 'train_13',
 'dialogue': "#Person1#: How old is Keith?\n#Person2#: He's 21. how old is James?\n#Person1#: He's a year older than Keith, but he looks younger.\n#Person2#: How's your father?\n#Person1#: He's fine. He retired last week. It's turning going in his life. Now he can relax and enjoy his retirement.\n#Person2#: He can spend more time with his grandchildren.\n#Person1#: Oh, I don't think he wants to. He wants to travel to several different countries around the world.\n#Person2#: So, he wants to have a more active retirement. Good idea!\n#Person1#: How do you want to spend your old age?\n#Person2#: In the same way, probably.",
 'summary': "#Person1# and #Person2# are talking about their families' ages. #Person2#'s father wants to travel around the world after retirement.",
 'topic': 'age',

 # transformed data
 # - these are token ids of each token of DIALOGUE as generated by tokenizer
 'input_ids': tensor([[ 8779,   140,   125,  2817,    16,    48,  3582,     5,  1713,   345,
          13515,   536,  4663,    10,   571,   625,    19, 17017,    58,  1713,
            345, 13515,   357,  4663,    10,   216,    31,     7,  1401,     5,
            149,   625,    19,  2549,    58,  1713,   345, 13515,   536,  4663,
              ...
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0]]),

 # - these are token ids of each token of SUMMARY as generated by tokenizer
 'labels': tensor([[ 1713,   345, 13515,   536,  4663,    11,  1713,   345, 13515,   357,
           4663,    33,  2508,    81,    70,  1791,    31,     3,  2568,     5,
           1713,   345, 13515,   357,  4663,    31,     7,  2353,  2746,    12,
           1111,   300,     8,   296,   227,  6576,     5,     1,     0,     0,
              ...

              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0]])}
```

- Each model has its maximum sequence length
- We need to handle this while tokenizing the text sequences for fine tuning
- We get error of this kind if we pass a sequence longer than that required by the model

```Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512).
Running this sequence through the model will result in indexing errors```

- Special methods like padding and truncation exist to handle cases of sequence length

In [9]:
from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["dialogue"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["summary"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")


Loading cached processed dataset at /Users/sagar/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-931380d0e19583fc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-bc3bc43d36d7b29f.arrow


Max source length: 512


Map:   0%|          | 0/13960 [00:00<?, ? examples/s]

Max target length: 277


In [10]:
def summary_task_tokenize_util(sample, tokenizer, padding='max_length'):

    print('batch_size',len(sample['dialogue']))
    start_list = [
            "Summarize.",
            "Tell me what happened in this chat.",
            "Explain this conversation briefly.",
            "Give me a short version of this talk.",
            "What's the quick version of this discussion?",
            "Can you make this conversation shorter?",
            "In simple terms, what was discussed here?",
            "Give me the main points of this chat.",
            "Sum up this talk in a few sentences.",
            "Make this conversation easy to understand.",
            "Tell me the important parts of this discussion.",
        ]

    # get each dialog in a batch and add instruction
    inputs = [f"{random.sample(start_list,1)[0].replace('.','')}:\n" + item for item in sample["dialogue"]]

    # tokenize each input in a batch
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # labels
    labels = tokenizer(text_target=sample["summary"], max_length=max_target_length, padding=padding, truncation=True)

    # batch label cleaning
    # remove tokenizer pad_token and default to -100
    # if valid label i.e if not tokenizer pad_token keep token as it is
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]

    ############################################################
    # ^^^^^ setup inputs and outputs for the model ^^^^^
    # tokenize prompt for model fine tuning
    # for parallelization we need to make sure that model receives same sized vectors
    # due to varying sizes of text sequences the sequence can be shorter or longer
    # - padding if input length is shorter than that required by the model
    # - truncation if input length is greater than that required by model

    # more about padding and truncation here
    # https://huggingface.co/docs/transformers/pad_truncation
    ############################################################

    ############################################################
    # more optimized way for padding
    # transformers.DataCollatorForSeq2Seq
    # it dynamically pads sequences as it receives labels
    ############################################################

    return model_inputs


# sample test
# t = summary_task_tokenize_util(sample=dataset["train"][1000], tokenizer=tokenizer)
# t["input_ids"].shape

# mapping this function across the dataset
# - todo batched = True not working figure this out later
tokenized_datasets = dataset.map(
    summary_task_tokenize_util,
    batched=True,
    fn_kwargs={"tokenizer": tokenizer},
    remove_columns=["id", "topic", "dialogue", "summary"],
)

Loading cached processed dataset at /Users/sagar/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-931380d0e19583fc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-5af2fcb4ae96b8cf.arrow


Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

batch_size 1000


Loading cached processed dataset at /Users/sagar/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-931380d0e19583fc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-22f982e0539b2401.arrow


batch_size 500


In [11]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 500
    })
})

In [12]:
# reducing size of dataset by filtering
tokenized_datasets = tokenized_datasets.filter(
    lambda example, index: index % 10 == 0, with_indices=True
)

Loading cached processed dataset at /Users/sagar/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-931380d0e19583fc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-5c176eb5740ac400.arrow


Filter:   0%|          | 0/1500 [00:00<?, ? examples/s]

Loading cached processed dataset at /Users/sagar/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-931380d0e19583fc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-a6b3309590f05055.arrow


In [13]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1246
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 150
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 50
    })
})

# total parameters

In [14]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
# numel returns total number of elements in the tensor
# summing number of elements in each set of parameters will give us total parameters

print(f"""total number of parameters : {numerize.numerize(total_params)}""")

total number of parameters : 247.58M


# fine tuning LLM

## rouge_score

In [15]:
rouge = evaluate.load("rouge")


def compute_metrics(eval_pred):
    """
    Reference : https://huggingface.co/docs/transformers/tasks/summarization
    """

    # get predictions and ground truth
    predictions, labels = eval_pred

    # decode prediction token ids
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # decode ground truth token ids
    # -100 is pad token id in NLP tasks
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # use decoded_preds, decoded_labels to get rouge score
    result = rouge.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions
    ]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [16]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)


In [17]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Hugging Face repository id
# repository_id = f"{model_id.split('/')[1]}-{dataset_id}"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir='./models',
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=5e-5,
    num_train_epochs=5,
    # logging & evaluation strategies
    logging_dir=f"./logs",
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    # metric_for_best_model="overall_f1",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=False,
    # hub_strategy="every_save",
    # hub_model_id=repository_id,
    # hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


## complete fine tuning of model