# BART-Large-CNN Fine-Tuning with LoRA for Chat & Dialogue Summarization

https://www.philschmid.de/fine-tune-flan-t5-peft

In [None]:
!nvidia-smi

## Dependencies

In [1]:
# install Hugging Face Libraries
!pip install -q git+https://github.com/huggingface/peft.git
!pip install -q "transformers==4.27.2" "datasets==2.9.0" "accelerate==0.17.1" "evaluate==0.4.0" "bitsandbytes==0.37.1" loralib --upgrade --quiet
# install additional dependencies needed for training
!pip install -q rouge-score tensorboard py7zr


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## Dataset Preprocessing

### Load Dataset SAMSum

In [3]:
from datasets import load_dataset

# Load dataset from the hub
dataset = load_dataset("samsum")

print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")

# Train dataset size: 14732
# Test dataset size: 819



  0%|          | 0/3 [00:00<?, ?it/s]

Train dataset size: 14732
Test dataset size: 819


### Initiate Tokenizer

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id = "facebook/bart-large-cnn"

# Load tokenizer of BART
tokenizer = AutoTokenizer.from_pretrained(model_id)


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

### Preprocess

In [5]:
from datasets import concatenate_datasets
import numpy as np
# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["dialogue"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
# take 85 percentile of max length for better utilization
max_source_length = int(np.percentile(input_lenghts, 85))

print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["summary"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
# take 90 percentile of max length for better utilization
max_target_length = int(np.percentile(target_lenghts, 90))

print(f"Max target length: {max_target_length}")

  0%|          | 0/16 [00:00<?, ?ba/s]

Max source length: 270


  0%|          | 0/16 [00:00<?, ?ba/s]

Max target length: 47


In [6]:
def preprocess_function(sample, padding="max_length"):
    # add prefix to the input for t5
    inputs = ["summarize: " + item for item in sample["dialogue"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["summary"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["dialogue", "summary", "id"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

# save datasets to disk for later easy loading
tokenized_dataset["train"].save_to_disk("bart-sam-data/train")
tokenized_dataset["test"].save_to_disk("bart-sam-data/eval")


  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


Saving the dataset (0/1 shards):   0%|          | 0/14732 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/819 [00:00<?, ? examples/s]

## Fine-Tune the Model with LoRA using PEFT

In [7]:
from transformers import AutoModelForSeq2SeqLM

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map="auto") # load_in_8bit=True, 


Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [8]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
 r=8, #16
 lora_alpha=32,
 target_modules=None, # ["q", "v"], 'None' to let peft figure out the target modules for the type of model
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)
# prepare int-8 model for training
# model = prepare_model_for_int8_training(model) # --> Doesn't work for BART.

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# trainable params: 18874368 || all params: 11154206720 || trainable%: 0.16921300163961817



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /usr/local/lib/python3.8/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)
  warn(msg)


trainable params: 1179648 || all params: 510410752 || trainable%: 0.23111738837351137


### Create a DataCollator that will take care of padding inputs and labels. Use the DataCollatorForSeq2Seq from the HF Transformers library.

In [9]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)


### Define the hyperparameters (TrainingArguments) for training

In [10]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# output_dir="lora-bart-large-cnn-ss"

output_dir="bart-large-cnn-finetuned-samsum-lora"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
	auto_find_batch_size=True,
    learning_rate=5e-4, # 1e-3 higher learning rate
    num_train_epochs=5,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="no",
    push_to_hub=True,
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer,
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!


Cloning https://huggingface.co/sooolee/bart-large-cnn-finetuned-samsum-lora into local empty directory.


Download file pytorch_model.bin:   0%|          | 8.00k/1.90G [00:00<?, ?B/s]

Download file logs/events.out.tfevents.1682217341.5cfe7eb1d133.1082.2: 100%|##########| 8.08k/8.08k [00:00<?, …

Download file training_args.bin: 100%|##########| 3.62k/3.62k [00:00<?, ?B/s]

Clean file logs/events.out.tfevents.1682217341.5cfe7eb1d133.1082.2:  12%|#2        | 1.00k/8.08k [00:00<?, ?B/…

Download file logs/1682217341.1537435/events.out.tfevents.1682217341.5cfe7eb1d133.1082.3: 100%|##########| 5.9…

Clean file training_args.bin:  28%|##7       | 1.00k/3.62k [00:00<?, ?B/s]

Clean file logs/1682217341.1537435/events.out.tfevents.1682217341.5cfe7eb1d133.1082.3:  17%|#6        | 1.00k/…

Clean file pytorch_model.bin:   0%|          | 1.00k/1.90G [00:00<?, ?B/s]

### Train

In [11]:
# train model
trainer.train()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,1.5317
1000,1.4631
1500,1.4399
2000,1.4041
2500,1.3293
3000,1.3402
3500,1.3514
4000,1.2905
4500,1.247
5000,1.266


TrainOutput(global_step=9210, training_loss=1.2776606109320923, metrics={'train_runtime': 2202.1521, 'train_samples_per_second': 33.449, 'train_steps_per_second': 4.182, 'total_flos': 4.254324236746752e+16, 'train_loss': 1.2776606109320923, 'epoch': 5.0})

In [None]:
trainer.push_to_hub()

In [14]:
model.push_to_hub("bart-large-cnn-finetuned-samsum-lora")
tokenizer.push_to_hub("bart-large-cnn-finetuned-samsum-lora")

adapter_model.bin:   0%|          | 0.00/4.77M [00:00<?, ?B/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/sooolee/bart-large-cnn-finetuned-samsum-lora/commit/025bd6a492e258dc88c43e53e1f191d444d0e563', commit_message='Upload tokenizer', commit_description='', oid='025bd6a492e258dc88c43e53e1f191d444d0e563', pr_url=None, pr_revision=None, pr_num=None)

In [15]:
# Save our LoRA model & tokenizer results
peft_model_id="bart-results"
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)
# if you want to save the base model to call
# trainer.model.base_model.save_pretrained(peft_model_id)


('bart-results/tokenizer_config.json',
 'bart-results/special_tokens_map.json',
 'bart-results/vocab.json',
 'bart-results/merges.txt',
 'bart-results/added_tokens.json',
 'bart-results/tokenizer.json')

## Evaluate & Run Inference


In [16]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load peft config for pre-trained checkpoint etc.
peft_model_id = "bart-results"
config = PeftConfig.from_pretrained(peft_model_id)

# load base LLM model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, device_map='auto') # load_in_8bit=True,
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id, device_map='auto')
model.eval()

print("Peft model loaded")


Peft model loaded


In [22]:
# Load the dataset again with a random sample to try the summarization

from datasets import load_dataset
from random import randrange


# Load dataset from the hub and get a sample
dataset = load_dataset("samsum")
sample = dataset['test'][randrange(len(dataset["test"]))]

input_ids = tokenizer(sample["dialogue"], return_tensors="pt", truncation=True).input_ids.cuda()
# with torch.inference_mode():
outputs = model.generate(input_ids=input_ids, do_sample=True, top_p=0.9), # max_new_tokens=10,
print(f"input sentence: {sample['dialogue']}\n{'---'* 20}")

print(f"summary:\n{tokenizer.batch_decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)[0]}")




  0%|          | 0/3 [00:00<?, ?it/s]

input sentence: Robin: Btw, do you know anything about that poker platform?
Jacks: Which one? The Bulls Eye one?
Robin: Yeah
Jacks: I played on it several times, it's good!
Robin: I'm trying to read up on the game
Jacks: It takes time, but once you get into it it's fun
Jacks: I made a few bucks on it
Robin: Nice
Robin: Any well worth tricks to share? 😂
Jacks: Well it's more about strategy mate
Jacks: I can send you a few links to some good sites
Jacks: Check this out
Jacks: <file_other>
Jacks: <file_other>
Jacks: <file_other>
Robin: Cheers! I'll take a look
------------------------------------------------------------
summary:
Jacks played poker on the Bulls Eye platform several times and he made a few bucks on it. Robin is trying to read up on the game. He will take a look at the links to some good poker sites.    Jack can send  


In [None]:
# contents = "blah blah blah"
# inputs = tokenizer("summarize: " + contents, return_tensors="pt")

# with torch.no_grad():
#     outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=10)
#     print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])

In [38]:
text = "okay so hi everyone so this is going to  be the repeat stream of uh my deep speed  tutorial which is a distributed systems  library for uh making training larger  models a lot easier  and so a lot of this comes down to like  a whole bunch of bag of tricks that i  thought this library does a really good  job of covering and discussing like what  they are  like how they work and why they're  important and so i'm just going to give  you like all the background you'll need  to pretty much understand all of these  tricks and hopefully sort of train  larger models yourself and go beyond  like just like a single  v100 or a single a single machine setup  um so yeah so i guess like  without further ado we can just get  started right so really like if you come  here like the first claim that they're  making is like look we can train 10  times larger models 10 times faster with  minimal code change  and the idea behind the minimal code  change is really really important  because ultimately you just want to be  able to say like have your model  definition for example in a high level  framework like pytorch or tensorflow  so you'll have your init function your  void function and then when you say  model.fit or model.train or whatever  uh these things should just work across  multiple machines  and it's as far as you're concerned like  this is totally  uh you know like happening without your  knowledge uh but of course making it  debuggable and stuff is a lot easier  it's just that it's possible for you to  start doing this stuff  uh without spending too much time  becoming an expert in distributed  systems  uh so really this so so there's a couple  of a couple of important ideas here  uh the first one is that like okay well  we can they can train over larger models  making these things really memory  efficient so there's a couple of tricks  related to  another people called zero which  essentially gives you different ways of  sharding  uh models across multiple devices and  across the cpu and i'll talk about this  stuff in a second  it also talks about uh scaling to  extremely long sequence links so when  you think of something like transformers  well for the most part the default ones  scale to up to 128 sequences you can do  more by let's say if you have uh like  you want to work over a  like sequence of lengths 256 you could  concatenate two 128 representations and  you know just use that as a  representation for something that's a  sequence length 356."
inputs = tokenizer(text, return_tensors="pt", truncation=True)
with torch.no_grad():
    outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=57, do_sample=True, top_p=0.9)
    print(f"summary:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)}")

summary:
['This is a repeat stream of my deep speed tutorial on a distributed systems library for making training larger models easier. There are some tricks to make it easier to train larger models with minimal code changes. Zero gives you ways of …. The library also.  ']


In [35]:
text = "okay so hi everyone so this is going to  be the repeat stream of uh my deep speed  tutorial which is a distributed systems  library for uh making training larger  models a lot easier  and so a lot of this comes down to like  a whole bunch of bag of tricks that i  thought this library does a really good  job of covering and discussing like what  they are  like how they work and why they're  important and so i'm just going to give  you like all the background you'll need  to pretty much understand all of these  tricks and hopefully sort of train  larger models yourself and go beyond  like just like a single  v100 or a single a single machine setup  um so yeah so i guess like  without further ado we can just get  started right so really like if you come  here like the first claim that they're  making is like look we can train 10  times larger models 10 times faster with  minimal code change  and the idea behind the minimal code  change is really really important  because ultimately you just want to be  able to say like have your model  definition for example in a high level  framework like pytorch or tensorflow  so you'll have your init function your  void function and then when you say  model.fit or model.train or whatever  uh these things should just work across  multiple machines  and it's as far as you're concerned like  this is totally  uh you know like happening without your  knowledge uh but of course making it  debuggable and stuff is a lot easier  it's just that it's possible for you to  start doing this stuff  uh without spending too much time  becoming an expert in distributed  systems  uh so really this so so there's a couple  of a couple of important ideas here  uh the first one is that like okay well  we can they can train over larger models  making these things really memory  efficient so there's a couple of tricks  related to  another people called zero which  essentially gives you different ways of  sharding  uh models across multiple devices and  across the cpu and i'll talk about this  stuff in a second  it also talks about uh scaling to  extremely long sequence links so when  you think of something like transformers  well for the most part the default ones  scale to up to 128 sequences you can do  more by let's say if you have uh like  you want to work over a  like sequence of lengths 256 you could  concatenate two 128 representations and  you know just use that as a  representation for something that's a  sequence length 356."
inputs = tokenizer(text, return_tensors="pt", truncation=True)
with torch.no_grad():
    outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"))
    print(f"summary:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]}")

summary:
This is a repeat stream of my deep speed tutorial on a distributed systems library for making training larger models easier. The library makes it possible to train 10 times larger models 10 times faster with minimal code change. Zero gives the …    etc.  


Take a closer look and evaluate it against the test set of processed dataset from samsum. Therefore we need to use and create some utilities to generate the summaries and group them together. The most commonly used metrics to evaluate summarization task is rogue_score short for Recall-Oriented Understudy for Gisting Evaluation). This metric does not behave like the standard accuracy: it will compare a generated summary against a set of reference summaries.

In [26]:
import evaluate
import numpy as np
from datasets import load_from_disk
from tqdm import tqdm

# Metric
metric = evaluate.load("rouge")

def evaluate_peft_model(sample, max_target_length=55):
    # generate summary
    outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), do_sample=True, top_p=0.9, max_new_tokens=max_target_length)
    prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
    # decode eval sample
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id)
    labels = tokenizer.decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    return prediction, labels

# load test dataset from distk
test_dataset = load_from_disk("bart-sam-data/eval/").with_format("torch")

# run predictions
# this can take ~45 minutes
predictions, references = [] , []
for sample in tqdm(test_dataset):
    p,l = evaluate_peft_model(sample)
    predictions.append(p)
    references.append(l)

# compute metric
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)

# print results
print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")

# Rogue1: 50.386161%
# rouge2: 24.842412%
# rougeL: 41.370130%
# rougeLsum: 41.394230%


100%|██████████| 819/819 [28:03<00:00,  2.06s/it]


Rogue1: 43.115465%
rouge2: 21.563061%
rougeL: 33.409979%
rougeLsum: 33.414162%
