# Parameter-Efficient Finetuning (PEFT) with Low-Level Adaptation (LORA) using HuggingFace PEFT on a single GPU

In [None]:
%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [None]:
# input constants
import os
import dotenv

dotenv.load_dotenv()

HF_PRETRAINED_MODEL_NAME = "google/flan-t5-base" # "distilbert/distilbert-base-uncased"
HF_DATASET_NAME = "knkarthick/dialogsum"

TRAINING_EPOCHS = int(os.getenv('TRAINING_EPOCHS'))
TRAINING_BATCH_SIZE = int(os.getenv('TRAINING_BATCH_SIZE'))
TRAINING_LEARNING_RATE = float(os.getenv('TRAINING_LEARNING_RATE'))
TRAINING_DEVICE = 'gpu' # one of ['cpu', 'gpu', 'mps']

LORA_TARGET_MODULES=[
    "q", 
    "v"
]
LORA_R = int(os.getenv('LORA_R'))
LORA_ALPHA = int(os.getenv('LORA_ALPHA'))
LORA_DROPOUT = float(os.getenv('LORA_DROPOUT'))

OUTPUT_DIRECTORY = os.path.join('trained', HF_PRETRAINED_MODEL_NAME)
HUGGINGFACE_REPO_ID = os.getenv('HUGGINGFACE_REPO_ID')

if TRAINING_DEVICE == 'gpu':
    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
print(f"HF pretrained model name: {HF_PRETRAINED_MODEL_NAME}")
print(f"HF dataset name: {HF_DATASET_NAME}")

print(f"epochs: {TRAINING_EPOCHS}")
print(f"batch_size: {TRAINING_BATCH_SIZE}")
print(f"learning rate (lr): {TRAINING_LEARNING_RATE}")

print(f"LORA r: {LORA_R}")
print(f"LORA alpha: {LORA_ALPHA}")
print(f"LORA droupout: {LORA_DROPOUT}")

print(f"Using {TRAINING_DEVICE} device")

HF pretrained model name: google/flan-t5-base
HF datasets name: knkarthick/dialogsum
LORA r: 8
LORA alpha: 32
LORA droupout: 0.1
epochs: 5
batch_size: 64
learning rate (lr): 0.001
Using gpu device


# Download Training Data

In [None]:
# download datasets: train, validation, test
from datasets import load_dataset

dataset = load_dataset(HF_DATASET_NAME)  # doctest: +IGNORE_RESULT

In [None]:
import json
print(f"dataset: {[k for k in dataset]}")
topics = set()
for dataset_key in dataset:
    print(f"len({dataset_key}): {len(dataset[dataset_key])}")
    [topics.add(t) for t in dataset[dataset_key]['topic']]
print(f"train dataset: {dataset['train']}")
print(f"train dataset features: {dataset['train'].features}")
print(f"topics ({len(topics)} unique), first 10: {[topics.pop() for i in range(10)]}")
for i in range(3):
    print(f"Example ({i}): {json.dumps(dataset['train'][i], indent=2)}")

datasets: ['train', 'validation', 'test']
len(train): 12460
len(validation): 500
len(test): 1500
train dataset: Dataset({
    features: ['id', 'dialogue', 'summary', 'topic'],
    num_rows: 12460
})
train dataset features: {'id': Value(dtype='string', id=None), 'dialogue': Value(dtype='string', id=None), 'summary': Value(dtype='string', id=None), 'topic': Value(dtype='string', id=None)}
topics (8521 unique), first 10: ['phone numbers', 'having breakfast', 'married life', 'receipt', ' Biography of Dean', 'cultural shock', 'fashion awards', 'concentration problems', 'independent life', 'A fishy website']
Example (0): {
  "id": "train_0",
  "dialogue": "#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?\n#Person2#: I found it would be a good idea to get a check-up.\n#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.\n#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?\n#Person1#: Well, the best way to

# Model and Tokenizer

In [11]:
# download tokenizer
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained(HF_PRETRAINED_MODEL_NAME)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [12]:
# download model
from transformers import T5ForConditionalGeneration

base_model = T5ForConditionalGeneration.from_pretrained(
    pretrained_model_name_or_path=HF_PRETRAINED_MODEL_NAME
)
base_model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [None]:
# test inference
input_text = dataset['train'][0]['dialogue']
print(f"==INPUT TEXT==:\n{input_text}")
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
outputs = base_model.generate(inputs=input_ids, max_length=4000)
print(f"==OUTPUT==:\n{tokenizer.decode(outputs[0])}")
print(f"==EXPECTED==:\n{dataset['train'][0]['summary']}")

==INPUT TEXT==:
#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?
#Person2#: I found it would be a good idea to get a check-up.
#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.
#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?
#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.
#Person2#: Ok.
#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?
#Person2#: Yes.
#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.
#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.
#Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave.
#Person2#: Ok, thanks doctor.
==OUTPUT==:
<pad> Dr. Hawkins is here to help.</s>
=

# Fine-tuning configuration

In [None]:
# tokenize the dataset
# Hugging Face Transformers models expect tokenized input, 
# rather than the text in the downloaded data.
def tokenize_dataset(dataset):
    prompt = [f"Summarize the following dialogue:\n\n{dialogue}\n\nSummary:" 
              for dialogue in dataset["dialogue"]]
    dataset['input_ids'] = tokenizer(
        prompt,
        padding='max_length', 
        truncation=True, 
        return_tensors='pt').input_ids
    dataset['labels'] = tokenizer(
        dataset['summary'], 
        padding='max_length', 
        truncation=True, 
        return_tensors='pt').input_ids
    return dataset

encoded_dataset = dataset.map(
    tokenize_dataset, 
    batched=True,
    remove_columns=['id', 'topic', 'dialogue', 'summary'])

In [None]:
import json
print(f"encoded dataset: {[k for k in encoded_dataset]}")
for dataset_key in encoded_dataset:
    print(f"len({dataset_key}): {len(encoded_dataset[dataset_key])}")
print(f"train dataset: {encoded_dataset['train']}")
print(f"train dataset features: {encoded_dataset['train'].features}")
for i in range(3):
    print(f"Example ({i}): {json.dumps(encoded_dataset['train'][i], indent=2)}")

datasets: ['train', 'validation', 'test']
len(train): 12460
len(validation): 500
len(test): 1500
train dataset: Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 12460
})
train dataset features: {'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}
Example (0): {
  "input_ids": [
    12198,
    1635,
    1737,
    8,
    826,
    7478,
    10,
    1713,
    345,
    13515,
    536,
    4663,
    10,
    2018,
    6,
    1363,
    5,
    3931,
    5,
    27,
    31,
    51,
    7582,
    12833,
    77,
    7,
    5,
    1615,
    33,
    25,
    270,
    469,
    58,
    1713,
    345,
    13515,
    357,
    4663,
    10,
    27,
    435,
    34,
    133,
    36,
    3,
    9,
    207,
    800,
    12,
    129,
    3,
    9,
    691,
    18,
    413,
    5,
    1713,
    345,
    13515,
    536,
    4663,
    10,
    2163,
    6,
    168,
    6,
    25,
    43,
    2

In [None]:
# configure LoRA
from peft import LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,  # defines the expected fields of the tokenized dataset
    target_modules=LORA_TARGET_MODULES,  # model modules to apply LoRA to
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
)

In [None]:
# wrap model with PEFT config
from peft import get_peft_model

peft_wrapped_model = get_peft_model(base_model, peft_config)
peft_wrapped_model.print_trainable_parameters()

trainable params: 884,736 || all params: 248,462,592 || trainable%: 0.3561


# Training Job

## Training with Transformers for Pytorch

In [18]:
# data loader/collator to batch input in training and evaluation datasets
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# configure evaluation metrics 
# in addition to the default `loss` metric that the `Trainer` computes
import numpy as np
import evaluate

evaluation_module = evaluate.load("accuracy")

def compute_metrics(eval_pred, evaluation_module=evaluation_module):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return evaluation_module.compute(predictions=predictions, references=labels)

In [None]:
# [OPTIONAL] clean up the GPU memory
if TRAINING_DEVICE == 'gpu':
    from numba import cuda
    device = cuda.get_current_device()
    device.reset()

In [None]:
# train job config
# Hugging Face training configuration tools can be used to configure a Trainer.
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=OUTPUT_DIRECTORY,
    
    #do_train=True,
    #do_eval=True,

    num_train_epochs=TRAINING_EPOCHS,
    per_device_train_batch_size=TRAINING_BATCH_SIZE,
    per_device_eval_batch_size=TRAINING_BATCH_SIZE,
    learning_rate=TRAINING_LEARNING_RATE,
    
    weight_decay=0.01,
    #gradient_accumulation_steps=2,  # default 1
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    # metric_for_best_model="f1"
    
    #fp16=True,  # lower precision
    # use_ipex=True if DEVICE == 'cpu' else False,  # use Intel extension for PyTorch
    use_cpu=True if TRAINING_DEVICE == 'cpu' else False  # False will use CUDA or MPS if available
)

In [None]:
# The Trainer classes require the user to provide: 1) Metrics 2) A base model 3) A training configuration
from transformers import Trainer

trainer = Trainer(
    model=peft_wrapped_model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics
)

In [None]:
trainer.train()

# Store Model

In [None]:
# save model
import os

os.makedirs(OUTPUT_DIRECTORY, exist_ok=True)
peft_wrapped_model.save_pretrained(OUTPUT_DIRECTORY)
tokenizer.save_pretrained(OUTPUT_DIRECTORY)

In [None]:
# save on Huggingface
from huggingface_hub import notebook_login

notebook_login()
peft_wrapped_model.push_to_hub(HUGGINGFACE_REPO_ID)