# Parameter-Efficient Finetuning (PEFT) with Low-Level Adaptation (LORA) using HuggingFace PEFT

Sequence-to-Sequence model fine-tuning

In [2]:
# input constants
import os
import dotenv

dotenv.load_dotenv()

HF_DATASETS_NAME = "knkarthick/dialogsum"
HF_PRETRAINED_MODEL_NAME = "google/flan-t5-base"
DEVICE = 'cpu'

LORA_R = int(os.getenv('LORA_R'))
LORA_ALPHA = int(os.getenv('LORA_ALPHA'))
LORA_DROPOUT = float(os.getenv('LORA_DROPOUT'))

EPOCHS = int(os.getenv('EPOCHS'))
BATCH_SIZE = int(os.getenv('BATCH_SIZE'))
LEARNING_RATE = float(os.getenv('LEARNING_RATE'))

OUTPUT_DIR = os.path.join('trained', HF_PRETRAINED_MODEL_NAME)
HUGGINGFACE_REPO_ID = os.getenv('HUGGINGFACE_REPO_ID')

In [5]:
print(f"HF pretrained model name: {HF_PRETRAINED_MODEL_NAME}")
print(f"HF datasets name: {HF_DATASETS_NAME}")

print(f"LORA r: {LORA_R}")
print(f"LORA alpha: {LORA_ALPHA}")
print(f"LORA droupout: {LORA_DROPOUT}")

print(f"epochs: {EPOCHS}")
print(f"batch_size: {BATCH_SIZE}")
print(f"learning rate (lr): {LEARNING_RATE}")

print(f"Using {DEVICE} device")

HF pretrained model name: google/flan-t5-base
HF datasets name: knkarthick/dialogsum
LORA r: 8
LORA alpha: 32
LORA droupout: 0.1
epochs: 5
batch_size: 64
learning rate (lr): 0.001
Using cpu device


# EDA

In [6]:
# download datasets: train, validation, test
from datasets import load_dataset

datasets = load_dataset(HF_DATASETS_NAME)  # doctest: +IGNORE_RESULT

In [7]:
import json
print(f"datasets: {[k for k in datasets]}")
topics = set()
for dataset_key in datasets:
    print(f"len({dataset_key}): {len(datasets[dataset_key])}")
    [topics.add(t) for t in datasets[dataset_key]['topic']]
print(f"train dataset: {datasets['train']}")
print(f"train dataset features: {datasets['train'].features}")
print(f"topics ({len(topics)} unique), first 10: {[topics.pop() for i in range(10)]}")
for i in range(3):
    print(f"Example ({i}): {json.dumps(datasets['train'][i], indent=2)}")

datasets: ['train', 'validation', 'test']
len(train): 12460
len(validation): 500
len(test): 1500
train dataset: Dataset({
    features: ['id', 'dialogue', 'summary', 'topic'],
    num_rows: 12460
})
train dataset features: {'id': Value(dtype='string', id=None), 'dialogue': Value(dtype='string', id=None), 'summary': Value(dtype='string', id=None), 'topic': Value(dtype='string', id=None)}
topics (8521 unique), first 10: ['aging problem', ' indemnity', 'plans for Saturday', 'vacation in Thailand', 'family', 'explanation', 'different apples', 'delays on flights', 'husband hunting', 'check-up']
Example (0): {
  "id": "train_0",
  "dialogue": "#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?\n#Person2#: I found it would be a good idea to get a check-up.\n#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.\n#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?\n#Person1#: Well, the best way to avoid seriou

# Model and Tokenizer

In [10]:
# download tokenizer
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained(HF_PRETRAINED_MODEL_NAME)

In [11]:
# download model
from transformers import T5ForConditionalGeneration

base_model = T5ForConditionalGeneration.from_pretrained(
    pretrained_model_name_or_path=HF_PRETRAINED_MODEL_NAME
)
base_model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [12]:
# test inference
input_text = datasets['train'][0]['dialogue']
print(f"==INPUT TEXT==:\n{input_text}")
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
outputs = base_model.generate(inputs=input_ids, max_length=4000)
print(f"==OUTPUT==:\n{tokenizer.decode(outputs[0])}")
print(f"==EXPECTED==:\n{datasets['train'][0]['summary']}")

==INPUT TEXT==:
#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?
#Person2#: I found it would be a good idea to get a check-up.
#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.
#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?
#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.
#Person2#: Ok.
#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?
#Person2#: Yes.
#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.
#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.
#Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave.
#Person2#: Ok, thanks doctor.
==OUTPUT==:
<pad> Dr. Hawkins is here to help.</s>
=

# Finetuning configuration

In [13]:
# tokenize the dataset
def tokenize_dataset(dataset):
    prompt = [f"Summarize the following dialogue:\n\n{dialogue}\n\nSummary:" 
              for dialogue in dataset["dialogue"]]
    dataset['input_ids'] = tokenizer(
        prompt,
        padding='max_length', 
        truncation=True, 
        return_tensors='pt').input_ids
    dataset['labels'] = tokenizer(
        dataset['summary'], 
        padding='max_length', 
        truncation=True, 
        return_tensors='pt').input_ids
    return dataset

encoded_datasets = datasets.map(
    tokenize_dataset, 
    batched=True,
    remove_columns=['id', 'topic', 'dialogue', 'summary'])

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [14]:
import json
print(f"datasets: {[k for k in encoded_datasets]}")
for dataset_key in encoded_datasets:
    print(f"len({dataset_key}): {len(encoded_datasets[dataset_key])}")
print(f"train dataset: {encoded_datasets['train']}")
print(f"train dataset features: {encoded_datasets['train'].features}")
for i in range(3):
    print(f"Example ({i}): {json.dumps(encoded_datasets['train'][i], indent=2)}")

# # test inference
# input_ids = encoded_datasets['train'][0]['input_ids']
# outputs = base_model.generate(inputs=input_ids, max_length=4000)
# print(f"==OUTPUT==:\n{tokenizer.decode(outputs[0])}")
# print(f"==EXPECTED==:\n{datasets['train'][0]['summary']}")

datasets: ['train', 'validation', 'test']
len(train): 12460
len(validation): 500
len(test): 1500
train dataset: Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 12460
})
train dataset features: {'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}
Example (0): {
  "input_ids": [
    12198,
    1635,
    1737,
    8,
    826,
    7478,
    10,
    1713,
    345,
    13515,
    536,
    4663,
    10,
    2018,
    6,
    1363,
    5,
    3931,
    5,
    27,
    31,
    51,
    7582,
    12833,
    77,
    7,
    5,
    1615,
    33,
    25,
    270,
    469,
    58,
    1713,
    345,
    13515,
    357,
    4663,
    10,
    27,
    435,
    34,
    133,
    36,
    3,
    9,
    207,
    800,
    12,
    129,
    3,
    9,
    691,
    18,
    413,
    5,
    1713,
    345,
    13515,
    536,
    4663,
    10,
    2163,
    6,
    168,
    6,
    25,
    43,
    2

In [15]:
# configure PEFT
from peft import LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,  # defines the expected fields of the tokenized dataset
    target_modules=["q", "v"],  # model modules to apply LoRA to
    r=LORA_R, 
    lora_alpha=LORA_ALPHA, 
    lora_dropout=LORA_DROPOUT,
)

In [16]:
# wrap model with PEFT config
from peft import get_peft_model

peft_wrapped_model = get_peft_model(base_model, peft_config)
peft_wrapped_model.print_trainable_parameters()

trainable params: 884,736 || all params: 248,462,592 || trainable%: 0.3561


# Training Job

## Training with Transformers for Pytorch

In [17]:
# create batch data loader/collator
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [21]:
# train job config 
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
        
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    # metric_for_best_model="f1"
    
    bf16=True if DEVICE == 'cpu' else False,
    # use_ipex=True if DEVICE == 'cpu' else False,
    use_cpu=True if DEVICE == 'cpu' else False
)

In [22]:
from transformers import Trainer

trainer = Trainer(
    model=peft_wrapped_model,
    args=training_args,
    train_dataset=encoded_datasets["train"],
    eval_dataset=encoded_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [23]:
trainer.train()

  0%|          | 0/975 [00:00<?, ?it/s]

  ctx_manager = torch.cpu.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


: 

# Store Model

In [None]:
peft_wrapped_model.save_pretrained(OUTPUT_DIR)

In [1]:
# save on Huggingface
from huggingface_hub import notebook_login

notebook_login()
peft_wrapped_model.push_to_hub("HUGGINGFACE_REPO_ID")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

NameError: name 'peft_wrapped_model' is not defined