# Parameter-Efficient Fine-tuning (PEFT) with Low-Level Adaptation (LORA) of a sequence classification base model to perform sequence classification task using HuggingFace PEFT on a single GPU

base model task: Sequence-to-Label

new model task: Sequence-to-Label

In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
# input constants
import os
import dotenv
import torch

dotenv.load_dotenv()

HF_DATASETS_NAME = "google-research-datasets/go_emotions"
HF_PRETRAINED_MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
DEVICE = 'cpu' # 'cuda' if torch.cuda.is_available() else 'cpu'  # when debugging use 'cpu' for better error messages 

LORA_R = int(os.getenv('LORA_R'))
LORA_ALPHA = int(os.getenv('LORA_ALPHA'))
LORA_DROPOUT = float(os.getenv('LORA_DROPOUT'))

EPOCHS = int(os.getenv('EPOCHS'))
BATCH_SIZE = int(os.getenv('BATCH_SIZE'))
LEARNING_RATE = float(os.getenv('LEARNING_RATE'))

OUTPUT_DIR = os.path.join('trained', HF_PRETRAINED_MODEL_NAME)
HUGGINGFACE_REPO_ID = os.getenv('HUGGINGFACE_REPO_ID')

if DEVICE == 'gpu':
    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [3]:
print(f"HF pretrained model name: {HF_PRETRAINED_MODEL_NAME}")
print(f"HF datasets name: {HF_DATASETS_NAME}")

print(f"LORA r: {LORA_R}")
print(f"LORA alpha: {LORA_ALPHA}")
print(f"LORA droupout: {LORA_DROPOUT}")

print(f"epochs: {EPOCHS}")
print(f"batch_size: {BATCH_SIZE}")
print(f"learning rate (lr): {LEARNING_RATE}")

print(f"Using {DEVICE} device")

HF pretrained model name: cardiffnlp/twitter-roberta-base-sentiment-latest
HF datasets name: google-research-datasets/go_emotions
LORA r: 8
LORA alpha: 32
LORA droupout: 0.1
epochs: 5
batch_size: 64
learning rate (lr): 0.001
Using cpu device


# Data Downloading and Loading

In [4]:
# download datasets: train, validation, test
from datasets import load_dataset

datasets = load_dataset(HF_DATASETS_NAME)  # doctest: +IGNORE_RESULT

In [5]:
import json
print(f"datasets: {[k for k in datasets]}")
labelNames = datasets['train'].features['labels'].feature.names
labelIds = []
for dataset_key in datasets:
    print(f"len({dataset_key}): {len(datasets[dataset_key])}")
    [labelIds.append(l) for ls in datasets[dataset_key]['labels'] for l in ls]
labelIds = list(set(labelIds))
assert len(labelIds) == len(labelNames)
labelIds.sort()
print(f"train dataset: {datasets['train']}")
print(f"train dataset features: {datasets['train'].features}")
print(f"labelIds ({len(labelIds)} unique), first 10: {labelIds[:10]}")
print(f"labelNames ({len(labelNames)} unique), first 10: {labelNames[:10]}")
for i in range(3):
    print(f"Example ({i}): {json.dumps(datasets['train'][i], indent=2)}")

datasets: ['train', 'validation', 'test']
len(train): 43410
len(validation): 5426
len(test): 5427
train dataset: Dataset({
    features: ['text', 'labels', 'id'],
    num_rows: 43410
})
train dataset features: {'text': Value(dtype='string', id=None), 'labels': Sequence(feature=ClassLabel(names=['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'], id=None), length=-1, id=None), 'id': Value(dtype='string', id=None)}
labelIds (28 unique), first 10: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
labelNames (28 unique), first 10: ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment']
Example (0): {
  "text": "My favourite food is anything I didn't have to cook 

In [None]:
# validate data
for key, dataset in datasets.items():
    print(key)
    for idx, record in enumerate(dataset):
        assert len(record['text']) > 0, f"{key}:{idx} - Expected text, received '{record['text']}'"
        assert len(record['labels']) > 0, f"{key}:{idx} - Expected labels, received '{record['labels']}'"
        for label in record['labels']:
            assert isinstance(label, int), f"{key}:{idx} - Expected int label, received '{label}'"

# Model and Tokenizer

In [6]:
# using pipelines
from transformers import pipeline
sentiment_task = pipeline("sentiment-analysis", model=HF_PRETRAINED_MODEL_NAME, device=DEVICE)
sentiment_task("Covid cases are increasing fast!")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'label': 'negative', 'score': 0.7235766649246216}]

In [7]:
# download tokenizer
# use_fast: False for Python-based algo when encoding is non-trivial (default), True for Rust-base algo with trivial encoding
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(HF_PRETRAINED_MODEL_NAME, use_fast=False, device=DEVICE)

In [8]:
from transformers import AutoConfig
base_config = AutoConfig.from_pretrained(HF_PRETRAINED_MODEL_NAME)
base_config

RobertaConfig {
  "_name_or_path": "cardiffnlp/twitter-roberta-base-sentiment-latest",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "negative",
    "1": "neutral",
    "2": "positive"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 0,
    "neutral": 1,
    "positive": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.43.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [9]:
# download model
from transformers import AutoModelForSequenceClassification

base_model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=HF_PRETRAINED_MODEL_NAME
)
base_model.to(device=DEVICE)
base_model

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [10]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model=base_model, tokenizer=tokenizer, device=DEVICE)
classifier(inputs="This is super cool!")

[{'label': 'positive', 'score': 0.9798469543457031}]

In [11]:
# test inference
from torch import nn
from transformers import AutoConfig
from scipy.special import softmax

input_text = datasets['train'][0]['text']
print(f"==INPUT TEXT==:\n{input_text}")
expected_labels = datasets['train'][0]['labels']
print(f"==EXPECTED==:\n{[f'{labelNames[l]} ({l})' for l in expected_labels]}")
encoded_inputs = tokenizer(input_text, return_tensors='pt').to(device=DEVICE)
outputs = base_model(**encoded_inputs)
logits = outputs[0][0].detach()
scores = softmax(logits.to('cpu'))
config = AutoConfig.from_pretrained(HF_PRETRAINED_MODEL_NAME)
print(f"==OUTPUT==:\n{[{config.id2label[i]: scores[i]} for i in range(len(logits))]}")


==INPUT TEXT==:
My favourite food is anything I didn't have to cook myself.
==EXPECTED==:
['neutral (27)']
==OUTPUT==:
[{'negative': 0.012084135}, {'neutral': 0.06326519}, {'positive': 0.9246506}]


# Finetuning configuration

In [14]:
# download model with target number of labels
from transformers import AutoModelForSequenceClassification

base_model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=HF_PRETRAINED_MODEL_NAME,
    num_labels=len(labelIds),
    ignore_mismatched_sizes=True  # because original model's num_labels < expected model's num_labels 
)
base_model.to(device=DEVICE)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [13]:
print(f"Number of (optionally, trainable or non-embeddings) parameters: {base_model.num_parameters():,}")

Number of (optionally, trainable or non-embeddings) parameters: 124,667,164


In [15]:
# tokenize the dataset
# Hugging Face Transformers models expect tokenized input, rather than a string text.
def tokenize_dataset(dataset):
    # encode text to input_ids and attention_mask
    encoded_text = tokenizer(
        text=dataset["text"],
        padding='max_length',  # add special padding token to create uniform-length inputs of 'max_length'
        truncation=True,  # truncate to 'max_length'
        max_length=base_config.max_position_embeddings,
        return_tensors='pt')
    dataset['input_ids'] = encoded_text.input_ids
    dataset['attention_mask'] = encoded_text.attention_mask
    # encode labels from List[List[int]] to List[int]
    first_labels = []
    for labels in dataset['labels']:
        first_label = labels[0]
        first_labels.append(first_label)
    dataset['labels'] = first_labels
    return dataset

encoded_datasets = datasets.map(
    tokenize_dataset, 
    batched=True,
    remove_columns=['id', 'text'])

In [16]:
import json
print(f"datasets: {[k for k in encoded_datasets]}")
for dataset_key in encoded_datasets:
    print(f"len({dataset_key}): {len(encoded_datasets[dataset_key])}")
print(f"train dataset: {encoded_datasets['train']}")
print(f"train dataset features: {encoded_datasets['train'].features}")
for i in range(3):
    print(f"Example ({i}): {json.dumps(encoded_datasets['train'][i])}")

datasets: ['train', 'validation', 'test']
len(train): 43410
len(validation): 5426
len(test): 5427
train dataset: Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 43410
})
train dataset features: {'labels': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}
Example (0): {"labels": 27, "input_ids": [0, 2387, 5548, 689, 16, 932, 38, 399, 75, 33, 7, 7142, 2185, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [17]:
# configure PEFT
# LORA can only target the following module types: `torch.nn.Linear`, `torch.nn.Embedding`, `torch.nn.Conv2d`, `transformers.pytorch_utils.Conv1D`.
# see LoRA: Low-Rank Adaptation of Large Language Models, Hu et al, 2021: https://arxiv.org/abs/2106.09685
from peft import LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # defines the expected fields of the tokenized dataset
    target_modules=['query', 'key', 'value'],  # model modules to apply LoRA to
    r=LORA_R, 
    lora_alpha=LORA_ALPHA, 
    lora_dropout=LORA_DROPOUT,
)

In [18]:
# wrap model with PEFT config
from peft import get_peft_model

peft_wrapped_model = get_peft_model(base_model, peft_config)
peft_wrapped_model.print_trainable_parameters()

trainable params: 1,054,492 || all params: 125,721,656 || trainable%: 0.8388


# Training Job

## Training with Transformers for Pytorch

In [19]:
# data loader/collator to batch input in training and evaluation datasets
# DataCollatorWithPadding pads dynamically your text to the length of the longest element in its batch, 
# so they are a uniform length
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [20]:
# configure evaluation metrics in addition to the default `loss` metric that the `Trainer` computes
import numpy as np
import evaluate

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [21]:
# clean up the GPU memory
if DEVICE == 'gpu':
    from numba import cuda
    device = cuda.get_current_device()
    device.reset()

In [22]:
# [OPTIONAL] TROUBLESHOOTING
# huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
# To disable this warning, you can either:
#	- Avoid using `tokenizers` before the fork if possible
#	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [23]:
# train job config
# Hugging Face training configuration tools can be used to configure a <T>Trainer.
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    
    #do_train=True,
    #do_eval=True,

    num_train_epochs=1,    
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=LEARNING_RATE,
    
    weight_decay=0.01,
    #gradient_accumulation_steps=2,  # default 1
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    # metric_for_best_model="f1"
    
    fp16=True,  # lower precision
    # use_ipex=True if DEVICE == 'cpu' else False,  # use Intel extension for PyTorch
    use_cpu=True if DEVICE == 'cpu' else False  # False will use CUDA or MPS if available
)

In [24]:
# [OPTIONAL] TROUBLESHOOTHING
# IF
# RuntimeError: CUDA error: device-side assert triggered
# CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
# For debugging consider passing CUDA_LAUNCH_BLOCKING=1
# Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"
# IF still errors out try DEVICE = 'cpu' to see error message

In [25]:
# The <T>Trainer classes require the user to provide: 1) Metrics 2) A base model 3) A training configuration
from transformers import Trainer

trainer = Trainer(
    model=peft_wrapped_model,
    args=training_args,
    train_dataset=encoded_datasets["train"],
    eval_dataset=encoded_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics
)

In [26]:
# if using GPU, then during training job monitor compute instance in terminal with cli command `nvidia-smi`
trainer.train()

Epoch,Training Loss,Validation Loss


# Store Model

In [None]:
peft_wrapped_model.save_pretrained(OUTPUT_DIR)

In [None]:
# save on Huggingface
from huggingface_hub import notebook_login

notebook_login()
peft_wrapped_model.push_to_hub("HUGGINGFACE_REPO_ID")