# Apply Lightweight Fine-Tuning to a Foundation Model


## In this project, we will bring together all of the essential components of a PyTorch + Hugging Face training and inference process. Specifically, we will:

-  Load a pre-trained model and evaluate its performance
-  Perform parameter-efficient fine tuning using the pre-trained model
-  Perform inference using the fine-tuned model and compare its performance to the original model


## Dataset
The dataset selected is the dair-ai/emotion from huggingface. Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise. More details [here](https://huggingface.co/datasets/dair-ai/emotion)odel

In [2]:
# Load the dair-ai/emotion dataset.

from datasets import load_dataset, Dataset

#  
dataset = load_dataset("dair-ai/emotion",split="train", trust_remote_code=True).train_test_split(
    test_size=0.2, shuffle=True, seed=23
)
splits = ["train", "test"]

# View the dataset characteristics
print(dataset["train"])
print(dataset["test"])

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['text', 'label'],
    num_rows: 12800
})
Dataset({
    features: ['text', 'label'],
    num_rows: 3200
})


In [3]:
# Inspect the first example
dataset["train"][0]

{'text': 'i am feeling hopeful excited and very much being made new',
 'label': 1}

## Pre-process Dataset

### Convert all text into tokens for our model. Here the tokenizer is the one suitable for the model selected. In this case: GPT2

In [8]:
from transformers import AutoTokenizer, GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" 

tokenized_dataset = {}
for split in splits:
    tokenized_dataset[split] = dataset[split].map(
        lambda x: tokenizer(x["text"], truncation=True, ), batched=True
    )

# Inspect the available columsn in the dataset
tokenized_dataset["train"]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 12800
})

In [9]:
tokenized_dataset["train"][0]

{'text': 'i am feeling hopeful excited and very much being made new',
 'label': 1,
 'input_ids': [72, 716, 4203, 17836, 6568, 290, 845, 881, 852, 925, 649],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

## Load and set up the foundation model

In [42]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "gpt2",
    num_labels=6,
    id2label={0: "sadness", 1: "joy", 2: "love", 3: "anger", 4: "fear", 5: "surprise"},  # For converting predictions to strings
    label2id={"sadness": 0, "joy": 1, "love": 2, "anger": 3, "fear": 4, "surprise": 5}
)
model.config.pad_token_id = model.config.eos_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
# print model
print(model)

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=6, bias=False)
)


In [25]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
#def compute_metrics(eval_pred):
#    predictions, labels = eval_pred
#    predictions = np.argmax(predictions, axis=1)
#    return {"accuracy": (predictions == labels).mean()}

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)

   # Calculate precision, recall, and F1-score
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [43]:
# Evaluate model using the Trainer library
from transformers import DataCollatorWithPadding, DataCollator, Trainer, TrainingArguments

foundation_model_trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

## Evaluate foundation GPT2 model

In [44]:
foundation_model_trainer.evaluate()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 4.2808918952941895,
 'eval_accuracy': 0.306875,
 'eval_precision': 0.21980305483006937,
 'eval_recall': 0.306875,
 'eval_f1': 0.1698034250640395,
 'eval_runtime': 19.607,
 'eval_samples_per_second': 163.207,
 'eval_steps_per_second': 20.401}

## Fine tune GPT2 model using PEFT - LoRa

In [31]:
from peft import LoraConfig
config = LoraConfig(task_type="SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)

from peft import get_peft_model
lora_model = get_peft_model(model, config)

lora_model.print_trainable_parameters()

trainable params: 299,520 || all params: 124,743,936 || trainable%: 0.24010786384037136




#### We see that with PEFT we will train only 0.24% of the paratmeters in the model drastically recuding training time

In [35]:
trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="./lora",
        # Set the learning rate
        learning_rate = 2e-5,
        # Set the per device train batch size and eval batch size
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        # Evaluate and save the model after each epoch
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=10,
        weight_decay=0.01,
        load_best_model_at_end=True
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

In [36]:
trainer.train()
lora_model.save_pretrained("lora_model")

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,1.67076,0.334688,0.256958,0.334688,0.275154
2,3.675200,1.541964,0.4075,0.279814,0.4075,0.32224
3,1.573100,1.426909,0.499688,0.34797,0.499688,0.391169
4,1.428600,1.197828,0.570625,0.552126,0.570625,0.467414
5,1.179900,1.031275,0.62,0.61351,0.62,0.535246
6,1.179900,0.933653,0.67125,0.661906,0.67125,0.612027
7,1.039100,0.863824,0.696562,0.679079,0.696562,0.647004
8,0.951900,0.823895,0.7025,0.676838,0.7025,0.658148
9,0.909800,0.800431,0.710938,0.678923,0.710938,0.669753
10,0.889000,0.793518,0.713125,0.681736,0.713125,0.673468


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Load the PEFT model

In [45]:
from peft import AutoPeftModelForSequenceClassification, PeftModel, PeftConfig

peft_model_id = "lora_model"
config = PeftConfig.from_pretrained(peft_model_id)

inference_model = AutoPeftModelForSequenceClassification.from_pretrained(peft_model_id,pad_token_id=tokenizer.eos_token_id,id2label={0: "sadness", 1: "joy", 2: "love", 3: "anger", 4: "fear", 5: "surprise"},  # For converting predictions to strings
    label2id={"sadness": 0, "joy": 1, "love": 2, "anger": 3, "fear": 4, "surprise": 5})

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
loaded_lora_model_trainer = Trainer(
    model=inference_model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

In [41]:
loaded_lora_model_trainer.evaluate()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.79351806640625,
 'eval_accuracy': 0.713125,
 'eval_precision': 0.6817358319177603,
 'eval_recall': 0.713125,
 'eval_f1': 0.6734675567185461,
 'eval_runtime': 32.1116,
 'eval_samples_per_second': 99.653,
 'eval_steps_per_second': 12.457}

#### Compare results between PGT2 and fine tuned GPT2:

| Metric    | GPT2 | GPT2 PEFT |
|-----------|------|-----------|
| loss      | 4.28 | 0.79      |
| accuracy  | 0.31 | 0.71      |
| precision | 0.22 | 0.68      |
| recall    | 0.31 | 0.71      |
| f1        | 0.17 | 0.67      |.67 	|.67 	| 3_  |42.99 |   ||

##### As we can see we trained the model and achieved much better results by training a fraction of the foundation model parameters

### Inference

In [56]:
id2label={0: "sadness", 1: "joy", 2: "love", 3: "anger", 4: "fear", 5: "surprise"}

import torch

def classify_text(text):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(text, return_tensors="pt").to(device)
    outputs = inference_model(input_ids=inputs["input_ids"])
    return id2label[outputs.logits.argmax(dim=-1).to(device).item()]

'sadness'

In [60]:
sample_text_to_classify = "I feel so sad"
print (sample_text_to_classify + " -> " + classify_text(sample_text_to_classify))

I feel so sad -> sadness


In [61]:
sample_text_to_classify = "I am so happy"
print (sample_text_to_classify + " -> " + classify_text(sample_text_to_classify))

I am so happy -> joy
