# Lightweight Fine-Tuning Project

TODO: In this cell, describe your choices for each of the following

* PEFT technique: LoRa Adapter
* Model: DistilBert
* Evaluation approach: Accuracy
* Fine-tuning dataset: SetFit/bbc-news

In [1]:
!pip install datasets


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1

In [2]:
!pip install accelerate -U
!pip install transformers[torch]

Collecting accelerate
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [3]:
!pip install peft

Collecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: peft
Successfully installed peft-0.11.1


##Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

In [64]:
from datasets import load_dataset
import torch
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
import torchvision.models as models
from torch import nn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

splits = ["train", "test"]
dataset = load_dataset("SetFit/bbc-news", split=splits) 
ds = {split: ds for split, ds in zip(splits, dataset)}

#smaller set
#for split in splits:
 #   ds[split] = ds[split].shuffle(seed=42).select(range(500))

ds["train"].column_names

Downloading readme:   0%|          | 0.00/880 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.87M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.28M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1225 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

['text', 'label', 'label_text']

In [50]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

Using device: cuda


In [65]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    inputs = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)
    #inputs['labels'] = examples['label'] may need to convert to labels
    return inputs


tokenized_ds = {}
for split in splits:
    tokenized_ds[split] = ds[split].map(preprocess_function, batched=True)
    tokenized_ds[split] = tokenized_ds[split].remove_columns(["text",'label_text' ]) #model can not take str, give error

print(len(set(tokenized_ds["train"]["label"])))
print(tokenized_ds)

Map:   0%|          | 0/1225 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

5
{'train': Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 1225
}), 'test': Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 1000
})}


In [66]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased",
    num_labels=5 )

# freeze all the model parameters.
for param in model.base_model.parameters():
    param.requires_grad = False

print(model.classifier)
print(model)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Linear(in_features=768, out_features=5, bias=True)
DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
   

In [67]:
def compute_metrics(eval_pred):
    try:
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        average_type = 'macro' if len(np.unique(labels)) > 2 else 'binary' #macro: each class independently

        accuracy = (predictions == labels).mean()
        precision = precision_score(labels, predictions, average=average_type, zero_division=0)
        recall = recall_score(labels, predictions, average=average_type, zero_division=0)
        f1 = f1_score(labels, predictions, average=average_type,zero_division=0)

        return {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1
        }
    except Exception as e:
        print(f"Error computing metrics: {e}")
        return {}

In [54]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [68]:
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./results",
        per_device_eval_batch_size=4,
        eval_strategy="epoch",
        save_strategy="epoch",
    ),
    train_dataset=None,  # No training for evaluation
    eval_dataset=tokenized_ds['test'] ,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)
no_fine_results = trainer.evaluate()
print(no_fine_results)

{'eval_loss': 1.6090095043182373, 'eval_accuracy': 0.209, 'eval_precision': 0.18215478147008154, 'eval_recall': 0.20000738824891365, 'eval_f1': 0.18866106257092183, 'eval_runtime': 20.2703, 'eval_samples_per_second': 49.333, 'eval_steps_per_second': 12.333}


##Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

In [69]:
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoModelForCausalLM

config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    target_modules= ['q_lin', 'v_lin', 'k_lin', 'out_lin'], #attention mechanism, Q-V emphasizing certain keywords are critical for classification
    #modules_to_save=["classifier"], #also train classifier
)

# PEFT model
lora_model = get_peft_model(model, config)

lora_model.print_trainable_parameters()

for name, param in lora_model.named_parameters():
    if 'lora' not in name:
        param.requires_grad = False

for name, param in lora_model.named_parameters():
    if param.requires_grad:
        print(name, param.shape)

trainable params: 1,184,261 || all params: 68,141,578 || trainable%: 1.7379
base_model.model.distilbert.transformer.layer.0.attention.q_lin.lora_A.default.weight torch.Size([16, 768])
base_model.model.distilbert.transformer.layer.0.attention.q_lin.lora_B.default.weight torch.Size([768, 16])
base_model.model.distilbert.transformer.layer.0.attention.k_lin.lora_A.default.weight torch.Size([16, 768])
base_model.model.distilbert.transformer.layer.0.attention.k_lin.lora_B.default.weight torch.Size([768, 16])
base_model.model.distilbert.transformer.layer.0.attention.v_lin.lora_A.default.weight torch.Size([16, 768])
base_model.model.distilbert.transformer.layer.0.attention.v_lin.lora_B.default.weight torch.Size([768, 16])
base_model.model.distilbert.transformer.layer.0.attention.out_lin.lora_A.default.weight torch.Size([16, 768])
base_model.model.distilbert.transformer.layer.0.attention.out_lin.lora_B.default.weight torch.Size([768, 16])
base_model.model.distilbert.transformer.layer.1.attentio

In [70]:
trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="./lora/results",
        learning_rate=2e-5,
        # Set the per device train batch size and eval batch size
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        # Evaluate and save the model after each epoch
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=2,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        remove_unused_columns=False, #trainer does drop all the dataset columns that are not actual input to the models for training
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)


trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.5377,1.549106,0.463,0.544293,0.41044,0.325195
2,1.5004,1.468474,0.576,0.607495,0.538239,0.450205




TrainOutput(global_step=614, training_loss=1.5434655521902276, metrics={'train_runtime': 161.5896, 'train_samples_per_second': 15.162, 'train_steps_per_second': 3.8, 'total_flos': 333475712102400.0, 'train_loss': 1.5434655521902276, 'epoch': 2.0})

In [71]:
train_results = trainer.evaluate()
print(train_results)

{'eval_loss': 1.4684736728668213, 'eval_accuracy': 0.576, 'eval_precision': 0.6074946637004494, 'eval_recall': 0.538239434294519, 'eval_f1': 0.4502049195243469, 'eval_runtime': 20.7772, 'eval_samples_per_second': 48.13, 'eval_steps_per_second': 12.032, 'epoch': 2.0}


In [72]:
lora_model.save_pretrained("gpt-lora") # only saves the adapter weights and not the weights of the original model



## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

In [73]:
from peft import AutoPeftModelForSequenceClassification

lora_model = AutoPeftModelForSequenceClassification.from_pretrained("gpt-lora",num_labels=5)
print(lora_model)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): MultiHeadSelfAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=76

In [74]:
trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="./lora/results/test",
        per_device_eval_batch_size=4,
        eval_strategy="epoch",
        save_strategy="epoch",
    ),
    train_dataset=None,  # No training for evaluation
    eval_dataset=tokenized_ds['test'] ,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)
fine_results = trainer.evaluate()
print(fine_results)

{'eval_loss': 1.4684736728668213, 'eval_accuracy': 0.576, 'eval_precision': 0.6074946637004494, 'eval_recall': 0.538239434294519, 'eval_f1': 0.4502049195243469, 'eval_runtime': 21.0845, 'eval_samples_per_second': 47.428, 'eval_steps_per_second': 11.857}


In [75]:
import pandas as pd

results_df = pd.DataFrame({
    "No Fine-Tuning": no_fine_results,
    "Fine-Tuning": fine_results
})

results_df.T

Unnamed: 0,eval_loss,eval_accuracy,eval_precision,eval_recall,eval_f1,eval_runtime,eval_samples_per_second,eval_steps_per_second
No Fine-Tuning,1.60901,0.209,0.182155,0.200007,0.188661,20.2703,49.333,12.333
Fine-Tuning,1.468474,0.576,0.607495,0.538239,0.450205,21.0845,47.428,11.857
