# Fine-Tuning LLMs with LoRA (Low-Rank Adaptation)

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
from datasets import load_dataset, Dataset as HFDataset
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    PeftModel,
    PeftConfig
)
from transformers import (
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
import evaluate
from tqdm.auto import tqdm
import warnings
import random

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# Set random seeds for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cpu


# Part 1: Introduction to LoRA (Low-Rank Adaptation)

# 1. Introduction to LoRA

LoRA (Low-Rank Adaptation) is a technique for efficiently fine-tuning large language models (LLMs)
by freezing the pre-trained model weights and injecting trainable low-rank matrices into each layer
of the Transformer architecture, drastically reducing the number of trainable parameters for fine-tuning.

Key advantages of LoRA:
- Memory efficiency: Only updates a small number of parameters, reducing GPU memory requirements
- Storage efficiency: Only need to store small adapter weights instead of full model copies
- Computational efficiency: Faster training and inference compared to full fine-tuning
- Adaptability: Multiple LoRA adapters can be trained for different tasks on the same base model

How LoRA works:
1. Freezes the pre-trained weights of the LLM
2. For specific weight matrices (typically query and value matrices in attention layers):
   - Approximates weight updates using low-rank decomposition: ΔW = A × B
   - where A is a matrix of shape (d × r) and B is a matrix of shape (r × k)
   - r is the rank, typically much smaller than d and k
3. During inference: equivalent to W + ΔW, but more efficient

LoRA can be applied to various model architectures and tasks, including:
- Text classification
- Question answering
- Text generation
- Summarization
- Translation


# 2. Why Use LoRA for Fine-Tuning

Traditional fine-tuning of LLMs has several challenges:

1. Memory requirements: Full fine-tuning of large models (billions of parameters) requires 
   significant GPU memory
2. Computational cost: Training all parameters is expensive and time-consuming
3. Catastrophic forgetting: Full fine-tuning can cause the model to forget general capabilities
4. Storage overhead: Each fine-tuned model copy requires gigabytes of storage

LoRA addresses these issues by:
- Reducing trainable parameters by 99%+ in many cases
- Requiring a fraction of the GPU memory
- Preserving most of the base model's capabilities
- Enabling small, swappable adapters (typically a few MB) instead of full model copies

In this notebook, we'll demonstrate how to use LoRA to fine-tune a smaller model on a text 
classification task, but the same principles apply to larger models up to 1.5B parameters.


# Part 2: Setting Up LoRA Fine-Tuning Environment

In [3]:
# We'll use a smaller model for demonstration
BASE_MODEL = "distilbert-base-uncased"  # ~66M parameters
# Other options within 1.5B parameters:
# - "roberta-base" (~125M parameters)
# - "EleutherAI/pythia-410m" (~410M parameters)
# - "facebook/opt-350m" (~350M parameters)
# - "google/flan-t5-base" (~250M parameters)

# PEFT (Parameter-Efficient Fine-Tuning) configuration for LoRA
lora_config = LoraConfig(
    r=8,                # Rank of the low-rank matrices (typically 4-32)
    lora_alpha=16,      # Alpha parameter for LoRA scaling
    target_modules=["q_lin", "v_lin"],  # For DistilBERT, we target query and value matrices
    lora_dropout=0.1,   # Dropout probability for LoRA layers
    bias="none",        # Whether to train bias parameters
    task_type=TaskType.SEQ_CLS  # Task type (sequence classification in this case)
)

print(f"Base model: {BASE_MODEL}")
print(f"LoRA configuration: {lora_config}")

Base model: distilbert-base-uncased
LoRA configuration: LoraConfig(task_type=<TaskType.SEQ_CLS: 'SEQ_CLS'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=8, target_modules={'v_lin', 'q_lin'}, exclude_modules=None, lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)


# Part 3: Preparing a Dataset for LoRA Fine-Tuning

In [8]:
print("\n# 4. Preparing a Dataset for Fine-Tuning")

# For this example, we'll use the SST-2 dataset (Stanford Sentiment Treebank)
# It's a binary sentiment classification dataset (positive/negative movie reviews)
print("Loading SST-2 dataset...")
dataset = load_dataset("sst2")
print(f"Dataset loaded: {dataset}")

# Examine the dataset
print("\nDataset structure:")
print(f"Train set: {len(dataset['train'])} examples")
print(f"Validation set: {len(dataset['validation'])} examples")
print(f"Test set: {len(dataset['test']) if 'test' in dataset else 'Not available'} examples")

# Show a few examples - with debug info to understand the structure
print("\nSample examples:")
print("First example type:", type(dataset["train"][0]))
print("First example content:", dataset["train"][0])

# Now iterate with better error handling
for i in range(5):
    example = dataset["train"][i]
    print(f"Example {i+1}:")
    try:
        print(f"  Text: {example['sentence']}")
        print(f"  Label: {example['label']} ({'Positive' if example['label'] == 1 else 'Negative'})")
    except TypeError as e:
        print(f"  Error accessing example: {e}")
        print(f"  Example type: {type(example)}")
        print(f"  Example content: {example}")

# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

def tokenize_function(examples):
    """Tokenize examples with padding and truncation."""
    return tokenizer(
        examples["sentence"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

print("\nTokenizing the dataset...")
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["sentence", "idx"]
)

# Convert to PyTorch format
tokenized_datasets.set_format("torch")
print(f"Tokenized train set: {len(tokenized_datasets['train'])} examples")
print(f"Tokenized validation set: {len(tokenized_datasets['validation'])} examples")


# 4. Preparing a Dataset for Fine-Tuning
Loading SST-2 dataset...
Dataset loaded: DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1821
    })
})

Dataset structure:
Train set: 67349 examples
Validation set: 872 examples
Test set: 1821 examples

Sample examples:
First example type: <class 'dict'>
First example content: {'idx': 0, 'sentence': 'hide new secretions from the parental units ', 'label': 0}
Example 1:
  Text: hide new secretions from the parental units 
  Label: 0 (Negative)
Example 2:
  Text: contains no wit , only labored gags 
  Label: 0 (Negative)
Example 3:
  Text: that loves its characters and communicates something rather beautiful about human nature 
  Label: 1 (Positive)
Example 4:
  Text: remains utterly satisfied t

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]


Tokenizing the dataset...


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

Tokenized train set: 67349 examples
Tokenized validation set: 872 examples


# Part 4: Setting Up the Model with LoRA

In [9]:
# ====================================

print("\n# 5. Setting Up the Model with LoRA")

# Load the base model
print(f"Loading base model: {BASE_MODEL}")
model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL, 
    num_labels=2,  # Binary classification
    return_dict=True
)

# Count original parameters
original_params = sum(p.numel() for p in model.parameters())
print(f"Original model parameters: {original_params:,}")

# Apply LoRA configuration to the model
print("Applying LoRA to the model...")
peft_model = get_peft_model(model, lora_config)

# Count trainable parameters
trainable_params = sum(p.numel() for p in peft_model.parameters() if p.requires_grad)
print(f"Trainable parameters after applying LoRA: {trainable_params:,}")
print(f"Parameter reduction factor: {original_params / trainable_params:.2f}x")
print(f"Percentage of parameters trained: {100 * trainable_params / original_params:.2f}%")

# Print the model architecture with LoRA adapters
print("\nModel structure with LoRA adapters:")
print(peft_model)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



# 5. Setting Up the Model with LoRA
Loading base model: distilbert-base-uncased
Original model parameters: 66,955,010
Applying LoRA to the model...
Trainable parameters after applying LoRA: 739,586
Parameter reduction factor: 90.53x
Percentage of parameters trained: 1.10%

Model structure with LoRA adapters:
PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): DistilBertSdpaAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.L

# Part 5: Training with LoRA

In [10]:
print("\n# 6. Training the Model with LoRA")

# Define evaluation metric
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    """Compute accuracy metrics from predictions and labels."""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# Set up training arguments
batch_size = 16
num_epochs = 3

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

# Create Trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

# Train the model
print("Starting LoRA fine-tuning...")
train_result = trainer.train()

# Print training metrics
print("\nTraining metrics:")
print(f"Total training time: {train_result.metrics['train_runtime']:.2f} seconds")
print(f"Training loss: {train_result.metrics['train_loss']:.4f}")
print(f"Training samples per second: {train_result.metrics['train_samples_per_second']:.2f}")

# Evaluate the model
print("\nEvaluating fine-tuned model...")
eval_results = trainer.evaluate()
print(f"Validation accuracy: {eval_results['eval_accuracy']:.4f}")



# 6. Training the Model with LoRA


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting LoRA fine-tuning...


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

# Part 6: Analyzing Results


In [None]:
print("\n# 7. Analyzing Results")

# Function to get predictions
def get_predictions(model, dataset):
    """Get predictions from model for the given dataset."""
    dataloader = DataLoader(dataset, batch_size=16)
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Predicting"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"]
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = outputs.logits.argmax(dim=-1).cpu().numpy()
            
            all_preds.extend(predictions)
            all_labels.extend(labels.numpy())
    
    return np.array(all_preds), np.array(all_labels)

# Get predictions
val_preds, val_labels = get_predictions(peft_model, tokenized_datasets["validation"])

# Calculate classification report
print("\nClassification Report:")
print(classification_report(val_labels, val_preds, target_names=["Negative", "Positive"]))

# Plot confusion matrix
cm = confusion_matrix(val_labels, val_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", 
            xticklabels=["Negative", "Positive"],
            yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.savefig("confusion_matrix.png")
plt.show()






In [None]:
# Part 7: Saving and Loading LoRA Adapters
# ======================================

print("\n# 8. Saving and Loading LoRA Adapters")

# Save the LoRA adapter
adapter_path = "./lora_sst2_adapter"
peft_model.save_pretrained(adapter_path)
print(f"Saved LoRA adapter to: {adapter_path}")

# Check the size of the saved adapter
adapter_size = sum(os.path.getsize(os.path.join(adapter_path, f)) for f in os.listdir(adapter_path))
adapter_size_mb = adapter_size / (1024 * 1024)
print(f"LoRA adapter size: {adapter_size_mb:.2f} MB")

# Compare to full model size (estimated)
model_size_mb = original_params * 4 / (1024 * 1024)  # Assuming 4 bytes per parameter
print(f"Estimated full model size: {model_size_mb:.2f} MB")
print(f"Size reduction: {model_size_mb / adapter_size_mb:.2f}x")

# Load the adapter to a new model
print("\nLoading the LoRA adapter to a new model instance...")

# Load a new base model
new_model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL, 
    num_labels=2,
)

# Load the LoRA adapter
loaded_model = PeftModel.from_pretrained(new_model, adapter_path)
print("LoRA adapter loaded successfully!")

In [None]:
# Part 8: Inference with LoRA-Fine-Tuned Model
# ==========================================

print("\n# 9. Inference with LoRA-Fine-Tuned Model")

# Function for inference
def predict_sentiment(text, model, tokenizer):
    """Predict sentiment for the given text."""
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = outputs.logits.argmax(dim=-1).cpu().numpy()[0]
        probs = F.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
    
    sentiment = "Positive" if prediction == 1 else "Negative"
    confidence = probs[prediction]
    
    return sentiment, confidence

# Test new sentences
test_sentences = [
    "This movie was absolutely fantastic and I enjoyed every moment.",
    "The restaurant was terrible and the service was even worse.",
    "I'm not sure how I feel about this product yet.",
    "While it had some good moments, overall I was disappointed.",
    "The book started slow but the ending was mind-blowing!"
]

print("Predicting sentiment for test sentences:")
for sentence in test_sentences:
    sentiment, confidence = predict_sentiment(sentence, loaded_model, tokenizer)
    print(f"\nText: {sentence}")
    print(f"Predicted sentiment: {sentiment} (confidence: {confidence:.4f})")