# Low Rank Adaptation
---

In [1]:
# !pip install torch
# !pip install torch==2.0.1+cu118 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# !pip install numpy==1.26.4

In [3]:
import torch.nn as nn
import torch.nn.functional as F
import torch

In [4]:
if torch.cuda.is_available():
    torch.backends.cudnn.deterministic=True
    DEVICE=torch.device('cuda')
else:
    DEVICE=torch.device('cpu')

print(DEVICE)

cuda


In [5]:
if torch.cuda.is_available():
    """
    NVIDIA CUDA Deep Neural Network (cuDNN) is a GPU-accelerated library of primitives for deep neural networks
    """
    torch.backends.cudnn.deterministic=True

class LoRALayer(nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha):
        super().__init__()
        std_dev=1/torch.sqrt(torch.tensor(rank).float())
        self.A=nn.Parameter(torch.randn(in_dim, rank)*std_dev)
        self.B=nn.Parameter(torch.zeros(rank, out_dim))
        self.alpha=alpha
        
    def forward(self, x):
        x=self.alpha*(x@self.A@self.B) 
        # Here, @ denotes matrix multiplication
        return x
    
class LinearWithLoRA(nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear=linear
        self.lora=LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )
    
    def forward(self, x):
        return self.linear(x)+self.lora(x)

In [1]:
# # Hyperparameters
# random_seed=123

# torch.manual_seed(random_seed)
# layer=nn.Linear(10,2)
# x=torch.randn((1, 10))

# print(x)
# print(layer)
# print('Original output:', layer(x))

### Applying LoRA to Linear Layer
Let's apply LoRA to the Linear layer, we see that the results are the same since we haven't trained the LoRA weights yet. In other words, everything works as expected:

In [6]:
## Applying LoRA to Linear Layer
layer=nn.Linear(10,2)
x=torch.randn((1, 10))
layer_lora_1=LinearWithLoRA(layer, rank=2, alpha=4)
print(layer_lora_1(x))

tensor([[0.4645, 0.3197]], grad_fn=<AddBackward0>)


In [7]:
import torch.nn.functional as F

# This LoRA code is equivalent to LinearWithLoRA
class LinearWithLoRAMerged(nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )
    
    def forward(self, x):
        lora=self.lora.A @ self.lora.B # combine LoRA metrices
        # then combine LoRA original weights
        combined_weight = self.linear.weight + self.lora.alpha*lora.T
        return F.linear(x, combined_weight, self.linear.bias)

In [8]:
layer_lora_2=LinearWithLoRAMerged(layer, rank=2, alpha=4)
print(layer_lora_2(x))

tensor([[0.4645, 0.3197]], grad_fn=<AddmmBackward0>)


In [9]:
# Architecture
num_features=784
num_hidden_1=128
num_hidden_2=256
num_classes=10

# Settings
DEVICE=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
learning_rate=0.005
num_epochs=10

In [10]:
class MultilayerPerceptron(nn.Module):
    def __init__(self, num_features, num_hidden_1, num_hidden_2, num_classes):
        super().__init__()
        self.layers=nn.Sequential(
            nn.Linear(num_features, num_hidden_1),
            nn.ReLU(),
            nn.Linear(num_hidden_1, num_hidden_2),
            nn.ReLU(),
            nn.Linear(num_hidden_2, num_classes)
        )
    
    def forward(self, x):
        x=self.layers(x)
        return x
    
model=MultilayerPerceptron(
    num_features = num_features,
    num_hidden_1 = num_hidden_1,
    num_hidden_2 = num_hidden_2,
    num_classes = num_classes
)

model.to(DEVICE)
optimizer_pretrained=torch.optim.Adam(model.parameters(), lr=learning_rate)
print(DEVICE)
print(model)
print(optimizer_pretrained)

cuda
MultilayerPerceptron(
  (layers): Sequential(
    (0): Linear(in_features=784, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=10, bias=True)
  )
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.005
    maximize: False
    weight_decay: 0
)


In [11]:
from torchvision import datasets
from torchvision import transforms
from torch.utils.data import DataLoader

BATCH_SIZE=64

# Note: transforms.ToTensor() scales input images to 0-1 range
train_dataset=datasets.MNIST(root='data', train=True, transform=transforms.ToTensor(), download=True)
test_dataset=datasets.MNIST(root='data', train=False, transform=transforms.ToTensor())
train_loader=DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader=DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

for images, labels in train_loader:
    print('Image batch dimensions:', images.shape)
    print('Image label dimensions:', labels.shape)
    break

Image batch dimensions: torch.Size([64, 1, 28, 28])
Image label dimensions: torch.Size([64])


In [12]:
def compute_accuracy(model, data_loader, device):
    model.eval()
    correct_pred, num_examples=0,0
    with torch.no_grad():
        for features, targets in data_loader:
            features=features.view(-1, 28*28).to(device)
            targets=targets.to(device)
            logits=model(features)
            _, predicted_labels=torch.max(logits,1)
            num_examples+=targets.size(0)
            correct_pred+=(predicted_labels==targets).sum()
        return correct_pred.float()/num_examples*100

In [13]:
import time

def train(num_epochs, model, optimizer, train_loader, device):
    start_time=time.time()
    for epoch in range(num_epochs):
        model.train()
        for batch_idx, (features, targets) in enumerate(train_loader):
            features=features.view(-1, 28*28).to(device)
            targets=targets.to(device)
            
            # forward and back propagation
            logits=model(features)
            loss=F.cross_entropy(logits, targets)
            optimizer.zero_grad()
            
            loss.backward()
            
            # update model parameters
            optimizer.step()
            
            # logging
            if not batch_idx %400:
                print('Epoch: %03d/%03d|Batch %03d/%03d| Loss: %.4f' % (epoch+1, num_epochs, batch_idx, len(train_loader), loss))
        
        with torch.set_grad_enabled(False):
            print('Epoch: %03d/%03d training accuracy: %.2f%%' % (epoch+1, num_epochs, compute_accuracy(model, train_loader, device)))
        
        print('Time elapsed: %.2f min' % ((time.time() - start_time)/60))
    print('Total Training Time: %.2f min' % ((time.time() - start_time)/60))
                  
                  
train(num_epochs, model, optimizer_pretrained, train_loader, DEVICE)
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 001/010|Batch 000/938| Loss: 2.3076
Epoch: 001/010|Batch 400/938| Loss: 0.1613
Epoch: 001/010|Batch 800/938| Loss: 0.1605
Epoch: 001/010 training accuracy: 96.76%
Time elapsed: 0.17 min
Epoch: 002/010|Batch 000/938| Loss: 0.1003
Epoch: 002/010|Batch 400/938| Loss: 0.0263
Epoch: 002/010|Batch 800/938| Loss: 0.0424
Epoch: 002/010 training accuracy: 97.36%
Time elapsed: 0.33 min
Epoch: 003/010|Batch 000/938| Loss: 0.0978
Epoch: 003/010|Batch 400/938| Loss: 0.1006
Epoch: 003/010|Batch 800/938| Loss: 0.1845
Epoch: 003/010 training accuracy: 97.83%
Time elapsed: 0.48 min
Epoch: 004/010|Batch 000/938| Loss: 0.2168
Epoch: 004/010|Batch 400/938| Loss: 0.1372
Epoch: 004/010|Batch 800/938| Loss: 0.0543
Epoch: 004/010 training accuracy: 98.26%
Time elapsed: 0.64 min
Epoch: 005/010|Batch 000/938| Loss: 0.0336
Epoch: 005/010|Batch 400/938| Loss: 0.0326
Epoch: 005/010|Batch 800/938| Loss: 0.0873
Epoch: 005/010 training accuracy: 97.96%
Time elapsed: 0.79 min
Epoch: 006/010|Batch 000/938| Loss:

In [14]:
import copy

model_lora=copy.deepcopy(model)

model_lora.layers[0]=LinearWithLoRAMerged(model_lora.layers[0], rank=4, alpha=8)
model_lora.layers[2]=LinearWithLoRAMerged(model_lora.layers[2], rank=4, alpha=8)
model_lora.layers[4]=LinearWithLoRAMerged(model_lora.layers[4], rank=4, alpha=8)
model_lora.to(DEVICE)
optimizer_lora=torch.optim.Adam(model_lora.parameters(), lr=learning_rate)
print(model_lora)

MultilayerPerceptron(
  (layers): Sequential(
    (0): LinearWithLoRAMerged(
      (linear): Linear(in_features=784, out_features=128, bias=True)
      (lora): LoRALayer()
    )
    (1): ReLU()
    (2): LinearWithLoRAMerged(
      (linear): Linear(in_features=128, out_features=256, bias=True)
      (lora): LoRALayer()
    )
    (3): ReLU()
    (4): LinearWithLoRAMerged(
      (linear): Linear(in_features=256, out_features=10, bias=True)
      (lora): LoRALayer()
    )
  )
)


In [15]:
print(f'Test accuracy orig model:{compute_accuracy(model, test_loader, DEVICE):.2f}%')
print(f'Test accuracy LoRA model:{compute_accuracy(model_lora, test_loader, DEVICE):.2f}%')

Test accuracy orig model:96.81%
Test accuracy LoRA model:96.81%


In [16]:
def freeze_linear_layers(model):
    for child in model.children():
        if isinstance(child, nn.Linear):
            for param in child.parameters():
                param.requires_grad=False
        else:
            # recursively freeze linear layers in children modules
            freeze_linear_layers(child)

freeze_linear_layers(model_lora)
for name, param in model_lora.named_parameters():
    print(f'{name}:{param.requires_grad}')

layers.0.linear.weight:False
layers.0.linear.bias:False
layers.0.lora.A:True
layers.0.lora.B:True
layers.2.linear.weight:False
layers.2.linear.bias:False
layers.2.lora.A:True
layers.2.lora.B:True
layers.4.linear.weight:False
layers.4.linear.bias:False
layers.4.lora.A:True
layers.4.lora.B:True


In [17]:
optimizer_lora=torch.optim.Adam(model_lora.parameters(), lr=learning_rate)
train(num_epochs, model_lora, optimizer_lora, train_loader, DEVICE)
print(f'Test accuracy LoRA finetune: {compute_accuracy(model_lora, test_loader, DEVICE):.2f}%')

Epoch: 001/010|Batch 000/938| Loss: 0.0253
Epoch: 001/010|Batch 400/938| Loss: 0.0189
Epoch: 001/010|Batch 800/938| Loss: 0.1044
Epoch: 001/010 training accuracy: 98.33%
Time elapsed: 0.18 min
Epoch: 002/010|Batch 000/938| Loss: 0.0302
Epoch: 002/010|Batch 400/938| Loss: 0.0162
Epoch: 002/010|Batch 800/938| Loss: 0.0117
Epoch: 002/010 training accuracy: 99.02%
Time elapsed: 0.35 min
Epoch: 003/010|Batch 000/938| Loss: 0.0307
Epoch: 003/010|Batch 400/938| Loss: 0.0063
Epoch: 003/010|Batch 800/938| Loss: 0.0574
Epoch: 003/010 training accuracy: 98.81%
Time elapsed: 0.54 min
Epoch: 004/010|Batch 000/938| Loss: 0.0488
Epoch: 004/010|Batch 400/938| Loss: 0.0111
Epoch: 004/010|Batch 800/938| Loss: 0.0052
Epoch: 004/010 training accuracy: 98.99%
Time elapsed: 0.72 min
Epoch: 005/010|Batch 000/938| Loss: 0.0004
Epoch: 005/010|Batch 400/938| Loss: 0.0139
Epoch: 005/010|Batch 800/938| Loss: 0.0059
Epoch: 005/010 training accuracy: 98.89%
Time elapsed: 0.90 min
Epoch: 006/010|Batch 000/938| Loss:

In [18]:
print(f'Test accuracy orig model:{compute_accuracy(model, test_loader, DEVICE):.2f}%')
print(f'Test accuracy LoRA model:{compute_accuracy(model_lora, test_loader, DEVICE):.2f}%')

Test accuracy orig model:96.81%
Test accuracy LoRA model:96.71%


In [1]:
import numpy as np

# Original matrix W
W = np.random.rand(10, 10)

# Low-rank approximation matrices A and B
rank = 2
A = np.random.rand(10, rank)
B = np.random.rand(rank, 10)

# Approximate W with A * B
W_approx = np.dot(A, B)

# Print the matrices
print("Original Matrix W:")
print(W)
print("\nMatrix A:")
print(A)
print("\nMatrix B:")
print(B)
print("\nApproximated Matrix W_approx (A * B):")
print(W_approx)

# Number of parameters
params_W = W.size
params_A_B = A.size + B.size

print(f"\nNumber of parameters in W: {params_W}")
print(f"Number of parameters in A and B: {params_A_B}")

Original Matrix W:
[[0.27623515 0.63215998 0.92459435 0.12553922 0.1324817  0.62949987
  0.67642814 0.20044365 0.15545272 0.31994525]
 [0.31620771 0.73463356 0.40055041 0.87588128 0.78227843 0.90167607
  0.32959735 0.28369558 0.42492724 0.133431  ]
 [0.87525289 0.32988044 0.83585347 0.93829371 0.09390059 0.25781041
  0.51288144 0.91023738 0.93678052 0.40134471]
 [0.10761006 0.26557536 0.89544327 0.32928028 0.46488922 0.97024327
  0.52649051 0.00352671 0.82629554 0.43071364]
 [0.52791184 0.59928369 0.46913586 0.68322513 0.67940471 0.08802975
  0.80061053 0.97934069 0.46685688 0.69116909]
 [0.647555   0.25602217 0.25908842 0.8408801  0.90313406 0.43487419
  0.48209494 0.22609687 0.05766792 0.06151783]
 [0.99683345 0.76136488 0.65484794 0.43741365 0.75294596 0.20422666
  0.32325106 0.07004434 0.73373486 0.81046958]
 [0.15282686 0.09544934 0.19786853 0.60568179 0.36603849 0.42335153
  0.17983766 0.71277869 0.92063546 0.78758077]
 [0.34864647 0.54255032 0.58589173 0.94692057 0.28321406 0.92