In [None]:
import time
import copy
import numpy as np
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.nn as nn
import torch

if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True


In [None]:
# Training settings
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 64
random_seed = 123
learning_rate = 0.005
num_epochs = 10

# Dataset
train_dataset = datasets.MNIST(root='data/', train=True, transform=transforms.ToTensor(), download=True)
test_dataset = datasets.MNIST(root='data/', train=False, transform=transforms.ToTensor())

train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Device: {DEVICE}")
print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")


100%|██████████| 9.91M/9.91M [00:00<00:00, 16.1MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 479kB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 4.42MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 5.77MB/s]

Device: cuda
Training samples: 60000
Test samples: 10000





In [None]:
# LoRA and DoRA Layer Implementations
class LoRALayer(nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha):
        super().__init__()
        std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
        self.A = nn.Parameter(torch.randn(in_dim, rank) * std_dev)
        self.B = nn.Parameter(torch.zeros(rank, out_dim))
        self.alpha = alpha

    def forward(self, x):
        return self.alpha * (x @ self.A @ self.B)

class LinearWithLoRA(nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(linear.in_features, linear.out_features, rank, alpha)

    def forward(self, x):
        lora = self.lora.A @ self.lora.B
        combined_weight = self.linear.weight + self.lora.alpha * lora.T
        return F.linear(x, combined_weight, self.linear.bias)

class LinearWithDoRA(nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(linear.in_features, linear.out_features, rank, alpha)
        self.m = nn.Parameter(self.linear.weight.norm(p=2, dim=0, keepdim=True))

    def forward(self, x):
        lora = self.lora.A @ self.lora.B
        numerator = self.linear.weight + self.lora.alpha * lora.T
        denominator = numerator.norm(p=2, dim=0, keepdim=True)
        directional_component = numerator / denominator
        new_weight = self.m * directional_component
        return F.linear(x, new_weight, self.linear.bias)

# MLP Architecture
class MultilayerPerceptron(nn.Module):
    def __init__(self, num_features=784, num_hidden_1=128, num_hidden_2=256, num_classes=10):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(num_features, num_hidden_1),
            nn.ReLU(),
            nn.Linear(num_hidden_1, num_hidden_2),
            nn.ReLU(),
            nn.Linear(num_hidden_2, num_classes)
        )

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten
        return self.layers(x)

print("MLP architecture defined")


MLP architecture defined


In [None]:
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),  # 28x28 -> 28x28
            nn.ReLU(),
            nn.MaxPool2d(2),  # 28x28 -> 14x14
            nn.Conv2d(32, 64, kernel_size=3, padding=1),  # 14x14 -> 14x14
            nn.ReLU(),
            nn.MaxPool2d(2),  # 14x14 -> 7x7
        )

        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 7 * 7, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        return self.classifier(x)

print("CNN architecture defined")


CNN architecture defined


In [None]:
class SimpleAttention(nn.Module):
    def __init__(self, dim, num_heads=1):
        super().__init__()
        self.num_heads = num_heads
        self.dim = dim
        self.head_dim = dim // num_heads
        self.scale = self.head_dim ** -0.5

        self.q_proj = nn.Linear(dim, dim)
        self.k_proj = nn.Linear(dim, dim)
        self.v_proj = nn.Linear(dim, dim)
        self.out_proj = nn.Linear(dim, dim)

    def forward(self, x):
        B, N, C = x.shape

        q = self.q_proj(x).reshape(B, N, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.k_proj(x).reshape(B, N, self.num_heads, self.head_dim).transpose(1, 2)
        v = self.v_proj(x).reshape(B, N, self.num_heads, self.head_dim).transpose(1, 2)

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = F.softmax(attn, dim=-1)

        out = (attn @ v).transpose(1, 2).reshape(B, N, C)
        return self.out_proj(out)

class SimpleAttentionModel(nn.Module):
    def __init__(self, patch_size=4, embed_dim=128, num_classes=10):
        super().__init__()
        self.patch_size = patch_size
        self.embed_dim = embed_dim
        self.num_patches = (28 // patch_size) ** 2  # 49 patches for MNIST

        self.patch_embed = nn.Linear(patch_size * patch_size, embed_dim)
        self.pos_embed = nn.Parameter(torch.randn(1, self.num_patches, embed_dim))
        self.attention = SimpleAttention(embed_dim, num_heads=1)

        self.classifier = nn.Sequential(
            nn.LayerNorm(embed_dim),
            nn.Linear(embed_dim, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        B = x.shape[0]

        # Convert to patches
        x = x.unfold(2, self.patch_size, self.patch_size).unfold(3, self.patch_size, self.patch_size)
        x = x.contiguous().view(B, -1, self.patch_size * self.patch_size)

        # Patch embedding + positional embedding
        x = self.patch_embed(x) + self.pos_embed

        # Attention and classification
        x = self.attention(x)
        x = x.mean(dim=1)  # Global average pooling
        return self.classifier(x)

print("Attention architecture defined")


Attention architecture defined


In [None]:
# Training and evaluation functions
def compute_accuracy(model, data_loader, device, is_cnn=False):
    model.eval()
    correct_pred, num_examples = 0, 0
    with torch.no_grad():
        for features, targets in data_loader:
            if not is_cnn:
                features = features.view(-1, 28*28)  # Flatten for MLP
            features = features.to(device)
            targets = targets.to(device)

            logits = model(features)
            _, predicted_labels = torch.max(logits, 1)
            num_examples += targets.size(0)
            correct_pred += (predicted_labels == targets).sum()
    return correct_pred.float() / num_examples * 100

def train_model(model, train_loader, device, num_epochs, learning_rate, model_name, is_cnn=False):
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    print(f"Training {model_name}...")
    model.train()
    start_time = time.time()

    for epoch in range(num_epochs):
        for batch_idx, (features, targets) in enumerate(train_loader):
            if not is_cnn:
                features = features.view(-1, 28*28)  # Flatten for MLP
            features = features.to(device)
            targets = targets.to(device)

            logits = model(features)
            loss = F.cross_entropy(logits, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if not batch_idx % 400:
                print(f'Epoch: {epoch+1:03d}/{num_epochs:03d} | Batch {batch_idx:03d}/{len(train_loader):03d} | Loss: {loss:.4f}')

        with torch.no_grad():
            train_acc = compute_accuracy(model, train_loader, device, is_cnn)
            print(f'Epoch: {epoch+1:03d}/{num_epochs:03d} training accuracy: {train_acc:.2f}%')

        print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')

    print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')

    # Test accuracy
    test_acc = compute_accuracy(model, test_loader, device, is_cnn)
    print(f'{model_name} test accuracy: {test_acc:.2f}%\\n')
    return test_acc


In [None]:
# Create and train baseline models
torch.manual_seed(random_seed)

# MLP
mlp_model = MultilayerPerceptron()
mlp_acc = train_model(mlp_model, train_loader, DEVICE, num_epochs, learning_rate, "MLP Baseline", is_cnn=False)

# CNN
cnn_model = SimpleCNN()
cnn_acc = train_model(cnn_model, train_loader, DEVICE, num_epochs, learning_rate, "CNN Baseline", is_cnn=True)

# Attention
attn_model = SimpleAttentionModel()
attn_acc = train_model(attn_model, train_loader, DEVICE, num_epochs, learning_rate, "Attention Baseline", is_cnn=True)


Training MLP Baseline...
Epoch: 001/010 | Batch 000/938 | Loss: 2.2971
Epoch: 001/010 | Batch 400/938 | Loss: 0.1529
Epoch: 001/010 | Batch 800/938 | Loss: 0.1094
Epoch: 001/010 training accuracy: 96.01%
Time elapsed: 0.28 min
Epoch: 002/010 | Batch 000/938 | Loss: 0.1192
Epoch: 002/010 | Batch 400/938 | Loss: 0.0593
Epoch: 002/010 | Batch 800/938 | Loss: 0.0806
Epoch: 002/010 training accuracy: 97.23%
Time elapsed: 0.52 min
Epoch: 003/010 | Batch 000/938 | Loss: 0.2192
Epoch: 003/010 | Batch 400/938 | Loss: 0.0174
Epoch: 003/010 | Batch 800/938 | Loss: 0.0418
Epoch: 003/010 training accuracy: 98.11%
Time elapsed: 0.77 min
Epoch: 004/010 | Batch 000/938 | Loss: 0.0389
Epoch: 004/010 | Batch 400/938 | Loss: 0.1433
Epoch: 004/010 | Batch 800/938 | Loss: 0.1529
Epoch: 004/010 training accuracy: 98.16%
Time elapsed: 1.02 min
Epoch: 005/010 | Batch 000/938 | Loss: 0.0858
Epoch: 005/010 | Batch 400/938 | Loss: 0.1327
Epoch: 005/010 | Batch 800/938 | Loss: 0.0151
Epoch: 005/010 training accur

In [None]:
def apply_adaptations(model, adaptation_type, rank=4, alpha=8):
    """Apply LoRA or DoRA to model layers"""
    model_adapted = copy.deepcopy(model)

    if isinstance(model, MultilayerPerceptron):
        # Apply to linear layers in MLP
        if adaptation_type == 'lora':
            model_adapted.layers[0] = LinearWithLoRA(model_adapted.layers[0], rank, alpha)
            model_adapted.layers[2] = LinearWithLoRA(model_adapted.layers[2], rank, alpha)
            model_adapted.layers[4] = LinearWithLoRA(model_adapted.layers[4], rank, alpha)
        else:  # dora
            model_adapted.layers[0] = LinearWithDoRA(model_adapted.layers[0], rank, alpha)
            model_adapted.layers[2] = LinearWithDoRA(model_adapted.layers[2], rank, alpha)
            model_adapted.layers[4] = LinearWithDoRA(model_adapted.layers[4], rank, alpha)

    elif isinstance(model, SimpleCNN):
        # Apply to classifier layers in CNN
        if adaptation_type == 'lora':
            model_adapted.classifier[1] = LinearWithLoRA(model_adapted.classifier[1], rank, alpha)
            model_adapted.classifier[3] = LinearWithLoRA(model_adapted.classifier[3], rank, alpha)
            model_adapted.classifier[5] = LinearWithLoRA(model_adapted.classifier[5], rank, alpha)
        else:  # dora
            model_adapted.classifier[1] = LinearWithDoRA(model_adapted.classifier[1], rank, alpha)
            model_adapted.classifier[3] = LinearWithDoRA(model_adapted.classifier[3], rank, alpha)
            model_adapted.classifier[5] = LinearWithDoRA(model_adapted.classifier[5], rank, alpha)

    elif isinstance(model, SimpleAttentionModel):
        # Apply to attention projections and classifier
        if adaptation_type == 'lora':
            model_adapted.attention.q_proj = LinearWithLoRA(model_adapted.attention.q_proj, rank*2, alpha*2)
            model_adapted.attention.k_proj = LinearWithLoRA(model_adapted.attention.k_proj, rank*2, alpha*2)
            model_adapted.attention.v_proj = LinearWithLoRA(model_adapted.attention.v_proj, rank*2, alpha*2)
            model_adapted.attention.out_proj = LinearWithLoRA(model_adapted.attention.out_proj, rank*2, alpha*2)
            model_adapted.patch_embed = LinearWithLoRA(model_adapted.patch_embed, rank, alpha)
            model_adapted.classifier[1] = LinearWithLoRA(model_adapted.classifier[1], rank, alpha)
            model_adapted.classifier[3] = LinearWithLoRA(model_adapted.classifier[3], rank, alpha)
        else:  # dora
            model_adapted.attention.q_proj = LinearWithDoRA(model_adapted.attention.q_proj, rank*2, alpha*2)
            model_adapted.attention.k_proj = LinearWithDoRA(model_adapted.attention.k_proj, rank*2, alpha*2)
            model_adapted.attention.v_proj = LinearWithDoRA(model_adapted.attention.v_proj, rank*2, alpha*2)
            model_adapted.attention.out_proj = LinearWithDoRA(model_adapted.attention.out_proj, rank*2, alpha*2)
            model_adapted.patch_embed = LinearWithDoRA(model_adapted.patch_embed, rank, alpha)
            model_adapted.classifier[1] = LinearWithDoRA(model_adapted.classifier[1], rank, alpha)
            model_adapted.classifier[3] = LinearWithDoRA(model_adapted.classifier[3], rank, alpha)

    return model_adapted

def freeze_base_parameters(model, adaptation_type):
    """Freeze all parameters except adaptation ones"""
    for name, param in model.named_parameters():
        if adaptation_type.lower() in ['lora', 'dora']:
            if any(x in name for x in ['lora.A', 'lora.B', '.m']):
                param.requires_grad = True
            else:
                param.requires_grad = False
        else:
            param.requires_grad = True


In [None]:
# Apply LoRA and DoRA to all models
print("Applying adaptations and fine-tuning...\n")

results = {}

# MLP with LoRA/DoRA
mlp_lora = apply_adaptations(mlp_model, 'lora', rank=4, alpha=8)
mlp_dora = apply_adaptations(mlp_model, 'dora', rank=4, alpha=8)

freeze_base_parameters(mlp_lora, 'lora')
freeze_base_parameters(mlp_dora, 'dora')

results['mlp_lora'] = train_model(mlp_lora, train_loader, DEVICE, num_epochs, learning_rate, "MLP + LoRA", is_cnn=False)
results['mlp_dora'] = train_model(mlp_dora, train_loader, DEVICE, num_epochs, learning_rate, "MLP + DoRA", is_cnn=False)

# CNN with LoRA/DoRA
cnn_lora = apply_adaptations(cnn_model, 'lora', rank=4, alpha=8)
cnn_dora = apply_adaptations(cnn_model, 'dora', rank=4, alpha=8)

freeze_base_parameters(cnn_lora, 'lora')
freeze_base_parameters(cnn_dora, 'dora')

results['cnn_lora'] = train_model(cnn_lora, train_loader, DEVICE, num_epochs, learning_rate, "CNN + LoRA", is_cnn=True)
results['cnn_dora'] = train_model(cnn_dora, train_loader, DEVICE, num_epochs, learning_rate, "CNN + DoRA", is_cnn=True)

# Attention with LoRA/DoRA
attn_lora = apply_adaptations(attn_model, 'lora', rank=4, alpha=8)
attn_dora = apply_adaptations(attn_model, 'dora', rank=4, alpha=8)

freeze_base_parameters(attn_lora, 'lora')
freeze_base_parameters(attn_dora, 'dora')

results['attn_lora'] = train_model(attn_lora, train_loader, DEVICE, num_epochs, learning_rate, "Attention + LoRA", is_cnn=True)
results['attn_dora'] = train_model(attn_dora, train_loader, DEVICE, num_epochs, learning_rate, "Attention + DoRA", is_cnn=True)


Applying adaptations and fine-tuning...

Training MLP + LoRA...
Epoch: 001/010 | Batch 000/938 | Loss: 0.0048
Epoch: 001/010 | Batch 400/938 | Loss: 0.0304
Epoch: 001/010 | Batch 800/938 | Loss: 0.2098
Epoch: 001/010 training accuracy: 98.29%
Time elapsed: 0.26 min
Epoch: 002/010 | Batch 000/938 | Loss: 0.0198
Epoch: 002/010 | Batch 400/938 | Loss: 0.0164
Epoch: 002/010 | Batch 800/938 | Loss: 0.0015
Epoch: 002/010 training accuracy: 98.86%
Time elapsed: 0.53 min
Epoch: 003/010 | Batch 000/938 | Loss: 0.0142
Epoch: 003/010 | Batch 400/938 | Loss: 0.0454
Epoch: 003/010 | Batch 800/938 | Loss: 0.0057
Epoch: 003/010 training accuracy: 99.02%
Time elapsed: 0.79 min
Epoch: 004/010 | Batch 000/938 | Loss: 0.0592
Epoch: 004/010 | Batch 400/938 | Loss: 0.0063
Epoch: 004/010 | Batch 800/938 | Loss: 0.0117
Epoch: 004/010 training accuracy: 99.11%
Time elapsed: 1.05 min
Epoch: 005/010 | Batch 000/938 | Loss: 0.0011
Epoch: 005/010 | Batch 400/938 | Loss: 0.1442
Epoch: 005/010 | Batch 800/938 | Los

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print("=== Final Results Summary ===")
print()

print("📊 PERFORMANCE COMPARISON:")
print()

print("MLP Results:")
print(f"  Baseline:     {mlp_acc:.2f}%")
print(f"  + LoRA:       {results['mlp_lora']:.2f}%")
print(f"  + DoRA:       {results['mlp_dora']:.2f}%")
print()

print("CNN Results:")
print(f"  Baseline:     {cnn_acc:.2f}%")
print(f"  + LoRA:       {results['cnn_lora']:.2f}%")
print(f"  + DoRA:       {results['cnn_dora']:.2f}%")
print()

print("Attention Results:")
print(f"  Baseline:     {attn_acc:.2f}%")
print(f"  + LoRA:       {results['attn_lora']:.2f}%")
print(f"  + DoRA:       {results['attn_dora']:.2f}%")
print()

print("🔧 PARAMETER COUNTS:")
print()
print("MLP:")
print(f"  Baseline:     {count_parameters(mlp_model):,} parameters")
print(f"  + LoRA:       {count_parameters(mlp_lora):,} trainable parameters")
print(f"  + DoRA:       {count_parameters(mlp_dora):,} trainable parameters")
print()

print("CNN:")
print(f"  Baseline:     {count_parameters(cnn_model):,} parameters")
print(f"  + LoRA:       {count_parameters(cnn_lora):,} trainable parameters")
print(f"  + DoRA:       {count_parameters(cnn_dora):,} trainable parameters")
print()

print("Attention:")
print(f"  Baseline:     {count_parameters(attn_model):,} parameters")
print(f"  + LoRA:       {count_parameters(attn_lora):,} trainable parameters")
print(f"  + DoRA:       {count_parameters(attn_dora):,} trainable parameters")
print()

print("✅ SUMMARY:")
print("DoRA consistently shows competitive or superior performance to LoRA")
print("across all three architectures (MLP, CNN, Attention) while maintaining")
print("similar parameter efficiency through low-rank adaptation.")

=== Final Results Summary ===

📊 PERFORMANCE COMPARISON:

MLP Results:
  Baseline:     97.07%
  + LoRA:       97.32%
  + DoRA:       97.66%

CNN Results:
  Baseline:     98.76%
  + LoRA:       98.55%
  + DoRA:       98.93%

Attention Results:
  Baseline:     78.25%
  + LoRA:       38.91%
  + DoRA:       78.51%

🔧 PARAMETER COUNTS:

MLP:
  Baseline:     136,074 parameters
  + LoRA:       6,248 trainable parameters
  + DoRA:       7,416 trainable parameters

CNN:
  Baseline:     429,258 parameters
  + LoRA:       14,120 trainable parameters
  + DoRA:       17,448 trainable parameters

Attention:
  Baseline:     83,658 parameters
  + LoRA:       9,832 trainable parameters
  + DoRA:       10,552 trainable parameters

✅ SUMMARY:
DoRA consistently shows competitive or superior performance to LoRA
across all three architectures (MLP, CNN, Attention) while maintaining
similar parameter efficiency through low-rank adaptation.
