# Implementing Low-Rank Adaption (LoRA) from Scratch
---

### [tutorial by Sebastian Raschka](https://lightning.ai/lightning-ai/studios/code-lora-from-scratch?view=public&section=all)

In [1]:
!pip install evaluate -q

In [2]:
import numpy as np
import torch
import torch.nn as nn
from functools import partial
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import load_dataset
from transformers import DataCollatorWithPadding
import evaluate



In [3]:
model_name = 'microsoft/deberta-v3-base'

In [4]:
imdb = load_dataset("imdb")

Downloading builder script:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name,model_max_length=512)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [6]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_imdb = imdb.map(preprocess_function, batched=True)

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [9]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

# Adapting Model with LoRA

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id
)

total_params = sum([p.numel() for p in model.parameters()])

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.weight', 'pooler.dense.bias', 'classifier.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## LoRA Config

In [11]:
lora_config = {
    'rank': 4,
    'alpha': 8,
    'lora_dropout': 0.05,
    'modules': ['query_proj','value_proj']
}

## LoRA layer to adapt nn.Linear

In [12]:
class LoRALinear(nn.Module):
    def __init__(self, linear_layer, rank, alpha, lora_dropout=0.):
        super().__init__()
        
        self.linear = linear_layer
        
        self.in_features = self.linear.in_features
        self.out_features = self.linear.out_features
        self.rank = rank
        self.alpha = alpha
        
        std_dev = 1 / torch.sqrt(torch.tensor(self.rank).float())
        
        self.A = nn.Parameter(torch.randn(self.in_features, self.rank) * std_dev)
        self.B = nn.Parameter(torch.zeros(self.rank, self.out_features))
        self.dropout = nn.Dropout(lora_dropout)
        
    def forward(self, x):
        
        x1 = self.linear(x)
        x2 = self.alpha * (x @ self.A @ self.B)
        x2 = self.dropout(x2)
        return x1 + x2

In [13]:
apply_lora = partial(
    LoRALinear,
    rank=lora_config['rank'],
    alpha=lora_config['alpha'],
    lora_dropout=lora_config['lora_dropout']
)

In [14]:
for p in model.parameters():
    p.requires_grad = False
for p in model.classifier.parameters():
    p.requires_grad = True
for layer in model.deberta.encoder.layer:
    if 'query_proj' in lora_config['modules']:
        layer.attention.self.query_proj = apply_lora(layer.attention.self.query_proj)
    if 'key_proj' in lora_config['modules']:
        layer.attention.self.key_proj = apply_lora(layer.attention.self.key_proj)
    if 'value_proj' in lora_config['modules']:
        layer.attention.self.value_proj = apply_lora(layer.attention.self.value_proj)

In [15]:
lora_model_params = sum([p.numel() for p in model.parameters()])
trainable_params = sum([p.numel() for p in model.parameters() if p.requires_grad])

In [16]:
print(f'{total_params=:,} {lora_model_params=:,} {trainable_params=:,}')

total_params=184,423,682 lora_model_params=184,571,138 trainable_params=148,994


# Training

In [17]:
training_args = TrainingArguments(
    output_dir="deberta-v3-large-lora_imdb",
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to='none'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [18]:
trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2878,0.245164,0.94352
2,0.2366,0.24589,0.95036
3,0.1932,0.252232,0.95396




TrainOutput(global_step=9375, training_loss=0.26010551920572916, metrics={'train_runtime': 7806.1144, 'train_samples_per_second': 9.608, 'train_steps_per_second': 1.201, 'total_flos': 1.848516839291376e+16, 'train_loss': 0.26010551920572916, 'epoch': 3.0})