In [2]:
import csv
import time
import json
import warnings
from functools import partial
warnings.filterwarnings('always')

import torch
import numpy as np
import pandas as pd
torch.manual_seed(0)
from torch import nn
import matplotlib.pyplot as plt
from tqdm.notebook import trange, tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel, RobertaForSequenceClassification

from lora import *
from datasets import GLUEDatasetRoberta

In [None]:
RobertaForSequenceClassification.from_pretrained()

We will follow LoRA-FA experiment setting, where in paper they state that they apply lora to all 4 linear layers in MultiHeadAttention (see section 2.2): 3 layers that produce $Q$, $K$ and $V$ and also final linear layer weights

<center>
<img src="imgs/mha.png" width=300 height=400/>
</center>

In [2]:
def construct_lorafa_config(model, rank, init_method='svd'):
    """Each layer has a unique name within module hierarchy, so we can identify 
    them for lora parametrization"""
    config = {}
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and any([name.split('.')[-1] == n for n in ['dense', 'query', 'key', 'value',]]) and '.attention.' in name:
            config[name] = {
                nn.Linear: {
                    "weight": partial(
                        LoRAFAParametrization.from_linear,
                        rank=rank, 
                        init_method=init_method, # set svd as initiazliation method
                        original_weights=module.weight # pass weights for svd init
                    ),
                }
            }
    return config

# Fine-tuning stage

## MRPC

Hyperparameters from LoRa-FA article

<center>
<img src="imgs/MRPC hyperparams.png" width=1000/>
</center>

### Prepare model

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base')

# example
text1 = ["Simple text", "Cat", "House"]
text2 = ["Hard text", "Dog", "home"]

encoded_pair = tokenizer(
    text1,
    text2,
    padding='max_length',
    truncation=True,
    max_length=128,  # Maximum sequence length, adjust as needed
    return_tensors='pt'
)

output = model(**encoded_pair)
output.logits

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([[0.1352, 0.0986],
        [0.1422, 0.1042],
        [0.1397, 0.1045]], grad_fn=<AddmmBackward0>)

In [5]:
def freeze_nonlora(model):
    for name, parameters in model.named_parameters():
        if not name_is_lora(name) and not 'classifier' in name: # don't want to freeze last original layer too
            parameters.requires_grad = False

In [6]:
def get_hot_parameters(model):
    for _, params in model.named_parameters():
            if params.requires_grad:
                 yield params

In [7]:
rank = 2
lora_roberta_config = construct_lorafa_config(model, rank=rank, init_method='kaiming') # kaiming, svd
# list(lora_roberta_config.keys())
add_lora_by_layer_names(model, lora_roberta_config)
freeze_nonlora(model)

model = model.to(device)

parameters = [
    {"params": list(get_hot_parameters(model))},
]

### Prepare data

In [8]:
data_dir = 'data/MRPC'
train_file = data_dir + '/msr_paraphrase_train.txt'
test_file = data_dir + '/msr_paraphrase_test.txt'

In [10]:
train_dataset = GLUEDatasetRoberta(train_file, tokenizer, benchmark='mrpc')
test_dataset = GLUEDatasetRoberta(test_file, tokenizer, benchmark='mrpc')

In [11]:
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size,)

### Train

In [12]:
n_epoch = 30
warmup_ratio = 0.06
n_steps = len(train_loader) * n_epoch
warmup_steps = warmup_ratio * n_steps

def lr_lambda(current_step):
    if current_step <= warmup_steps:
        return (current_step + 1) / max(1, warmup_steps)
    else:
        return (n_steps - current_step) / (max(1, n_steps - warmup_steps))

optimizer = torch.optim.AdamW(parameters, lr=4e-4)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)
criteria = nn.CrossEntropyLoss()

In [13]:
pbar = tqdm(total=n_steps, desc='Training', position=0)
pbar_test = tqdm(total=len(test_loader), desc='Validating', position=1)

training_history= {'train_loss': [],
                   'val_loss': [],
                   'val_acc': [],}

for epoch in range(30):

    step_loss = []
    model.train()
    for input_ids, attention_mask, label in train_loader: # training

        output = model(input_ids.to(device), attention_mask.to(device))
        loss = criteria(output.logits, label.to(device))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        step_loss.append(loss.item())
        
        pbar.update()
    train_loss = np.mean(step_loss)

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> VALIDATION AND PROGRESS BARS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #

    if (epoch + 1) % 5 == 0 or epoch > 0.6 * n_epoch: # validation
        if epoch != 4:
            pbar_test.update(-pbar_test.total) # for refreshing pbars
        step_loss = []
        total = 0
        correct = 0
        model.eval()
        for input_ids, attention_mask, label in test_loader:
            output = model(input_ids.to(device), attention_mask.to(device))
            loss = criteria(output.logits, label.to(device))
            step_loss.append(loss.item())

            _, preds = torch.max(output.logits, 1)
            total += label.size(0)
            correct += (preds.detach().cpu() == label).sum().item()
            pbar_test.update()

        val_loss = np.mean(step_loss)
        val_acc = correct / total
        training_history['val_loss'].append(val_loss)
        training_history['val_acc'].append(val_acc)
        pbar_test.set_postfix({'val_loss': val_loss,
                               'val_acc': val_acc,})
    
    pbar.set_postfix({'train_loss': train_loss})
    training_history['train_loss'].append(train_loss)

pbar.close()
pbar_test.close()

Training:   0%|          | 0/7650 [00:00<?, ?it/s]

Validating:   0%|          | 0/108 [00:00<?, ?it/s]

In [None]:
with open('history.kaiming.json', 'w') as f:
    json.dump(training_history, f)