In [1]:
!git clone https://github.com/qangviet/BERT_PEFT.git

Cloning into 'BERT_PEFT'...
remote: Enumerating objects: 10, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 10 (delta 2), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (10/10), 6.64 KiB | 2.21 MiB/s, done.


In [2]:
%cd BERT_PEFT

/kaggle/working/BERT_PEFT


In [3]:
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader
from transformers import BertTokenizer
import tqdm
from sklearn.metrics import accuracy_score,  f1_score
from sklearn.model_selection import train_test_split
from datasets import load_dataset

from bert_model import BertForSequenceClassification as MyBert
from lora import (
    add_lora_layers,
    merge_lora_layers,
    freeze_model,
    unfreeze_model,
)

### Data Preprocessing

Data: GLUE - SST-2

In [4]:
dataset = load_dataset('glue', 'sst2')
dataset = dataset['train']
dataset

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 3.11M/3.11M [00:00<00:00, 10.7MB/s]
Downloading data: 100%|██████████| 72.8k/72.8k [00:00<00:00, 408kB/s]
Downloading data: 100%|██████████| 148k/148k [00:00<00:00, 769kB/s]


Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})

In [5]:
sample_size = 15000
sentences = dataset['sentence']
labels = dataset['label']
combined_data = list(zip(sentences, labels))
random.shuffle(combined_data)
combined_data = combined_data[:sample_size]
train_size = int(15000 * 0.9)
train_dataset = combined_data[:train_size]
val_dataset = combined_data[train_size:]

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=True)

### Bert Trainer

In [6]:
class BertTrainer:
    
    def __init__(self, model, tokenizer, train_dataloader, eval_dataloader=None,
                 epochs=1, lr=3e-5):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model = model.to(self.device)
        self.tokenizer = tokenizer
        self.train_dataloader = train_dataloader
        self.eval_dataloader = eval_dataloader
        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr, eps=1e-8)
        self.loss_fn = nn.CrossEntropyLoss()
        self.eval_loss = float('inf')
        self.epochs = epochs
        self.epochs_best = 0
    
    def train(self, evaluate=False):
        
        for epoch in range(self.epochs):
            self.iteration(epoch, self.train_dataloader)
            if evaluate and self.eval_dataloader is not None:
                self.iteration(epoch, self.eval_dataloader, train=False)
    
    def evaluate(self):
        epoch=0
        self.iteration(epoch, self.eval_dataloader, train=False)
        
    def iteration(self, epoch, data_loader, train=True):
        loss_accumulated = 0.
        preds_all = []
        labels_all = []
        
        self.model.train() if train else self.model.eval()
        mode = 'train' if train else 'eval'
        
        batch_iter = tqdm.tqdm(
            enumerate(data_loader),
            desc=f"Epoch ({mode}) {epoch + 1} / {self.epochs}",
            total=len(data_loader),
            bar_format="{l_bar}{r_bar}"
        )
        
        for i, batch in batch_iter:
            
            batch_t = self.tokenizer(
                batch[0],
                padding='max_length',
                max_length=256,
                truncation=True,
                return_tensors='pt',
            )
            batch_t = {key: value.to(self.device) for key, value in batch_t.items()}
            input_labels = batch[1].to(self.device)
            
            logits = self.model(
                input_ids=batch_t["input_ids"],
                token_type_ids=batch_t["token_type_ids"],
                attention_mask=batch_t['attention_mask'],
                
            )
            loss = self.loss_fn(logits, input_labels)
            
            if train:
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
            
            preds = logits.argmax(dim=-1)
            loss_accumulated += loss.item()
            preds_all.append(preds.detach())
            labels_all.append(input_labels.detach())
        
        preds_all = torch.cat(preds_all, dim=0).cpu()
        labels_all = torch.cat(labels_all, dim=0).cpu()
        
        accuracy = accuracy_score(labels_all, preds_all)
        f1 = f1_score(labels_all, preds_all, average='macro')
        avg_loss_epoch = loss_accumulated / len(data_loader)
        print("")
        print("  Avg Loss: {0:.4f}".format(avg_loss_epoch))
        print("  F1 Score: {0:.4f}".format(f1))
        print("  Accuracy: {0:.4f}".format(accuracy))
    

### Fine tuning

#### BERT-base

In [7]:
tokenizer_base = BertTokenizer.from_pretrained('bert-base-uncased')
bert_base = MyBert.from_pretrained(
    model_type='bert-base-uncased',
    config_args= {
        "vocab_size": 30522, 'n_classes': 2, "max_seq_len": 256
    },
    adaptive_weight_copy = True,
)
bert_base.to('cuda')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Loading weights from pretrained model: bert-base-uncased


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768)
      (position_embeddings): Embedding(256, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [8]:
trainer_bert_base = BertTrainer(
    bert_base,
    tokenizer_base,
    lr=5e-5,
    epochs=4,
    train_dataloader=train_dataloader,
    eval_dataloader=val_dataloader,
)
trainer_bert_base.train(evaluate=True)

Epoch (train) 1 / 4: 100%|| 422/422 [05:27<00:00,  1.29it/s]



  Avg Loss: 0.2942
  F1 Score: 0.8790
  Accuracy: 0.8808


Epoch (eval) 1 / 4: 100%|| 47/47 [00:12<00:00,  3.63it/s]



  Avg Loss: 0.2158
  F1 Score: 0.9167
  Accuracy: 0.9187


Epoch (train) 2 / 4: 100%|| 422/422 [05:28<00:00,  1.28it/s]



  Avg Loss: 0.1217
  F1 Score: 0.9566
  Accuracy: 0.9573


Epoch (eval) 2 / 4: 100%|| 47/47 [00:12<00:00,  3.64it/s]



  Avg Loss: 0.2765
  F1 Score: 0.9101
  Accuracy: 0.9133


Epoch (train) 3 / 4: 100%|| 422/422 [05:28<00:00,  1.28it/s]



  Avg Loss: 0.0652
  F1 Score: 0.9786
  Accuracy: 0.9790


Epoch (eval) 3 / 4: 100%|| 47/47 [00:12<00:00,  3.63it/s]



  Avg Loss: 0.2728
  F1 Score: 0.9180
  Accuracy: 0.9200


Epoch (train) 4 / 4: 100%|| 422/422 [05:28<00:00,  1.28it/s]



  Avg Loss: 0.0427
  F1 Score: 0.9858
  Accuracy: 0.9861


Epoch (eval) 4 / 4: 100%|| 47/47 [00:12<00:00,  3.63it/s]


  Avg Loss: 0.3079
  F1 Score: 0.9153
  Accuracy: 0.9173





#### LoRA BERT-base

rank = 8

In [9]:
tokenizer_base = BertTokenizer.from_pretrained('bert-base-uncased')
bert_base = MyBert.from_pretrained(
    model_type='bert-base-uncased',
    config_args= {
        "vocab_size": 30522, 'n_classes': 2, "max_seq_len": 256
    },
    adaptive_weight_copy = True,
)
add_lora_layers(bert_base, r = 8, lora_alpha=16)
freeze_model(bert_base)
bert_base.to('cuda')

Loading weights from pretrained model: bert-base-uncased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
n_params = 0
n_trainable_params = 0

# count the number of trainable parameters
for n, p in bert_base.named_parameters():
    n_params += p.numel()
    if p.requires_grad:
        n_trainable_params += p.numel()

print(f"Total parameters: {n_params}")
print(f"Trainable parameters: {n_trainable_params}")
print(f"Percentage trainable: {round(n_trainable_params / n_params * 100, 2)}%")

Total parameters: 109582082
Trainable parameters: 296450
Percentage trainable: 0.27%


In [11]:
#bert base lora all r = 8
trainer_bert_base_lora = BertTrainer(
    bert_base,
    tokenizer_base,
    lr=5e-4,
    epochs=4,
    train_dataloader=train_dataloader,
    eval_dataloader=val_dataloader,
)

trainer_bert_base_lora.train(evaluate=True)

Epoch (train) 1 / 4: 100%|| 422/422 [03:55<00:00,  1.79it/s]



  Avg Loss: 0.3114
  F1 Score: 0.8595
  Accuracy: 0.8628


Epoch (eval) 1 / 4: 100%|| 47/47 [00:13<00:00,  3.53it/s]



  Avg Loss: 0.2542
  F1 Score: 0.9038
  Accuracy: 0.9060


Epoch (train) 2 / 4: 100%|| 422/422 [03:55<00:00,  1.79it/s]



  Avg Loss: 0.1903
  F1 Score: 0.9244
  Accuracy: 0.9257


Epoch (eval) 2 / 4: 100%|| 47/47 [00:13<00:00,  3.53it/s]



  Avg Loss: 0.2213
  F1 Score: 0.9132
  Accuracy: 0.9147


Epoch (train) 3 / 4: 100%|| 422/422 [03:55<00:00,  1.79it/s]



  Avg Loss: 0.1202
  F1 Score: 0.9549
  Accuracy: 0.9557


Epoch (eval) 3 / 4: 100%|| 47/47 [00:13<00:00,  3.53it/s]



  Avg Loss: 0.2292
  F1 Score: 0.9249
  Accuracy: 0.9267


Epoch (train) 4 / 4: 100%|| 422/422 [03:56<00:00,  1.79it/s]



  Avg Loss: 0.0715
  F1 Score: 0.9776
  Accuracy: 0.9780


Epoch (eval) 4 / 4: 100%|| 47/47 [00:13<00:00,  3.52it/s]


  Avg Loss: 0.2846
  F1 Score: 0.9201
  Accuracy: 0.9220



