<a href="https://colab.research.google.com/github/rprimi/colB5BERT/blob/main/python/colB5BERT_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **colB5BERT:** Fine tuning colBERT with Big Five dataset

* Notebook inspirado no buscador denso de Leandro Carísio Fernandes

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip3 install transformers


In [3]:
!git clone https://github.com/rprimi/colB5BERT.git

%cd /content/colB5BERT
!git pull


Cloning into 'colB5BERT'...
remote: Enumerating objects: 254, done.[K
remote: Counting objects: 100% (58/58), done.[K
remote: Compressing objects: 100% (40/40), done.[K
remote: Total 254 (delta 43), reused 18 (delta 18), pack-reused 196[K
Receiving objects: 100% (254/254), 32.69 MiB | 11.17 MiB/s, done.
Resolving deltas: 100% (158/158), done.
/content/colB5BERT
Already up to date.


### Libraries

In [4]:
import sys
sys.path.append('/content/colB5BERT/python/')

import utils
import vsm
import sst

import os
import pandas as pd
import numpy as np
import textwrap
import pickle
import h5py
import logging

from transformers import BertModel, BertTokenizer
from transformers import RobertaModel, RobertaTokenizer

import torch
from torch.nn.functional import cosine_similarity
from tqdm import tqdm

import pandas as pd

from pathlib import Path
from sklearn.model_selection import train_test_split


from transformers import AutoTokenizer
from torch.utils import data
from torch.utils.data import DataLoader
from transformers import BatchEncoding

import torch
from transformers import AutoModel

from transformers import Trainer, TrainingArguments
from transformers import get_linear_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup, AdamW
from tqdm.auto import tqdm

In [5]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Jun 26 20:38:30 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    47W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [10]:
df_test = pd.read_csv('/content/drive/MyDrive/colB5BERT/dataset_test.tsv', sep='\t')
df_train = pd.read_csv('/content/drive/MyDrive/colB5BERT/dataset_train.tsv', sep='\t')

len(df_test)
len(df_train)

# Sample data to test the pipeline

df_test = df_test.sample(frac=0.08)
df_train = df_train.sample(frac=0.08)

302846

In [12]:
queries_train = df_train['item_pt_text'].tolist()
docs_train = df_train['texto_dividido'].tolist()
positive_ex_train = df_train['postive_ex'].tolist()


queries_val = df_test['item_pt_text'].tolist()
docs_val = df_test['texto_dividido'].tolist()
positive_ex_val = df_test['postive_ex'].tolist()

type(queries_val)

list

## Fine-tuning dos encoders

Define os datasets e dataloaders:

In [14]:
max_length = 512
batch_size = 50
epochs = 20
lr = 2e-5


In [15]:
# Definição do Dataset
class Dataset(data.Dataset):
    # Recebe dois vetores de textos e um vetor de labels
    def __init__(self, tokenizer, textos, labels, max_seq_length = max_length):
        self.max_seq_length = max_seq_length
        self.tokenizer = tokenizer
        self.textos = textos
        self.labels = labels
        self.cache = {}

    def __len__(self):
        return len(self.textos)

    def __getitem__(self, idx):
        self.cache[str(idx)] = self.cache.get(str(idx),
                   (self.tokenizer(self.textos[idx],
                                  padding=True,
                                  truncation=True,
                                  max_length=self.max_seq_length
                                  ),
                    self.labels[idx])
                   )
        return self.cache[str(idx)]


In [16]:
# Create datasets
bert_weights_name = 'neuralmind/bert-base-portuguese-cased'
tokenizer = AutoTokenizer.from_pretrained(bert_weights_name)

dataset_queries_train = Dataset(tokenizer, queries_train, positive_ex_train)
dataset_docs_train = Dataset(tokenizer, docs_train, positive_ex_train)

dataset_queries_val = Dataset(tokenizer, queries_val, positive_ex_val)
dataset_docs_val = Dataset(tokenizer, docs_val, positive_ex_val)


# Dataloaders para os datasets

#collate_fn = lambda batch: BatchEncoding(tokenizer.pad(batch, return_tensors='pt'))
#def collate_fn(batch):
#    #print('Dentro de collate_fn')
#    #print(BatchEncoding(tokenizer.pad(batch, return_tensors='pt')))
#    return BatchEncoding(tokenizer.pad(batch, return_tensors='pt'))

# collate function that also handles the labels
def collate_fn(batch):
    inputs = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    return BatchEncoding(tokenizer.pad(inputs, return_tensors='pt')), torch.tensor(labels)


# Create dataloaders
dataloader_queries_train = DataLoader(dataset_queries_train, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
dataloader_docs_train = DataLoader(dataset_docs_train, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

dataloader_queries_val = DataLoader(dataset_queries_val, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
dataloader_docs_val = DataLoader(dataset_docs_val, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)



# Adapt your DataLoader object
#class PositiveExDataLoader:
#    def __init__(self, dataloader, positive_ex):
#        self.dataloader = dataloader
#        self.positive_ex = positive_ex
#
#    def __iter__(self):
#        for batch, pos_ex in zip(self.dataloader, self.positive_ex):
#            yield batch, pos_ex
#    def __len__(self):
#        return len(self.dataloader)
## Wrap your original dataloaders
#dataloader_queries_train = PositiveExDataLoader(dataloader_queries_train, positive_ex_train)
#dataloader_docs_train = PositiveExDataLoader(dataloader_docs_train, positive_ex_train)
#
#dataloader_queries_val = PositiveExDataLoader(dataloader_queries_val, positive_ex_val)
#dataloader_docs_val = PositiveExDataLoader(dataloader_docs_val, positive_ex_val)


Downloading (…)okenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Carrega os modelos:

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Se tiver que treinar os modelos, abre
model_query = AutoModel.from_pretrained(bert_weights_name).to(device)
model_doc = AutoModel.from_pretrained(bert_weights_name).to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertModel: ['cls.predicti

Define função pro cálculo da loss (modifiquei essa função para trabalhar com um vetor indicando quais pares são relevantes e quais não são:

In [20]:
# Essa função já considera o resultado via batchs:
def compute_loss_com_gradiente(model_query, tokenized_queries, model_doc, tokenized_docs, positive_ex):
    #outputs_queries = model_query(**tokenized_queries[0].to(device))
    #outputs_docs    = model_doc(**tokenized_docs[0].to(device))

    outputs_queries = model_query(**{k: v.to(device) for k, v in tokenized_queries.items()})
    outputs_docs    = model_doc(**{k: v.to(device) for k, v in tokenized_docs.items()})



    # Extrai a última camada oculta associada ao token [CLS]
    tcls_queries = outputs_queries.last_hidden_state[:, 0, :]
    tcls_docs    = outputs_docs.last_hidden_state[:, 0, :]

    # Normaliza os tensores
    #tcls_queries = tcls_queries / torch.norm(tcls_queries, dim=1, keepdim=True)
    #tcls_docs = tcls_docs / torch.norm(tcls_docs, dim=1, keepdim=True)

    # Agora é necessário calcular a loss. Para isso, o primeiro passo é
    # calcular a similaridade entre uma query e documento (sim(q, d))
    similaridade = torch.matmul(tcls_queries, torch.transpose(tcls_docs, 0, 1))

    # Calcula a exponencial da similaridade
    exp_sim = torch.exp(similaridade)

    # Calcula a loss
    # We are now considering only the positive examples (where positive_ex is 1)
    positive_exp_sim = exp_sim * positive_ex.to(device)
    soma_linhas = positive_exp_sim.sum(dim=1)
    diagonal = torch.diag(positive_exp_sim)
    log_loss = -1* torch.log(diagonal/soma_linhas)

    loss = torch.mean(log_loss)
    return loss

def compute_loss_sem_gradiente(model_query, tokenized_queries, model_doc, tokenized_docs, positive_ex):
    with torch.no_grad():
        return compute_loss_com_gradiente(model_query, tokenized_queries, model_doc, tokenized_docs, positive_ex)

def compute_loss_dataloaders(model_query, dataloader_query, model_doc, dataloader_docs):
    loss = 0
    n_batches = 0
    for (batch_query, positive_ex_query), (batch_docs, positive_ex_docs)  in zip(dataloader_query, dataloader_docs):
        positive_ex_query = positive_ex_query.to(device)
        positive_ex_docs = positive_ex_docs.to(device)
        loss = loss + compute_loss_sem_gradiente(model_query, batch_query, model_doc, batch_docs, positive_ex_query)
        n_batches += 1
    return loss/n_batches

In [None]:
%%time
# Só pra medir o tempo que ele demora para calcular a loss em todo o dataset de treinamento
model_query.eval()
model_doc.eval()
print(f'Loss de treinamento: {compute_loss_dataloaders(model_query, dataloader_queries_train, model_doc, dataloader_docs_train)}')
print(f'Loss de validação: {compute_loss_dataloaders(model_query, dataloader_queries_val, model_doc, dataloader_docs_val)}')

Agora treina os dois encoders simulatenamente:

In [69]:
%%time

treinar_e_salvar_modelos  = True
# Diretório onde vai salvando o modelo a cada época
dir_modelos = '/content/drive/MyDrive/colB5BERT'


if treinar_e_salvar_modelos:
  # Training loop
  optimizer_query = AdamW(model_query.parameters(), lr=lr)
  optimizer_doc = AdamW(model_doc.parameters(), lr=lr)

  num_training_steps = epochs * len(dataloader_queries_train)
  num_warmup_steps = int(num_training_steps * 0.1)

  # get_linear_schedule_with_warmup get_cosine_with_hard_restarts_schedule_with_warmup
  scheduler_query = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer_query, num_warmup_steps, num_training_steps)
  scheduler_doc = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer_doc, num_warmup_steps, num_training_steps)

  for epoch in tqdm(range(epochs), desc='Epochs'):
      model_query.train()
      model_doc.train()

      train_losses = []
      for (batch_query, positive_ex_query), (batch_docs, positive_ex_docs) in tqdm(list(zip(dataloader_queries_train, dataloader_docs_train)), mininterval=0.5, desc='Train', disable=False):
        optimizer_query.zero_grad()
        optimizer_doc.zero_grad()

        # Ensure positive_ex_query and positive_ex_docs are tensors and on the same device as your models
        batch_query = batch_query.to(device)
        batch_docs = batch_docs.to(device)
        positive_ex_query = torch.tensor(positive_ex_query).to(device)
        positive_ex_docs = torch.tensor(positive_ex_docs).to(device)

        loss = compute_loss_com_gradiente(model_query, batch_query, model_doc, batch_docs, positive_ex_query, positive_ex_docs)
        loss.backward()

        optimizer_query.step()
        optimizer_doc.step()

        scheduler_query.step()
        scheduler_doc.step()


      model_query.save_pretrained(f'{dir_modelos}{epoch}/query/')
      model_doc.save_pretrained(f'{dir_modelos}{epoch}/doc/')

      model_query.eval()
      model_doc.eval()

      print(f'Loss de treinamento {epoch}: {compute_loss_dataloaders(model_query, dataloader_queries_train, model_doc, dataloader_docs_train)}')
      print(f'Loss de validação {epoch}: {compute_loss_dataloaders(model_query, dataloader_queries_val, model_doc, dataloader_docs_val)}')