<a href="https://colab.research.google.com/github/rafavidal1709/projeto-aplicado-iii/blob/main/01%20-%20Modelos%20de%20embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Carregando o dataset de avaliação

Carregue abaixo o arquivo 'dataset_original.json' para que iniciemos o tratamento dos dados e façamos o _embedding_ dos textos.

In [16]:
import json
from google.colab import files

# Carrega o dataset
def upload_dataset():
  uploaded = files.upload()
  return json.loads(uploaded[list(uploaded.keys())[0]].decode('utf-8'))

df = upload_dataset()

# Formata o dataset original para o formato desejado
def format_dataset(data):
  dataset = {'text':[],'category':[],'embedding':{},'accuracy':{}}
  for c in range(len(data)):
    for i in data[c]['examples']:
      dataset['text'].append(i)
      dataset['category'].append(data[c]['category'])
  return dataset

dataset_base = format_dataset(df['base'])
dataset_real = format_dataset(df['real'])

Saving dataset_original.json to dataset_original (4).json


# Embedding com Longformer

In [17]:
import torch
from transformers import LongformerTokenizer, LongformerModel

class LongformerEmbedding:
  def __init__(self):
    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Verificar se a GPU está disponível
    torch.cuda.empty_cache()  # Liberar memória da GPU, se necessário
    self.tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096") # Carregar o modelo e o tokenizador Longformer-large
    self.model = LongformerModel.from_pretrained("allenai/longformer-large-4096").to(self.device)
    self.model.gradient_checkpointing_enable()  # Habilitar gradient checkpointing para economizar memória

  def process_text(self, text):
    with torch.no_grad():  # Desabilitar o cálculo de gradientes
        # Reduzir o comprimento máximo, se possível
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=2048)

        # Definir a máscara de atenção: 1 para global attention no primeiro token (ou outro token especial)
        attention_mask = torch.ones(inputs['input_ids'].shape, dtype=torch.long).to(self.device)  # Enviar para GPU
        global_attention_mask = torch.zeros(inputs['input_ids'].shape, dtype=torch.long).to(self.device)  # Enviar para GPU
        global_attention_mask[:, 0] = 1  # Dar atenção global ao primeiro token (posição 0)

        # Enviar os inputs para a GPU
        inputs = {key: value.to(self.device) for key, value in inputs.items()}

        # Passar pelo modelo com a máscara de atenção global
        outputs = self.model(input_ids=inputs['input_ids'], attention_mask=attention_mask, global_attention_mask=global_attention_mask)

        # Extraímos o embedding do primeiro token (com atenção global)
        global_attention_embedding = outputs.last_hidden_state[:, 0, :]  # Primeiro token com global attention

        # Fazer a média dos embeddings de todos os tokens (global pooling)
        output_mean_embedding = torch.mean(outputs.last_hidden_state, dim=1)

        # Mover os embeddings para a CPU
        global_attention_embedding = global_attention_embedding.cpu()
        output_mean_embedding = output_mean_embedding.cpu()

        # Sincronizar CUDA para garantir que a GPU terminou o processamento antes de prosseguir
        torch.cuda.synchronize()

        # Limpar variáveis não utilizadas e liberar memória da GPU
        del inputs, attention_mask, global_attention_mask, outputs
        torch.cuda.empty_cache()

        return {"longformer_global_attention": global_attention_embedding, "longformer_output_mean": output_mean_embedding}

longformer_embedding = LongformerEmbedding()

# Aplicando o embedding ao dataset "base"
for i in range(len(dataset_base['text'])):
  embedding = longformer_embedding.process_text(dataset_base['text'][i])
  if i not in dataset_base['embedding']:
        dataset_base['embedding'][i] = {}
  for key, value in embedding.items():
    dataset_base['embedding'][i][key] = value.numpy().tolist()[0]

# Aplicando o embedding ao dataset "real"
for i in range(len(dataset_real['text'])):
  embedding = longformer_embedding.process_text(dataset_real['text'][i])
  if i not in dataset_real['embedding']:
        dataset_real['embedding'][i] = {}
  for key, value in embedding.items():
    dataset_real['embedding'][i][key] = value.numpy().tolist()[0]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/803 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

# Embedding com BERT

In [18]:
import torch
from transformers import BertTokenizer, BertModel

class BertEmbedding:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Verificar se a GPU está disponível
        torch.cuda.empty_cache()  # Liberar memória da GPU, se necessário
        self.tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")  # Carregar o modelo e o tokenizador BERT-large
        self.model = BertModel.from_pretrained("bert-large-uncased").to(self.device)

    def process_text(self, text):
        with torch.no_grad():  # Desabilitar o cálculo de gradientes
            # Reduzir o comprimento máximo, se possível
            inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)

            # Enviar os inputs para a GPU
            inputs = {key: value.to(self.device) for key, value in inputs.items()}

            # Passar pelo modelo
            outputs = self.model(**inputs)

            # Pooler output: já é o resumo global baseado no token [CLS]
            pooler_output = outputs.pooler_output.cpu()  # Já é um resumo do texto

            # Fazer a média dos embeddings de todos os tokens (global pooling)
            output_mean_embedding = torch.mean(outputs.last_hidden_state, dim=1)
            output_mean_embedding = output_mean_embedding.cpu()

            # Sincronizar CUDA para garantir que a GPU terminou o processamento antes de prosseguir
            torch.cuda.synchronize()

            # Limpar variáveis não utilizadas e liberar memória da GPU
            del inputs, outputs
            torch.cuda.empty_cache()

            return {"bert_pooler_output": pooler_output, "bert_output_mean": output_mean_embedding}

bert_embedding = BertEmbedding()

# Aplicando o embedding ao dataset "base"
for i in range(len(dataset_base['text'])):
    embedding = bert_embedding.process_text(dataset_base['text'][i])
    if i not in dataset_base['embedding']:
        dataset_base['embedding'][i] = {}
    for key, value in embedding.items():
        dataset_base['embedding'][i][key] = value.numpy().tolist()[0]

# Aplicando o embedding ao dataset "real"
for i in range(len(dataset_real['text'])):
    embedding = bert_embedding.process_text(dataset_real['text'][i])
    if i not in dataset_real['embedding']:
        dataset_real['embedding'][i] = {}
    for key, value in embedding.items():
        dataset_real['embedding'][i][key] = value.numpy().tolist()[0]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

# Embedding com BERTimbau

In [19]:
import torch
from transformers import BertTokenizer, BertModel

class BertimbauEmbedding:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Verificar se a GPU está disponível
        torch.cuda.empty_cache()  # Liberar memória da GPU, se necessário
        self.tokenizer = BertTokenizer.from_pretrained("neuralmind/bert-large-portuguese-cased")  # Carregar o modelo e o tokenizador BERTimbau-large
        self.model = BertModel.from_pretrained("neuralmind/bert-large-portuguese-cased").to(self.device)

    def process_text(self, text):
        with torch.no_grad():  # Desabilitar o cálculo de gradientes
            # Reduzir o comprimento máximo, se possível
            inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)

            # Enviar os inputs para a GPU
            inputs = {key: value.to(self.device) for key, value in inputs.items()}

            # Passar pelo modelo
            outputs = self.model(**inputs)

            # Pooler output: já é o resumo global baseado no token [CLS]
            pooler_output = outputs.pooler_output.cpu()  # Já é um resumo do texto

            # Fazer a média dos embeddings de todos os tokens (global pooling)
            output_mean_embedding = torch.mean(outputs.last_hidden_state, dim=1)
            output_mean_embedding = output_mean_embedding.cpu()

            # Sincronizar CUDA para garantir que a GPU terminou o processamento antes de prosseguir
            torch.cuda.synchronize()

            # Limpar variáveis não utilizadas e liberar memória da GPU
            del inputs, outputs
            torch.cuda.empty_cache()

            return {"bertimbau_pooler_output": pooler_output, "bertimbau_output_mean": output_mean_embedding}

bertimbau_embedding = BertimbauEmbedding()

# Aplicando o embedding ao dataset "base"
for i in range(len(dataset_base['text'])):
    embedding = bertimbau_embedding.process_text(dataset_base['text'][i])
    if i not in dataset_base['embedding']:
        dataset_base['embedding'][i] = {}
    for key, value in embedding.items():
        dataset_base['embedding'][i][key] = value.numpy().tolist()[0]

# Aplicando o embedding ao dataset "real"
for i in range(len(dataset_real['text'])):
    embedding = bertimbau_embedding.process_text(dataset_real['text'][i])
    if i not in dataset_real['embedding']:
        dataset_real['embedding'][i] = {}
    for key, value in embedding.items():
        dataset_real['embedding'][i][key] = value.numpy().tolist()[0]

tokenizer_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

# Salvando o dataset

In [20]:
df = {'base': dataset_base, 'real': dataset_real}
json.dump(df, open('dataset_embeddings.json', 'w'))
files.download('dataset_embeddings.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>