## Criando modelo

In [2]:
import torch
import torch.nn as nn
from transformers import MBartForConditionalGeneration, MBart50Tokenizer
import torch.nn.functional as F
import numpy as np

In [3]:
import torch

def align_deeppavlov_to_mbart(deeppavlov_labels, source_tokens, target_tokens):
    """
    Alinha as labels do deeppavlov dos texto de referência, com as labels geradas pelo modelo MBartforConditionalGeneration para cálculo do loss.
    Função necessária pois os tokenizadores do MBartforConditionalGeneration e do DeepPavlov são diferentes.
    """
    aligned_labels = []
    source_idx = 0  # Percorre os tokens gerados pelo DeepPavlov

    for i, target_token in enumerate(target_tokens):
        # Tokens de Exceção (associa ao label de não-NE)
        if target_token in ["en_XX", "de_DE", "ar_AR", "es_XX", "ja_XX", "fr_XX", "it_IT", "</s>", "▁", ",", ".", ";", ":", "!", "?", "<pad>"]:  # Add other special tokens as necessary
            aligned_labels.append(0)
            continue

        #alinhamento
        if source_idx < len(deeppavlov_labels):
            aligned_labels.append(deeppavlov_labels[source_idx])

        # Checa se o próximo token inicia uma nova palavra
        if i + 1 < len(target_tokens):
            next_target_token = target_tokens[i + 1]
            if next_target_token.startswith("▁") and source_idx < len(deeppavlov_labels) - 1:
                source_idx += 1

    return torch.tensor(aligned_labels, dtype=torch.long)



77


In [4]:
relu = nn.ReLU()
softmax = nn.Softmax(dim=-1)

In [5]:
class MBart_NER_Translation(nn.Module):
    def __init__(self, device):
        super().__init__()
        self.MBart = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
        self.tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
        hidden_size = 1024
        num_ne_classes = 37 # (18 classes com I-B, e o O)
        self.W_src_NE = nn.Linear(hidden_size, hidden_size)
        self.E_s_ne = nn.Linear(hidden_size, num_ne_classes)
        self.W_tgt_NE = nn.Linear(hidden_size, hidden_size)
        self.E_t_ne = nn.Linear(hidden_size, num_ne_classes)
        self.device = device

        #Congel camada de embeddings
        for param in self.MBart.model.shared.parameters():
          param.requires_grad = False

        #Congela primeiras 8 camadas do Encoder
        for layer in self.MBart.model.encoder.layers[:8]:
          for param in layer.parameters():
            param.requires_grad = False

        #Congela primeiras 8 camadas do Decoder
        for layer in self.MBart.model.decoder.layers[:8]:
          for param in layer.parameters():
            param.requires_grad = False

    def forward(self, input_texts, source_lang="en_XX", target_lang="fr_XX"):
      self.tokenizer.src_lang = source_lang
      self.tokenizer.tgt_lang = target_lang

      # Tokenização em batch
      inputs = self.tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True).to(self.device)
      input_ids = inputs.input_ids
      attention_mask = inputs.attention_mask

      # Encoder
      encoder_outputs = self.MBart.model.encoder(input_ids=input_ids, attention_mask=attention_mask)
      H_src = encoder_outputs.last_hidden_state

      # NER no lado da fonte
      H_ne = F.relu(self.W_src_NE(H_src))
      X_ne = F.softmax(self.E_s_ne(H_ne), dim=-1)
      # Tradução
      forced_bos_token_id = self.tokenizer.lang_code_to_id[target_lang]
      translated_outputs = self.MBart.generate(
          input_ids=input_ids,
          attention_mask=attention_mask,
          max_length=50,
          num_beams=5,
          early_stopping=True,
          forced_bos_token_id=forced_bos_token_id
      )

      # Decodificação batch
      translated_texts = self.tokenizer.batch_decode(translated_outputs, skip_special_tokens=True)

      # Converter ids para tokens em batch
      source_mbart_tokens = [self.tokenizer.convert_ids_to_tokens(seq.tolist()) for seq in input_ids]
      target_mbart_tokens = [self.tokenizer.convert_ids_to_tokens(seq.tolist()) for seq in translated_outputs]

      # Decoder para NER no lado destino
      decoder_outputs = self.MBart.model.decoder(
          input_ids=translated_outputs,
          encoder_hidden_states=H_src + H_ne,  # Combinação do encoder com info de NE
          encoder_attention_mask=attention_mask
      )

      H_t_tgt = decoder_outputs.last_hidden_state
      Y_ne = F.softmax(self.E_t_ne(F.relu(self.W_tgt_NE(H_t_tgt))), dim=-1)

      #saídas: labels de NER do input, labels de NER do output, texto traduzido, tokens do input, tokens do output, ids do input
      return X_ne, Y_ne, translated_texts, source_mbart_tokens, target_mbart_tokens, input_ids


In [None]:
import pandas as pd
import html
import ast

# Carregamento dos datasets de Alemão
df_train = pd.read_csv("./en_de_train_ner.csv", on_bad_lines='skip')
df_valid = pd.read_csv("./en_de_valid_ner.csv", on_bad_lines='skip')

# Parsing do ner para array
def parse_ner_list(ner_str):
    if pd.isna(ner_str) or ner_str == '':
        return []
    try:
        return ast.literal_eval(ner_str)
    except (SyntaxError, ValueError):
        return []

df_train['source_text_ner'] = df_train['source_text_ner'].apply(parse_ner_list)
df_train['target_text_ner'] = df_train['target_text_ner'].apply(parse_ner_list)

df_valid['source_text_ner'] = df_valid['source_text_ner'].apply(parse_ner_list)
df_valid['target_text_ner'] = df_valid['target_text_ner'].apply(parse_ner_list)

df_train.head()

Unnamed: 0,source_text,target_text,source_text_ner,target_text_ner,source_text_ner_tokens,target_text_ner_tokens
0,"and it can be a very complicated thing , what ...","und was menschliche gesundheit ist , kann auch...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O]","['and', 'it', 'can', 'be', 'a', 'very', 'compl...","['und', 'was', 'menschliche', 'gesundheit', 'i..."
1,and bringing those two together might seem a v...,"und diese zwei zusammen zu bringen , erscheint...","[O, O, O, B-CARDINAL, O, O, O, O, O, O, O, O, ...","[O, O, B-CARDINAL, O, O, O, O, O, O, O, O, O, ...","['and', 'bringing', 'those', 'two', 'together'...","['und', 'diese', 'zwei', 'zusammen', 'zu', 'br..."
2,and those simple themes aren 't really themes ...,und diese einfachen themen sind eigentlich kei...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","['and', 'those', 'simple', 'themes', 'aren', ""...","['und', 'diese', 'einfachen', 'themen', 'sind'..."
3,and i 'm going to start with this one : if mom...,und ich werde mit dieser hier anfangen : wenn ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","['and', 'i', ""'m"", 'going', 'to', 'start', 'wi...","['und', 'ich', 'werde', 'mit', 'dieser', 'hier..."
4,"we know that , right ? we 've experienced that .",kennen wir das nicht alle ? das haben wir alle...,"[O, O, O, O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]","['we', 'know', 'that', ',', 'right', '?', 'we'...","['kennen', 'wir', 'das', 'nicht', 'alle', '?',..."


In [8]:
# mapeando as classes de entidades nomeadas
entity_to_idx = {
    "O": 0,
    "B-PERSON": 1,
    "I-PERSON": 2,
    "B-NORP": 3,
    "I-NORP": 4,
    "B-FACILITY": 5,
    "I-FACILITY": 6,
    "B-ORGANIZATION": 7,
    "I-ORGANIZATION": 8,
    "B-GPE": 9,
    "I-GPE": 10,
    "B-LOCATION": 11,
    "I-LOCATION": 12,
    "B-PRODUCT": 13,
    "I-PRODUCT": 14,
    "B-EVENT": 15,
    "I-EVENT": 16,
    "B-WORK_OF_ART": 17,
    "I-WORK_OF_ART": 18,
    "B-LAW": 19,
    "I-LAW": 20,
    "B-LANGUAGE": 21,
    "I-LANGUAGE": 22,
    "B-DATE": 23,
    "I-DATE": 24,
    "B-TIME": 25,
    "I-TIME": 26,
    "B-PERCENT": 27,
    "I-PERCENT": 28,
    "B-MONEY": 29,
    "I-MONEY": 30,
    "B-QUANTITY": 31,
    "I-QUANTITY": 32,
    "B-ORDINAL": 33,
    "I-ORDINAL": 34,
    "B-CARDINAL": 35,
    "I-CARDINAL": 36
}

def convert_labels_to_indices(label_list):
    return [entity_to_idx.get(label) for label in label_list]  # Default to 0 ('O') if label not found


df_train['source_ner_indices'] = df_train['source_text_ner'].apply(convert_labels_to_indices)
df_train['target_ner_indices'] = df_train['target_text_ner'].apply(convert_labels_to_indices)

df_valid['source_ner_indices'] = df_valid['source_text_ner'].apply(convert_labels_to_indices)
df_valid['target_ner_indices'] = df_valid['target_text_ner'].apply(convert_labels_to_indices)

NameError: name 'df_train' is not defined

In [None]:
#Criando a classe Train dataset para usar no DataLoader

class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, input_texts, target_texts, ne_labels_src, ne_labels_tgt, source_lang, target_lang, source_text_ner_tokens, target_text_ner_tokens):
        self.input_texts = input_texts
        self.target_texts = target_texts
        self.ne_labels_src = ne_labels_src
        self.ne_labels_tgt = ne_labels_tgt
        self.source_lang = source_lang
        self.target_lang = target_lang
        self.source_text_ner_tokens = source_text_ner_tokens #adicionado para o alinhamento
        self.target_text_ner_tokens = target_text_ner_tokens #adicionado para o alinhamento

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
      return {
            "input_text": self.input_texts[idx],
            "target_text": self.target_texts[idx],
            "ne_labels_src": self.ne_labels_src[idx],
            "ne_labels_tgt": self.ne_labels_tgt[idx],
            "source_lang": self.source_lang,
            "target_lang": self.target_lang,
            "target_text_ner_tokens": self.target_text_ner_tokens[idx], #adicionado para o alinhamento
            "source_text_ner_tokens": self.source_text_ner_tokens[idx]  #adicionado para o alinhamento
        }

In [None]:

# Substituir os valores None por 0 na coluna 'source_text_ner_tokens'
df_train['source_ner_indices'] = df_train['source_ner_indices'].apply(
    lambda tokens: [label if label is not None else 0 for label in tokens]
)

# Substituir os valores None por 0 na coluna 'target_text_ner_tokens'
df_train['target_ner_indices'] = df_train['target_ner_indices'].apply(
    lambda tokens: [label if label is not None else 0 for label in tokens]
)

In [None]:
train_dataset = TrainDataset(
    input_texts=df_train['source_text'].tolist(),
    target_texts=df_train['target_text'].tolist(),
    ne_labels_src=df_train['source_ner_indices'].tolist(),
    ne_labels_tgt=df_train['target_ner_indices'].tolist(),
    source_text_ner_tokens = df_train['source_text_ner_tokens'].tolist(), #adicionado para o alinhamento
    target_text_ner_tokens = df_train['target_text_ner_tokens'].tolist(), #adicionado para o alinhamento
    source_lang="en_XX",
    target_lang="de_DE"
)

In [None]:
from torch import optim
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Criação do modelo e passagem pro device
model = MBart_NER_Translation(device).to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

## Treiando com dados do artigo

In [None]:
import torch

import torch

def my_collate_fn(batch):

    # Extração de colunas dos itens do batch
    input_texts = [item['input_text'] for item in batch]
    target_texts = [item['target_text'] for item in batch]
    ne_labels_src_list = [item['ne_labels_src'] for item in batch]
    ne_labels_tgt_list = [item['ne_labels_tgt'] for item in batch]
    source_text_ner_tokens_list = [item['source_text_ner_tokens'] for item in batch]
    target_text_ner_tokens_list = [item['target_text_ner_tokens'] for item in batch]
    source_langs = [item['source_lang'] for item in batch]
    target_langs = [item['target_lang'] for item in batch]


    #comprimento máximo
    max_src_len = max(len(lbls) for lbls in ne_labels_src_list)
    max_tgt_len = max(len(lbls) for lbls in ne_labels_tgt_list)


    padded_ne_labels_src = []
    for lbls in ne_labels_src_list:
        padded = lbls + [0] * (max_src_len - len(lbls))
        padded_ne_labels_src.append(padded)


    padded_ne_labels_tgt = []
    for lbls in ne_labels_tgt_list:
        padded = lbls + [0] * (max_tgt_len - len(lbls))
        padded_ne_labels_tgt.append(padded)

    #transforma em tensor
    padded_ne_labels_src = torch.tensor(padded_ne_labels_src, dtype=torch.long)
    padded_ne_labels_tgt = torch.tensor(padded_ne_labels_tgt, dtype=torch.long)


    def ensure_list_of_tokens(tokens):
        if isinstance(tokens, str):
            tokens = [tokens]
        return tokens

    #garante que é lista
    source_text_ner_tokens_list = [ensure_list_of_tokens(toks) for toks in source_text_ner_tokens_list]
    target_text_ner_tokens_list = [ensure_list_of_tokens(toks) for toks in target_text_ner_tokens_list]

    #comprimento máximo
    max_src_ner_len = max(len(toks) for toks in source_text_ner_tokens_list)
    max_tgt_ner_len = max(len(toks) for toks in target_text_ner_tokens_list)

    #Padding
    padded_source_text_ner_tokens = []
    for toks in source_text_ner_tokens_list:
        padded = toks + ["<PAD>"] * (max_src_ner_len - len(toks))
        padded_source_text_ner_tokens.append(padded)

    # Padding
    padded_target_text_ner_tokens = []
    for toks in target_text_ner_tokens_list:
        padded = toks + ["<PAD>"] * (max_tgt_ner_len - len(toks))
        padded_target_text_ner_tokens.append(padded)


    return {
        'input_text': input_texts,
        'target_text': target_texts,
        'ne_labels_src': padded_ne_labels_src,
        'ne_labels_tgt': padded_ne_labels_tgt,
        'source_text_ner_tokens': padded_source_text_ner_tokens,
        'target_text_ner_tokens': padded_target_text_ner_tokens,
        'source_lang': source_langs,
        'target_lang': target_langs
    }

from tqdm import tqdm
# Funções de Loss
loss_ner = nn.CrossEntropyLoss()
loss_translation = nn.CrossEntropyLoss(ignore_index=model.tokenizer.pad_token_id)

# Hiperparâmetros
alpha = 0.5
beta = 0.5
gamma = 1.0


base_params = []
new_params = []

for name, param in model.named_parameters():
    if param.requires_grad:
        if any(nd in name for nd in ['W_src_NE', 'E_s_ne', 'W_tgt_NE', 'E_t_ne']):
            new_params.append(param)
        else:
            base_params.append(param)

# Otimizador
optimizer = torch.optim.AdamW([
    {'params': base_params, 'lr': 1e-5},
    {'params': new_params, 'lr': 1e-4}
])

# Criação do DataLoader
train_dataloader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    collate_fn=my_collate_fn
)

# Treinamento
num_epochs = 1

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc=f"Epoch {epoch + 1}/{num_epochs}")
    for i, batch in progress_bar:
        # Se batch["source_lang"] e batch["target_lang"] forem listas, e o batch for homogêneo:
        src_lang = batch["source_lang"][0]
        tgt_lang = batch["target_lang"][0]

        # Mandando o batch para o device
        batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}

        input_texts = batch["input_text"]  # lista de strings
        target_texts = batch["target_text"]  # lista de strings

        ne_labels_src = torch.tensor(batch["ne_labels_src"], dtype=torch.long, device=device)
        ne_labels_tgt = torch.tensor(batch["ne_labels_tgt"], dtype=torch.long, device=device)
        source_text_ner_tokens = batch["source_text_ner_tokens"]
        target_text_ner_tokens = batch["target_text_ner_tokens"]

        # Forward pass (agora em batch)
        X_ne, Y_ne, translated_texts, source_mbart_tokens, target_mbart_tokens, input_ids = model(
            input_texts, source_lang=src_lang, target_lang=tgt_lang
        )

        # Tokenização do texto esperado na tradução (batch)
        target_ids = model.tokenizer(
            target_texts, return_tensors="pt", padding=True, truncation=True
        ).input_ids.to(device)

        # Alinhar labels de NE em batch
        aligned_ne_labels_src = []
        for src_lbl, src_toks, src_mbart_toks in zip(ne_labels_src, source_text_ner_tokens, source_mbart_tokens):
            aligned_ne_labels_src.append(align_deeppavlov_to_mbart(src_lbl, src_toks, src_mbart_toks))
        ne_labels_src = torch.stack(aligned_ne_labels_src).to(device)

        aligned_ne_labels_tgt = []
        for tgt_lbl, tgt_toks, tgt_mbart_toks in zip(ne_labels_tgt, target_text_ner_tokens, target_mbart_tokens):
            aligned_ne_labels_tgt.append(align_deeppavlov_to_mbart(tgt_lbl, tgt_toks, tgt_mbart_toks))
        ne_labels_tgt = torch.stack(aligned_ne_labels_tgt).to(device)

        # Cálculo das perdas
        loss_s_ne = loss_ner(X_ne.view(-1, X_ne.size(-1)), ne_labels_src.view(-1))
        loss_t_ne = loss_ner(Y_ne.view(-1, Y_ne.size(-1)), ne_labels_tgt.view(-1))

        probs_ne_t = Y_ne.view(-1, Y_ne.size(-1))[:, 1:].sum(dim=-1)
        weights = 1 + (probs_ne_t ** gamma)

        translation_outputs = model.MBart(
            input_ids=input_ids,
            labels=target_ids
        )
        translation_logits = translation_outputs.logits
        translation_loss = loss_translation(
            translation_logits.view(-1, translation_logits.size(-1)),
            target_ids.view(-1)
        )
        weighted_translation_loss = (weights * translation_loss).mean()

        total_task_loss = weighted_translation_loss + alpha * loss_s_ne + beta * loss_t_ne

        optimizer.zero_grad()
        total_task_loss.backward()
        optimizer.step()

        total_loss += total_task_loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_dataloader)}")


    # Print para acompanharmos se o modelo está melhorando a cada época
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_dataloader)}")


  ne_labels_src = torch.tensor(batch["ne_labels_src"], dtype=torch.long, device=device)
  ne_labels_tgt = torch.tensor(batch["ne_labels_tgt"], dtype=torch.long, device=device)
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Epoch 1/2: 100%|██████████| 10015/10015 [4:53:12<00:00,  1.76s/it]


Epoch 1/2, Loss: 3.8962507089613916
Epoch 1/2, Loss: 3.8962507089613916


Epoch 2/2:   0%|          | 3/10015 [00:05<5:18:03,  1.91s/it]


KeyboardInterrupt: 

In [None]:
model_save_path = "mbart_ner_translation_model.pt"
tokenizer_save_path = "mbart_ner_translation_tokenizer"

# Salva modelo inteiro
torch.save(model, model_save_path)

# Salva o tokenizador
model.tokenizer.save_pretrained(tokenizer_save_path)

('mbart_ner_translation_tokenizer/tokenizer_config.json',
 'mbart_ner_translation_tokenizer/special_tokens_map.json',
 'mbart_ner_translation_tokenizer/sentencepiece.bpe.model',
 'mbart_ner_translation_tokenizer/added_tokens.json')

In [None]:
from google.colab import drive

drive.mount('/content/drive')

drive_model_save_path = "/content/drive/My Drive/mbart_ner_translation_model.pt"
torch.save(model, drive_model_save_path)

Mounted at /content/drive


## Treinando com dados do SemEval

In [None]:
import pandas as pd

In [None]:
df_de = pd.read_csv('./semeval_de_ner.csv')
df_es = pd.read_csv('./semeval_es_ner.csv')
df_fr = pd.read_csv('./semeval_fr_ner.csv')
df_it = pd.read_csv('./semeval_it_ner.csv')
df_ja = pd.read_csv('./semeval_ja_ner.csv')

In [None]:
from sklearn.model_selection import train_test_split

def split_dataframe(df, test_size=0.2):
    train, test = train_test_split(df, test_size=test_size, random_state=42)
    return train, test

# Split dos dataframes (esquecemos de fazer do árabe, por isso não conseguimos avaliar)
train_de, test_de = split_dataframe(df_de)
train_es, test_es = split_dataframe(df_es)
train_fr, test_fr = split_dataframe(df_fr)
train_it, test_it = split_dataframe(df_it)
train_ja, test_ja = split_dataframe(df_ja)

In [None]:
from transformers import MBartTokenizer

tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer.save_pretrained("./model")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/sentencepiece.bpe.model',
 './model/added_tokens.json')

In [None]:
import torch
from transformers import MBartTokenizer, MBartForConditionalGeneration
from google.colab import drive

drive.mount('/content/drive')

tokenizer_path = "./model"
tokenizer = MBartTokenizer.from_pretrained(tokenizer_path)

path = '/content/drive/MyDrive/mbart_ner_translation_model.pt'
model = torch.load(path)

Mounted at /content/drive


  model = torch.load(path)


In [None]:
# código replicado do treinamento feito anteriormente (com o dataset inglês-alemão), agor com os dados do semeval
from torch.utils.data import DataLoader
from google.colab import files
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def convert_labels_to_indices(label_list):
  return [entity_to_idx.get(label) for label in label_list]  # Default 0 ('O')


dataframes = [
    {'df': df_de, 'code': 'de_DE'}, {'df': df_es, 'code': 'es_XX'}, {'df': train_fr, 'code': 'fr_XX'}, {'df': train_it, 'code': 'it_IT'}, {'df': df_ja, 'code': 'ja_XX'}
]

for data in dataframes:
  df = data['df']
  entity_to_idx = {
      "O": 0,
      "B-PERSON": 1,
      "I-PERSON": 2,
      "B-NORP": 3,
      "I-NORP": 4,
      "B-FACILITY": 5,
      "I-FACILITY": 6,
      "B-ORGANIZATION": 7,
      "I-ORGANIZATION": 8,
      "B-GPE": 9,
      "I-GPE": 10,
      "B-LOCATION": 11,
      "I-LOCATION": 12,
      "B-PRODUCT": 13,
      "I-PRODUCT": 14,
      "B-EVENT": 15,
      "I-EVENT": 16,
      "B-WORK_OF_ART": 17,
      "I-WORK_OF_ART": 18,
      "B-LAW": 19,
      "I-LAW": 20,
      "B-LANGUAGE": 21,
      "I-LANGUAGE": 22,
      "B-DATE": 23,
      "I-DATE": 24,
      "B-TIME": 25,
      "I-TIME": 26,
      "B-PERCENT": 27,
      "I-PERCENT": 28,
      "B-MONEY": 29,
      "I-MONEY": 30,
      "B-QUANTITY": 31,
      "I-QUANTITY": 32,
      "B-ORDINAL": 33,
      "I-ORDINAL": 34,
      "B-CARDINAL": 35,
      "I-CARDINAL": 36
  }


  df['source_ner_indices'] = df['source_text_ner'].apply(convert_labels_to_indices)
  df['target_ner_indices'] = df['target_text_ner'].apply(convert_labels_to_indices)

  df['source_ner_indices'] = df['source_text_ner'].apply(convert_labels_to_indices)
  df['target_ner_indices'] = df['target_text_ner'].apply(convert_labels_to_indices)

  # Substituir os valores None por 0 na coluna 'source_text_ner_tokens'
  df['source_ner_indices'] = df['source_ner_indices'].apply(
      lambda tokens: [label if label is not None else 0 for label in tokens]
  )

  # Substituir os valores None por 0 na coluna 'target_text_ner_tokens'
  df['target_ner_indices'] = df['target_ner_indices'].apply(
      lambda tokens: [label if label is not None else 0 for label in tokens]
  )

def my_collate_fn(batch):


    # Extração das colunas
    input_texts = [item['input_text'] for item in batch]
    target_texts = [item['target_text'] for item in batch]
    ne_labels_src_list = [item['ne_labels_src'] for item in batch]
    ne_labels_tgt_list = [item['ne_labels_tgt'] for item in batch]
    source_text_ner_tokens_list = [item['source_text_ner_tokens'] for item in batch]
    target_text_ner_tokens_list = [item['target_text_ner_tokens'] for item in batch]
    source_langs = [item['source_lang'] for item in batch]
    target_langs = [item['target_lang'] for item in batch]


    # Comprimento máximo
    max_src_len = max(len(lbls) for lbls in ne_labels_src_list)
    max_tgt_len = max(len(lbls) for lbls in ne_labels_tgt_list)

    # Padding
    padded_ne_labels_src = []
    for lbls in ne_labels_src_list:
        padded = lbls + [0] * (max_src_len - len(lbls))
        padded_ne_labels_src.append(padded)

    # Padding
    padded_ne_labels_tgt = []
    for lbls in ne_labels_tgt_list:
        padded = lbls + [0] * (max_tgt_len - len(lbls))
        padded_ne_labels_tgt.append(padded)

    # Converter para tensores
    padded_ne_labels_src = torch.tensor(padded_ne_labels_src, dtype=torch.long)
    padded_ne_labels_tgt = torch.tensor(padded_ne_labels_tgt, dtype=torch.long)

    def ensure_list_of_tokens(tokens):
        if isinstance(tokens, str):
            tokens = [tokens]
        return tokens

    # Garante que é lista
    source_text_ner_tokens_list = [ensure_list_of_tokens(toks) for toks in source_text_ner_tokens_list]
    target_text_ner_tokens_list = [ensure_list_of_tokens(toks) for toks in target_text_ner_tokens_list]

    # Comprimento máximo
    max_src_ner_len = max(len(toks) for toks in source_text_ner_tokens_list)
    max_tgt_ner_len = max(len(toks) for toks in target_text_ner_tokens_list)

    # Padding
    padded_source_text_ner_tokens = []
    for toks in source_text_ner_tokens_list:
        padded = toks + ["<PAD>"] * (max_src_ner_len - len(toks))
        padded_source_text_ner_tokens.append(padded)

    # Padding
    padded_target_text_ner_tokens = []
    for toks in target_text_ner_tokens_list:
        padded = toks + ["<PAD>"] * (max_tgt_ner_len - len(toks))
        padded_target_text_ner_tokens.append(padded)


    return {
        'input_text': input_texts,
        'target_text': target_texts,
        'ne_labels_src': padded_ne_labels_src,
        'ne_labels_tgt': padded_ne_labels_tgt,
        'source_text_ner_tokens': padded_source_text_ner_tokens,
        'target_text_ner_tokens': padded_target_text_ner_tokens,
        'source_lang': source_langs,
        'target_lang': target_langs
    }

for data in dataframes:
  df = data['df']
  code = data['code']

  train_dataset = TrainDataset(
    input_texts=df['source_text'].tolist(),
    target_texts=df['target_text'].tolist(),
    ne_labels_src=df['source_ner_indices'].tolist(),
    ne_labels_tgt=df['target_ner_indices'].tolist(),
    source_text_ner_tokens = df['source_text_ner_tokens'].tolist(), #adicionado para o alinhamento
    target_text_ner_tokens = df['target_text_ner_tokens'].tolist(), #adicionado para o alinhamento
    source_lang="en_XX",
    target_lang=code
  )

  from tqdm import tqdm
  # Funções de Loss
  loss_ner = nn.CrossEntropyLoss()
  loss_translation = nn.CrossEntropyLoss(ignore_index=model.tokenizer.pad_token_id)

  # Hiperparâmetros
  alpha = 0.5
  beta = 0.5
  gamma = 1.0


  base_params = []
  new_params = []

  for name, param in model.named_parameters():
      if param.requires_grad:
          if any(nd in name for nd in ['W_src_NE', 'E_s_ne', 'W_tgt_NE', 'E_t_ne']):
              new_params.append(param)
          else:
              base_params.append(param)

  # Otimizador
  optimizer = torch.optim.AdamW([
      {'params': base_params, 'lr': 1e-5},
      {'params': new_params, 'lr': 1e-4}
  ])

  # Criação do DataLoader
  train_dataloader = DataLoader(
      train_dataset,
      batch_size=16,
      shuffle=True,
      collate_fn=my_collate_fn
  )

  # Treinamento
  num_epochs = 1

  for epoch in range(num_epochs):
      model.train()
      total_loss = 0
      progress_bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc=f"Epoch {epoch + 1}/{num_epochs}")
      for i, batch in progress_bar:
          # Se batch["source_lang"] e batch["target_lang"] forem listas, e o batch for homogêneo:
          src_lang = batch["source_lang"][0]
          tgt_lang = batch["target_lang"][0]

          # Mandando o batch para o device
          batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}

          input_texts = batch["input_text"]
          target_texts = batch["target_text"]

          ne_labels_src = torch.tensor(batch["ne_labels_src"], dtype=torch.long, device=device)
          ne_labels_tgt = torch.tensor(batch["ne_labels_tgt"], dtype=torch.long, device=device)
          source_text_ner_tokens = batch["source_text_ner_tokens"]
          target_text_ner_tokens = batch["target_text_ner_tokens"]

          # Forward pass (agora em batch)
          X_ne, Y_ne, translated_texts, source_mbart_tokens, target_mbart_tokens, input_ids = model(
              input_texts, source_lang=src_lang, target_lang=tgt_lang
          )

          # Tokenização do texto esperado na tradução (batch)
          target_ids = model.tokenizer(
              target_texts, return_tensors="pt", padding=True, truncation=True
          ).input_ids.to(device)

          # Alinhar labels de NE em batch
          aligned_ne_labels_src = []
          for src_lbl, src_toks, src_mbart_toks in zip(ne_labels_src, source_text_ner_tokens, source_mbart_tokens):
              aligned_ne_labels_src.append(align_deeppavlov_to_mbart(src_lbl, src_toks, src_mbart_toks))
          ne_labels_src = torch.stack(aligned_ne_labels_src).to(device)

          aligned_ne_labels_tgt = []
          for tgt_lbl, tgt_toks, tgt_mbart_toks in zip(ne_labels_tgt, target_text_ner_tokens, target_mbart_tokens):
              aligned_ne_labels_tgt.append(align_deeppavlov_to_mbart(tgt_lbl, tgt_toks, tgt_mbart_toks))
          ne_labels_tgt = torch.stack(aligned_ne_labels_tgt).to(device)

          # Cálculo das perdas
          loss_s_ne = loss_ner(X_ne.view(-1, X_ne.size(-1)), ne_labels_src.view(-1))
          loss_t_ne = loss_ner(Y_ne.view(-1, Y_ne.size(-1)), ne_labels_tgt.view(-1))

          probs_ne_t = Y_ne.view(-1, Y_ne.size(-1))[:, 1:].sum(dim=-1)
          weights = 1 + (probs_ne_t ** gamma)

          translation_outputs = model.MBart(
              input_ids=input_ids,
              labels=target_ids
          )
          translation_logits = translation_outputs.logits
          translation_loss = loss_translation(
              translation_logits.view(-1, translation_logits.size(-1)),
              target_ids.view(-1)
          )
          weighted_translation_loss = (weights * translation_loss).mean()

          total_task_loss = weighted_translation_loss + alpha * loss_s_ne + beta * loss_t_ne

          optimizer.zero_grad()
          total_task_loss.backward()
          optimizer.step()

          total_loss += total_task_loss.item()

      print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_dataloader)}")


      # Print para acompanharmos se o modelo está melhorando a cada época
      print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_dataloader)}")

  #salvar modelo
  torch.save(model, f"./mbart_final_{code}.pt")

  ne_labels_src = torch.tensor(batch["ne_labels_src"], dtype=torch.long, device=device)
  ne_labels_tgt = torch.tensor(batch["ne_labels_tgt"], dtype=torch.long, device=device)
Epoch 1/1: 100%|██████████| 256/256 [10:15<00:00,  2.41s/it]


Epoch 1/1, Loss: 3.4578680619597435
Epoch 1/1, Loss: 3.4578680619597435


Epoch 1/1: 100%|██████████| 323/323 [13:06<00:00,  2.44s/it]


Epoch 1/1, Loss: 3.631565395154451
Epoch 1/1, Loss: 3.631565395154451


Epoch 1/1: 100%|██████████| 277/277 [12:29<00:00,  2.71s/it]


Epoch 1/1, Loss: 3.471730204695829
Epoch 1/1, Loss: 3.471730204695829


Epoch 1/1: 100%|██████████| 187/187 [07:41<00:00,  2.47s/it]


Epoch 1/1, Loss: 3.9929312652445095
Epoch 1/1, Loss: 3.9929312652445095


Epoch 1/1: 100%|██████████| 452/452 [20:34<00:00,  2.73s/it]


Epoch 1/1, Loss: 3.794059912715338
Epoch 1/1, Loss: 3.794059912715338


Avaliação

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import torch
from transformers import MBartTokenizer, MBartForConditionalGeneration

#importar tokenizer
tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer.save_pretrained("./model")
tokenizer_path = "./model"
tokenizer = MBartTokenizer.from_pretrained(tokenizer_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


In [8]:
path = '/content/drive/MyDrive/mbart_final.pt'
#importar modelo
model = torch.load(path)

  model = torch.load(path)


In [9]:
import pandas as pd

df_test_de_1 = pd.read_csv('/content/en_de_test_ner.csv')
df_test_de_2 = pd.read_csv('/content/semeval_de_ner_test.csv')

df_test_de_1.head()

Unnamed: 0,source_text,target_text,source_text_ner,target_text_ner,source_text_ner_tokens,target_text_ner_tokens
0,"you know , one of the intense pleasures of tra...","wissen sie , eines der großen vernügen beim re...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","['you', 'know', ',', 'one', 'of', 'the', 'inte...","['wissen', 'sie', ',', 'eines', 'der', 'großen..."
1,just to know that jaguar shamans still journey...,"einfach das wissen , dass jaguar-schamanen noc...","['O', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'O', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","['just', 'to', 'know', 'that', 'jaguar', 'sham...","['einfach', 'das', 'wissen', ',', 'dass', 'jag..."
2,"and of course , we all share the same adaptive...",und natürlich teilen wir alle dieselben anpass...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']","['and', 'of', 'course', ',', 'we', 'all', 'sha...","['und', 'natürlich', 'teilen', 'wir', 'alle', ..."
3,we 're all born . we all bring our children in...,wir werden alle geboren . wir bringen kinder z...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","['we', ""'re"", 'all', 'born', '.', 'we', 'all',...","['wir', 'werden', 'alle', 'geboren', '.', 'wir..."
4,we go through initiation rites .,wir durchlaufen initiationsrituale .,"['O', 'O', 'O', 'O', 'O', 'O', 'O']","['O', 'O', 'O', 'O', 'O']","['we', 'go', 'through', 'initiation', 'rites',...","['wir', 'durchlaufen', 'initiationsrituale', '..."


In [10]:
#concatenação dos dataframes de teste em alemão de duas fontes
df_test_de = pd.concat([df_test_de_1[['source_text','target_text']], df_test_de_2[['source_text','target_text']]])

In [38]:
df_test_fr = pd.read_csv('/content/semeval_fr_ner_test.csv')
df_test_it = pd.read_csv('/content/semeval_it_ner_test.csv')
df_test_ja = pd.read_csv('/content/semeval_ja_ner_test.csv')
df_test_es = pd.read_csv('/content/semeval_es_ner_test.csv')

df_test_fr = df_test_fr[['source_text','target_text']]
df_test_it = df_test_it[['source_text','target_text']]
df_test_ja = df_test_ja[['source_text','target_text']]
df_test_es = df_test_es[['source_text','target_text']]

In [77]:
import torch
from nltk.translate.bleu_score import sentence_bleu
import nltk
import numpy as np


nltk.download('punkt')

# Função para calcular o BLEU
def calculate_bleu(references, hypothesis):
  bleu_scores = []
  for i in range(len(references)):
    bleu_score = sentence_bleu([references[i]], hypothesis[i])
    bleu_scores.append(bleu_score)
  return np.mean(bleu_scores)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [78]:
import gc
from tqdm import tqdm

# Função para processar um batch de linhas do DataFrame
def process_batch(model, tokenizer, batch, target_lang):
    results = []
    for _, row in batch.iterrows():
        source_text = row['source_text']
        target_text = row['target_text']

        # Processar sem manter tudo em memória
        with torch.no_grad():
            outputs = model(input_texts=source_text, target_lang=target_lang)
            generated_text = outputs[4]
        text_with_lang = f"{target_lang} {target_text}"
        ref_tokens = tokenizer.tokenize(text_with_lang)
        hyp_tokens = tokens[0]
        results.append((ref_tokens, hyp_tokens))
    return results

# Função principal
def evaluate_translation_memory_optimized(model, tokenizer, data, max_length=512, target_lang="fr_XX", batch_size=10):
    model.eval()
    references = []
    hypotheses = []

    # Processa os batches sequencialmente
    for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
        batch = data.iloc[i:i + batch_size]

        # Processa o batch
        batch_results = process_batch(model, tokenizer, batch, target_lang)


        for ref_tokens, hyp_tokens in batch_results:
            references.append(ref_tokens)
            hypotheses.append(hyp_tokens)

        # Libera memória
        del batch, batch_results
        gc.collect()

    # BLEU médio
    avg_bleu = calculate_bleu(references, hypotheses)

    return avg_bleu


In [40]:
#avaliação do modelo em francês
with open("evaluation_results_fr.txt", "w") as file:
  avg_bleu = evaluate_translation_memory_optimized(model=model, tokenizer=tokenizer, data=df_test_fr, max_length=512, target_lang='fr_XX')

  # Escrever os resultados no arquivo
  file.write(f"BLEU Score médio: {avg_bleu}\n\n")

  # Exibir na tela (opcional)
  print(f"Avaliação-Francês:")
  print(f"BLEU Score médio: {avg_bleu}")


Processing Batches: 100%|██████████| 111/111 [09:31<00:00,  5.15s/it]

Evaluation for de_DE:
Average BLEU Score: 1.0262586988078325e-232



The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [41]:
#avaliação do modelo em espanhol
with open("evaluation_results_es.txt", "w") as file:
  avg_bleu = evaluate_translation_memory_optimized(model=model, tokenizer=tokenizer, data=df_test_es, max_length=512, target_lang='es_XX')

  # Escrever os resultados no arquivo
  file.write(f"BLEU Score médio: {avg_bleu}\n\n")

  # Exibir na tela (opcional)
  print(f"Avaliação-Espanhol:")
  print(f"BLEU Score médio: {avg_bleu}")

Processing Batches: 100%|██████████| 104/104 [07:55<00:00,  4.57s/it]

Avaliação de_DE:
BLEU Score médio: 6.488822733008327e-233



The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [42]:
with open("evaluation_results_ja.txt", "w") as file:
  avg_bleu = evaluate_translation_memory_optimized(model=model, tokenizer=tokenizer, data=df_test_ja, max_length=512, target_lang='ja_XX')

  # Escrever os resultados no arquivo
  file.write(f"BLEU Score médio: {avg_bleu}\n\n")

  # Exibir na tela (opcional)
  print(f"Avaliação {lang}:")
  print(f"BLEU Score médio: {avg_bleu}")

Processing Batches: 100%|██████████| 145/145 [14:53<00:00,  6.16s/it]


Avaliação de_DE:
BLEU Score médio: 2.0269331746232353e-232


In [44]:
#avaliação do modelo em italiano
with open("evaluation_results_it.txt", "w") as file:
  avg_bleu = evaluate_translation_memory_optimized(model=model, tokenizer=tokenizer, data=df_test_it, max_length=512, target_lang='it_IT')

  # Escrever os resultados no arquivo
  file.write(f"BLEU Score médio: {avg_bleu}\n\n")

  # Exibir na tela (opcional)
  print(f"Avaliação-Italiano:")
  print(f"BLEU Score médio: {avg_bleu}")

Processing Batches: 100%|██████████| 75/75 [05:45<00:00,  4.60s/it]

Avaliação-Italiano:
BLEU Score médio: 1.4773691655718286e-232





In [46]:
#avaliação do modelo em alemão
with open("evaluation_results_de.txt", "w") as file:
  avg_bleu = evaluate_translation_memory_optimized(model=model, tokenizer=tokenizer, data=df_test_de, max_length=512, target_lang='de_DE')

  # Escrever os resultados no arquivo
  file.write(f"BLEU Score médio: {avg_bleu}\n\n")

  # Exibir na tela (opcional)
  print(f"Avaliação-Alemão:")
  print(f"BLEU Score médio: {avg_bleu}")

Processing Batches: 100%|██████████| 757/757 [1:14:18<00:00,  5.89s/it]


Avaliação-Alemão:
BLEU Score médio: 6.004471714225603e-158


Avaliação do Base Model MBart (a título de comparação)


model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [79]:
import gc
from tqdm import tqdm

# Função para processar um batch de linhas do DataFrame
def process_batch(model, tokenizer, batch, target_lang):
    results = []
    for _, row in batch.iterrows():
        source_text = row['source_text']
        target_text = row['target_text']

        # Processar sem manter tudo em memória
        with torch.no_grad():
            outputs = model(input_texts=source_text, target_lang=target_lang)
            generated_text = outputs[0]
        text_with_lang = f"{target_lang} {target_text}"
        ref_tokens = tokenizer.tokenize(text_with_lang)
        hyp_tokens = tokens[0]
        results.append((ref_tokens, hyp_tokens))
    return results

# Função principal
def evaluate_translation_memory_optimized(model, tokenizer, data, max_length=512, target_lang="fr_XX", batch_size=10):
    model.eval()
    references = []
    hypotheses = []

    # Processa os batches sequencialmente
    for i in tqdm(range(0, len(data), batch_size), desc="Processing Batches"):
        batch = data.iloc[i:i + batch_size]

        # Processa o batch
        batch_results = process_batch(model, tokenizer, batch, target_lang)


        for ref_tokens, hyp_tokens in batch_results:
            references.append(ref_tokens)
            hypotheses.append(hyp_tokens)

        # Libera memória
        del batch, batch_results
        gc.collect()

    # BLEU médio
    avg_bleu = calculate_bleu(references, hypotheses)

    return avg_bleu


In [73]:
class MBart_Basic(nn.Module):
    def __init__(self, device):
        super().__init__()
        self.MBart = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
        self.tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
        hidden_size = 1024
        self.device = device

    def forward(self, input_texts, source_lang="en_XX", target_lang="fr_XX"):
      self.tokenizer.src_lang = source_lang
      self.tokenizer.tgt_lang = target_lang

      # Tokenização em batch
      inputs = self.tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True).to(self.device)
      input_ids = inputs.input_ids
      attention_mask = inputs.attention_mask

      # Encoder
      encoder_outputs = self.MBart.model.encoder(input_ids=input_ids, attention_mask=attention_mask)

      # Tradução
      forced_bos_token_id = self.tokenizer.lang_code_to_id[target_lang]
      translated_outputs = self.MBart.generate(
          input_ids=input_ids,
          attention_mask=attention_mask,
          max_length=50,
          num_beams=5,
          early_stopping=True,
          forced_bos_token_id=forced_bos_token_id
      )

      # Decodificação batch
      translated_texts = self.tokenizer.batch_decode(translated_outputs, skip_special_tokens=True)

      # Converter ids para tokens em batch
      source_mbart_tokens = [self.tokenizer.convert_ids_to_tokens(seq.tolist()) for seq in input_ids]
      target_mbart_tokens = [self.tokenizer.convert_ids_to_tokens(seq.tolist()) for seq in translated_outputs]


      #saídas: labels de NER do input, labels de NER do output, texto traduzido, tokens do input, tokens do output, ids do input
      return translated_texts, source_mbart_tokens, target_mbart_tokens, input_ids


In [84]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Criação do modelo e passagem pro device
model = MBart_Basic(device).to(device)
tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [85]:
#avaliação do modelo em francês
with open("evaluation_results_base_fr.txt", "w") as file:
  avg_bleu = evaluate_translation_memory_optimized(model=model, tokenizer=tokenizer, data=df_test_fr, max_length=512, target_lang='fr_XX')

  # Escrever os resultados no arquivo
  file.write(f"BLEU Score médio: {avg_bleu}\n\n")

  # Exibir na tela (opcional)
  print(f"Avaliação-Francês:")
  print(f"BLEU Score médio: {avg_bleu}")


Processing Batches:   0%|          | 0/111 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Processing Batches: 100%|██████████| 111/111 [08:46<00:00,  4.74s/it]

Avaliação-Francês:
BLEU Score médio: 1.0262586988078325e-232



The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [86]:
#avaliação do modelo em francês
with open("evaluation_results_base_it.txt", "w") as file:
  avg_bleu = evaluate_translation_memory_optimized(model=model, tokenizer=tokenizer, data=df_test_it, max_length=512, target_lang='it_IT')

  # Escrever os resultados no arquivo
  file.write(f"BLEU Score médio: {avg_bleu}\n\n")

  # Exibir na tela (opcional)
  print(f"Avaliação-Italiano:")
  print(f"BLEU Score médio: {avg_bleu}")

Processing Batches: 100%|██████████| 75/75 [05:16<00:00,  4.22s/it]

Avaliação-Italiano:
BLEU Score médio: 1.4773691655718286e-232





In [None]:
#avaliação do modelo em francês
with open("evaluation_results_base_es.txt", "w") as file:
  avg_bleu = evaluate_translation_memory_optimized(model=model, tokenizer=tokenizer, data=df_test_es, max_length=512, target_lang='es_XX')

  # Escrever os resultados no arquivo
  file.write(f"BLEU Score médio: {avg_bleu}\n\n")

  # Exibir na tela (opcional)
  print(f"Avaliação-Espanhol:")
  print(f"BLEU Score médio: {avg_bleu}")

In [None]:
#avaliação do modelo em francês
with open("evaluation_results_base_ja.txt", "w") as file:
  avg_bleu = evaluate_translation_memory_optimized(model=model, tokenizer=tokenizer, data=df_test_ja, max_length=512, target_lang='ja_XX')

  # Escrever os resultados no arquivo
  file.write(f"BLEU Score médio: {avg_bleu}\n\n")

  # Exibir na tela (opcional)
  print(f"Avaliação-Japonês:")
  print(f"BLEU Score médio: {avg_bleu}")

In [87]:
#avaliação do modelo em francês
with open("evaluation_results_base_de.txt", "w") as file:
  avg_bleu = evaluate_translation_memory_optimized(model=model, tokenizer=tokenizer, data=df_test_de, max_length=512, target_lang='de_DE')

  # Escrever os resultados no arquivo
  file.write(f"BLEU Score médio: {avg_bleu}\n\n")

  # Exibir na tela (opcional)
  print(f"Avaliação-Alemão:")
  print(f"BLEU Score médio: {avg_bleu}")

Processing Batches: 100%|██████████| 757/757 [1:08:37<00:00,  5.44s/it]


Avaliação-Alemão:
BLEU Score médio: 6.004471714225603e-158
