# ByteNet: BLEU-Score
evaluate the encoder-decoder network on character-to-character machine translation

In [1]:
# Import necessary libraries
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import json
from transformers import AutoTokenizer

In [2]:
# Define device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Define BytenetEncoder and BytenetDecoder classes
class ResidualBlockReLu(nn.Module):
    def __init__(self, d, dilation, k, decoder=False):
        super(ResidualBlockReLu, self).__init__()
        self.decoder = decoder
        self.layer_norm1 = nn.LayerNorm(128)
        self.reLu1 = nn.ReLU()
        self.conv1 = nn.Conv1d(d * 2, d, 1)
        self.layer_norm2 = nn.LayerNorm(128)
        self.reLu2 = nn.ReLU()
        if decoder:
            self.receptive_field = (k - 1) * dilation
            self.conv2 = nn.Conv1d(d, d, k, dilation=dilation)
        else:
            padding = (k - 1) * dilation // 2
            self.conv2 = nn.Conv1d(d, d, k, dilation=dilation, padding=padding)
        self.layer_norm3 = nn.LayerNorm(128)
        self.reLu3 = nn.ReLU()
        self.conv3 = nn.Conv1d(d, d * 2, 1)

    def forward(self, x):
        residual = x
        x = self.layer_norm1(x)
        x = self.reLu1(x)
        x = self.conv1(x)
        x = self.layer_norm2(x)
        x = self.reLu2(x)
        if self.decoder and self.receptive_field > 0:
            x = torch.nn.functional.pad(x, (self.receptive_field, 0))
        x = self.conv2(x)
        x = self.layer_norm3(x)
        x = self.reLu3(x)
        x = self.conv3(x)
        x += residual
        return x

class BytenetEncoder(nn.Module):
    def __init__(self, kernel_size=3, max_dilation_rate=16, masked_kernel_size=3, num_sets=6, set_size=5,
                 hidden_channels=800, emb_size=1600):
        super(BytenetEncoder, self).__init__()
        self.num_channels = hidden_channels
        self.kernel_size = kernel_size
        self.layers = nn.Sequential()
        self.layers.append(nn.Conv1d(in_channels=emb_size, out_channels=hidden_channels * 2, kernel_size=1))
        for _ in range(num_sets):
            dilation_rate = 1
            for _ in range(set_size):
                self.layers.append(ResidualBlockReLu(hidden_channels,
                                                     dilation_rate if dilation_rate <= max_dilation_rate else max_dilation_rate,
                                                     masked_kernel_size))
                dilation_rate *= 2
        self.encoder_out_conv = nn.Conv1d(in_channels=hidden_channels * 2, out_channels=2 * hidden_channels, kernel_size=1)

    def forward(self, x):
        x = x.float()
        for layer in self.layers:
            x = layer(x)
        x = self.encoder_out_conv(x)
        return x

class BytenetDecoder(nn.Module):
    def __init__(self, kernel_size=3, max_dilation_rate=16, masked_kernel_size=3, num_sets=6, set_size=5,
                 hidden_channels=800, output_channels=384):
        super(BytenetDecoder, self).__init__()
        self.num_channels = hidden_channels
        self.kernel_size = kernel_size
        self.layers = nn.Sequential()
        for _ in range(num_sets):
            dilation_rate = 1
            for _ in range(set_size):
                self.layers.append(ResidualBlockReLu(hidden_channels,
                                                     dilation_rate if dilation_rate <= max_dilation_rate else max_dilation_rate,
                                                     masked_kernel_size, decoder=True))
                dilation_rate *= 2
        self.layers.append(nn.Conv1d(hidden_channels * 2, hidden_channels, 1))
        self.layers.append(nn.ReLU())
        self.layers.append(nn.Conv1d(hidden_channels, output_channels, 1))

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

class EncoderDecoderStacking(nn.Module):
    def __init__(self, vocab_size, kernel_size=3, max_dilation_rate=16, masked_kernel_size=3, n_sets=6, blocks_per_set=5,
                 hidden_channels=800, output_channels=384, emb_size=1600):
        super(EncoderDecoderStacking, self).__init__()
        self.embed = nn.Embedding(vocab_size, emb_size)
        self.encoder = BytenetEncoder(kernel_size=kernel_size, max_dilation_rate=max_dilation_rate,
                                      masked_kernel_size=masked_kernel_size, num_sets=n_sets, set_size=blocks_per_set,
                                      hidden_channels=hidden_channels, emb_size=emb_size)
        self.decoder = BytenetDecoder(kernel_size=kernel_size, max_dilation_rate=max_dilation_rate,
                                      masked_kernel_size=masked_kernel_size, num_sets=n_sets, set_size=blocks_per_set,
                                      hidden_channels=hidden_channels, output_channels=output_channels)

    def forward(self, x):
        embed_x = self.embed(x).permute(0, 2, 1)
        x = self.encoder(embed_x)
        x = self.decoder(x)
        return x

In [3]:
# Load the entire model
# model = torch.load('model_whole_small.pth', map_location=device)
# model.to(device)

vocab_size = 384
num_sets = 6
set_size = 5
embed_size = 1600

encoder_decoder = EncoderDecoderStacking(vocab_size=vocab_size, n_sets=3, blocks_per_set=5, output_channels=vocab_size,emb_size=embed_size).to(device)
loaded_model = encoder_decoder
loaded_model.load_state_dict(torch.load('model_state_small_new.pth'))
loaded_model.to(device)

# Define a custom dataset class for loading the test data
class TranslationDataset(Dataset):
    def __init__(self, source_texts, target_texts):
        self.source_texts = source_texts
        self.target_texts = target_texts

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, idx):
        return self.source_texts[idx], self.target_texts[idx]

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")

# Define function to load and tokenize test data
def load_and_tokenize(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    source_texts = [item['row']['translation']['de'] for item in data]
    target_texts = [item['row']['translation']['en'] for item in data]

    source_tokens = tokenizer(source_texts, max_length=128, padding='max_length', truncation=True, return_tensors="pt")['input_ids']
    target_tokens = tokenizer(target_texts, max_length=128, padding='max_length', truncation=True, return_tensors="pt")['input_ids']
    
    return source_tokens, target_tokens

# Load and tokenize test data
source_tokens, target_tokens = load_and_tokenize('wmt19_json1/wmt_19_de_en.json')

# Create dataset and dataloader
translation_dataset = TranslationDataset(source_tokens, target_tokens)
test_loader = DataLoader(translation_dataset, batch_size=32, shuffle=False)



# Average BLEU-score 
- per sentence
- on batch level
- on character level

Character-Level BLEU Scores: Character-level BLEU scores may be lower than word-level scores because the evaluation becomes more fine-grained.

In [4]:
#average bleu score per sentence

def evaluate_model_sentence(model, test_loader, tokenizer):
   
    model.eval()
    smooth_fn = SmoothingFunction().method1
    total_bleu_score = 0.0
    count = 0

    with torch.no_grad():
        for inputs, targets in tqdm(test_loader, total=len(test_loader)):
            inputs = inputs.to(device)
            targets = targets.to(device)

            # Forward pass
            outputs = model(inputs)
            outputs = torch.argmax(outputs, dim=-1)

            # Convert tensor to list of token ids
            output_token_ids = outputs.cpu().tolist()
            target_token_ids = targets.cpu().tolist()

            for output_tokens, target_tokens in zip(output_token_ids, target_token_ids):
                # Decode tokens to text
                output_text = tokenizer.decode(output_tokens, skip_special_tokens=True)
                target_text = tokenizer.decode(target_tokens, skip_special_tokens=True)

                # Calculate BLEU score
                bleu_score = sentence_bleu([target_text.split()], output_text.split(), smoothing_function=smooth_fn)
                total_bleu_score += bleu_score
                count += 1

    avg_bleu_score = total_bleu_score / count
    print(f"Average BLEU Score: {avg_bleu_score:.4f}")



In [5]:
#average bleu score on batch level

def evaluate_model_batch(model, test_loader, tokenizer):

    smooth = SmoothingFunction().method4
    total_bleu_score_batch = 0
    num_batches = 0

    with torch.no_grad():
        for inputs, targets in tqdm(test_loader, total=len(test_loader)):
            inputs = inputs.to(device)
            targets = targets.to(device)
            
            # Generate predictions
            outputs = model(inputs)
            outputs = torch.argmax(outputs, dim=-1)
            
            for i in range(outputs.size(0)):
                # Decode the predicted and target sequences
                predicted_seq = tokenizer.decode(outputs[i].cpu().numpy(), skip_special_tokens=True)
                target_seq = tokenizer.decode(targets[i].cpu().numpy(), skip_special_tokens=True)
                
                # Tokenize the sequences
                predicted_tokens = nltk.word_tokenize(predicted_seq)
                target_tokens = [nltk.word_tokenize(target_seq)]
                
                # Calculate BLEU score
                bleu_score = sentence_bleu(target_tokens, predicted_tokens, smoothing_function=smooth)
                total_bleu_score += bleu_score
                num_batches += 1
        avg_bleu_score = total_bleu_score / count
        print(f"Average BLEU Score per Sentence : {avg_bleu_score:.4f}")
   
   

    with torch.no_grad():
        for inputs, targets in tqdm(test_loader, total=len(test_loader)):
            # Move data to the appropriate device
            inputs = inputs.to(device)
            targets = targets.to(device)
            
           
    
    avg_bleu_score = total_bleu_score / num_batches
    print(f'Average BLEU Score on batch level: {avg_bleu_score:.4f}')

   


In [6]:
def evaluate_model_char_level(model, test_loader, tokenizer):
    model.eval()
    smooth_fn = SmoothingFunction().method1
    total_bleu_score = 0.0
    count = 0

    with torch.no_grad():
        for inputs, targets in tqdm(test_loader, total=len(test_loader)):
            inputs = inputs.to(device)
            targets = targets.to(device)

            # Forward pass
            outputs = model(inputs)
            outputs = torch.argmax(outputs, dim=-1)

            # Convert tensor to list of token ids
            output_token_ids = outputs.cpu().tolist()
            target_token_ids = targets.cpu().tolist()

            for output_tokens, target_tokens in zip(output_token_ids, target_token_ids):
                # Decode tokens to text
                output_text = tokenizer.decode(output_tokens, skip_special_tokens=True)
                target_text = tokenizer.decode(target_tokens, skip_special_tokens=True)

                # Tokenize to characters
                output_chars = list(output_text)
                target_chars = list(target_text)

                # Calculate BLEU score
                bleu_score = sentence_bleu([target_chars], output_chars, smoothing_function=smooth_fn)
                total_bleu_score += bleu_score
                count += 1
    
        avg_bleu_score = total_bleu_score / count
        print(f"Average BLEU Score on Character-Level : {avg_bleu_score:.4f}")

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Evaluate the model
# Evaluate the model
evaluate_model_sentence(loaded_model, test_loader, tokenizer)
evaluate_model_batch(loaded_model, test_loader, wmt_json_loader.tokenizer)
evaluate_model_batch(loaded_model, test_loader, wmt_json_loader.tokenizer)
evaluate_model_char_level(loaded_model, test_loader, wmt_json_loader.tokenizer)

  0%|          | 0/1250 [00:00<?, ?it/s]2024-06-26 11:14:32.820442: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-26 11:14:32.872634: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-26 11:14:32.872688: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-26 11:14:32.875086: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-26 11:14:32.886265: I t