In [1]:
# Import necessary libraries
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import json
import os
from transformers import AutoTokenizer
from data.data_loader import WMTLoader, WMT19JSONLoader, download_entire_de_en_dataset

In [2]:
# Define device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Define BytenetEncoder and BytenetDecoder classes
class ResidualBlockReLu(nn.Module):
    """
    Implementation of residual Layer for Bytenet machine translation task.

    :param d: The number of input features.
    :param dilation: The initial dilation rate for the convolution layers.
    """
    def __init__(self, d, dilation, k, decoder=False):
        super(ResidualBlockReLu, self).__init__()
        self.decoder = decoder
        self.layer_norm1 = nn.LayerNorm(128)
        self.reLu1 = nn.ReLU()
        self.conv1 = nn.Conv1d(d * 2, d, 1)
        self.layer_norm2 = nn.LayerNorm(128)
        self.reLu2 = nn.ReLU()
        if decoder:
            self.receptive_field = (k - 1) * dilation
            self.conv2 = nn.Conv1d(d, d, k, dilation=dilation)
        else:
            padding = (k - 1) * dilation // 2
            self.conv2 = nn.Conv1d(d, d, k, dilation=dilation, padding=padding)
        self.layer_norm3 = nn.LayerNorm(128)
        self.reLu3 = nn.ReLU()
        self.conv3 = nn.Conv1d(d, d * 2, 1)

    def forward(self, x):
        residual = x
        x = self.layer_norm1(x)
        x = self.reLu1(x)
        x = self.conv1(x)
        x = self.layer_norm2(x)
        x = self.reLu2(x)
        if self.decoder and self.receptive_field > 0:
            x = torch.nn.functional.pad(x, (self.receptive_field, 0))
        x = self.conv2(x)
        x = self.layer_norm3(x)
        x = self.reLu3(x)
        x = self.conv3(x)
        x += residual
        return x

class BytenetEncoder(nn.Module):
    """
    Implementation of the ByteNet Encoder. Default Parameters are set to the ones used in the paper.
    
    :param kernel_size: The kernel size for the unmasked (padded) convolution in the residual block.
    :param max_dilation_rate: The maximum dilation rate for the convolution layers.
    :param masked_kernel_size: The kernel size for the masked convolution in the residual block (only interesting for decoder).
    :param num_sets: The number of sets of residual blocks.
    :param set_size: The number of residual blocks in each set.
    :param hidden_channels: The number of hidden channels in the model.
    """
    def __init__(self, kernel_size=3, max_dilation_rate=16, masked_kernel_size=3, num_sets=6, set_size=5,
                 hidden_channels=800, emb_size=1600):
        super(BytenetEncoder, self).__init__()
        self.num_channels = hidden_channels
        self.kernel_size = kernel_size
        self.layers = nn.Sequential()
        self.layers.append(nn.Conv1d(in_channels=emb_size, out_channels=hidden_channels * 2, kernel_size=1))
        for _ in range(num_sets):
            dilation_rate = 1
            for _ in range(set_size):
                self.layers.append(ResidualBlockReLu(hidden_channels,
                                                     dilation_rate if dilation_rate <= max_dilation_rate else max_dilation_rate,
                                                     masked_kernel_size))
                dilation_rate *= 2
        self.encoder_out_conv = nn.Conv1d(in_channels=hidden_channels * 2, out_channels=2 * hidden_channels, kernel_size=1)

    def forward(self, x):
        x = x.float()
        for layer in self.layers:
            x = layer(x)
        x = self.encoder_out_conv(x)
        return x

class BytenetDecoder(nn.Module):
    """
    Implementation of the ByteNet Decoder. Default Parameters are set to the ones used in the paper.
    
    :param kernel_size: The kernel size for the unmasked (padded) convolution in the residual block (not important for decoder).
    :param max_dilation_rate: The maximum dilation rate for the convolution layers.
    :param masked_kernel_size: The kernel size for the masked convolution in the residual block.
    :param num_sets: The number of sets of residual blocks.
    :param set_size: The number of residual blocks in each set.
    :param hidden_channels: The number of hidden channels in the model.
    """
    def __init__(self, kernel_size=3, max_dilation_rate=16, masked_kernel_size=3, num_sets=6, set_size=5,
                 hidden_channels=800, output_channels=384):
        super(BytenetDecoder, self).__init__()
        self.num_channels = hidden_channels
        self.kernel_size = kernel_size
        self.layers = nn.Sequential()
        for _ in range(num_sets):
            dilation_rate = 1
            for _ in range(set_size):
                self.layers.append(ResidualBlockReLu(hidden_channels,
                                                     dilation_rate if dilation_rate <= max_dilation_rate else max_dilation_rate,
                                                     masked_kernel_size, decoder=True))
                dilation_rate *= 2
        self.layers.append(nn.Conv1d(hidden_channels * 2, hidden_channels, 1))
        self.layers.append(nn.ReLU())
        self.layers.append(nn.Conv1d(hidden_channels, output_channels, 1))

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

class EncoderDecoderStacking(nn.Module):
    """
    Stacks the encoder and decoder for the ByteNet model.
    This means passing the output of the encoder as input to the decoder.
    
    :param kernel_size: The kernel size for the unmasked (padded) convolution in the residual block (for Encoder).
    :param max_dilation_rate: The maximum dilation rate for the convolution layers.
    :param masked_kernel_size: The kernel size for the masked convolution in the residual block (for Decoder).
    :param num_sets: The number of sets of residual blocks.
    :param set_size: The number of residual blocks in each set.
    :param hidden_channels: The number of hidden channels in the model.
    :param output_channels: The number of output channels in the model (vocab size).

    :return x: The output of the decoder.
    """
    def __init__(self, vocab_size, kernel_size=3, max_dilation_rate=16, masked_kernel_size=3, n_sets=6, blocks_per_set=5,
                 hidden_channels=800, output_channels=384, emb_size=1600):
        super(EncoderDecoderStacking, self).__init__()
        self.embed = nn.Embedding(vocab_size, emb_size)
        self.encoder = BytenetEncoder(kernel_size=kernel_size, max_dilation_rate=max_dilation_rate,
                                      masked_kernel_size=masked_kernel_size, num_sets=n_sets, set_size=blocks_per_set,
                                      hidden_channels=hidden_channels, emb_size=emb_size)
        self.decoder = BytenetDecoder(kernel_size=kernel_size, max_dilation_rate=max_dilation_rate,
                                      masked_kernel_size=masked_kernel_size, num_sets=n_sets, set_size=blocks_per_set,
                                      hidden_channels=hidden_channels, output_channels=output_channels)

    def forward(self, x):
        embed_x = self.embed(x).permute(0, 2, 1)
        x = self.encoder(embed_x)
        x = self.decoder(x)
        return x


In [3]:

# Define function to load and tokenize test data
def load_and_tokenize(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    source_texts = [item['row']['translation']['de'] for item in data]
    target_texts = [item['row']['translation']['en'] for item in data]

    source_tokens = tokenizer(source_texts, max_length=128, padding='max_length', truncation=True, return_tensors="pt")['input_ids']
    target_tokens = tokenizer(target_texts, max_length=128, padding='max_length', truncation=True, return_tensors="pt")['input_ids']
    
    return source_tokens, target_tokens

# Load and tokenize test data
# source_tokens, target_tokens = load_and_tokenize('wmt19_json1/wmt_19_de_en.json')

# Create dataset and dataloader
# translation_dataset = TranslationDataset(source_tokens, target_tokens)
# test_loader = DataLoader(translation_dataset, batch_size=32, shuffle=False,  num_workers=4)


In [4]:
# Load the entire model
loaded_model = torch.load('model_whole.pth', map_location=device)
loaded_model.to(device)


device = 'cuda' if torch.cuda.is_available() else 'cpu'
num_workers = 4
batch_size = 64

current_path = os.getcwd()
output_dir = current_path+'/wmt19_json_eval'
download_entire_de_en_dataset(batch_size, output_dir, 4)
wmt_json_loader = WMT19JSONLoader(output_dir)


Downloading dataset-offset: 128
Downloading dataset-offset: 0
Downloading dataset-offset: 192
Downloading dataset-offset: 64
Downloading dataset-offset: 256
Downloading dataset-offset: 320


In [5]:

# Define a custom dataset class for loading the test data
class TranslationDataset(Dataset):
    def __init__(self, source_texts, target_texts):
        self.source_texts = source_texts
        self.target_texts = target_texts

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, idx):
        return self.source_texts[idx], self.target_texts[idx]


In [6]:
cache_dir = current_path+'/wmt19_cache'
# wmt_loader = WMTLoader(split="train", cache_dir=cache_dir)
# index = 0
# source, target = wmt_loader[index]
# print("Source:", source)
# print("Target:", target)

# use drive in which to save dataset in cache
source_tokens, target_tokens = wmt_json_loader.load_and_tokenize(current_path+'/wmt19_json_eval/wmt_19_de_en.json')

translation_dataset = TranslationDataset(source_tokens, target_tokens)
test_loader = DataLoader(translation_dataset, batch_size=batch_size, shuffle=True)

#dataset_size = len(translation_dataset)
#train_size = int(0.8 * dataset_size)
#test_size = dataset_size - train_size
#train_dataset, test_dataset = torch.utils.data.random_split(translation_dataset, [train_size, test_size])

#test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

# Average BLEU-score 
- per sentence
- on batch level
- on character level

Character-Level BLEU Scores: Character-level BLEU scores may be lower than word-level scores because the evaluation becomes more fine-grained.

In [7]:
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/e12230488/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
def evaluate_model(model, test_loader, tokenizer):
    
    #average bleu score per sentence  
    #average bleu score on batch level
    #average bleu score on character level
    
    
    model.eval()

    smooth_sentence = SmoothingFunction().method1
    smooth_batch = SmoothingFunction().method4
    smooth_char = SmoothingFunction().method1

    total_bleu_score_sentence = 0.0
    total_bleu_score_batch = 0.0
    total_bleu_score_char = 0.0

    count_sentence = 0
    num_batches = 0 
    count_char = 0

    bleu_scores_sentence = []
    bleu_scores_batch = []
    bleu_scores_char = []

    
    with torch.no_grad():
        for inputs, targets in tqdm(test_loader, total=len(test_loader)):
            inputs = inputs.to(device)
            targets = targets.to(device)

            # Forward pass
            outputs = model(inputs)
            outputs = torch.argmax(outputs, dim=-1)

            # Sentence level BLEU
            output_token_ids = outputs.cpu().tolist()
            target_token_ids = targets.cpu().tolist()

            for output_tokens, target_tokens in zip(output_token_ids, target_token_ids):
                output_text = tokenizer.decode(output_tokens, skip_special_tokens=True)
                target_text = tokenizer.decode(target_tokens, skip_special_tokens=True)

                bleu_score_sentence = sentence_bleu([target_text.split()], output_text.split(), smoothing_function=smooth_sentence)
                total_bleu_score_sentence += bleu_score_sentence
                count_sentence += 1
           
            # Character level BLEU
            for output_tokens, target_tokens in zip(output_token_ids, target_token_ids):
                output_text = tokenizer.decode(output_tokens, skip_special_tokens=True)
                target_text = tokenizer.decode(target_tokens, skip_special_tokens=True)

                output_chars = list(output_text)
                target_chars = list(target_text)

                bleu_score_char = sentence_bleu([target_chars], output_chars, smoothing_function=smooth_char)
                total_bleu_score_char += bleu_score_char
                count_char += 1
                
            # Batch level BLEU
            for i in range(outputs.size(0)):
                predicted_seq = tokenizer.decode(outputs[i].cpu().numpy(), skip_special_tokens=True)
                target_seq = tokenizer.decode(targets[i].cpu().numpy(), skip_special_tokens=True)

                predicted_tokens = nltk.word_tokenize(predicted_seq)
                target_tokens = [nltk.word_tokenize(target_seq)]

                bleu_score_batch = sentence_bleu(target_tokens, predicted_tokens, smoothing_function=smooth_batch)
                total_bleu_score_batch += bleu_score_batch

                num_batches += 1

            avg_bleu_score_sentence = total_bleu_score_sentence / count_sentence
            print(f"Average BLEU Score per Sentence : {avg_bleu_score_sentence:.4f}")
   
            avg_bleu_score_batch = total_bleu_score_batch / num_batches
            print(f'Average BLEU Score on batch level: {avg_bleu_score_batch:.4f}')

            avg_bleu_score_char = total_bleu_score_char / count_char
            print(f'Average BLEU Score on character level: {avg_bleu_score_char:.4f}')

    
    avg_bleu_score_sentence = total_bleu_score_sentence / count_sentence
    print(f"Average BLEU Score per Sentence : {avg_bleu_score_sentence:.4f}")
    
    avg_bleu_score_batch = total_bleu_score_batch / num_batches
    print(f'Average BLEU Score on batch level: {avg_bleu_score_batch:.4f}')

    avg_bleu_score_char = total_bleu_score_char / count_char
    print(f'Average BLEU Score on character level: {avg_bleu_score_char:.4f}')

   

In [9]:
evaluate_model(loaded_model, test_loader, wmt_json_loader.tokenizer)

  0%|          | 0/6 [00:00<?, ?it/s]2024-06-26 16:51:58.640958: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-26 16:51:58.683613: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-26 16:51:58.683666: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-26 16:51:58.686060: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-26 16:51:58.697603: I tens

Average BLEU Score per Sentence : 0.0000
Average BLEU Score on batch level: 0.0032
Average BLEU Score on character level: 0.0011


 33%|███▎      | 2/6 [00:29<00:57, 14.37s/it]

Average BLEU Score per Sentence : 0.0000
Average BLEU Score on batch level: 0.0032
Average BLEU Score on character level: 0.0011


 50%|█████     | 3/6 [00:42<00:41, 13.84s/it]

Average BLEU Score per Sentence : 0.0000
Average BLEU Score on batch level: 0.0031
Average BLEU Score on character level: 0.0011


 67%|██████▋   | 4/6 [00:56<00:27, 13.87s/it]

Average BLEU Score per Sentence : 0.0000
Average BLEU Score on batch level: 0.0031
Average BLEU Score on character level: 0.0011


 83%|████████▎ | 5/6 [01:09<00:13, 13.52s/it]

Average BLEU Score per Sentence : 0.0001
Average BLEU Score on batch level: 0.0028
Average BLEU Score on character level: 0.0011


100%|██████████| 6/6 [01:22<00:00, 13.81s/it]

Average BLEU Score per Sentence : 0.0000
Average BLEU Score on batch level: 0.0027
Average BLEU Score on character level: 0.0011
Average BLEU Score per Sentence : 0.0000
Average BLEU Score on batch level: 0.0027
Average BLEU Score on character level: 0.0011



