In [1]:
# pip install nltk

In [2]:
# pip install datasets requests transformers tqdm

In [3]:
# Import necessary libraries
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import json
import os
from transformers import AutoTokenizer
from data.data_loader import WMTLoader, WMT19JSONLoader, download_entire_de_en_dataset

In [4]:
# Define device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Define BytenetEncoder and BytenetDecoder classes
class ResidualBlockReLu(nn.Module):
    """
    Implementation of residual Layer for Bytenet machine translation task.

    :param d: The number of input features.
    :param dilation: The initial dilation rate for the convolution layers.
    """
    def __init__(self, d, dilation, k, decoder=False):
        super(ResidualBlockReLu, self).__init__()
        self.decoder = decoder
        self.layer_norm1 = nn.LayerNorm(128)
        self.reLu1 = nn.ReLU()
        self.conv1 = nn.Conv1d(d * 2, d, 1)
        self.layer_norm2 = nn.LayerNorm(128)
        self.reLu2 = nn.ReLU()
        if decoder:
            self.receptive_field = (k - 1) * dilation
            self.conv2 = nn.Conv1d(d, d, k, dilation=dilation)
        else:
            padding = (k - 1) * dilation // 2
            self.conv2 = nn.Conv1d(d, d, k, dilation=dilation, padding=padding)
        self.layer_norm3 = nn.LayerNorm(128)
        self.reLu3 = nn.ReLU()
        self.conv3 = nn.Conv1d(d, d * 2, 1)

    def forward(self, x):
        residual = x
        x = self.layer_norm1(x)
        x = self.reLu1(x)
        x = self.conv1(x)
        x = self.layer_norm2(x)
        x = self.reLu2(x)
        if self.decoder and self.receptive_field > 0:
            x = torch.nn.functional.pad(x, (self.receptive_field, 0))
        x = self.conv2(x)
        x = self.layer_norm3(x)
        x = self.reLu3(x)
        x = self.conv3(x)
        x += residual
        return x

class BytenetEncoder(nn.Module):
    """
    Implementation of the ByteNet Encoder. Default Parameters are set to the ones used in the paper.
    
    :param kernel_size: The kernel size for the unmasked (padded) convolution in the residual block.
    :param max_dilation_rate: The maximum dilation rate for the convolution layers.
    :param masked_kernel_size: The kernel size for the masked convolution in the residual block (only interesting for decoder).
    :param num_sets: The number of sets of residual blocks.
    :param set_size: The number of residual blocks in each set.
    :param hidden_channels: The number of hidden channels in the model.
    """
    def __init__(self, kernel_size=3, max_dilation_rate=16, masked_kernel_size=3, num_sets=6, set_size=5,
                 hidden_channels=800, emb_size=1600):
        super(BytenetEncoder, self).__init__()
        self.num_channels = hidden_channels
        self.kernel_size = kernel_size
        self.layers = nn.Sequential()
        self.layers.append(nn.Conv1d(in_channels=emb_size, out_channels=hidden_channels * 2, kernel_size=1))
        for _ in range(num_sets):
            dilation_rate = 1
            for _ in range(set_size):
                self.layers.append(ResidualBlockReLu(hidden_channels,
                                                     dilation_rate if dilation_rate <= max_dilation_rate else max_dilation_rate,
                                                     masked_kernel_size))
                dilation_rate *= 2
        self.encoder_out_conv = nn.Conv1d(in_channels=hidden_channels * 2, out_channels=2 * hidden_channels, kernel_size=1)

    def forward(self, x):
        x = x.float()
        for layer in self.layers:
            x = layer(x)
        x = self.encoder_out_conv(x)
        return x

class BytenetDecoder(nn.Module):
    """
    Implementation of the ByteNet Decoder. Default Parameters are set to the ones used in the paper.
    
    :param kernel_size: The kernel size for the unmasked (padded) convolution in the residual block (not important for decoder).
    :param max_dilation_rate: The maximum dilation rate for the convolution layers.
    :param masked_kernel_size: The kernel size for the masked convolution in the residual block.
    :param num_sets: The number of sets of residual blocks.
    :param set_size: The number of residual blocks in each set.
    :param hidden_channels: The number of hidden channels in the model.
    """
    def __init__(self, kernel_size=3, max_dilation_rate=16, masked_kernel_size=3, num_sets=6, set_size=5,
                 hidden_channels=800, output_channels=384):
        super(BytenetDecoder, self).__init__()
        self.num_channels = hidden_channels
        self.kernel_size = kernel_size
        self.layers = nn.Sequential()
        for _ in range(num_sets):
            dilation_rate = 1
            for _ in range(set_size):
                self.layers.append(ResidualBlockReLu(hidden_channels,
                                                     dilation_rate if dilation_rate <= max_dilation_rate else max_dilation_rate,
                                                     masked_kernel_size, decoder=True))
                dilation_rate *= 2
        self.layers.append(nn.Conv1d(hidden_channels * 2, hidden_channels, 1))
        self.layers.append(nn.ReLU())
        self.layers.append(nn.Conv1d(hidden_channels, output_channels, 1))

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

class EncoderDecoderStacking(nn.Module):
    """
    Stacks the encoder and decoder for the ByteNet model.
    This means passing the output of the encoder as input to the decoder.
    
    :param kernel_size: The kernel size for the unmasked (padded) convolution in the residual block (for Encoder).
    :param max_dilation_rate: The maximum dilation rate for the convolution layers.
    :param masked_kernel_size: The kernel size for the masked convolution in the residual block (for Decoder).
    :param num_sets: The number of sets of residual blocks.
    :param set_size: The number of residual blocks in each set.
    :param hidden_channels: The number of hidden channels in the model.
    :param output_channels: The number of output channels in the model (vocab size).

    :return x: The output of the decoder.
    """
    def __init__(self, vocab_size, kernel_size=3, max_dilation_rate=16, masked_kernel_size=3, n_sets=6, blocks_per_set=5,
                 hidden_channels=800, output_channels=384, emb_size=1600):
        super(EncoderDecoderStacking, self).__init__()
        self.embed = nn.Embedding(vocab_size, emb_size)
        self.encoder = BytenetEncoder(kernel_size=kernel_size, max_dilation_rate=max_dilation_rate,
                                      masked_kernel_size=masked_kernel_size, num_sets=n_sets, set_size=blocks_per_set,
                                      hidden_channels=hidden_channels, emb_size=emb_size)
        self.decoder = BytenetDecoder(kernel_size=kernel_size, max_dilation_rate=max_dilation_rate,
                                      masked_kernel_size=masked_kernel_size, num_sets=n_sets, set_size=blocks_per_set,
                                      hidden_channels=hidden_channels, output_channels=output_channels)

    def forward(self, x):
        embed_x = self.embed(x).permute(0, 2, 1)
        x = self.encoder(embed_x)
        x = self.decoder(x)
        return x


In [5]:
# Define function to load and tokenize test data
def load_and_tokenize(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    source_texts = [item['row']['translation']['de'] for item in data]
    target_texts = [item['row']['translation']['en'] for item in data]

    source_tokens = tokenizer(source_texts, max_length=128, padding='max_length', truncation=True, return_tensors="pt")['input_ids']
    target_tokens = tokenizer(target_texts, max_length=128, padding='max_length', truncation=True, return_tensors="pt")['input_ids']
    
    return source_tokens, target_tokens


In [6]:

# Define a custom dataset class for loading the test data
class TranslationDataset(Dataset):
    def __init__(self, source_texts, target_texts):
        self.source_texts = source_texts
        self.target_texts = target_texts

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, idx):
        return self.source_texts[idx], self.target_texts[idx]


In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
num_workers = 4
batch_size = 64

current_path = os.getcwd()


# Load the entire model
loaded_model = torch.load('model_whole.pth', map_location=device)
loaded_model.to(device)

output_dir = current_path+'/wmt19_json_eval'

download_entire_de_en_dataset(batch_size, output_dir, 4)

wmt_json_loader = WMT19JSONLoader(output_dir)


In [8]:
cache_dir = current_path+'/wmt19_cache'

# use drive in which to save dataset in cache
source_tokens, target_tokens = wmt_json_loader.load_and_tokenize(current_path+'/wmt19_json_eval/wmt_19_de_en.json')

translation_dataset = TranslationDataset(source_tokens, target_tokens)
test_loader = DataLoader(translation_dataset, batch_size=batch_size, shuffle=False, num_workers=4)


Second evaluatio test with wmt 14

TODOs if you want to run these tests:
- need to do some changes in the dataloader file for reproduction!!!
- uncommend text below
- uncomment last cell


In [None]:
#output_dir = current_path+'/wmt14_json_eval'

# download_entire_de_en_dataset(batch_size, output_dir, 4)

#source_tokens2, target_tokens2 = wmt_json_loader.load_and_tokenize(current_path+'/wmt14_json_eval/wmt_14_de_en.json')
#translation_dataset2 = TranslationDataset(source_tokens2, target_tokens2)
#test_loader2 = DataLoader(translation_dataset2, batch_size=batch_size, shuffle=False, num_workers=4)


# Average BLEU-score 
- per sentence
- on batch level
- on character level

Character-Level BLEU Scores: Character-level BLEU scores may be lower than word-level scores because the evaluation becomes more fine-grained.

paper results: 
BLEU Score: 22.85 (0.380 bits/character) 25.53 (0.389 bits/character)
means 0.2285 and 0.2553

In [9]:
from torch.utils.data import DataLoader
import zlib
import matplotlib.pyplot as plt
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/e12230488/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
def translate(to_translate, model, loader):
    model.eval()
    inp = loader.tokenize_texts([to_translate])[0].unsqueeze(0).to(device)
    with torch.no_grad():
        out = model(inp)
    # print(out.shape)

    out = torch.argmax(out.squeeze(0), dim=0)
    token_ids = out.tolist()
    translated_texts = loader.tokenizer.decode(token_ids, skip_special_tokens = True)
    return translated_texts

Translating: ['Wiederaufnahme der Sitzungsperiode']


2024-06-27 08:42:05.952390: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-27 08:42:05.952467: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-27 08:42:05.955611: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-27 08:42:05.970346: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Translated text: Resumption of the session


In [None]:
#manual test to check translation 

loaded_model.eval()
text = ["Wiederaufnahme der Sitzungsperiode"]
print(f"Translating: {text}")
translated_texts = translate(text, loaded_model, wmt_json_loader)
print(f"Translated text: {translated_texts}")

In [11]:
def evaluate_model_with_translate(model, test_loader, loader):
    model.eval()
    smooth_sentence = SmoothingFunction().method1
    smooth_batch = SmoothingFunction().method4
    smooth_char = SmoothingFunction().method1

    total_bleu_score_sentence = 0.0
    total_bleu_score_batch = 0.0
    total_bleu_score_char = 0.0

    count_sentence = 0
    num_batches = 0
    count_char = 0

    total_compressed_bits = 0
    total_characters = 0

    total_correct_chars = 0
    total_chars = 0

    first_sample_printed = False

    with torch.no_grad():
        for inputs, targets in tqdm(test_loader, total=len(test_loader)):
            # Decode input text
            input_texts = [loader.tokenizer.decode(input.tolist(), skip_special_tokens=True) for input in inputs]
            target_texts = [loader.tokenizer.decode(target.tolist(), skip_special_tokens=True) for target in targets]

            for input_text, target_text in zip(input_texts, target_texts):
                predicted_text = translate(input_text, model, loader)

                # Print the first sample
                if not first_sample_printed:
                    print(f"Input: {input_text}")
                    print(f"Predicted: {predicted_text}")
                    print(f"Target: {target_text}")
                    first_sample_printed = True

                # Sentence level BLEU
                bleu_score_sentence = sentence_bleu([target_text.split()], predicted_text.split(), smoothing_function=smooth_sentence)
                total_bleu_score_sentence += bleu_score_sentence
                count_sentence += 1

                # Calculate BPC
                compressed_output = zlib.compress(predicted_text.encode('utf-8'))
                total_compressed_bits += len(compressed_output) * 8  # Convert to bits
                total_characters += len(predicted_text)

                # Character-level accuracy
                for pred_char, true_char in zip(predicted_text, target_text):
                    if pred_char == true_char:
                        total_correct_chars += 1
                    total_chars += 1

                # Character level BLEU
                output_chars = list(predicted_text)
                target_chars = list(target_text)

                bleu_score_char = sentence_bleu([target_chars], output_chars, smoothing_function=smooth_char)
                total_bleu_score_char += bleu_score_char
                count_char += 1

                # Batch level BLEU
                predicted_tokens = nltk.word_tokenize(predicted_text)
                target_tokens = [nltk.word_tokenize(target_text)]

                bleu_score_batch = sentence_bleu(target_tokens, predicted_tokens, smoothing_function=smooth_batch)
                total_bleu_score_batch += bleu_score_batch

                num_batches += 1

    avg_bleu_score_sentence = total_bleu_score_sentence / count_sentence
    avg_bleu_score_batch = total_bleu_score_batch / num_batches
    avg_bleu_score_char = total_bleu_score_char / count_char

    bpc = total_compressed_bits / total_characters if total_characters > 0 else float('inf')
    char_accuracy = total_correct_chars / total_chars if total_chars > 0 else 0

    print(f"Average BLEU Score per Sentence: {avg_bleu_score_sentence:.4f}")
    print(f"Average BLEU Score on Batch Level: {avg_bleu_score_batch:.4f}")
    print(f"Average BLEU Score on Character Level: {avg_bleu_score_char:.4f}")
    print(f"Total Bits per Character (BPC): {bpc:.4f}")
    print(f"Character-Level Accuracy: {char_accuracy:.4f}")



# Evaluate the model with wmt19
dataset we trained with

In [12]:
evaluate_model_with_translate(loaded_model, test_loader, wmt_json_loader)

  0%|          | 0/6 [00:00<?, ?it/s]

Input: Wiederaufnahme der Sitzungsperiode
Predicted: Resumption of the session
Target: Resumption of the session


100%|██████████| 6/6 [01:02<00:00, 10.37s/it]

Average BLEU Score per Sentence: 0.8439
Average BLEU Score on Batch Level: 0.8571
Average BLEU Score on Character Level: 0.9712
Total Bits per Character (BPC): 6.8211
Character-Level Accuracy: 0.9872





# Evaluate the model with wmt14
if you want to try with this dataset: uncomment cell bellow and cell for downloading andl loading wmt14 (cell 9) 

In [13]:
# evaluate_model_with_translate(loaded_model, test_loader2, wmt_json_loader)

  0%|          | 0/6 [00:00<?, ?it/s]

Input: Weshalb also sollten Waffenhersteller in der EU auf Kosten unschuldiger Menschen Profite einstreichen?
Predicted: Soywhse he tho warmes  hes ye te te we th ie  ar  eo  chtt oh   ee  h  h  ph ne?  n   t?
Target: So why should EU arms producers profit at the expense of innocent people?


100%|██████████| 6/6 [00:59<00:00,  9.84s/it]

Average BLEU Score per Sentence: 0.0429
Average BLEU Score on Batch Level: 0.0540
Average BLEU Score on Character Level: 0.2166
Total Bits per Character (BPC): 6.3390
Character-Level Accuracy: 0.1754



