In [1]:
import torch
import random
from torch.utils.data import Dataset, DataLoader
import itertools
import re
from tokenizers import ByteLevelBPETokenizer
import os
from torch.nn.utils.rnn import pad_sequence

In [2]:
FILE_PATH = 'data/en-pt.txt'
NUM_PHRASES = 1_000_000
OUTPUT_FILE = 'data/en-pt_sentences.txt'
TOKENIZER_DIR = 'tokenizer'
VOCAB_SIZE = 64_000

In [3]:
def load_dataset(dataset_path, limit=1000000):
    """
    Loads the dataset from the given path.

    Args:
        dataset_path: The path to the dataset file.
        limit: The maximum number of sentence pairs to load (default: 1000000).

    Returns:
        A list of (source, target) sentence pairs.
    """
    with open(dataset_path, "r", encoding="utf-8") as file:
        sentence_pairs = [tuple(line.strip().split("\t")) for line in itertools.islice(file, limit)]

    return sentence_pairs

In [4]:
def preprocess_text(text):
    """
    Preprocesses the given text.

    Args:
        text: The input text.
        remove_stopwords: Whether to remove stopwords from the text (default: True).

    Returns:
        A list of preprocessed words.
    """
    # Convert the text to lowercase
    text = text.lower()

    # Remove special characters and digits
    text = re.sub(r'[^a-záàâãéèêíïóôõöúçñ]+', ' ', text)

    return text

In [5]:
def train_tokenizer(sentence_pairs,temp_sentences_path, vocab_size=VOCAB_SIZE, min_frequency=2, output_dir=TOKENIZER_DIR):
    """
    Trains a tokenizer on the given sentence pairs.

    Args:
        sentence_pairs: The list of (source, target) sentence pairs.
        vocab_size: The vocabulary size for the tokenizer.
        min_frequency: The minimum frequency for a token to be included in the vocabulary.
        output_dir: The directory to save the tokenizer files.
    """
    # Save all sentences to a temporary file
    with open(temp_sentences_path, "w", encoding="utf-8") as file:
        for src, tgt in sentence_pairs:
            file.write(src + "\n")
            file.write(tgt + "\n")

    # Train the tokenizer
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=[temp_sentences_path], vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ])

    # Save the tokenizer
    tokenizer.save_model(output_dir)


In [6]:
class TranslationDataset(Dataset):
    def __init__(self, sentence_pairs, tokenizer, max_length):
        self.sentence_pairs = sentence_pairs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentence_pairs)

    def __getitem__(self, idx):
        source, target = self.sentence_pairs[idx]
        tokenized_source = self.tokenizer.encode(source).ids[:self.max_length]
        tokenized_target = self.tokenizer.encode(target).ids[:self.max_length]
        return tokenized_source, tokenized_target

In [7]:
def load_tokenizer(tokenizer_dir=TOKENIZER_DIR):
    """
    Loads a tokenizer from the specified directory.

    Args:
        tokenizer_dir: The directory containing the tokenizer files.

    Returns:
        A Tokenizer object.
    """
    tokenizer = ByteLevelBPETokenizer(
        f"{tokenizer_dir}{os.sep}vocab.json",
        f"{tokenizer_dir}{os.sep}merges.txt",
        add_prefix_space=True
    )
    return tokenizer

In [8]:
def preprocess_data(sentence_pairs, tokenizer, max_length):
    """
    Preprocesses the sentence pairs by tokenizing and creating a TranslationDataset.

    Args:
        sentence_pairs: A list of (source, target) sentence pairs.
        tokenizer: The tokenizer used for tokenization.
        max_length: The maximum sequence length.

    Returns:
        A TranslationDataset object.
    """
    dataset = TranslationDataset(sentence_pairs, tokenizer, max_length)
    return dataset

In [9]:
def create_dataloader(dataset, batch_size,tokenizer, shuffle=True, num_workers=0):
    """
    Creates a DataLoader for the given dataset.

    Args:
        dataset: A PyTorch Dataset object.
        batch_size: The batch size to use in the DataLoader.
        shuffle: Whether to shuffle the dataset before creating the DataLoader.
        num_workers: The number of worker processes to use for loading the data.

    Returns:
        A DataLoader object.
    """
    def collate_fn(batch):
        src_tensors, tgt_tensors = zip(*batch)
        src_tensors = pad_sequence(src_tensors, batch_first=True, padding_value=tokenizer.token_to_id("<pad>"))
        tgt_tensors = pad_sequence(tgt_tensors, batch_first=True, padding_value=tokenizer.token_to_id("<pad>"))
        return src_tensors, tgt_tensors

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        collate_fn=collate_fn,
    )

    return dataloader

In [10]:
# Load a smaller subset of the dataset
sentence_pairs = load_dataset(FILE_PATH, limit=NUM_PHRASES)

In [11]:
preprocessed_pairs = [(preprocess_text(en), preprocess_text(pt)) for en, pt in sentence_pairs]

In [12]:
# Train and save the tokenizer
train_tokenizer(preprocessed_pairs, OUTPUT_FILE)

In [11]:
tokenizer = load_tokenizer()

In [12]:
tokenized_pairs = [(tokenizer.encode(en).ids, tokenizer.encode(pt).ids) for en, pt in preprocessed_pairs]

In [12]:
random.shuffle(sentence_pairs)
split_idx = int(len(sentence_pairs) * 0.9)  # 90% for training, 10% for validation
train_sentence_pairs = sentence_pairs[:split_idx]
val_sentence_pairs = sentence_pairs[split_idx:]

In [13]:
train_dataset = preprocess_data(train_sentence_pairs, tokenizer, 5000)
val_dataset = preprocess_data(val_sentence_pairs, tokenizer, 5000)

In [14]:
train_dataloader = create_dataloader(train_dataset, 32, tokenizer, shuffle=True, num_workers=0)
val_dataloader = create_dataloader(val_dataset, 32, tokenizer, shuffle=False, num_workers=0)