# BEST CATALAN DATASET

This script aims to create the best dataset for the Catalan language. The dataset is created by translating the well-known
C4 in its variant realnewslike dataset to Catalan resulting in around 25 GB of text data about news articles. Its motivation
comes from the lack of Catalan datasets in the NLP community and the need to have a good dataset to train models in this language.

The dataset is downloaded by creating batches of 1 GB (parameter) in order to avoid memory issuess during the translation and training process.

In [2]:
import datasets

class C4NewsBatchLoader:
    def __init__(self, split='train'):
        """
        Initialize the C4NewsBatchLoader with a specific split.
        
        Parameters:
        split (str): The split of the dataset to load (train, validation, test).
        """
        self.dataset = datasets.load_dataset('allenai/c4', 'realnewslike', split=split, streaming=True)
        self.dataset_iter = iter(self.dataset)
        self.bytes_written = 0
        self.file_count = 1

    def _save_to_file(self, text, file_count):
        """
        Save text to a file.
        
        Parameters:
        text (str): The text to save.
        file_count (int): The current file count for naming the file.
        """
        file_name = f"../data/CA_realnewslike{file_count}.txt"
        print(f"Saving to {file_name}")
        with open(file_name, 'w', encoding='utf-8') as f:
            f.write(text)

    def split_to_files(self, max_size_gb=1):
        """
        Split the dataset into files of approximately max_size_gb GB each.
        
        Parameters:
        max_size_gb (int): The maximum size of each file in GB.
        """
        max_size_bytes = max_size_gb * 1024**3  # Convert GB to bytes
        current_text = []

        try:
            while True:
                example = next(self.dataset_iter)
                text = example['text']
                current_text.append(text)
                self.bytes_written += len(text.encode('utf-8'))

                if self.bytes_written >= max_size_bytes:
                    self._save_to_file(''.join(current_text), self.file_count)
                    self.file_count += 1
                    current_text = []
                    self.bytes_written = 0

        except StopIteration:
            if current_text:
                self._save_to_file(''.join(current_text), self.file_count)

# Example usage:
batch_loader = C4NewsBatchLoader(split='train')
batch_loader.split_to_files(max_size_gb=1)

## TRANSLATE DATA USING LOCAL TRANSFORMERS MODEL

At the end it was not possible to translate the whole dataset using the local transformers model due to computational limitations. The script is still available as it works and can be used to translate smaller datasets. If having a powerful machine, it is possible to translate the whole dataset by changing the `batch_size` parameter to a higher value. If you do so, please let me know how it went!

In [None]:
import nltk
import torch
from transformers import MarianMTModel, MarianTokenizer
from nltk.tokenize import sent_tokenize

# Download the NLTK data needed for sentence tokenization
nltk.download('punkt')

def translate_sentences(sentences, model, tokenizer):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: value.to('cuda') for key, value in inputs.items()}  # Move inputs to GPU
    translated = model.generate(**inputs)
    translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return translated_texts

def translate_file(input_path, output_path, chunk_size=1024*1024, batch_size=32):  # 1MB chunk size, batch size 32
    model_name = "Helsinki-NLP/opus-mt-en-ca"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    model.to('cuda')  # Move model to GPU

    with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
        while True:
            text_chunk = infile.read(chunk_size)
            if not text_chunk:
                break

            sentences = sent_tokenize(text_chunk)
            translated_sentences = []
            for i in range(0, len(sentences), batch_size):
                batch = sentences[i:i + batch_size]
                translated_batch = translate_sentences(batch, model, tokenizer)
                translated_sentences.extend(translated_batch)

            translated_text = " ".join(translated_sentences)
            outfile.write(translated_text + "\n")
            print("Translated and wrote a chunk.")