# Creating the training corpus

This document downloads the most popular catalan datasets from projecte-aina and creates a training corpus with them. The datasets are:
- Oscar +5GB
- Catalan_textual_corpus +10GB

Although they are one of the best options for training and LLM it is still not enough filtered and preprocessed so after downloaded texts, will be
trying to create a better dataset for training. This is crucial for small LLM models as they need to be trained with a good dataset to perform well.

In [None]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset('projecte-aina/catalan_general_crawling', trust_remote_code=True)

# Extract the 'train' split and preprocess
corpus_text = " ".join(dataset['train']['text'])

## Storing different data sizes from the full dataset

In [None]:
def limit_dataset_size(corpus_text, size_mb):
    max_bytes = size_mb * 1024 * 1024  # Convert MB to bytes
    encoded_text = corpus_text.encode('utf-8')
    limited_text = encoded_text[:max_bytes].decode('utf-8', errors='ignore')
    return limited_text

In [None]:
def saving_text_to_file(text, filename):
    with open(f"../data/{filename}.txt", "w") as f:
        f.write(text)


# Limit the dataset size to X MB
corpus_text = limit_dataset_size(corpus_text, 10)
saving_text_to_file(corpus_text, "tiny_corpus")

In [None]:
# Define the function to save dataset texts to a .txt file
def save_text_to_file(texts, file_path):
    with open(file_path, "w") as f:
        for line in texts:
            f.write(line.replace("\n", " ") + "\n")  # Replace newlines within articles to maintain proper formatting

In [None]:
# Cargar el dataset original
with open('../data/catalan_oscar.txt', 'r', encoding='utf-8') as f:
    corpus_text = f.read()

# Limitar el tamaño del dataset a 50 MB
limited_text = limit_dataset_size(corpus_text, 50)
saving_text_to_file(limited_text, "small_catalan_oscar")

In [None]:
# Load the OSCAR dataset for Catalan
oscar_dataset = load_dataset("oscar", "unshuffled_deduplicated_ca", split="train")
oscar_text = [example['text'] for example in oscar_dataset]
save_text_to_file(oscar_text, "catalan_oscar.txt")

In [None]:
import requests
import bz2
import xml.etree.ElementTree as ET
from datasets import load_dataset

# Define the function to save dataset texts to a .txt file
def save_text_to_file(texts, file_path):
    with open(file_path, "w") as f:
        for line in texts:
            f.write(line.replace("\n", " ") + "\n")  # Replace newlines within articles to maintain proper formatting

# Function to download and extract Wikipedia dump
def download_wikipedia_dump(url, output_file):
    response = requests.get(url)
    with open(output_file, 'wb') as file:
        file.write(response.content)
    print(f"Downloaded Wikipedia dump to {output_file}")

def parse_wikipedia_dump(dump_file):
    with bz2.open(dump_file, 'rt') as f:
        context = ET.iterparse(f, events=('end',))
        for event, elem in context:
            if elem.tag.endswith('text'):
                yield elem.text
            elem.clear()

# Download Wikipedia dump
wikipedia_dump_url = 'https://dumps.wikimedia.org/cawiki/latest/cawiki-latest-pages-articles.xml.bz2'
wikipedia_dump_file = 'cawiki-latest-pages-articles.xml.bz2'
download_wikipedia_dump(wikipedia_dump_url, wikipedia_dump_file)

# Parse Wikipedia dump
wikipedia_texts = list(parse_wikipedia_dump(wikipedia_dump_file))
save_text_to_file(wikipedia_texts, "catalan_wikipedia.txt")

### Preprocess data by removing new lines

In [2]:
# Name of the original file and the output file
input_filename = '../data/catalan_textual_corpus.txt'
output_filename = '../data/CatGPT_dataset.txt'

# Open the original file in read mode and the output file in write mode
with open(input_filename, 'r') as infile, open(output_filename, 'w') as outfile:
    for line in infile:
        # Strip to remove leading and trailing whitespaces and newline characters
        if line.strip():
            outfile.write(line)

print(f"Empty lines have been removed and the result has been saved in {output_filename}.")


Empty lines have been removed and the result has been saved in ../data/CatGPT_dataset.txt.


# PATUFET DATASET

After using the previous datasets, the model was fine-tuned with the patufet-textbooks dataset. This datasets is a collection of synthetic catalan texts created by using a much larger LLM, gemini-flash. It is structured in many differents fields and topics like Mathematics, History, Geography, etc. Moreover it is explained for different types of audience like kids, general persons or even researchers. The dataset is available in the huggingface datasets library and can be downloaded by using the following code:

In [None]:
from datasets import load_dataset

# Load the dataset from Hugging Face
dataset = load_dataset("pauhidalgoo/patufet-textbooks")

# C4 CATALAN DATASET

This part of the script aims to create one of the bests dataset for the Catalan language. The dataset will be created by translating the well-known
C4 in its variant realnewslike dataset to Catalan resulting in around 25 GB of text data about news articles. Its motivation
comes from the lack of Catalan datasets in the NLP community and the need to have a good dataset to train models in this language.

The dataset is downloaded by creating batches of 1 GB (parameter) in order to avoid memory issuess during the translation and training process.

In [None]:
import datasets

class C4NewsBatchLoader:
    def __init__(self, split='train'):
        """
        Initialize the C4NewsBatchLoader with a specific split.
        
        Parameters:
        split (str): The split of the dataset to load (train, validation, test).
        """
        self.dataset = datasets.load_dataset('allenai/c4', 'realnewslike', split=split, streaming=True)
        self.dataset_iter = iter(self.dataset)
        self.bytes_written = 0
        self.file_count = 1

    def _save_to_file(self, text, file_count):
        """
        Save text to a file.
        
        Parameters:
        text (str): The text to save.
        file_count (int): The current file count for naming the file.
        """
        file_name = f"../data/CA_realnewslike{file_count}.txt"
        print(f"Saving to {file_name}")
        with open(file_name, 'w', encoding='utf-8') as f:
            f.write(text)

    def split_to_files(self, max_size_gb=1):
        """
        Split the dataset into files of approximately max_size_gb GB each.
        
        Parameters:
        max_size_gb (int): The maximum size of each file in GB.
        """
        max_size_bytes = max_size_gb * 1024**3  # Convert GB to bytes
        current_text = []

        try:
            while True:
                example = next(self.dataset_iter)
                text = example['text']
                current_text.append(text)
                self.bytes_written += len(text.encode('utf-8'))

                if self.bytes_written >= max_size_bytes:
                    self._save_to_file(''.join(current_text), self.file_count)
                    self.file_count += 1
                    current_text = []
                    self.bytes_written = 0

        except StopIteration:
            if current_text:
                self._save_to_file(''.join(current_text), self.file_count)

# Example usage:
batch_loader = C4NewsBatchLoader(split='train')
batch_loader.split_to_files(max_size_gb=1)

## TRANSLATE DATA USING LOCAL TRANSFORMERS MODEL

At the end it was not possible to translate the whole dataset using the local transformers model due to computational and economic limitations. The script is still available as it works and can be used to translate smaller datasets. If having a powerful machine, it is possible to translate the whole dataset by changing the `batch_size` parameter to a higher value. If you do so, please let me know how it went! If possible it would be much better to use an LLM for translating with a better quality.

In [None]:
import nltk
import torch
from transformers import MarianMTModel, MarianTokenizer
from nltk.tokenize import sent_tokenize

# Download the NLTK data needed for sentence tokenization
nltk.download('punkt')

def translate_sentences(sentences, model, tokenizer):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: value.to('cuda') for key, value in inputs.items()}  # Move inputs to GPU
    translated = model.generate(**inputs)
    translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return translated_texts

def translate_file(input_path, output_path, chunk_size=1024*1024, batch_size=32):  # 1MB chunk size, batch size 32
    model_name = "Helsinki-NLP/opus-mt-en-ca"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    model.to('cuda')  # Move model to GPU

    with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
        while True:
            text_chunk = infile.read(chunk_size)
            if not text_chunk:
                break

            sentences = sent_tokenize(text_chunk)
            translated_sentences = []
            for i in range(0, len(sentences), batch_size):
                batch = sentences[i:i + batch_size]
                translated_batch = translate_sentences(batch, model, tokenizer)
                translated_sentences.extend(translated_batch)

            translated_text = " ".join(translated_sentences)
            outfile.write(translated_text + "\n")
            print("Translated and wrote a chunk.")