## TRAIN SPECIFIC TOKENIZER USING BYTE PAIR ENCODING

In [None]:
import os
from tokenizers import ByteLevelBPETokenizer

# Initialize the ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer()

# Train the tokenizer on the limited dataset
tokenizer.train(files=['data/small_catalan_oscar.txt'], vocab_size=32768, min_frequency=2, special_tokens=[
    "<s>",  # Start of sequence token
    "<pad>",  # Padding token
    "</s>",  # End of sequence token
    "<unk>",  # Unknown token
    "<mask>",  # Mask token for masked language modeling
])

# Save the trained tokenizer model
tokenizer.save_model('tokenizer')

print("Tokenizer successfully trained and saved.")

## Load the trained tokenizer

In [10]:
from tokenizers import ByteLevelBPETokenizer

# Load the tokenizer
tokenizer = ByteLevelBPETokenizer(
    'vocab.json',
    'merges.txt'
)   

# Encode a sample text
sample_text = "Aquesta és una frase de prova."
encoded = tokenizer.encode(sample_text)
print(encoded.ids)
print(tokenizer.decode(encoded.ids))

# Encode the same text with GPT-2 tokenizer

from transformers import GPT2Tokenizer

# Load the GPT-2 tokenizer
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Encode the sample text
gpt2_encoded = gpt2_tokenizer.encode(sample_text)
print(gpt2_encoded)
print(gpt2_tokenizer.decode(gpt2_encoded))

[6726, 373, 363, 6710, 264, 3361, 18]
Aquesta és una frase de prova.
[32, 6138, 64, 220, 20954, 555, 64, 1216, 589, 390, 899, 64, 13]
Aquesta és una frase de prova.


### Comparing the performance of trained tokenizer and GPT-2 tokenizer

In [20]:
# Compare the two tokenizers
print(f'Lenght of the sample text: {len(sample_text)}')
print(f'Length of the encoded text with ByteLevelBPETokenizer: {len(encoded.ids)}')
print(f'Length of the encoded text with GPT-2 tokenizer: {len(gpt2_encoded)}')

Lenght of the sample text: 30
Length of the encoded text with ByteLevelBPETokenizer: 7
Length of the encoded text with GPT-2 tokenizer: 13
