# Preliminary dataset analysis

In [49]:
!pip install datasets




In [50]:
from datasets import load_dataset

# Load the opus_books dataset
dataset = load_dataset("opus_books", "en-fr")



In [51]:
dataset.shape

{'train': (127085, 2)}

In [53]:
# Display the first few examples from the training set
print(dataset["train"].shape)
print(dataset["train"][:6])

(127085, 2)
{'id': ['0', '1', '2', '3', '4', '5'], 'translation': [{'en': 'The Wanderer', 'fr': 'Le grand Meaulnes'}, {'en': 'Alain-Fournier', 'fr': 'Alain-Fournier'}, {'en': 'First Part', 'fr': 'PREMIÈRE PARTIE'}, {'en': 'I', 'fr': 'CHAPITRE PREMIER'}, {'en': 'THE BOARDER', 'fr': 'LE PENSIONNAIRE'}, {'en': 'He arrived at our home on a Sunday of November, 189-.', 'fr': 'Il arriva chez nous un dimanche de novembre 189-…'}]}


In [7]:
# Sentence Length Analysis
english_lengths = [len(example['translation']['en'].split()) for example in dataset["train"]]
french_lengths = [len(example['translation']['fr'].split()) for example in dataset["train"]]

print(f"Average English sentence length: {sum(english_lengths) / len(english_lengths)}")
print(f"Average French sentence length: {sum(french_lengths) / len(french_lengths)}")
print(f"Maximum English sentence length: {max(english_lengths)}")
print(f"Maximum French sentence length: {max(french_lengths)}")




Average English sentence length: 21.364212928355037
Average French sentence length: 20.79159617578786
Maximum English sentence length: 372
Maximum French sentence length: 324
Maximum French sentence length: 324


In [8]:
# Unique Tokens (basic count for now)
english_tokens = set()
french_tokens = set()

for example in dataset["train"]:
    english_tokens.update(example['translation']['en'].split())
    french_tokens.update(example['translation']['fr'].split())

print(f"Unique English tokens: {len(english_tokens)}")
print(f"Unique French tokens: {len(french_tokens)}")

Unique English tokens: 146031
Unique French tokens: 169205


In [40]:
max_length_en_item = max(dataset['train']['translation'], key=lambda x: len(x['en'].split()))
print(len(max_length_en_item['en'].split()))
print(max_length_en_item['en'])


372
Upon the whole, I was by this time so fixed upon my design of going over with him to the continent that I told him we would go and make one as big as that, and he should go home in it. He answered not one word, but looked very grave and sad. I asked him what was the matter with him. He asked me again, “Why you angry mad with Friday?—what me done?” I asked him what he meant. I told him I was not angry with him at all. “No angry!” says he, repeating the words several times; “why send Friday home away to my nation?” “Why,” says I, “Friday, did not you say you wished you were there?” “Yes, yes,” says he, “wish we both there; no wish Friday there, no master there.” In a word, he would not think of going there without me. “I go there, Friday?” says I; “what shall I do there?” He turned very quick upon me at this. “You do great deal much good,” says he; “you teach wild mans be good, sober, tame mans; you tell them know God, pray God, and live new life.” “Alas, Friday!” says I, “thou knowe

# Build Vocabulary

In [41]:
!pip install tokenizers




In [42]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

def get_all_sentences(ds, lang):
    for item in ds['translation']:
        yield item[lang]


# Initialize tokenizer
tokenizer_en = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer_en.pre_tokenizer = Whitespace()

dataset = dataset["train"]
# Train tokenizer
trainer_en = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
tokenizer_en.train_from_iterator(get_all_sentences(dataset, 'en'), trainer=trainer_en)





In [43]:
# Initialize tokenizer
tokenizer_fr = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer_fr.pre_tokenizer = Whitespace()

# Train tokenizer
trainer_fr = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
tokenizer_fr.train_from_iterator(get_all_sentences(dataset, 'fr'), trainer=trainer_fr)

In [47]:
print(tokenizer_en.get_vocab_size())
print(tokenizer_fr.get_vocab_size())

30000
30000


# Tokenize Dataset

In [17]:
def tokenize_data(dataset, tokenizer_en, tokenizer_fr):
    tokenized_data_en = []
    tokenized_data_fr = []
    for example in dataset['translation']:
        tokenized_data_en.append(tokenizer_en.encode(example['en']).ids)
        tokenized_data_fr.append(tokenizer_fr.encode(example['fr']).ids)
    return tokenized_data_en, tokenized_data_fr


tokenized_data_en,  tokenized_data_fr = tokenize_data(dataset, tokenizer_en, tokenizer_fr)

In [18]:
print(tokenized_data_en[:5])
print(tokenized_data_fr[:5])

[[46, 0], [0, 31, 0], [2293, 9371], [11], [904, 0]]
[[82, 157, 774], [0, 14, 0], [29730, 14265], [1033, 17335], [2150, 0]]


In [27]:
max_length_item = max(tokenized_data_en, key=len)
print(len(max_length_item))
print(max_length_item)


471
[2363, 5, 306, 4, 11, 14, 42, 43, 98, 55, 882, 97, 32, 2388, 7, 223, 122, 22, 38, 8, 5, 3536, 15, 11, 248, 38, 59, 54, 138, 9, 163, 56, 26, 763, 26, 15, 4, 9, 17, 117, 138, 454, 12, 18, 6, 66, 478, 27, 56, 307, 4, 45, 243, 84, 1284, 9, 1252, 6, 11, 222, 38, 81, 14, 5, 488, 22, 38, 6, 66, 222, 34, 131, 4, 836, 424, 23, 1444, 1169, 22, 1308, 19423, 81, 34, 312, 4496, 11, 222, 38, 81, 17, 1372, 6, 11, 248, 38, 11, 14, 27, 1444, 22, 38, 29, 50, 6, 836, 226, 1444, 6954, 465, 17, 4, 2650, 5, 326, 530, 484, 16, 836, 527, 953, 1308, 454, 171, 8, 32, 3677, 4496, 836, 424, 2483, 465, 11, 4, 836, 1308, 4, 101, 27, 23, 155, 23, 657, 23, 48, 70, 4496, 836, 247, 4, 813, 2483, 465, 17, 4, 836, 487, 59, 361, 70, 16, 64, 487, 1308, 70, 4, 64, 379, 70, 2411, 169, 10, 307, 4, 17, 54, 27, 202, 7, 223, 70, 127, 34, 6, 836, 11, 138, 70, 4, 1308, 4496, 465, 11, 16, 836, 81, 175, 11, 100, 70, 4496, 66, 334, 84, 1696, 97, 34, 29, 43, 6, 836, 128, 100, 141, 860, 142, 134, 2483, 465, 17, 16, 836, 23, 3397, 8

# Padding

In [19]:
import torch
from torch.nn.utils.rnn import pad_sequence

# tokenized_data_en = [[46, 0], [0, 31, 0], [2293, 9371], [11], [904, 0]]
# tokenized_data_fr = [[82, 157, 774], [0, 14, 0], [29730, 14265], [1033, 17335], [2150, 0]]

# Convert to PyTorch tensors
tokenized_tensors_en = [torch.tensor(seq) for seq in tokenized_data_en]
tokenized_tensors_fr = [torch.tensor(seq) for seq in tokenized_data_fr]

# Pad sequences
padded_en = pad_sequence(tokenized_tensors_en, batch_first=True, padding_value=1)
padded_fr = pad_sequence(tokenized_tensors_fr, batch_first=True, padding_value=1)



In [24]:
print(padded_en[:5])
print(len(padded_en[0]))

tensor([[  46,    0,    1,  ...,    1,    1,    1],
        [   0,   31,    0,  ...,    1,    1,    1],
        [2293, 9371,    1,  ...,    1,    1,    1],
        [  11,    1,    1,  ...,    1,    1,    1],
        [ 904,    0,    1,  ...,    1,    1,    1]])
471


# Trouble-shooting vocabulary issues

In [25]:
dataset_sample = {
    'id': ['0', '1', '2', '3', '4'],
    'translation': [
        {'en': 'The Wanderer', 'fr': 'Le grand Meaulnes'},
        {'en': 'Alain-Fournier', 'fr': 'Alain-Fournier'},
        {'en': 'First Part', 'fr': 'PREMIÈRE PARTIE'},
        {'en': 'I', 'fr': 'CHAPITRE PREMIER'},
        {'en': 'THE BOARDER', 'fr': 'LE PENSIONNAIRE'}
    ]
}


In [55]:
print("ID for [UNK]:", tokenizer_en.token_to_id("[UNK]"))
print("ID for [PAD]:", tokenizer_en.token_to_id("[PAD]"))
print("ID for [SOS]:", tokenizer_en.token_to_id("[SOS]"))
print("ID for [EOS]:", tokenizer_en.token_to_id("[EOS]"))



ID for [UNK]: 0
ID for [PAD]: 1
ID for [SOS]: 2
ID for [EOS]: 3


In [44]:
encoded = tokenizer_en.encode("whom I used to call")
print(encoded.ids)
print(tokenizer_en.decode(encoded.ids))


[264, 11, 584, 8, 608]
whom I used to call


In [23]:
vocab_en = tokenizer_en.get_vocab()
print("Is 'Wanderer' in the vocabulary?", 'Wanderer' in vocab_en)


Is 'Wanderer' in the vocabulary? False


In [45]:
vocab = tokenizer_en.get_vocab()

first_few = {k: vocab[k] for k in list(vocab)[:10]}
print(first_few)
print(len(vocab))



{'prudemment': 28107, 'exasperate': 18383, 'occupés': 20843, 'rude': 4108, 'Chékina': 11446, 'Japanese': 5934, 'snored': 21237, 'Valley': 22064, 'Beauty': 29543, 'Malgré': 9705}
30000


In [47]:
vocab = tokenizer_fr.get_vocab()

first_few = {k: vocab[k] for k in list(vocab)[:10]}
print(first_few)
print(len(vocab))

{'vitre': 3120, 'vouée': 29051, 'considéré': 7364, 'idole': 16665, 'humiliante': 19522, 'jugeant': 16709, '_Il': 29925, 'affreusement': 9930, 'interroge': 18013, 'quais': 7881}
30000


In [39]:
wanderer_count = sum(1 for sentence in get_all_sentences(dataset["train"], 'en') if 'Wanderer' in sentence)
print(f"'Wanderer' appears {wanderer_count} times.")


'Wanderer' appears 2 times.


# Glossary

## Hugging Face offers two primary libraries for tokenization

Hugging Face offers two primary libraries for tokenization, and it can be a bit confusing. Here's a breakdown:

### 1. `tokenizers` Library:
- **Purpose**: The `tokenizers` library is designed to be a standalone, fast, and efficient tokenization library. It's written in Rust with Python bindings to ensure performance, making it much faster than pure Python tokenization methods.
- **Capabilities**: It supports a wide variety of tokenization methods, such as Byte-Pair Encoding (BPE), WordPiece, Unigram, and more.
- **Flexibility**: This library provides a lot of flexibility to customize the tokenization process, such as adding special tokens, controlling tokenization at the character or word level, and building custom pre- and post-tokenization pipelines.
- **Training**: The library allows you to train your tokenizers from scratch on custom datasets.

### 2. `transformers` Library:
- **Purpose**: The `transformers` library is primarily designed to provide pre-trained models and their associated tokenizers for various NLP tasks. While it does offer tokenization capabilities, it's deeply integrated with the models it provides.
- **Capabilities**: It wraps around the tokenizers from the `tokenizers` library and provides easy-to-use methods that are tied to specific pre-trained models.
- **Ease of Use**: If you're using a model from the `transformers` library, it's often easier to use the associated tokenizer from the same library because they're designed to work together seamlessly.
- **Pre-trained Tokenizers**: The library provides pre-trained tokenizers for many popular models, ensuring that the tokenization process aligns perfectly with the pre-trained model you're using.

### Why Both?
You might wonder why Hugging Face provides two libraries that seemingly overlap in functionality. The reason is modularity and flexibility. The `tokenizers` library is designed for speed and versatility in tokenization without any assumptions about the downstream model. In contrast, the `transformers` library provides an end-to-end solution for using pre-trained models, including tokenization that's tailored for each model.

### When to Use Which?
- If you're building a custom tokenization pipeline or need high-speed tokenization without necessarily using a pre-trained model from the `transformers` library, you might prefer the `tokenizers` library.
- If you're using a pre-trained model from the `transformers` library (like BERT, GPT-2, T5, etc.), it's generally easier and more straightforward to use the associated tokenizer from the `transformers` library.

In essence, while there's overlap, the two libraries serve slightly different needs in the NLP ecosystem.

## Hugging Face's Transformers tokenizers

Hugging Face's Transformers library provides a variety of tokenizers suitable for different models and use-cases. Here are some of the word-level tokenizers:

1. **BertTokenizer**: Uses WordPiece tokenization. As seen in the example, it can break words into smaller subwords prefixed by `##`.

2. **RobertaTokenizer**: Uses Byte-Pair Encoding (BPE) similar to GPT-2. It's another form of subword tokenization.

3. **XLNetTokenizer**: Uses SentencePiece tokenization, which can also produce subwords.

4. **OpenAIApiTokenizer**: Tokenizer for OpenAI models like GPT-3.

5. **GPT2Tokenizer**: Uses Byte-Pair Encoding (BPE).

6. **T5Tokenizer**: Tokenizer for the T5 models. It uses SentencePiece.

7. **LongformerTokenizer**: Tokenizer for the Longformer model. It's similar to the BertTokenizer.

8. **DistilBertTokenizer**: Tokenizer for the DistilBERT model, which is a distilled version of BERT. Uses WordPiece tokenization.

9. **WhitespaceTokenizer**: Splits text on whitespaces. This is one of the simplest tokenization methods.

10. **BasicTokenizer**: A very basic tokenizer that splits the text on whitespaces and punctuation.

Out of these, the **WhitespaceTokenizer** and **BasicTokenizer** are the simplest. The **WhitespaceTokenizer** only splits on whitespace, making no other modifications, while the **BasicTokenizer** additionally splits on punctuation and can handle case and Unicode normalization.

For most tasks, especially when starting, the **BasicTokenizer** provides a simple yet effective tokenization method. However, for advanced models like BERT and its variants, their respective tokenizers (like BertTokenizer) are recommended because they align with the pre-training data and techniques used for those models.

## Hugging Face's `tokenizers` library

Hugging Face's `tokenizers` library  is a powerful and efficient tool for tokenizing text, which is a fundamental step in any NLP pipeline. Here's a high-level overview:

### 1. **Purpose**:
- The library is designed to provide an extremely fast and efficient tokenization, leveraging Rust's performance with Python bindings.

### 2. **Tokenization Techniques**:
- Supports a variety of tokenization methods:
  - **Byte-Pair Encoding (BPE)**: A data compression algorithm that's been adapted for word segmentation in NLP tasks.
  - **WordPiece**: A tokenization method often used with models like BERT.
  - **Unigram**: A tokenization method that's particularly popular for models like SentencePiece.
  - **SentencePiece**: While not directly a method, the library supports models trained with Google's SentencePiece.
  
### 3. **Key Features**:

- **Custom Tokenization**: Beyond the standard methods, users can define custom tokenization processes.
  
- **Training from Scratch**: Users can train their tokenizers on custom datasets, allowing creation of domain-specific tokenizers.

- **Pre-tokenizers & Post-processors**: These allow users to customize the tokenization process. For instance, a pre-tokenizer might split text into words or sentences before the main tokenization, and a post-processor might add special tokens (e.g., [CLS], [SEP]).

- **Normalization**: The library supports various text normalization techniques, such as lowercasing, stripping accents, and more.

- **Fast**: Being implemented in Rust, the library offers performance that's much faster than pure Python tokenization methods.

### 4. **Decoding**:
- Converts token IDs back to human-readable text, which is crucial for tasks like text generation.

### 5. **Handles Special Tokens**:
- Provides an easy way to deal with special tokens, like padding, start-of-sentence, and end-of-sentence.

### 6. **Batching**:
- Efficiently tokenizes batches of text, which is essential for processing large datasets or for real-time applications.

### 7. **Files and Serialization**:
- Tokenizers can be saved to and loaded from files, ensuring consistent tokenization across different stages and applications.

### 8. **Alignment with Original Text**:
- The library can keep track of alignments between tokens and their corresponding parts in the original text, which is useful for tasks like Named Entity Recognition (NER) where you need to map model predictions back to the original text.

### 9. **Integration with `transformers` Library**:
- Although the `tokenizers` library can be used standalone, it's also designed to work seamlessly with Hugging Face's `transformers` library. Many pre-trained models in `transformers` use tokenizers built with this library.

### Conclusion:
The `tokenizers` library provides a versatile and efficient solution for one of the foundational steps in NLP, ensuring that both researchers and practitioners can have fast and consistent tokenization across various tasks and models.

## "Training" the tokenizer

"Training" the tokenizer refers to the process of learning the most optimal way to split text into smaller units (tokens) based on a given dataset. This concept can be a bit confusing because, in many contexts, "training" is typically associated with supervised learning models. However, for tokenizers, "training" is more about data-driven statistical analysis.

Here's a deeper look:

### 1. **Why Train a Tokenizer?**

Tokenization is not always as straightforward as splitting text on spaces or punctuation. Different tasks or languages might require different tokenization strategies. By training a tokenizer, you're essentially letting it learn the most common patterns in your dataset, which can help in breaking down the text in the most meaningful and consistent way.

### 2. **How Does It Work?**

For methods like Byte-Pair Encoding (BPE) or WordPiece, the training process typically involves:

- **Initialization**: Start by treating each word as a single token and each character as a token.
  
- **Iterative Merging**: Repeatedly merge the most common pair of consecutive tokens. For instance, if "h" and "i" appear next to each other more frequently than any other pair of characters or tokens, merge them to create the token "hi".

- **Termination**: Stop when you've reached a predefined number of merges or when merges no longer improve the tokenization quality based on some criterion.

### 3. **Vocabulary Building**:

While training, the tokenizer also builds its vocabulary, which is a list of tokens that the tokenizer knows about. This vocabulary is then used to tokenize new unseen texts. Tokens in the unseen texts that are not in the vocabulary are further split until they match the known tokens or are represented as special tokens (like [UNK] for "unknown").

### 4. **Special Tokens and Rules**:

During training, the tokenizer can also learn or be provided with special rules or tokens. For instance, certain tokenization methods might decide to treat numbers or dates in a specific way.

### 5. **Benefits**:

- **Adaptability**: Training allows the tokenizer to adapt to the specifics of the language or domain in the dataset. For instance, a tokenizer trained on medical texts might tokenize medical jargon differently than one trained on general news articles.
  
- **Optimized Vocabulary**: By training on a large corpus, the tokenizer can build a vocabulary optimized for that corpus, leading to more efficient token representations.

In summary, training a tokenizer is about letting it learn the best way to split texts from a specific dataset, ensuring that the resulting tokens are meaningful, consistent, and optimized for downstream tasks.

## `tokenizer.tokenize()`

The `tokenizer.tokenize()` method is part of Hugging Face's tokenization utilities. It's responsible for breaking down a given text into smaller pieces, typically words or subwords, depending on the tokenizer.

In the context of the `BertTokenizer` that we've been using, it splits text into words or subwords that the BERT model was trained on. BERT uses WordPiece tokenization, which means it can split words into smaller pieces if those words are not in its vocabulary. This allows BERT to handle a wide variety of out-of-vocabulary words.

Here's a simple example:

```python
from transformers import BertTokenizer

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize a sample sentence
sentence = "ChatGPT is part of OpenAI's family of models."
tokens = tokenizer.tokenize(sentence)

print(tokens)
```

In the above example, the tokenizer will break down the sentence into individual words or subwords. Words like "OpenAI's" might get split into "open", "ai", and "'s" since "OpenAI's" might not be a single token in the BERT's vocabulary, but its constituent parts are.

This approach helps the model generalize better to unseen or rare words in real-world data by representing them as a combination of seen subwords.

## Build the vocabulary using the tokenized sentences
Now, we'll build the vocabulary using the tokenized sentences. We'll focus on words that appear more than once to filter out potential typos or very rare words. We'll also limit the vocabulary size for efficiency.

### Rationale:

- **Word-level Tokenization**: At this stage, we're breaking down sentences into individual words, which will serve as the basic units for our model.
  
- **Building Vocabulary**: The vocabulary represents the set of words our model will recognize. Words not in the vocabulary will be treated as unknown (`[UNK]`). By focusing on more common words (those appearing more than once) and setting a limit on vocabulary size, we aim to strike a balance between covering a broad range of words and computational efficiency.

- **Special Tokens**: These tokens serve specific purposes in the tokenized sequences:
  - `[PAD]`: Used for padding shorter sequences to a fixed length.
  - `[START]` and `[END]`: Indicate the beginning and end of sequences.
  - `[UNK]`: Represents any token that is not in the vocabulary.

The steps outlined above should help in processing the data for use in the Transformer model. After tokenization and vocabulary creation, the next steps would involve converting words to their corresponding integer IDs from the vocabulary, padding sequences to a fixed length, and creating the necessary masks. But first, let's ensure the tokenization and vocabulary-building steps work correctly in your Colab environment.

In [25]:
from transformers import BertTokenizer

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize a sample sentence
sentence = "ChatGPT is part of OpenAI's family of models."
tokens = tokenizer.tokenize(sentence)

print(tokens)

['chat', '##gp', '##t', 'is', 'part', 'of', 'open', '##ai', "'", 's', 'family', 'of', 'models', '.']


## Size of the vocabulary and computation

The size of the vocabulary directly influences various computational aspects of a Transformer-based model, especially in the context of machine translation:

1. **Embedding Layer**: Each word in the vocabulary has a corresponding embedding vector. A larger vocabulary means the embedding matrix (where each row is a word's embedding) will be larger. This increases the number of parameters and the memory required.

2. **Output Layer**: In the case of machine translation, the output layer (or the projection layer) will have a size equivalent to the target vocabulary size. The logits for each word in the vocabulary are computed, and then a softmax is applied to get the probabilities. A larger vocabulary increases the computation required for this step.

3. **Softmax Computation**: The softmax operation in the output layer, which normalizes logits into probabilities, is more computationally intensive with a larger vocabulary.

4. **Training Time**: A larger vocabulary means more parameters to train (in the embedding and output layers), which can lead to longer training times.

5. **Memory Footprint**: The larger embedding and output matrices require more memory, both in terms of model storage and during runtime (when matrices are loaded into memory/GPU).

6. **Generalization**: A model with more parameters (due to a larger vocabulary) has a higher capacity. While this might allow it to fit the training data better, there's also a risk of overfitting, especially if the dataset isn't large enough.

In summary, while the embedding and output layers are directly affected by vocabulary size, the repercussions are felt throughout the training and inference processes in terms of computational time, memory usage, and potential model generalization. However, it's also essential to strike a balance. A vocabulary that's too small can hinder the model's ability to generalize and perform well on unseen data, especially in tasks like translation where capturing nuances is crucial.

## Limiting the vocabulary size

Limiting the vocabulary size to a certain number, such as 30,000, is a common practice in NLP, especially in machine translation tasks. This is done for several reasons:

1. **Memory and Computational Efficiency**: Large vocabularies increase the size of embedding matrices, which directly impacts the memory requirements and computational cost. By limiting the vocabulary size, models become more manageable in terms of memory and speed.

2. **Out-of-Vocabulary Handling**: No matter how large the vocabulary, there will always be words that are not included. Having a fixed-size vocabulary means there's a systematic way to handle out-of-vocabulary (OOV) words, often by using a special token like `[UNK]` (unknown).

3. **Rare Words**: In any language, there are words that appear very infrequently. These rare words can be problematic for training because the model doesn't see them enough to learn their meaning reliably. By limiting the vocabulary size, many of these rare words are excluded, and the model focuses on more common and relevant words.

4. **Regularization**: Limiting the vocabulary can act as a form of regularization, preventing the model from fitting too closely to the training data and potentially improving generalization to new, unseen data.

5. **Consistency with Pretrained Models**: Many pretrained models, like those from BERT or GPT families, use a fixed vocabulary size (often around 30,000 to 50,000). When fine-tuning or adapting these models to specific tasks, it's beneficial to maintain consistency with their vocabulary size.

6. **Subword Tokenization**: Modern tokenizers, like SentencePiece or the BPE (Byte-Pair Encoding) algorithm, split words into smaller units (subwords or even characters). This allows the model to handle OOV words by representing them as a sequence of known subwords. With subword tokenization, even a seemingly limited vocabulary can cover a vast majority of the language's words and phrases.

Given the benefits, it's not surprising that the tokenizers you're examining have a vocabulary size of 30,000. However, the specific size is a hyperparameter and can be adjusted based on the needs of the specific task and the amount of available data.

## Tokenizing the dataset.

### Goal of Tokenizing the Dataset:

The primary goal of tokenizing is to convert human-readable text (sentences) into a format that a machine learning model can understand, i.e., sequences of numbers. Each word or sub-word in the text gets mapped to a unique number (token ID) based on the vocabulary of the tokenizer.

### Desired Output:

For your dataset, after tokenization, we want to transform each English and French sentence into its corresponding sequence of token IDs.

For instance, if the English sentence is "The Wanderer" and after tokenization it becomes `[46, 763]`, and the French sentence "Le grand Meaulnes" becomes `[35, 89, 123]`, our tokenized dataset should reflect these transformations.

### Shape and Structure:

Given your dataset structure, the tokenized dataset might look something like this:

```python
{
    'en': [
        [46, 763],
        [24, 55],
        ...
    ],
    'fr': [
        [35, 89, 123],
        [35, 55],
        ...
    ]
}
```

Each list inside 'en' or 'fr' represents a tokenized sentence. So, the shape is essentially two lists of lists, where the outer list's length is the number of sentences in the dataset, and the inner list's length varies based on the sentence length.

### Post-tokenization:

Once the data is tokenized, most of the operations (like padding, batching, and passing data through the model) will be performed on the tokenized data. The raw text data won't be used for training the model. However, you will occasionally need the original or the detokenized data for tasks like evaluations, for instance, when calculating BLEU scores for translations, or when you want to convert model predictions back into human-readable format.

### Steps:

1. Use the trained tokenizers to convert each English and French sentence in the dataset to their corresponding token IDs.
2. Store these tokenized sequences in a structured format (like the one shown above).

## SOS and EOS tokens

You'll typically add both SOS and EOS tokens to both the source and target sequences, but for slightly different reasons:

1. **Source Sequence (English in your case)**:
    - **SOS**: Helps the model know where the sequence starts, especially if you're batching multiple sequences together.
    - **EOS**: Signals the end of the sequence. This is particularly useful when you're working with attention mechanisms, as it allows the model to know where the valid data for each sequence ends.

2. **Target Sequence (French in your case)**:
    - **SOS**: This is crucial for the decoding phase. During training, models like the Transformer are often trained with a technique called "teacher forcing", where the true previous token (rather than the predicted one) is fed as input for predicting the next token. The SOS token provides the starting point for this process.
    - **EOS**: Signals the end of the target sequence. During the decoding/generation phase, the EOS token indicates to the model that it should stop generating further tokens.

In summary, while you could technically get away with not using SOS or EOS for the source sequence (though it's still recommended to use them), they are pretty much essential for the target sequence due to the reasons stated above.

## `torch.nn.utils.rnn.pad_sequence`

`pad_sequence` is a utility function provided by PyTorch to pad a list of variable-length sequences with zeros (or any specified value) to make them all of the same length.

#### Parameters:

- **sequences**: A list of sequences, where each sequence is a 1D tensor of variable lengths.
- **batch_first**: (Optional) If `True`, the returned tensor will have shape `(batch_size, max_sequence_length)`. If `False`, the returned tensor will have shape `(max_sequence_length, batch_size)`. Default is `False`.
- **padding_value**: (Optional) The value used for padding. Default is `0`.

#### Returns:

- A 2D tensor where sequences are padded to equal length.

#### Example:

Suppose you have sequences of token IDs with different lengths:

```python
sequences = [
    torch.tensor([1, 2, 3]),
    torch.tensor([4, 5]),
    torch.tensor([6])
]
```

You can use `pad_sequence` to pad these sequences:

```python
from torch.nn.utils.rnn import pad_sequence

padded_sequences = pad_sequence(sequences, batch_first=True)
```

The `padded_sequences` tensor will look like:

```
tensor([[1, 2, 3],
        [4, 5, 0],
        [6, 0, 0]])
```

Here, `0` is used as the padding value, and the sequences are padded to the length of the longest sequence.

#### When to use:

`pad_sequence` is especially useful in natural language processing tasks where you're dealing with sequences of words/tokens of variable lengths, and you want to batch them together for processing with deep learning models. Padding is necessary because models typically require input data to have consistent dimensions.

However, note that while padding allows sequences to be processed in batches, it might introduce computational inefficiencies (since the model would process padding tokens which don't carry meaningful information). To counteract this, masking is often used in conjunction to inform the model which tokens are real and which are padding.

## Padding

1. **Goal of Padding**: When training neural networks (especially deep learning models), it's often necessary to process data in batches for computational efficiency. However, sequences (like sentences) can vary in length. Since deep learning models expect input data to have consistent dimensions, we pad shorter sequences to match the length of the longest sequence in a batch. This ensures that every sequence in the batch has the same length.

2. **Output Format**: After padding, for every sequence in your dataset, you'll have a consistent list of integers of a given length (either a pre-defined fixed length or the length of the longest sequence in the batch). Sequences shorter than this length will have additional integers added (typically 0 or some other designated "padding" value) to bring them up to the required length.

3. **Shape**: The shape will depend on the number of sequences you're processing in a batch and the length you've padded them to. For instance, if you're processing batches of 32 sequences and you've padded them all to a length of 20 tokens, then the shape of your input tensor for each batch would be \([32, 20]\).

### How to go about padding:

With PyTorch and HuggingFace, padding is quite straightforward.

1. First, decide on a consistent length to which you want to pad your sequences. You can either:
   - Use the length of the longest sequence in the entire dataset.
   - Use the length of the longest sequence in each batch (this is dynamic padding and is more memory efficient).
   - Define an arbitrary fixed length.

2. Use PyTorch's `pad_sequence` function to pad sequences in your dataset to the desired length.

Here's how you can do it:

```python
from torch.nn.utils.rnn import pad_sequence
import torch

# Assuming you have tokenized data like the one you've shown
tokenized_data_en = [[46, 0], [0, 31, 0], [2293, 9371], [11], [904, 0]]
tokenized_data_fr = [[82, 157, 774], [0, 14, 0], [29730, 14265], [1033, 17335], [2150, 0]]

# Convert to PyTorch tensors
tokenized_tensors_en = [torch.tensor(seq) for seq in tokenized_data_en]
tokenized_tensors_fr = [torch.tensor(seq) for seq in tokenized_data_fr]

# Pad sequences
padded_en = pad_sequence(tokenized_tensors_en, batch_first=True, padding_value=0)
padded_fr = pad_sequence(tokenized_tensors_fr, batch_first=True, padding_value=0)

print(padded_en)
print(padded_fr)
```

3. When creating DataLoader batches, ensure that each batch has sequences of the same length, either by padding all sequences in the dataset to a fixed length or by dynamically padding each batch.
