In [1]:
%autosave 300
%reload_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

Autosaving every 300 seconds


In [2]:
import os

os.chdir(
    "/mnt/batch/tasks/shared/LS_root/mounts/clusters/insights-model-run2/code/Users/soutrik.chowdhury/EraV2_Transformers"
)
print(os.getcwd())

/mnt/batch/tasks/shared/LS_root/mounts/clusters/insights-model-run2/code/Users/soutrik.chowdhury/EraV2_Transformers


### Non-Conventional way - Spacy tokenization and collate batch
https://kikaben.com/transformers-data-loader/m

In [3]:
from torch.utils.data import IterableDataset
from torchtext import datasets
from typing import Tuple

In [4]:
def load_dataset(
    name: str, split: str, language_pair: Tuple[str, str]
) -> IterableDataset:
    dataset_class = eval(f"datasets.{name}")
    dataset = dataset_class(split=split, language_pair=language_pair)
    return dataset

In [5]:
# The dataset has 29K pairs of German and English sentences.
dataset = load_dataset("Multi30k", "train", ("de", "en"))

In [6]:
next(iter(dataset))

('Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'Two young, White males are outside near many bushes.')

In [7]:
for de_text, en_text in dataset:
    print(de_text, "\n", en_text)
    break

Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche. 
 Two young, White males are outside near many bushes.




In [8]:
# Tokenization using spacy
import spacy
from collections import Counter
from typing import List

In [9]:
de_tokenizer = spacy.load("de_core_news_sm")
en_tokenizer = spacy.load("en_core_web_sm")

In [10]:
print([token.text for token in de_tokenizer(de_text)])
print([token.text for token in en_tokenizer(en_text)])

['Zwei', 'junge', 'weiße', 'Männer', 'sind', 'im', 'Freien', 'in', 'der', 'Nähe', 'vieler', 'Büsche', '.']
['Two', 'young', ',', 'White', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']


In [11]:
def engTokenize(text):
    """
    Tokenize an English text and return a list of tokens
    """
    return [token.text for token in en_tokenizer.tokenizer(text)]


def deTokenize(text):
    """
    Tokenize a German text and return a list of tokens
    """
    return [token.text for token in de_tokenizer.tokenizer(text)]

In [12]:
print(engTokenize(en_text))
print(deTokenize(de_text))

['Two', 'young', ',', 'White', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']
['Zwei', 'junge', 'weiße', 'Männer', 'sind', 'im', 'Freien', 'in', 'der', 'Nähe', 'vieler', 'Büsche', '.']


In [13]:
# First we split out the source text and target text
source_text = [en_text for de_text, en_text in dataset]
target_text = [de_text for de_text, en_text in dataset]



In [14]:
len(source_text), len(target_text)

(29001, 29001)

In [15]:
def unique_tokens(tokenizer, texts):
    """
    Return a list of unique tokens in the texts
    """
    counter = Counter()
    for doc in tokenizer.pipe(texts):
        token_texts = []
        for token in doc:
            token_text = token.text.strip()
            if len(token_text) > 0:  # not a white space
                token_texts.append(token_text)
        counter.update(token_texts)

    # unique tokens
    tokens = [token for token, count in counter.most_common()]
    return tokens

In [16]:
en_tokens = unique_tokens(en_tokenizer, source_text)
de_tokens = unique_tokens(de_tokenizer, target_text)

In [17]:
len(en_tokens), len(de_tokens)

(10832, 19207)

In [18]:
# special token indices
UNK_IDX = 0
PAD_IDX = 1
SOS_IDX = 2
EOS_IDX = 3

UNK = "<unk>"  # Unknown
PAD = "<pad>"  # Padding
SOS = "<sos>"  # Start of sentence
EOS = "<eos>"  # End of sentence

SPECIAL_TOKENS = [UNK, PAD, SOS, EOS]


class Vocab:
    def __init__(
        self, tokenizer: spacy.language.Language, tokens: List[str] = []
    ) -> None:
        self.tokenizer = tokenizer
        self.tokens = SPECIAL_TOKENS + tokens  # addition of special tokens
        self.index_lookup = {self.tokens[i]
            : i for i in range(len(self.tokens))}

    def __len__(self) -> int:
        return len(self.tokens)  # vocab size

    def __call__(self, text: str) -> List[int]:
        text = text.strip()
        return [self.to_index(token.text) for token in self.tokenizer(text)]

    def to_index(self, token: str) -> int:
        return self.index_lookup[token] if token in self.index_lookup else UNK_IDX

In [19]:
en_vocab = Vocab(en_tokenizer, en_tokens)
de_vocab = Vocab(de_tokenizer, de_tokens)

In [20]:
# en_vocab(en_text)

In [21]:
# de_vocab(de_text)

In [22]:
# PyTorch’s DataLoader and collate_fn to encapsulate tokenization and token index processing
# We prepend SOS_IDX and append EOS_IDX for target sentences. Finally, we convert token indices into Tensor and keep them in a list.

In [23]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [24]:
class CustomData(Dataset):
    def __init__(
        self,
        source_text,
        target_text,
        source_vocab,
        target_vocab,
        pad_idx,
        sos_idx,
        eos_idx,
    ):
        super().__init__()
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab
        self.pad_idx = pad_idx
        self.sos_idx = sos_idx
        self.eos_idx = eos_idx
        self.source_text = source_text
        self.target_text = target_text

    def __len__(self):
        return len(self.source_text)

    def __getitem__(self, idx):
        source_text_sample = self.source_text[idx]
        target_text_sample = self.target_text[idx]

        # convert text to token indices
        source_token_indices = self.source_vocab(source_text_sample)
        target_token_indices = self.target_vocab(target_text_sample)

        # prepend SOS_IDX and append EOS_IDX for source sentences
        source_token_indices = [self.sos_idx] + source_token_indices + [self.eos_idx]
        # prepend SOS_IDX and append EOS_IDX for target sentences
        target_token_indices = [self.sos_idx] + target_token_indices + [self.eos_idx]

        return {
            "encoder_input": torch.tensor(source_token_indices, dtype=torch.long),
            "decoder_input": torch.tensor(target_token_indices, dtype=torch.long),
        }

In [25]:
custom_dataset = CustomData(
    source_text=source_text,
    target_text=target_text,
    source_vocab=en_vocab,
    target_vocab=de_vocab,
    pad_idx=PAD_IDX,
    sos_idx=SOS_IDX,
    eos_idx=EOS_IDX,
)

In [26]:
def create_masks(src_batch: torch.Tensor, tgt_batch: torch.Tensor):
    # ----------------------------------------------------------------------
    # [1] padding mask
    # ----------------------------------------------------------------------

    # (batch_size, 1, max_tgt_seq_len)
    src_pad_mask = (src_batch != PAD_IDX).unsqueeze(1)

    # (batch_size, 1, max_src_seq_len)
    tgt_pad_mask = (tgt_batch != PAD_IDX).unsqueeze(1)

    # ----------------------------------------------------------------------
    # [2] subsequent mask for decoder inputs
    # ----------------------------------------------------------------------
    max_tgt_sequence_length = tgt_batch.shape[1]
    tgt_attention_square = (max_tgt_sequence_length, max_tgt_sequence_length)

    # full attention
    full_mask = torch.full(tgt_attention_square, 1)

    # subsequent sequence should be invisible to each token position
    subsequent_mask = torch.tril(full_mask)

    # add a batch dim (1, max_tgt_seq_len, max_tgt_seq_len)
    subsequent_mask = subsequent_mask.unsqueeze(0)

    return src_pad_mask, tgt_pad_mask & subsequent_mask

In [27]:
def collate_fn(batch):
    source_batch = [sample["encoder_input"] for sample in batch]
    target_batch = [sample["decoder_input"] for sample in batch]

    source_batch = pad_sequence(source_batch, padding_value=PAD_IDX, batch_first=True)
    target_batch = pad_sequence(target_batch, padding_value=PAD_IDX, batch_first=True)

    label_batch = target_batch[:, 1:]  # remove <sos> from target
    target_batch = target_batch[:, :-1]  # remove <eos> from target

    source_mask, target_mask = create_masks(source_batch, target_batch)

    all_batches = [source_batch, target_batch, label_batch, source_mask, target_mask]

    # move everything to the target device
    return [x.to(device) for x in all_batches]

In [28]:
custom_dataloader = DataLoader(
    custom_dataset, batch_size=10, shuffle=True, collate_fn=collate_fn
)

In [29]:
for idx, (source, target, label, source_mask, target_mask) in enumerate(
    custom_dataloader
):
    print("source", source.shape)
    print("target", target.shape)
    print("label", label.shape)
    print("source_mask", source_mask.shape)
    print("target_mask", target_mask.shape)

    if idx == 100:
        break

    print("\n")

source torch.Size([10, 20])
target torch.Size([10, 18])
label torch.Size([10, 18])
source_mask torch.Size([10, 1, 20])
target_mask torch.Size([10, 18, 18])


source torch.Size([10, 24])
target torch.Size([10, 22])
label torch.Size([10, 22])
source_mask torch.Size([10, 1, 24])
target_mask torch.Size([10, 22, 22])


source torch.Size([10, 19])
target torch.Size([10, 19])
label torch.Size([10, 19])
source_mask torch.Size([10, 1, 19])
target_mask torch.Size([10, 19, 19])


source torch.Size([10, 23])
target torch.Size([10, 18])
label torch.Size([10, 18])
source_mask torch.Size([10, 1, 23])
target_mask torch.Size([10, 18, 18])


source torch.Size([10, 20])
target torch.Size([10, 22])
label torch.Size([10, 22])
source_mask torch.Size([10, 1, 20])
target_mask torch.Size([10, 22, 22])


source torch.Size([10, 26])
target torch.Size([10, 18])
label torch.Size([10, 18])
source_mask torch.Size([10, 1, 26])
target_mask torch.Size([10, 18, 18])


source torch.Size([10, 26])
target torch.Size([10, 2

### Optimized code

In [110]:
from torch.utils.data import IterableDataset
from torchtext import datasets
from typing import Tuple
import spacy
from collections import Counter
from typing import List
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [111]:
# Function to load the dataset based on given parameters
def load_dataset(
    name: str, split: str, language_pair: Tuple[str, str]
) -> IterableDataset:
    """Loads a dataset from torchtext.datasets based on the provided name, split, and language pair"""
    dataset_class = eval(f"datasets.{name}")
    dataset = dataset_class(split=split, language_pair=language_pair)
    return dataset


# Load the Multi30k dataset (German-English)
dataset = load_dataset("Multi30k", "train", ("de", "en"))

# Display the first sample from the dataset to verify the loading process
for de_text, en_text in dataset:
    print(de_text, "\n", en_text)
    break

Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche. 
 Two young, White males are outside near many bushes.


In [112]:
# Initialize SpaCy tokenizers for German and English
de_tokenizer = spacy.load("de_core_news_sm")
en_tokenizer = spacy.load("en_core_web_sm")

# Tokenize and display tokens for a sample text
print([token.text for token in de_tokenizer(de_text)])
print([token.text for token in en_tokenizer(en_text)])


# Function to tokenize English text
def engTokenize(text):
    """Tokenize an English text and return a list of tokens"""
    return [token.text for token in en_tokenizer.tokenizer(text)]


# Function to tokenize German text
def deTokenize(text):
    """Tokenize a German text and return a list of tokens"""
    return [token.text for token in de_tokenizer.tokenizer(text)]


# Verify tokenization functions
print(engTokenize(en_text))
print(deTokenize(de_text))

['Zwei', 'junge', 'weiße', 'Männer', 'sind', 'im', 'Freien', 'in', 'der', 'Nähe', 'vieler', 'Büsche', '.']
['Two', 'young', ',', 'White', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']
['Two', 'young', ',', 'White', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']
['Zwei', 'junge', 'weiße', 'Männer', 'sind', 'im', 'Freien', 'in', 'der', 'Nähe', 'vieler', 'Büsche', '.']


In [113]:
# Split the dataset into source and target texts
source_text = [en_text for de_text, en_text in dataset]
target_text = [de_text for de_text, en_text in dataset]
print(len(source_text), len(target_text))



29001 29001


In [114]:
# Function to return unique tokens in the texts
def unique_tokens(tokenizer, texts):
    """Return a list of unique tokens in the texts"""
    counter = Counter()
    for doc in tokenizer.pipe(texts):
        token_texts = []
        for token in doc:
            token_text = token.text.strip()
            if len(token_text) > 0:  # Not a white space
                token_texts.append(token_text)
        counter.update(token_texts)

    # Unique tokens
    tokens = [token for token, count in counter.most_common()]
    return tokens


# Get unique tokens for English and German texts
en_tokens = unique_tokens(en_tokenizer, source_text)
de_tokens = unique_tokens(de_tokenizer, target_text)
print(len(en_tokens), len(de_tokens))

10832 19207


In [115]:
# Special token indices
UNK_IDX = 0
PAD_IDX = 1
SOS_IDX = 2
EOS_IDX = 3

# Special tokens
UNK = "<unk>"  # Unknown
PAD = "<pad>"  # Padding
SOS = "<sos>"  # Start of sentence
EOS = "<eos>"  # End of sentence

SPECIAL_TOKENS = [UNK, PAD, SOS, EOS]


# Vocabulary class
class Vocab:
    def __init__(
        self, tokenizer: spacy.language.Language, tokens: List[str] = []
    ) -> None:
        """Initialize the vocabulary with a tokenizer and an optional list of tokens"""
        self.tokenizer = tokenizer
        self.tokens = SPECIAL_TOKENS + tokens  # Addition of special tokens
        self.index_lookup = {self.tokens[i]: i for i in range(len(self.tokens))}

    def __len__(self) -> int:
        """Return the size of the vocabulary"""
        return len(self.tokens)

    def __call__(self, text: str) -> List[int]:
        """Convert a text to a list of token indices"""
        text = text.strip()
        return [self.to_index(token.text) for token in self.tokenizer(text)]

    def to_index(self, token: str) -> int:
        """Convert a token to its corresponding index"""
        return self.index_lookup[token] if token in self.index_lookup else UNK_IDX


# Create vocabularies for English and German
en_vocab = Vocab(en_tokenizer, en_tokens)
de_vocab = Vocab(de_tokenizer, de_tokens)

In [116]:
# Device configuration (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [123]:
# Custom dataset class for translation data
class SpacyData(Dataset):
    def __init__(
        self,
        source_text,
        target_text,
        source_vocab,
        target_vocab,
        pad_idx,
        sos_idx,
        eos_idx,
    ):
        """Initialize the dataset with source and target texts, vocabularies, and special token indices"""
        super().__init__()
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab
        self.pad_idx = pad_idx
        self.sos_idx = sos_idx
        self.eos_idx = eos_idx
        self.source_text = source_text
        self.target_text = target_text

    def __len__(self):
        """Return the number of samples in the dataset"""
        return len(self.source_text)

    def __getitem__(self, idx):
        """Return a sample from the dataset at the given index"""
        source_text_sample = self.source_text[idx]
        target_text_sample = self.target_text[idx]

        # Convert text to token indices
        source_token_indices = self.source_vocab(source_text_sample)
        target_token_indices = self.target_vocab(target_text_sample)

        # Prepend SOS_IDX and append EOS_IDX for source and target sentences
        source_token_indices = [self.sos_idx] + source_token_indices + [self.eos_idx]
        target_token_indices = [self.sos_idx] + target_token_indices + [self.eos_idx]

        return {
            "encoder_input": torch.tensor(source_token_indices, dtype=torch.long),
            "decoder_input": torch.tensor(target_token_indices, dtype=torch.long),
            "src_texts": source_text_sample,
            "tgt_texts": target_text_sample,
        }

In [124]:
# Create custom dataset
custom_dataset = SpacyData(
    source_text=source_text,
    target_text=target_text,
    source_vocab=en_vocab,
    target_vocab=de_vocab,
    pad_idx=PAD_IDX,
    sos_idx=SOS_IDX,
    eos_idx=EOS_IDX,
)

In [125]:
# Function to create masks for source and target sequences
def create_masks(src_batch: torch.Tensor, tgt_batch: torch.Tensor):
    # Padding masks
    src_pad_mask = (src_batch != PAD_IDX).unsqueeze(1)
    tgt_pad_mask = (tgt_batch != PAD_IDX).unsqueeze(1)

    # Subsequent mask for decoder inputs (causal mask)
    max_tgt_sequence_length = tgt_batch.shape[1]
    tgt_attention_square = (max_tgt_sequence_length, max_tgt_sequence_length)

    full_mask = torch.full(tgt_attention_square, 1)
    subsequent_mask = torch.tril(full_mask).unsqueeze(0)

    return src_pad_mask, tgt_pad_mask & subsequent_mask


# Collate function for batching
def collate_fn(batch):
    source_batch = [sample["encoder_input"] for sample in batch]
    target_batch = [sample["decoder_input"] for sample in batch]
    src_texts = [sample["src_texts"] for sample in batch]
    tgt_texts = [sample["tgt_texts"] for sample in batch]

    # Pad sequences to the same length within the batch
    source_batch = pad_sequence(source_batch, padding_value=PAD_IDX, batch_first=True)
    target_batch = pad_sequence(target_batch, padding_value=PAD_IDX, batch_first=True)

    # Remove <sos> from target and create label batch
    label_batch = target_batch[:, 1:]
    target_batch = target_batch[:, :-1]

    # Create masks for source and target sequences
    source_mask, target_mask = create_masks(source_batch, target_batch)

    # Move all batches to the target device (GPU or CPU)
    all_batches = [source_batch, target_batch, label_batch, source_mask, target_mask]
    return [x.to(device) for x in all_batches] + [src_texts, tgt_texts]

In [127]:
# Create DataLoader
custom_dataloader = DataLoader(
    custom_dataset, batch_size=10, shuffle=True, collate_fn=collate_fn
)

# Iterate over DataLoader and print batch shapes for debugging
for idx, (
    source,
    target,
    label,
    source_mask,
    target_mask,
    src_texts,
    tgt_texts,
) in enumerate(custom_dataloader):
    print("source", source.shape)
    print("target", target.shape)
    print("label", label.shape)
    print("source_mask", source_mask.shape)
    print("target_mask", target_mask.shape)
    print("src_texts", src_texts)
    print("tgt_texts", tgt_texts)

    if idx == 0:
        break

    print("\n")

source torch.Size([10, 24])
target torch.Size([10, 33])
label torch.Size([10, 33])
source_mask torch.Size([10, 1, 24])
target_mask torch.Size([10, 33, 33])
src_texts ['The most amazing recreational activity outdoor camping.', 'Two girls standing on the side of the road.', 'Four girls in a swimming pool the one getting ready to jump in.', 'People staring up at the fair.', 'A man wearing a hoodie and jacket is sitting on a bench in a park.', 'Two young ladies, one blond talking on the cellphone, one brunette texting, are walking on the summer street.', 'A man and a woman carrying bags are walking down the sidewalk.', 'Several individuals getting together to play the bagpipes and eat some cuisine.', 'A boy in a black helmet jumps a bicycle with a small town visible in the background.', 'A woman and a man are dancing in public']
tgt_texts ['Eine absolut faszinierend Freizeitaktivität: im Freien Campen.', 'Zwei Mädchen stehen am Straßenrand.', 'Vier Mädchen in einem Schwimmbad, von denen ei

Here's an analysis of the entire code provided, explaining what each part is doing and why it's necessary for training a transformer model in PyTorch:

### Loading the Dataset
```python
from torch.utils.data import IterableDataset
from torchtext import datasets
from typing import Tuple

def load_dataset(name: str, split: str, language_pair: Tuple[str, str]) -> IterableDataset:
    """Loads a dataset from torchtext.datasets based on the provided name, split, and language pair"""
    dataset_class = eval(f"datasets.{name}")
    dataset = dataset_class(split=split, language_pair=language_pair)
    return dataset

# Load the Multi30k dataset (German-English)
dataset = load_dataset("Multi30k", "train", ("de", "en"))

# Display the first sample from the dataset to verify the loading process
for de_text, en_text in dataset:
    print(de_text, "\n", en_text)
    break
```
- **Purpose**: This block defines a function to load datasets from the `torchtext.datasets` module. The `load_dataset` function is flexible, allowing different dataset names, splits (e.g., train, test), and language pairs.
- **Verification**: The first sample from the loaded dataset is printed to verify that the dataset is loaded correctly.

### Tokenization
```python
# Initialize SpaCy tokenizers for German and English
import spacy
de_tokenizer = spacy.load("de_core_news_sm")
en_tokenizer = spacy.load("en_core_web_sm")

# Tokenize and display tokens for a sample text
print([token.text for token in de_tokenizer(de_text)])
print([token.text for token in en_tokenizer(en_text)])

# Function to tokenize English text
def engTokenize(text):
    """Tokenize an English text and return a list of tokens"""
    return [token.text for token in en_tokenizer.tokenizer(text)]

# Function to tokenize German text
def deTokenize(text):
    """Tokenize a German text and return a list of tokens"""
    return [token.text for token in de_tokenizer.tokenizer(text)]

# Verify tokenization functions
print(engTokenize(en_text))
print(deTokenize(de_text))
```
- **Purpose**: This section initializes SpaCy tokenizers for German and English and defines functions to tokenize text in both languages.
- **Verification**: Sample tokenizations are printed to ensure that the tokenizers work correctly.

### Splitting and Analyzing the Dataset
```python
# Split the dataset into source and target texts
source_text = [en_text for de_text, en_text in dataset]
target_text = [de_text for de_text, en_text in dataset]
print(len(source_text), len(target_text))

# Function to return unique tokens in the texts
from collections import Counter
def unique_tokens(tokenizer, texts):
    """Return a list of unique tokens in the texts"""
    counter = Counter()
    for doc in tokenizer.pipe(texts):
        token_texts = []
        for token in doc:
            token_text = token.text.strip()
            if len(token_text) > 0:  # Not a white space
                token_texts.append(token_text)
        counter.update(token_texts)

    # Unique tokens
    tokens = [token for token, count in counter.most_common()]
    return tokens

# Get unique tokens for English and German texts
en_tokens = unique_tokens(en_tokenizer, source_text)
de_tokens = unique_tokens(de_tokenizer, target_text)
print(len(en_tokens), len(de_tokens))
```
- **Purpose**: This block splits the dataset into source and target texts (English and German, respectively) and calculates unique tokens in both languages using the tokenizers.
- **Verification**: The lengths of the source and target texts, as well as the number of unique tokens, are printed for validation.

### Vocabulary Creation
```python
# Special token indices
UNK_IDX = 0
PAD_IDX = 1
SOS_IDX = 2
EOS_IDX = 3

# Special tokens
UNK = "<unk>"  # Unknown
PAD = "<pad>"  # Padding
SOS = "<sos>"  # Start of sentence
EOS = "<eos>"  # End of sentence

SPECIAL_TOKENS = [UNK, PAD, SOS, EOS]

# Vocabulary class
class Vocab:
    def __init__(self, tokenizer: spacy.language.Language, tokens: List[str] = []) -> None:
        """Initialize the vocabulary with a tokenizer and an optional list of tokens"""
        self.tokenizer = tokenizer
        self.tokens = SPECIAL_TOKENS + tokens  # Addition of special tokens
        self.index_lookup = {self.tokens[i]: i for i in range(len(self.tokens))}

    def __len__(self) -> int:
        """Return the size of the vocabulary"""
        return len(self.tokens)

    def __call__(self, text: str) -> List[int]:
        """Convert a text to a list of token indices"""
        text = text.strip()
        return [self.to_index(token.text) for token in self.tokenizer(text)]

    def to_index(self, token: str) -> int:
        """Convert a token to its corresponding index"""
        return self.index_lookup[token] if token in self.index_lookup else UNK_IDX

# Create vocabularies for English and German
en_vocab = Vocab(en_tokenizer, en_tokens)
de_vocab = Vocab(de_tokenizer, de_tokens)
```
- **Purpose**: This section defines special tokens and their indices, then creates a `Vocab` class to handle token-to-index conversions, including special tokens for unknown words, padding, start of sentence, and end of sentence.
- **Initialization**: Two vocabularies are created for English and German using the `Vocab` class.

### Custom Dataset
```python
# Device configuration (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Custom dataset class for translation data
class CustomData(Dataset):
    def __init__(self, source_text, target_text, source_vocab, target_vocab, pad_idx, sos_idx, eos_idx):
        """Initialize the dataset with source and target texts, vocabularies, and special token indices"""
        super().__init__()
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab
        self.pad_idx = pad_idx
        self.sos_idx = sos_idx
        self.eos_idx = eos_idx
        self.source_text = source_text
        self.target_text = target_text

    def __len__(self):
        """Return the number of samples in the dataset"""
        return len(self.source_text)

    def __getitem__(self, idx):
        """Return a sample from the dataset at the given index"""
        source_text_sample = self.source_text[idx]
        target_text_sample = self.target_text[idx]

        # Convert text to token indices
        source_token_indices = self.source_vocab(source_text_sample)
        target_token_indices = self.target_vocab(target_text_sample)

        # Prepend SOS_IDX and append EOS_IDX for source and target sentences
        source_token_indices = [self.sos_idx] + source_token_indices + [self.eos_idx]
        target_token_indices = [self.sos_idx] + target_token_indices + [self.eos_idx]

        return {
            "encoder_input": torch.tensor(source_token_indices, dtype=torch.long),
            "decoder_input": torch.tensor(target_token_indices, dtype=torch.long),
        }

# Create custom dataset
custom_dataset = CustomData(
    source_text=source_text,
    target_text=target_text,
    source_vocab=en_vocab,
    target_vocab=de_vocab,
    pad_idx=PAD_IDX,
    sos_idx=SOS_IDX,
    eos_idx=EOS_IDX,
)
```
- **Purpose**: This block defines a `CustomData` class to handle the translation data, converting texts to token indices and adding special tokens (SOS and EOS). It also initializes the custom dataset.
- **Device Configuration**: The device (GPU or CPU) is configured based on availability.

### Mask Creation and Collate Function
```python
# Function to create masks for source and target sequences
def create_masks(src_batch: torch.Tensor, tgt_batch: torch.Tensor):
    # Padding masks
    src_pad_mask = (src_batch != PAD_IDX).unsqueeze(1)
    tgt_pad_mask = (tgt_batch != PAD_IDX).unsqueeze(1)

    # Subsequent mask for decoder inputs (causal mask)
    max_tgt_sequence_length = tgt_batch.shape[1]
    tgt_attention_square = (max_tgt_sequence_length, max_tgt_sequence_length)

    full_mask = torch.full(tgt_attention_square, 1)
    subsequent_mask = torch.tril(full_mask).unsqueeze(0)

    return src_pad_mask, tgt_pad_mask & subsequent_mask

# Collate function for batching
def collate_fn(batch):
    source_batch = [sample["encoder_input"] for sample in batch]
    target_batch = [sample["decoder_input"] for sample in batch]

    # Pad sequences to the same length within the batch
    source_batch = pad_sequence(source_batch, padding_value=PAD_IDX, batch_first=True)
    target_batch = pad_sequence(target_batch, padding_value=PAD_IDX, batch_first=True)

    # Remove <sos> from target and create label batch
    label_batch = target_batch[:, 1:]
    target_batch = target_batch[:, :-1]

    # Create masks for source and target sequences
    source_mask, target_mask = create_masks(source_batch, target_batch)

    # Move all batches to the target device (GPU or CPU)
    all_batches = [source_batch, target_batch, label_batch, source_mask, target_mask]
    return [x.to(device) for x in all_batches

]
```
- **Purpose**: 
  - `create_masks` function creates padding masks for the source and target sequences and a subsequent mask for the target sequence, crucial for transformer models to handle variable-length sequences and maintain causality in the decoder.
  - `collate_fn` function pads sequences within a batch to the same length, adjusts target sequences, creates masks, and moves all data to the target device.

### DataLoader and Debugging
```python
# Create DataLoader
from torch.utils.data import DataLoader
custom_dataloader = DataLoader(custom_dataset, batch_size=10, shuffle=True, collate_fn=collate_fn)

# Iterate over DataLoader and print batch shapes for debugging
for idx, (source, target, label, source_mask, target_mask) in enumerate(custom_dataloader):
    print("source", source.shape)
    print("target", target.shape)
    print("label", label.shape)
    print("source_mask", source_mask.shape)
    print("target_mask", target_mask.shape)

    if idx == 100:
        break

    print("\n")

# TODO: Test the above dataloader with a transformer model
```
- **Purpose**: 
  - A `DataLoader` is created for batching and shuffling the custom dataset.
  - The loop iterates over batches from the `DataLoader` and prints their shapes for debugging purposes, ensuring the data is correctly batched and masked.

### Summary
The code systematically loads and processes a translation dataset, tokenizes the text, creates vocabularies, and defines a custom dataset class. It prepares data batches with padding and masks, suitable for feeding into a transformer model. The debugging step verifies the data shapes, ensuring everything is set up correctly before testing with a transformer model.

In [37]:
# TODO:Test the above dataloader with a transformer model

### Converntional way - using Hugging face tokenizer

#### Download the data

In [38]:
from datasets import load_dataset
from S16_code.config import get_config

In [39]:
ds_raw = load_dataset(
    get_config()["datasource"],
    f"{get_config()['src_lang']}-{get_config()['tgt_lang']}",
    split="train",
)
print(len(ds_raw))

32332


In [40]:
def get_all_sentences(ds, lang):
    """Iterate over all sentences in the dataset and yield them."""
    for pair in ds:
        yield pair["translation"][lang]

#### Multi-lang tokenizer

In [41]:
from tokenizers import Tokenizer, normalizers
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import Lowercase

In [42]:
def build_tokenizer(ds, lang):
    """Function to build a tokenizer for the given language and dataset"""

    print(f"Building tokenizer for {lang}")
    tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = Whitespace()
    tokenizer.normalizer = normalizers.Sequence([Lowercase()])
    trainer = WordLevelTrainer(
        special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2
    )
    tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
    os.makedirs("./tokenizer", exist_ok=True)
    return tokenizer

In [43]:
tokenizer_src = build_tokenizer(ds_raw, get_config()["src_lang"])
tokenizer_tgt = build_tokenizer(ds_raw, get_config()["tgt_lang"])

Building tokenizer for en


Building tokenizer for it


In [44]:
tokenizer_src.get_vocab_size()

14554

In [45]:
tokenizer_tgt.get_vocab_size()

21401

#### Custom Dataset

In [62]:
import torch
from torch.utils.data import Dataset, DataLoader

In [55]:
def causal_mask(size):
    # Creating a square matrix of dimensions 'size x size' filled with ones
    mask = torch.triu(torch.ones(1, size, size), diagonal=1).type(torch.int)
    return mask == 0

In [59]:
class BilingualDataset(Dataset):

    def __init__(
        self,
        ds,
        tokenizer_src,
        tokenizer_tgt,
        src_lang,
        tgt_lang,
    ):
        super().__init__()

        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang

        self.sos_token = torch.tensor(
            [tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64
        )
        self.eos_token = torch.tensor(
            [tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64
        )
        self.pad_token = torch.tensor(
            [tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64
        )

        # print(self.sos_token, self.eos_token, self.pad_token)

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        src_target_pair = self.ds[idx]
        src_text = src_target_pair["translation"][self.src_lang]
        tgt_text = src_target_pair["translation"][self.tgt_lang]

        # Transform the text into tokens
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        # Add <s> and </s> token
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
            ],
            dim=0,
        )

        # Add only <s> token
        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only </s> token as it offset by 1 from the decoder_input
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
            ],
            dim=0,
        )

        return {
            "encoder_input": encoder_input,  # (seq_len)
            "decoder_input": decoder_input,  # (seq_len)
            "label": label,  # (seq_len)
            "src_text": src_text,
            "tgt_text": tgt_text,
        }

In [96]:
def collate_fn(batch):
    encoder_input = [sample["encoder_input"] for sample in batch]
    decoder_input = [sample["decoder_input"] for sample in batch]
    label = [sample["label"] for sample in batch]

    encoder_input = torch.nn.utils.rnn.pad_sequence(
        encoder_input,
        batch_first=True,
        padding_value=tokenizer_tgt.token_to_id("[PAD]"),
    )
    decoder_input = torch.nn.utils.rnn.pad_sequence(
        decoder_input,
        batch_first=True,
        padding_value=tokenizer_tgt.token_to_id("[PAD]"),
    )
    label = torch.nn.utils.rnn.pad_sequence(
        label, batch_first=True, padding_value=tokenizer_tgt.token_to_id("[PAD]")
    )

    # print(f"encoder_input: {encoder_input.shape}")
    # print(f"decoder_input: {decoder_input.shape}")
    # print(f"label: {label.shape}")

    encoder_mask = (
        (encoder_input != tokenizer_tgt.token_to_id("[PAD]"))
        .unsqueeze(1)
        .type(torch.int)
    )
    # print(f"encoder_mask: {encoder_mask.shape}")

    decoder_mask = (decoder_input != tokenizer_tgt.token_to_id("[PAD]")).unsqueeze(
        1
    ).type(torch.int) & causal_mask(decoder_input.size(1))
    # print(f"decoder_mask: {decoder_mask.shape}")

    return encoder_input, decoder_input, label, encoder_mask, decoder_mask

In [97]:
custom_dataset = BilingualDataset(
    ds_raw,
    tokenizer_src,
    tokenizer_tgt,
    get_config()["src_lang"],
    get_config()["tgt_lang"],
)

In [98]:
custom_dataloader = DataLoader(
    custom_dataset, batch_size=10, collate_fn=collate_fn)

In [99]:
for idx, (encoder_input, decoder_input, label, src_mask, tgt_mask) in enumerate(
    custom_dataloader
):
    print("encoder_input", encoder_input.shape)
    print("decoder_input", decoder_input.shape)
    print("label", label.shape)
    print("src_mask", src_mask.shape)
    print("tgt_mask", tgt_mask.shape)

    if idx == 10:
        break

    print("\n")

encoder_input torch.Size([10, 97])
decoder_input torch.Size([10, 77])
label torch.Size([10, 77])
src_mask torch.Size([10, 1, 97])
tgt_mask torch.Size([10, 77, 77])


encoder_input torch.Size([10, 85])
decoder_input torch.Size([10, 85])
label torch.Size([10, 85])
src_mask torch.Size([10, 1, 85])
tgt_mask torch.Size([10, 85, 85])


encoder_input torch.Size([10, 127])
decoder_input torch.Size([10, 124])
label torch.Size([10, 124])
src_mask torch.Size([10, 1, 127])
tgt_mask torch.Size([10, 124, 124])


encoder_input torch.Size([10, 63])
decoder_input torch.Size([10, 55])
label torch.Size([10, 55])
src_mask torch.Size([10, 1, 63])
tgt_mask torch.Size([10, 55, 55])


encoder_input torch.Size([10, 90])
decoder_input torch.Size([10, 92])
label torch.Size([10, 92])
src_mask torch.Size([10, 1, 90])
tgt_mask torch.Size([10, 92, 92])


encoder_input torch.Size([10, 72])
decoder_input torch.Size([10, 65])
label torch.Size([10, 65])
src_mask torch.Size([10, 1, 72])
tgt_mask torch.Size([10, 65, 65])


### Super Optimized Code 

In [100]:
# Import necessary libraries
from datasets import load_dataset
from S16_code.config import get_config
import os
from tokenizers import Tokenizer, normalizers
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import Lowercase
import torch
from torch.utils.data import Dataset, DataLoader

In [101]:
# Load the dataset based on the configuration settings
# The dataset is loaded using the Hugging Face datasets library, which simplifies accessing and managing datasets
ds_raw = load_dataset(
    get_config()["datasource"],
    f"{get_config()['src_lang']}-{get_config()['tgt_lang']}",
    split="train",
)
print(f"Dataset size: {len(ds_raw)}")

Dataset size: 32332


In [102]:
def get_all_sentences(ds, lang):
    """Iterate over all sentences in the dataset and yield them."""
    for pair in ds:
        yield pair["translation"][lang]

In [103]:
tokenizer_path = get_config()["tokenizer_path"]


def build_tokenizer(ds, lang):
    """Function to build a tokenizer for the given language and dataset"""
    print(f"Building tokenizer for {lang}")

    if os.path.exists(os.path.join(tokenizer_path, f"tokenizer_{lang}.json")):
        print(f"Tokenizer for {lang} already exists")
        tokenizer = Tokenizer.from_file(
            os.path.join(tokenizer_path, f"tokenizer_{lang}.json")
        )
        return tokenizer

    # Initialize a WordLevel tokenizer with an unknown token
    tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))

    # Use whitespace for tokenization
    tokenizer.pre_tokenizer = Whitespace()

    # Convert all text to lowercase to ensure consistency
    tokenizer.normalizer = normalizers.Sequence([Lowercase()])

    # Define special tokens and set minimum frequency for words
    trainer = WordLevelTrainer(
        special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2
    )

    # Train tokenizer on the provided dataset
    tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)

    os.makedirs("./tokenizer", exist_ok=True)
    tokenizer.save(os.path.join(tokenizer_path, f"tokenizer_{lang}.json"))

    return tokenizer

In [104]:
# Build tokenizers for source and target languages
tokenizer_src = build_tokenizer(ds_raw, get_config()["src_lang"])
tokenizer_tgt = build_tokenizer(ds_raw, get_config()["tgt_lang"])

# Print vocabulary sizes
print(f"Source vocabulary size: {tokenizer_src.get_vocab_size()}")
print(f"Target vocabulary size: {tokenizer_tgt.get_vocab_size()}")

Building tokenizer for en
Tokenizer for en already exists
Building tokenizer for it
Tokenizer for it already exists
Source vocabulary size: 15698
Target vocabulary size: 22463


In [105]:
def causal_mask(size):
    """Create a causal mask to ensure each position can attend to previous positions"""
    # The causal mask ensures that each token can only attend to previous tokens (for autoregressive decoding)
    mask = torch.triu(torch.ones(size, size), diagonal=1).type(torch.bool)
    return ~mask

In [106]:
class BilingualDataset(Dataset):
    """Custom Dataset for bilingual translation data"""

    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang):
        super().__init__()

        # Initialize dataset and tokenizers
        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang

        # Token IDs for special tokens
        self.sos_token = torch.tensor(
            [tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64
        )
        self.eos_token = torch.tensor(
            [tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64
        )
        self.pad_token = torch.tensor(
            [tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64
        )

    def __len__(self):
        """Return the number of samples in the dataset"""
        return len(self.ds)

    def __getitem__(self, idx):
        """Return a sample from the dataset at the given index"""
        src_target_pair = self.ds[idx]
        src_text = src_target_pair["translation"][self.src_lang]
        tgt_text = src_target_pair["translation"][self.tgt_lang]

        # Tokenize source and target texts
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        # Add <SOS> and <EOS> tokens to the encoder input
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
            ],
            dim=0,
        )

        # Add <SOS> token to the decoder input
        decoder_input = torch.cat(
            [self.sos_token, torch.tensor(dec_input_tokens, dtype=torch.int64)], dim=0
        )

        # Add <EOS> token to the label
        label = torch.cat(
            [torch.tensor(dec_input_tokens, dtype=torch.int64), self.eos_token], dim=0
        )

        return {
            "encoder_input": encoder_input,  # (seq_len)
            "decoder_input": decoder_input,  # (seq_len)
            "label": label,  # (seq_len)
            "src_text": src_text,
            "tgt_text": tgt_text,
        }

In [107]:
def collate_fn(batch):
    """Collate function to pad sequences and create masks for batching"""
    encoder_input = [sample["encoder_input"] for sample in batch]
    decoder_input = [sample["decoder_input"] for sample in batch]
    label = [sample["label"] for sample in batch]
    src_texts = [sample["src_text"] for sample in batch]
    tgt_texts = [sample["tgt_text"] for sample in batch]

    # Pad sequences for batching
    encoder_input = torch.nn.utils.rnn.pad_sequence(
        encoder_input,
        batch_first=True,
        padding_value=tokenizer_tgt.token_to_id("[PAD]"),
    )
    decoder_input = torch.nn.utils.rnn.pad_sequence(
        decoder_input,
        batch_first=True,
        padding_value=tokenizer_tgt.token_to_id("[PAD]"),
    )
    label = torch.nn.utils.rnn.pad_sequence(
        label, batch_first=True, padding_value=tokenizer_tgt.token_to_id("[PAD]")
    )

    # Create masks
    encoder_mask = (
        (encoder_input != tokenizer_tgt.token_to_id("[PAD]"))
        .unsqueeze(1)
        .type(torch.bool)
    )
    decoder_mask = (decoder_input != tokenizer_tgt.token_to_id("[PAD]")).unsqueeze(
        1
    ).type(torch.bool) & causal_mask(decoder_input.size(1))

    return {
        "encoder_input": encoder_input,  # (batch_size, seq_len)
        "decoder_input": decoder_input,  # (batch_size, seq_len)
        "label": label,  # (batch_size, seq_len)
        "encoder_mask": encoder_mask,  # (batch_size, 1, seq_len)
        "decoder_mask": decoder_mask,  # (batch_size, seq_len, seq_len)
        "src_texts": src_texts,  # List of source texts
        "tgt_texts": tgt_texts,  # List of target texts
    }
        

In [108]:
# Create custom dataset and dataloader
custom_dataset = BilingualDataset(
    ds_raw,
    tokenizer_src,
    tokenizer_tgt,
    get_config()["src_lang"],
    get_config()["tgt_lang"],
)
custom_dataloader = DataLoader(custom_dataset, batch_size=10, collate_fn=collate_fn)

In [109]:
# Iterate over the dataloader to check shapes and functionality
for idx, (
    encoder_input,
    decoder_input,
    label,
    encoder_mask,
    decoder_mask,
    src_texts,
    tgt_texts,
) in enumerate(custom_dataloader):
    print("encoder_input", encoder_input.shape)  # (batch_size, enc_seq_len)
    print("decoder_input", decoder_input.shape)  # (batch_size, dec_seq_len)
    print("label", label.shape)  # (batch_size, dec_seq_len)
    print("encoder_mask", encoder_mask.shape)  # (batch_size, 1, enc_seq_len)
    # (batch_size, dec_seq_len, dec_seq_len)
    print("decoder_mask", decoder_mask.shape)
    print("src_texts", src_texts)  # List of source texts
    print("tgt_texts", tgt_texts)  # List of target texts

    if idx == 10:
        break

    print("\n")

encoder_input torch.Size([10, 97])
decoder_input torch.Size([10, 77])
label torch.Size([10, 77])
encoder_mask torch.Size([10, 1, 97])
decoder_mask torch.Size([10, 77, 77])
src_texts ['Source: Project Gutenberg', 'Jane Eyre', 'Charlotte Bronte', 'CHAPTER I', 'There was no possibility of taking a walk that day.', 'We had been wandering, indeed, in the leafless shrubbery an hour in the morning; but since dinner (Mrs. Reed, when there was no company, dined early) the cold winter wind had brought with it clouds so sombre, and a rain so penetrating, that further out-door exercise was now out of the question.', 'I was glad of it: I never liked long walks, especially on chilly afternoons: dreadful to me was the coming home in the raw twilight, with nipped fingers and toes, and a heart saddened by the chidings of Bessie, the nurse, and humbled by the consciousness of my physical inferiority to Eliza, John, and Georgiana Reed.', 'The said Eliza, John, and Georgiana were now clustered round their

### Detailed Steps and Reasoning

1. **Dataset Loading**:
   - The dataset is loaded using the Hugging Face `datasets` library, which simplifies accessing and managing datasets. The configuration specifies the data source and language pairs. This step ensures we have access to the raw data needed for training.

2. **Tokenizer Building**:
   - **Tokenizer Initialization**: A `WordLevel` tokenizer is initialized with an unknown token (`[UNK]`).
   - **Whitespace Tokenization**: Tokenization is performed using whitespace, which is simple and effective for many languages.
   - **Normalization**: Text is converted to lowercase to maintain consistency and reduce the vocabulary size.
   - **Training**: The tokenizer is trained on the dataset, ensuring it learns the vocabulary specific to the source and target languages. Special tokens are included to handle unknown words, padding, start of sequence, and end of sequence tokens.

3. **Bilingual Dataset Class**:
   - This class handles the preparation of bilingual translation data. It tokenizes the input text and adds special tokens required for the transformer model.
   - **Special Tokens**: `[SOS]` (start of sequence) and `[EOS]` (end of sequence) tokens are added to the inputs and labels to guide the model during training.

4. **Causal Mask**:
   - The causal mask ensures that each position in the decoder can only attend to previous positions. This is essential for autoregressive decoding, where the model generates one token at a time.

5. **Collate Function**:
   - The `collate_fn` function handles batching of data. It pads sequences to the same length within a batch and creates masks to ignore padding tokens during model training.
   - **Padding**: Sequences are padded to the maximum length within the batch using the `[PAD]` token.
   - **Masks**: Encoder and decoder masks are created to ignore padding tokens and enforce causal attention in the decoder.

6. **DataLoader**:


   - A `DataLoader` is created to handle batching and shuffling of data during training. The `batch_size` is set to 10, but this can be adjusted based on available memory and model requirements.
   - **Iteration and Checking**: The dataloader is iterated over to check the shapes and functionality of the batched data. This ensures that the data is correctly prepared for model training.

By following these steps, the code is structured to efficiently prepare data for training a transformer model, ensuring proper tokenization, masking, and batching. This refined code is optimized for readability, maintainability, and functionality, adhering to best practices for transformer model training in PyTorch.