## Testing Dataloaders

this notebook is to test the text-dataloaders for the translation task for the transformer model. 

In [1]:
import sys
import os

# adding project root directory to the sys path
project_root = os.path.abspath(os.path.join(
    os.getcwd(), '..'))
sys.path.append(project_root)

from src.data.text_dataloader import TextDataLoader
import torch
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


## load the downloaded datasets. 
## here, we will use the bert-base-multilingual-uncased tokenizer for the tokenization of the text data. 

In [2]:
src_val_data = os.path.join(project_root, 'Datasets', 'raw', 'en-hi', 'opus.en-hi-dev.en')
tgt_val_data = os.path.join(project_root, 'Datasets', 'raw', 'en-hi', 'opus.en-hi-dev.hi')
src_train_data = os.path.join(project_root, 'Datasets', 'raw', 'en-hi', 'opus.en-hi-train.en')
tgt_train_data = os.path.join(project_root, 'Datasets', 'raw', 'en-hi', 'opus.en-hi-train.hi')
src_test_data = os.path.join(project_root, 'Datasets', 'raw', 'en-hi', 'opus.en-hi-test.en')
tgt_test_data = os.path.join(project_root, 'Datasets', 'raw', 'en-hi', 'opus.en-hi-test.hi')

# print the first 10 lines of the source and target data for the dev, train and test data. 
with open(src_val_data, 'r') as f:
    print(f.readlines()[:10], "\n")

with open(tgt_val_data, 'r') as f:
    print(f.readlines()[:10], "\n")

with open(src_train_data, 'r') as f:
    print(f.readlines()[:10], "\n")

with open(tgt_train_data, 'r') as f:
    print(f.readlines()[:10], "\n")

with open(src_test_data, 'r') as f:
    print(f.readlines()[:10], "\n")

with open(tgt_test_data, 'r') as f:
    print(f.readlines()[:10], "\n")



# print the total number of samples in the source and target data for the dev, train and test data. 
print(f"Total number of samples in the source dev data: {len(open(src_val_data, 'r').readlines())}", "\n")
print(f"Total number of samples in the target dev data: {len(open(tgt_val_data, 'r').readlines())}", "\n")
print(f"Total number of samples in the source train data: {len(open(src_train_data, 'r').readlines())}", "\n")
print(f"Total number of samples in the target train data: {len(open(tgt_train_data, 'r').readlines())}", "\n")
print(f"Total number of samples in the source test data: {len(open(src_test_data, 'r').readlines())}", "\n")
print(f"Total number of samples in the target test data: {len(open(tgt_test_data, 'r').readlines())}", "\n")


['No, no, not so fast.\n', ', eject!\n', "I'm Dr. Messa.\n", 'So we notify the cops about big ticket sales and we even keep half a dozen Ukrainian ex-naval commandos in a van outside, just in case it all kicks off.\n', 'receiving what their Lord has given them, for they had been virtuous aforetime.\n', "Default folder to use for the '--add' and '--extract' commands\n", 'Hey, how are you? Beautiful day.\n', 'Dengue is a tropical virus carried by the Aedes aegypti mosquito with no known cure. According to the World Health Organization, about 40 percent of the world’s population is at risk from dengue.\n', 'Show right margin\n', '%s: not enough free space\n'] 

['तुम इतनी आसानी से छूट नहीं सकते.\n', ', बेदखल!\n', 'Messa हूँ.\n', 'तोहमबड़ीटिकटोंकीबिक्रीकेबारे मेंपुलिस सूचित... / मैं ... और हम भी रखना आधा दर्जन यूक्रेनी पूर्व नौसेना कमांडो...\n', 'जो कुछ उनके रब ने उन्हें दिया, वे उसे ले रहे होंगे। निस्संदेह वे इससे पहले उत्तमकारों में से थे\n', '--add और --extract कमान्ड में उपयोग हेतु डिफ

## Initialize and test the test the text dataloader. 

In [7]:
# initialize the text dataloader for the train data
text_train_dataloader = TextDataLoader(src_train_data, tgt_train_data, max_len=512)

# initialize the text dataloader for the validation data
text_val_dataloader = TextDataLoader(src_val_data, tgt_val_data, max_len=512)

# initialize the text dataloader for the test data
text_test_dataloader = TextDataLoader(src_test_data, tgt_test_data, max_len=512)

# load the training data
train_dataloader = text_train_dataloader.load_data()

# load the validation data
val_dataloader = text_val_dataloader.load_data()

# load the testing data
test_dataloader = text_test_dataloader.load_data()

# get the first batch of the training data to inspect
batch = next(iter(train_dataloader))
src_train_input_ids, src_train_attention_mask, train_tgt_input_ids, train_tgt_attention_mask = batch

# print the shapes of the source and target training data
print(f"Shape of the source input ids: {src_train_input_ids.shape}", "\n")
print(f"Shape of the source attention mask: {src_train_attention_mask.shape}", "\n")
print(f"Shape of the target input ids: {train_tgt_input_ids.shape}", "\n")
print(f"Shape of the target attention mask: {train_tgt_attention_mask.shape}", "\n")

# print the first few samples of the source and target training data
print(f"\nFirst few samples of the source input ids:\n{src_train_input_ids[:3]}", "\n")
print(f"\nFirst few samples of the target input ids:\n{train_tgt_input_ids[:3]}", "\n")

# print the first few samples of the source and target attention masks of the training data
print(f"\nFirst few samples of the source attention mask:\n{src_train_attention_mask[:3]}", "\n")
print(f"\nFirst few samples of the target attention mask:\n{train_tgt_attention_mask[:3]}", "\n")

# print the first few samples of the source and target input sequences in the training data
print("\nFirst few samples of the source input sequences:")
for i in range(3):  # Show first 3 sequences
    tokens = text_train_dataloader.tokenizer.convert_ids_to_tokens(src_train_input_ids[i])
    print(f"Sample {i+1}: {' '.join(tokens[:10])}...")  # Show first 10 tokens

print("\nFirst few samples of the target input sequences:")
for i in range(3):  # Show first 3 sequences
    tokens = text_train_dataloader.tokenizer.convert_ids_to_tokens(train_tgt_input_ids[i])
    print(f"Sample {i+1}: {' '.join(tokens[:10])}...")  # Show first 10 tokens

Shape of the source input ids: torch.Size([32, 512]) 

Shape of the source attention mask: torch.Size([32, 512]) 

Shape of the target input ids: torch.Size([32, 512]) 

Shape of the target attention mask: torch.Size([32, 512]) 


First few samples of the source input ids:
tensor([[101, 156, 102,  ...,   0,   0,   0],
        [101, 161, 102,  ...,   0,   0,   0],
        [101, 154, 102,  ...,   0,   0,   0]]) 


First few samples of the target input ids:
tensor([[101, 156, 102,  ...,   0,   0,   0],
        [101, 161, 102,  ...,   0,   0,   0],
        [101, 154, 102,  ...,   0,   0,   0]]) 


First few samples of the source attention mask:
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]) 


First few samples of the target attention mask:
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]) 


First few samples of the source input sequences:
Sample 1: [CLS] n [SEP] [PAD] [PAD] [PAD] 

## inspecting the validation dataloading

In [8]:
# get the first batch of the validation data to inspect
batch = next(iter(val_dataloader))
src_val_input_ids, src_val_attention_mask, val_tgt_input_ids, val_tgt_attention_mask = batch

# print the shapes of the source and target validation data
print(f"Shape of the source input ids: {src_val_input_ids.shape}", "\n")
print(f"Shape of the source attention mask: {src_val_attention_mask.shape}", "\n")
print(f"Shape of the target input ids: {val_tgt_input_ids.shape}", "\n")
print(f"Shape of the target attention mask: {val_tgt_attention_mask.shape}", "\n")

# print the first few samples of the source and target validation data
print(f"\nFirst few samples of the source input ids:\n{src_val_input_ids[:3]}", "\n")
print(f"\nFirst few samples of the target input ids:\n{val_tgt_input_ids[:3]}", "\n")

# print the first few samples of the source and target attention masks of the validation data
print(f"\nFirst few samples of the source attention mask:\n{src_val_attention_mask[:3]}", "\n")
print(f"\nFirst few samples of the target attention mask:\n{val_tgt_attention_mask[:3]}", "\n")

# print the total number of batches in the validation data


# print the first few samples of the source and target input sequences in the validation data
print("\nFirst few samples of the source input sequences:")
for i in range(3):  # Show first 3 sequences
    tokens = text_val_dataloader.tokenizer.convert_ids_to_tokens(src_val_input_ids[i])
    print(f"Sample {i+1}: {' '.join(tokens[:10])}...")  # Show first 10 tokens

print("\nFirst few samples of the target input sequences:")
for i in range(3):  # Show first 3 sequences
    tokens = text_val_dataloader.tokenizer.convert_ids_to_tokens(val_tgt_input_ids[i])
    print(f"Sample {i+1}: {' '.join(tokens[:10])}...")  # Show first 10 tokens

Shape of the source input ids: torch.Size([32, 512]) 

Shape of the source attention mask: torch.Size([32, 512]) 

Shape of the target input ids: torch.Size([32, 512]) 

Shape of the target attention mask: torch.Size([32, 512]) 


First few samples of the source input ids:
tensor([[101, 165, 102,  ...,   0,   0,   0],
        [101, 157, 102,  ...,   0,   0,   0],
        [101, 146, 102,  ...,   0,   0,   0]]) 


First few samples of the target input ids:
tensor([[101, 165, 102,  ...,   0,   0,   0],
        [101, 157, 102,  ...,   0,   0,   0],
        [101, 146, 102,  ...,   0,   0,   0]]) 


First few samples of the source attention mask:
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]) 


First few samples of the target attention mask:
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]) 


First few samples of the source input sequences:
Sample 1: [CLS] w [SEP] [PAD] [PAD] [PAD] 

## inspecting the test dataloading

In [9]:
# get the first batch of the training data to inspect
batch = next(iter(train_dataloader))
src_train_input_ids, src_train_attention_mask, train_tgt_input_ids, train_tgt_attention_mask = batch

# print the shapes of the source and target training data
print(f"Shape of the source input ids: {src_train_input_ids.shape}", "\n")
print(f"Shape of the source attention mask: {src_train_attention_mask.shape}", "\n")
print(f"Shape of the target input ids: {train_tgt_input_ids.shape}", "\n")
print(f"Shape of the target attention mask: {train_tgt_attention_mask.shape}", "\n")

# print the first few samples of the source and target training data
print(f"\nFirst few samples of the source input ids:\n{src_train_input_ids[:3]}", "\n")
print(f"\nFirst few samples of the target input ids:\n{train_tgt_input_ids[:3]}", "\n")

# print the first few samples of the source and target attention masks of the training data
print(f"\nFirst few samples of the source attention mask:\n{src_train_attention_mask[:3]}", "\n")
print(f"\nFirst few samples of the target attention mask:\n{train_tgt_attention_mask[:3]}", "\n")

# print the first few samples of the source and target input sequences in the training data
print("\nFirst few samples of the source input sequences:")
for i in range(3):  # Show first 3 sequences
    tokens = text_train_dataloader.tokenizer.convert_ids_to_tokens(src_train_input_ids[i])
    print(f"Sample {i+1}: {' '.join(tokens[:10])}...")  # Show first 10 tokens

print("\nFirst few samples of the target input sequences:")
for i in range(3):  # Show first 3 sequences
    tokens = text_train_dataloader.tokenizer.convert_ids_to_tokens(train_tgt_input_ids[i])
    print(f"Sample {i+1}: {' '.join(tokens[:10])}...")  # Show first 10 tokens

Shape of the source input ids: torch.Size([32, 512]) 

Shape of the source attention mask: torch.Size([32, 512]) 

Shape of the target input ids: torch.Size([32, 512]) 

Shape of the target attention mask: torch.Size([32, 512]) 


First few samples of the source input ids:
tensor([[101, 120, 102,  ...,   0,   0,   0],
        [101, 160, 102,  ...,   0,   0,   0],
        [101, 157, 102,  ...,   0,   0,   0]]) 


First few samples of the target input ids:
tensor([[101, 120, 102,  ...,   0,   0,   0],
        [101, 160, 102,  ...,   0,   0,   0],
        [101, 157, 102,  ...,   0,   0,   0]]) 


First few samples of the source attention mask:
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]) 


First few samples of the target attention mask:
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]) 


First few samples of the source input sequences:
Sample 1: [CLS] / [SEP] [PAD] [PAD] [PAD] 

In [10]:
# Compute vocabulary size for English and Hindi sentences

# Function to get unique tokens from a dataset
def get_unique_tokens(dataloader):
    unique_tokens = set()
    for batch in dataloader:
        src_input_ids, _, tgt_input_ids, _ = batch
        unique_tokens.update(src_input_ids.unique().tolist())
        unique_tokens.update(tgt_input_ids.unique().tolist())
    return unique_tokens

# Get unique tokens for training and validation sets
train_tokens = get_unique_tokens(train_dataloader)
val_tokens = get_unique_tokens(val_dataloader)

# Combine unique tokens from both sets
all_tokens = train_tokens.union(val_tokens)

# Remove special tokens (assuming IDs 0-3 are special tokens like PAD, UNK, etc.)
vocab_tokens = [token for token in all_tokens if token > 3]

# Print vocabulary size
print(f"Total vocabulary size: {len(vocab_tokens)}")

# Get vocabulary for source (English) and target (Hindi) separately
src_vocab = set()
tgt_vocab = set()

for batch in train_dataloader:
    src_input_ids, _, tgt_input_ids, _ = batch
    src_vocab.update(src_input_ids.unique().tolist())
    tgt_vocab.update(tgt_input_ids.unique().tolist())

for batch in val_dataloader:
    src_input_ids, _, tgt_input_ids, _ = batch
    src_vocab.update(src_input_ids.unique().tolist())
    tgt_vocab.update(tgt_input_ids.unique().tolist())

# Remove special tokens
src_vocab = [token for token in src_vocab if token > 3]
tgt_vocab = [token for token in tgt_vocab if token > 3]

print(f"English vocabulary size: {len(src_vocab)}")
print(f"Hindi vocabulary size: {len(tgt_vocab)}")

# Print some sample tokens
print("\nSample English tokens:")
print(text_train_dataloader.tokenizer.convert_ids_to_tokens(src_vocab[:10]))
print("\nSample Hindi tokens:")
print(text_train_dataloader.tokenizer.convert_ids_to_tokens(tgt_vocab[:10]))


Total vocabulary size: 24
English vocabulary size: 24
Hindi vocabulary size: 24

Sample English tokens:
['a', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'l', 'm']

Sample Hindi tokens:
['a', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'l', 'm']
