In [1]:
import os
import nltk
import torch
import json
from torch.utils.data import DataLoader
from torch.utils.data import Dataset


In [2]:
english_text = "../data/train.en/train.en"
hindi_text = "../data/train.hi/train.hi"

Tokenize data functions

In [3]:
def split_and_add_special(text: str) -> list[str]:
    tokens = text.split()
    tokens = ['sos'] + tokens + ['eos']
    return tokens

def flatten_and_unique(text_list: list[str]):
    tokens_list = [split_and_add_special(t) for t in text_list]
    vocabulary = set(token for tokens in tokens_list for token in tokens)
    
    # Map tokens to integers
    vocab_to_index = {token: idx for idx, token in enumerate(vocabulary)}
    return vocab_to_index

def indexed_tokens(text_list: list[str]):
    vocab_map = flatten_and_unique(text_list)
    vocab_map = {key: value + 1 for key, value in vocab_map.items()}
    
    tokens_list = [split_and_add_special(t) for t in text_list]
    indexed_tokens = [[vocab_map[token] for token in tokens] for tokens in tokens_list]
    return indexed_tokens, vocab_map

def indexed_tokens_per_text(text: str, vocab_map):
    tokens_list = split_and_add_special(text)
    indexed_tokens = [vocab_map[token] for token in tokens_list]
    return indexed_tokens
    

Test Train Split

In [4]:
with open(english_text, 'r') as f:
    english_data = f.readlines()
    
with open(hindi_text, 'r', encoding='utf-8') as f:
    hindi_data = f.readlines()


In [5]:
english_data = english_data[0:10000]
hindi_data = hindi_data[0:10000]

In [6]:
_, english_map = indexed_tokens(english_data)
_, hindi_map = indexed_tokens(hindi_data)

with open('../data/english_map.json', 'w', encoding='utf-8') as f:
        json.dump(english_map, f, ensure_ascii=False, indent=4)
        
with open('../data/hindi_map.json', 'w', encoding='utf-8') as f:
        json.dump(hindi_map, f, ensure_ascii=False, indent=4)

In [7]:
def data_split(x, y, split_pct: dict):
    length = len(x)
    
    train_index_start = 0
    train_index_end = int(split_pct['train'] * length)
    x_train, y_train = x[train_index_start: train_index_end], y[train_index_start: train_index_end]
    
    val_index_start = train_index_end
    val_index_end = int(split_pct['validation'] * length + train_index_end)
    x_val, y_val = x[val_index_start: val_index_end], y[val_index_start: val_index_end]
    
    test_index_start = val_index_end
    test_index_end = int(split_pct['test'] * length + val_index_end)
    x_test, y_test = x[test_index_start: test_index_end], y[test_index_start: test_index_end]
    
    return {
        "train":
            {
                "x": x_train,
                "y": y_train
            },
        "test": 
            {
                "x": x_test,
                "y": y_test
            },
        "validation":
            {
                "x": x_val,
                "y": y_val
            }
    }
    
split_pct = {
    "train": 0.7,
    "validation": 0.15,
    "test": 0.15
}
data = data_split(english_data, hindi_data, split_pct)

In [8]:
for key, value in data.items():
    with open(f'../data/{key}.json', 'w', encoding='utf-8') as f:
        json.dump(value, f, ensure_ascii=False, indent=4)

In [185]:
def collate_fn(batch):
    x_batch, y_batch = zip(*batch)

    # Find maximum sequence length (tokens) in x_batch and y_batch
    max_tokens_x = max(len(x) for x in x_batch)
    max_tokens_y = max(y.size(0) for y in y_batch)
    one_hot_size = y_batch[0].shape[1]  # Dimension of one-hot encoding

    # Pad x_batch with zeros (assuming 0 is the padding index for tokens)
    x_padded = torch.stack([
        torch.nn.functional.pad(
            x.clone(),
            (0, max_tokens_x - len(x)),
            value=0  # Padding index
        )
        for x in x_batch
    ])

    # Pad y_batch with zeros along the token dimension
    y_padded = torch.stack([
        torch.nn.functional.pad(
            y,
            (0, 0, 0, max_tokens_y - y.shape[0]),  # Padding tokens dimension only
            value=0  # Padding with zeros
        )
        for y in y_batch
    ])

    return x_padded, y_padded

class DatasetLanguage(Dataset):
    def __init__(self, data_path: str, x_vocab: str, y_vocab: str):
        self.data_path = data_path
        with open(data_path, 'r', encoding='utf-8') as f:
            self.data = json.load(f)
            self.x_data = self.data['x']
            self.y_data = self.data['y']
            
        # load vocab map
        with open(x_vocab, 'r', encoding='utf-8') as f:
            self.x_vocab = json.load(f)
        
        with open(y_vocab, 'r', encoding='utf-8') as f:
            self.y_vocab = json.load(f)
            self.y_output_classes = len(self.y_vocab)
            print(self.y_output_classes)
                
    def get_one_hot_encoding(self, y):
        return torch.nn.functional.one_hot(y, num_classes=self.y_output_classes)
        
    def __len__(self):
        return len(self.x_data)
    
    def __getitem__(self, idx):
        x_data = self.x_data[idx]
        x_data = indexed_tokens_per_text(x_data, self.x_vocab)
        
        y_data = self.y_data[idx]
        y_data = indexed_tokens_per_text(y_data, self.y_vocab)
        y_data_one_hot = self.get_one_hot_encoding(torch.tensor(y_data))
        
        y_output = {
            'y': torch.tensor(y_data),
            'y_data_one_hot': y_data_one_hot
        }
        
        return torch.tensor(x_data), y_output

In [186]:
training_data = DatasetLanguage(data_path='../data/validation.json',
                                x_vocab='../data/english_map.json',
                                y_vocab='../data/hindi_map.json')

27473


In [189]:
train_dataloader = DataLoader(training_data, batch_size=10, shuffle=True, collate_fn=collate_fn)

In [190]:
train_features, train_labels = next(iter(train_dataloader))

torch.Size([22, 27473])
torch.Size([39, 27473])
torch.Size([26, 27473])
torch.Size([6, 27473])
torch.Size([12, 27473])
torch.Size([15, 27473])
torch.Size([14, 27473])
torch.Size([12, 27473])
torch.Size([19, 27473])
torch.Size([6, 27473])


In [191]:
train_features.size()

torch.Size([10, 32])

In [192]:
train_labels.size()

torch.Size([10, 39, 27473])