In [1]:
import torch
import torch.nn as nn
import numpy as np
from datasets import Dataset, DatasetDict
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.readlines()

In [3]:
def create_translation_dataset(english_file, hindi_file):
    # Load the texts from the files
    english_texts = load_text(english_file)
    hindi_texts = load_text(hindi_file)

    # Check if both files have the same number of lines
    if len(english_texts) != len(hindi_texts):
        raise ValueError("The number of lines in the English and Hindi files do not match.")
    
    # Create a list of dictionaries where each dictionary contains 'english' and 'hindi' texts
    data = [{'english': eng.strip(), 'hindi': hin.strip()} 
            for eng, hin in zip(english_texts, hindi_texts)]
    
    # Create a Dataset from the list of dictionaries
    dataset = Dataset.from_dict({'translation': data})
    
    return dataset

In [4]:
# Define file paths
english_file = '../data/v3/v2/en-hi/train.en'
hindi_file = '../data/v3/v2/en-hi/train.hi'

# Create the dataset
dataset = create_translation_dataset(english_file, hindi_file)

# For demonstration, print the first example
print(dataset[0])

{'translation': {'english': "However, Paes, who was partnering Australia's Paul Hanley, could only go as far as the quarterfinals where they lost to Bhupathi and Knowles", 'hindi': 'आस्ट्रेलिया के पाल हेनली के साथ जोड़ी बनाने वाले पेस मियामी में क्वार्टरफाइनल तक ही पहुंच सके क्योंकि इस दौर में उन्हें भूपति और नोल्स ने हराया था।'}}


In [5]:
print(dataset[-1])

{'translation': {'english': 'and guided them to a straight path.', 'hindi': 'और उनको राहे रास्त की भी ज़रूर हिदायत करते'}}


In [6]:
print(len(dataset))

10125690


In [7]:
from torch.utils.data import Dataset, DataLoader, random_split
# Keep 90% for training, 10% for validation
train_ds_size = int(0.9 * len(dataset))
val_ds_size = len(dataset) - train_ds_size
train_ds_raw, val_ds_raw = random_split(dataset, [train_ds_size, val_ds_size])

In [8]:
len(train_ds_raw)

9113121

In [9]:
len(val_ds_raw)

1012569

In [10]:
val_ds_raw[0]

{'translation': {'english': 'Serve with rice or bread.',
  'hindi': 'इसे रोटी या पूरी के साथ खाया जाता है.'}}

In [11]:
from datasets import load_dataset
from datasets import Dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers import Tokenizer, models, pre_tokenizers, trainers 
from tokenizers.pre_tokenizers import Whitespace

def get_or_build_bpe_tokenizer(config, ds, lang):
    tokenizer_path = Path('../tokenizer_{0}.json'.format(lang))
    if not tokenizer_path.exists():
        raise "Error"
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

In [12]:
en_tokenizer = get_or_build_bpe_tokenizer(None, None, 'en')

In [13]:
hi_tokenizer = get_or_build_bpe_tokenizer(None, None, 'hi')

In [15]:
# Check the max seq_len for tokens to modify the data, So we can take context length of 1k tokens
max_len_en = 0
max_len_hi = 0
d = []
d_h = []
for index, data in enumerate(dataset):
    en = len(en_tokenizer.encode(data['translation']['english']).ids)
    hi = len(hi_tokenizer.encode(data['translation']['hindi']).ids)
    if en > max_len_en:
        d.append(index)
    if hi > max_len_hi:
        d_h.append(index)
    max_len_en = max(en, max_len_en)
    max_len_hi = max(hi, max_len_hi)

In [16]:
print(max_len_en, max_len_hi)

3822 2294


In [17]:
d.sort()
d_h.sort()

In [18]:
print(d)

[0, 1, 7, 10, 26, 66, 87, 89, 399, 7749, 10128, 39219, 169572, 196470, 303756, 938082, 2724105, 2958494, 3933023, 6424507, 7053659, 8647465]


In [19]:
print(d_h)

[0, 1, 26, 28, 33, 87, 89, 399, 3716, 7749, 10128, 40303, 121122, 196470, 303756, 651857, 938082, 2074888, 2724105, 6766497]


In [20]:
print(dataset[d[-1]])

{'translation': {'english': 'Come to the Joyful Praisers District Convention WHAT a fine theme has been chosen for the 1995 district conventions: Joyful Praisers ! This is certainly what Jehovahs Witnesses are. Praisers of whom? Why, Jehovah God of course! Jehovah is matchless, peerless, incomparable, unique in so many ways. He is omnipotent, omniscient, perfect in justice, and the personification of love. He is worthy, above all others, to receive our worship and praise. Surely we want to be joyful praisers of him! To help us, the Governing Body of Jehovahs Witnesses has arranged a fine three - day convention program, beginning in the summer of 1995. These three days will certainly be joyful ones, and every Witness of Jehovah will want to do his utmost to be present from the opening song on Friday morning until the closing song and prayer on Sunday afternoon. 1995 District Convention Locations JUNE 2 - 4 JACKSONVILLE, FL, Memorial Coliseum, Gator Bowl Sports Complex. LONG BEACH, CA, L

In [21]:
text = 'The other films in this section include, Venice Film Festival Horizon Award Best Actor winner A Son by Mehdi Barsaoui, India Premiere of And Then We Danced by Levan Akin which had won 9 international awards, Cannes Jury Prize winner Bacurau co-directed by Juliano Dornelles and Kleber Mendona Filho with 9 international awards, India Premiere of Locarno Film Festival winner Echo by Rnar Rnarsson, India Premiere of Silver Berlin Bear Best Director winner I was at Home, But by Angela Schanelec, India Premiere of La belle poque by Nicolas Bedos, India Premiere of Lara by Jan Ole Gerster, Cannes Best Actress winner Little Joe by Jessica Hausner, India Premiere of Karlovy Vary Crystal Globe Best Director winner Patrick by Tim Mielants, India Premiere of Silver Berlin Bear Best Screenplay winner Piranhas by Claudio Giovannesi, India Premiere of Cannes Best Screenplay winner Portrait of a Lady on Fire by Cline Sciamma, Sundance Film Festival Audience award winner with 9 international awards Queen of Hearts by May el-Toukhy, India Premiere of Berlin Silver Bear Best Actor and Actress winner So Long My Son by Xiaoshuai Wang, India Premiere of Serbian film Stitches with 10 international awards directed by Miroslav Terzic, Golden Berlin Bear Best Film winner Synonyms by Nadav Lapid, Most sought after film of the year System Crasher with 10 international awards by Nora Fingscheidt, Tremors by Jayro Bustamante, India Premiere of Crystal Globe Best Film winner at Karlovy Vary Film Festival The Father by Kristina Grozeva and Petar Valchanov and India Premiere of Venice FIPRESCI Prize winner Blanco en blanco directed by Tho Court'

In [22]:
ids = en_tokenizer.encode(text).ids

In [23]:
print(ids)
print(len(ids))

[13, 79, 845, 10, 34, 562, 499, 6, 27933, 2183, 3296, 0, 1834, 1843, 1747, 4065, 80, 4133, 23, 27262, 0, 6, 32, 0, 7, 123, 628, 70, 0, 23, 0, 0, 51, 48, 348, 366, 516, 2555, 6, 16089, 16322, 5967, 4065, 0, 1127, 14, 612, 23, 0, 0, 8, 0, 0, 0, 21, 366, 516, 2555, 6, 32, 0, 7, 0, 2183, 3296, 4065, 0, 23, 0, 0, 6, 32, 0, 7, 4631, 7858, 17729, 1843, 633, 4065, 41, 19, 31, 473, 6, 122, 23, 10962, 0, 6, 32, 0, 7, 8095, 0, 0, 23, 21267, 0, 6, 32, 0, 7, 12395, 23, 3408, 0, 0, 6, 16089, 1843, 2698, 4065, 9791, 4544, 23, 21516, 0, 6, 32, 0, 7, 0, 0, 24917, 17885, 1843, 633, 4065, 16706, 23, 5667, 0, 6, 32, 0, 7, 4631, 7858, 17729, 1843, 0, 4065, 0, 23, 0, 0, 6, 32, 0, 7, 16089, 1843, 0, 4065, 26794, 7, 11, 7818, 16, 2149, 23, 0, 0, 6, 0, 2183, 3296, 28453, 1281, 4065, 21, 366, 516, 2555, 5426, 7, 0, 23, 632, 29578, 14, 0, 6, 32, 0, 7, 7858, 4631, 17729, 1843, 1747, 8, 2698, 4065, 178, 5841, 591, 4133, 23, 0, 10702, 6, 32, 0, 7, 22437, 119, 0, 21, 156, 516, 2555, 612, 23, 0, 0, 6, 5722, 7858, 177

In [24]:
en_tokenizer.decode([13])

'The'