In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
import numpy as np

In [3]:
english_file = '/content/drive/MyDrive/data/english.txt'
nepali_file = '/content/drive/MyDrive/data/nepali.txt'

# To start a sentence
START_TOKEN = ''

# To end a sentence
END_TOKEN = ''

# To pad the sentences to make them all of equal length
PADDING_TOKEN = ''

In [4]:
english_vocabulary = [
    START_TOKEN, ' ', '!', ':', ';', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
    ':', '<', '=', '>', '?', '@',
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
    'Y', 'Z',
    '[', '\\', ']', '^', '_', '`',
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
    'y', 'z',
    '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN
]


nepali_vocabulary = [
    START_TOKEN, ' ', '!', ':', ';', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
    ':', '<', '=', '>', '?', 'ˌ',
    'ँ', 'आ', 'इ', 'ा', 'ि', 'ी', 'ु', 'ू',
    'क', 'ख', 'ग', 'घ', 'ङ',
    'च', 'छ', 'ज', 'झ', 'ञ',
    'ट', 'ठ', 'ड', 'ढ', 'ण',
    'त', 'थ', 'द', 'ध', 'न',
    'प', 'फ', 'ब', 'भ', 'म',
    'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह',
    '಼', 'ಽ', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॄ', 'ॆ', 'े', 'ै', 'ो', 'ौ', '्', 'ॕ', 'ॖ', 'फ़', 'ॣ', 'ं', 'ः',
    '०', '१', '२', '३', '४', '५', '६', '७', '८', '९', PADDING_TOKEN, END_TOKEN
]

In [5]:
text = 'पुस्तकालय'
list(text)

['प', 'ु', 'स', '्', 'त', 'क', 'ा', 'ल', 'य']

In [6]:
# Creating a dictionary that maps words to numbers and vice versa
index_to_nepali = {k:v for k,v in enumerate(nepali_vocabulary)}
nepali_to_index = {v:k for k,v in enumerate(nepali_vocabulary)}
index_to_english = {k:v for k,v in enumerate(english_vocabulary)}
english_to_index = {v:k for k,v in enumerate(english_vocabulary)}

In [7]:
with open(english_file, 'r') as file:
    english_sentences = file.readlines()
with open(nepali_file, 'r') as file:
    nepali_sentences = file.readlines()

# Limiting the number of sentences
TOTAL_SENTENCES = 92000
english_sentences = english_sentences[:TOTAL_SENTENCES]
nepali_sentences = nepali_sentences[:TOTAL_SENTENCES]

In [8]:
# Remove the "/n" after the end of every sentences
english_sentences = [sentence.rstrip('\n') for sentence in english_sentences]
nepali_sentences = [sentence.rstrip('\n') for sentence in nepali_sentences]

In [9]:
english_sentences[:10]

['York is a collegiate university and every student is allocated to one of the university’s nine colleges. The ninth college was founded in 2014 and was named Constantine after the Roman emperor Constantine the Great, who was proclaimed Augustus in York in 306 AD.',
 '40:8 All my enemies were whispering against me . They were thinking up evils against me .',
 'During the Presidency of Edmund J . James (1904-1920), James is credited for building the foundation of the large Chinese international student population on campus . James established ties with China through the Chinese Minister to the United States Wu Ting-Fang In addition, during James’s presidency, class rivalries and Bob Zuppke’s winning football teams contributed to campus morale . On June 11, 1929, the Alma Mater statue was unveiled . The Alma Mater was established by donations by the Alumni Fund and the classes of 1923-1929.',
 'These sequences are intercepted by the ANSI.SYS driver which is loaded in CONFIG.SYS on a numb

In [10]:
nepali_sentences[:10]

['न्यूयोर्क एक collegiate विश्वविद्यालय छ र हरेक विद्यार्थी विश्वविद्यालय गरेको नौ कलेज को एक विनियोजन गरिएको छ. नवौं कलेज मा स्थापित भएको थियो 2014 र रोमन सम्राट Constantine महान् पछि नाम थियो Constantine, जो मा योर्क अगस्तस घोषणा भएको थियो 306 ई. त्यहाँ निकट भविष्यमा दशौं कलेज निर्माण गर्ने योजना हो.',
 '40:8 मेरो सबै शत्रुलाई मेरो विरुद्धमा कानेखुसी थिए. तिनीहरू मेरो विरूद्धमा अप दुष्कर्मलाई सोच थियो.',
 'पुस्तकालय, जसमा स्कूल संग खोलियो 1868, सुरु 1,039 मात्रा. पछि, PresidentEdmund जे. जेम्स, मा न्यासी को बोर्ड एक बोलीमा 1912, एक अनुसन्धान पुस्तकालय बनाउन प्रस्तावित. यसलाई अहिले संसारको सबैभन्दा ठूलो सार्वजनिक शैक्षिक संग्रह को छ. मा 1870, को Mumford घरको विद्यालयको प्रयोगात्मक खेत लागि एक मोडेल गोठ रूपमा निर्माण गरिएको थियो. को Mumford हाउस परिसर मा पुरानो संरचना रहन्छ. मूल विश्वविद्यालय हल (1871) को 4th भवन बनाइएको थियो; आज Illini संघ जहाँ खडा यसलाई उभिए.',
 'यी दृश्यहरु पीसी गरेको एक नम्बर मा CONFIG.SYS मा लोड जो ANSI.SYS चालक अन्तर्खण्ड छन्। यो ट्रोजन अन केही कुञ्जीहरू redefine

In [11]:
max(len(x) for x in nepali_sentences), max(len(x) for x in english_sentences)

(1023, 1024)

In [12]:
PERCENTILE = 99
print( f"{PERCENTILE}th percentile length Nepali: {np.percentile([len(x) for x in nepali_sentences], PERCENTILE)}" )
print( f"{PERCENTILE}th percentile length English: {np.percentile([len(x) for x in english_sentences], PERCENTILE)}" )

99th percentile length Nepali: 696.0
99th percentile length English: 914.0


In [13]:
# Filter out data based on maximum sequence length and presence of token in vocab
max_sequence_length = 200

def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True

def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length - 1) # need to re-add the end token so leaving 1 space

valid_sentence_indicies = []
for index in range(len(nepali_sentences)):
    nepali_sentence, english_sentence = nepali_sentences[index], english_sentences[index]
    if is_valid_length(nepali_sentence, max_sequence_length) \
      and is_valid_length(english_sentence, max_sequence_length) \
      and is_valid_tokens(nepali_sentence, nepali_vocabulary):
        valid_sentence_indicies.append(index)

print(f"Number of sentences: {len(nepali_sentences)}")
print(f"Number of valid sentences: {len(valid_sentence_indicies)}")

Number of sentences: 92000
Number of valid sentences: 6836


In [14]:
nepali_sentences = [nepali_sentences[i] for i in valid_sentence_indicies]
english_sentences = [english_sentences[i] for i in valid_sentence_indicies]

In [15]:
nepali_sentences[:3]

['मास्को मा स्पेस मौसम को तिहार.',
 'कार्यालयहरू, काम गर्ने कोठाहरू, प्रतिनिधि कार्यालयहरू 300',
 'विद्यालयको फोन नम्बरः ०२२२२४६५०९']

In [16]:
english_sentences[:3]

['Space weather Festival in Moscow.',
 'Offices, working rooms, offices of representation 300',
 'TEL: 022-224-6509 FAX: 022-224-6517 Establishment: July, 1992']

In [17]:
# Creating a Dataloader class
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):

    def __init__(self, english_sentences, nepali_sentences):
        self.english_sentences = english_sentences
        self.nepali_sentences = nepali_sentences

    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self, idx):
        return self.english_sentences[idx], self.nepali_sentences[idx]

In [18]:
dataset = TextDataset(english_sentences, nepali_sentences)

In [19]:
len(dataset)

6836

In [20]:
dataset[1]

('Offices, working rooms, offices of representation 300',
 'कार्यालयहरू, काम गर्ने कोठाहरू, प्रतिनिधि कार्यालयहरू 300')

In [21]:
batch_size = 3
train_loader = DataLoader(dataset, batch_size)
iterator = iter(train_loader)

In [22]:
for batch_num, batch in enumerate(iterator):
    print(batch)
    if batch_num > 3:
        break

[('Space weather Festival in Moscow.', 'Offices, working rooms, offices of representation 300', 'TEL: 022-224-6509 FAX: 022-224-6517 Establishment: July, 1992'), ('मास्को मा स्पेस मौसम को तिहार.', 'कार्यालयहरू, काम गर्ने कोठाहरू, प्रतिनिधि कार्यालयहरू 300', 'विद्यालयको फोन नम्बरः ०२२२२४६५०९')]
[('Department of Ancient and Medieval History', '27:6 The furnace tests the potter’s vessels, and the trial of the tribulation tests just men.', '27:6 The furnace tests the potter’s vessels, and the trial of the tribulation tests just men .'), ('प्राचीन र मध्यकालीन इतिहास विभाग', '27:6 आगोको भट्टी को पटर गरेको जहाजहरु परीक्षण, र कष्ट को परीक्षण बस मानिसहरू परीक्षण.', '27:6 आगोको भट्टी को पटर गरेको जहाजहरु परीक्षण, र कष्ट को परीक्षण बस मानिसहरू परीक्षण.')]
[('in National', 'AMANKOUA, Cotonou I am a man, 49 years old, seeking a woman from 26 till 46', '5 March 2016 / 0 Comments / in 1989, PILOTS / by iron For Giampiero Fatemian the Paris-Dakar race ended prematurely for a bad ruzzolone that did kee

In [23]:
def tokenize(sentence, language_to_index, start_token=True, end_token=True):
    sentence_word_indicies = [language_to_index[token] for token in list(sentence)]
    if start_token:
        sentence_word_indicies.insert(0, language_to_index[START_TOKEN])
    if end_token:
        sentence_word_indicies.append(language_to_index[END_TOKEN])
    for _ in range(len(sentence_word_indicies), max_sequence_length):
        sentence_word_indicies.append(language_to_index[PADDING_TOKEN])
    return torch.tensor(sentence_word_indicies)

In [24]:
batch

[('32:14 Then I will cause their waters to be very pure, and their rivers to be like oil, says the Lord God,',
  'Photo Gallery Purnabiram The Profile of our Honors &amp; Awards',
  'Double sheets lead curtain Double sheets lead screen'),
 ('32:14 त्यसपछि म आफ्नो पानी धेरै शुद्ध हुन गर्नाले, र आफ्नो नदीहरू तेल जस्तै हुन, प्रभु परमेश्वर भन्नुहुन्छ,',
  'हाम्रा सम्मान र पुरस्कारको प्रोफाइल',
  'डबल पानाहरू नेतृत्व स्क्रिन')]

In [25]:
eng_tokenized, ne_tokenized = [], []
for sentence_num in range(batch_size):
    eng_sentence, ne_sentence = batch[0][sentence_num], batch[1][sentence_num]
    eng_tokenized.append( tokenize(eng_sentence, english_to_index, start_token=False, end_token=False) )
    ne_tokenized.append( tokenize(ne_sentence, nepali_to_index, start_token=True, end_token=True) )
eng_tokenized = torch.stack(eng_tokenized)
ne_tokenized = torch.stack(ne_tokenized)

In [26]:
eng_tokenized

tensor([[22, 21, 29, 20, 23,  1, 54, 74, 71, 80,  1, 43,  1, 89, 75, 78, 78,  1,
         69, 67, 87, 85, 71,  1, 86, 74, 71, 75, 84,  1, 89, 67, 86, 71, 84, 85,
          1, 86, 81,  1, 68, 71,  1, 88, 71, 84, 91,  1, 82, 87, 84, 71, 15,  1,
         67, 80, 70,  1, 86, 74, 71, 75, 84,  1, 84, 75, 88, 71, 84, 85,  1, 86,
         81,  1, 68, 71,  1, 78, 75, 77, 71,  1, 81, 75, 78, 15,  1, 85, 67, 91,
         85,  1, 86, 74, 71,  1, 46, 81, 84, 70,  1, 41, 81, 70, 15, 98, 98, 98,
         98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98,
         98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98,
         98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98,
         98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98,
         98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98,
         98, 98],
        [50, 74, 81, 86, 81,  1, 41, 67, 78, 78, 71, 84, 91,  1, 50, 87, 84, 80,
         6

In [27]:
ne_tokenized

tensor([[108,  22,  21,  29,  20,  23,   1,  58,  90,  68,  74,  63,  49,  79,
           1,  67,   1,  36,  64,  90,  62,  88,   1,  63,  78,  62,  80,   1,
          61,  86,  69,  87,   1,  72,  81,  60,  90,  61,   1,  75,  81,  62,
           1,  45,  69,  90,  62,  78,  70,  86,  15,   1,  69,   1,  36,  64,
          90,  62,  88,   1,  62,  60,  80,  75,  69,  82,   1,  58,  86,  70,
           1,  50,  74,  90,  58,  87,   1,  75,  81,  62,  15,   1,  63,  90,
          69,  66,  81,   1,  63,  69,  67,  86,  72,  90,  71,  69,   1,  66,
          62,  90,  62,  81,  75,  81,  62,  90,  49,  15, 108, 108, 108, 108,
         108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
         108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
         108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
         108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
         108, 108, 108, 108, 108, 108, 108, 108, 108

In [28]:
NEG_INFTY = -1e9

# There will be the usual mask in decoder that only looks at current and previous words

# There will also be a padding mask in both encoder and decoder that will ignore the padding and everything associated with it

def create_masks(eng_batch, ne_batch):
    num_sentences = len(eng_batch)
    look_ahead_mask = torch.full([max_sequence_length, max_sequence_length] , True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)

    for idx in range(num_sentences):
      eng_sentence_length, ne_sentence_length = len(eng_batch[idx]), len(ne_batch[idx])
      eng_chars_to_padding_mask = np.arange(eng_sentence_length + 1, max_sequence_length)
      ne_chars_to_padding_mask = np.arange(ne_sentence_length + 1, max_sequence_length)
      encoder_padding_mask[idx, :, eng_chars_to_padding_mask] = True
      encoder_padding_mask[idx, eng_chars_to_padding_mask, :] = True
      decoder_padding_mask_self_attention[idx, :, ne_chars_to_padding_mask] = True
      decoder_padding_mask_self_attention[idx, ne_chars_to_padding_mask, :] = True
      decoder_padding_mask_cross_attention[idx, :, eng_chars_to_padding_mask] = True
      decoder_padding_mask_cross_attention[idx, ne_chars_to_padding_mask, :] = True

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask =  torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    print(f"encoder_self_attention_mask {encoder_self_attention_mask.size()}: {encoder_self_attention_mask[0, :10, :10]}")
    print(f"decoder_self_attention_mask {decoder_self_attention_mask.size()}: {decoder_self_attention_mask[0, :10, :10]}")
    print(f"decoder_cross_attention_mask {decoder_cross_attention_mask.size()}: {decoder_cross_attention_mask[0, :10, :10]}")
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask

In [29]:
create_masks(batch[0], batch[1])

encoder_self_attention_mask torch.Size([3, 200, 200]): tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
decoder_self_attention_mask torch.Size([3, 200, 200]): tensor([[ 0.0000e+00, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00, -1.0000e+09, -1.0000e+09, -1.0000e+09,
         -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00, -1.0000e+09, -1.0000e+09,
         -1.0000e

(tensor([[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          ...,
          [-1.0000e+09, -1.0000e+09, -1.0000e+09,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [-1.0000e+09, -1.0000e+09, -1.0000e+09,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [-1.0000e+09, -1.0000e+09, -1.0000e+09,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09]],
 
         [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.0000e+09,
           -1.0000e+09, -1.0000e+09],
          ...,
    