# Prepare data set for training and test

In [1]:
import os
import sys

In [2]:
current_path = os.path.abspath(os.path.join('.'))
module_path = os.path.abspath(os.path.join('..'))


In [3]:
print(module_path)

In [4]:
if module_path not in sys.path:
    sys.path.append(module_path)

In [5]:
import data_prep;
import utils;

In [6]:
data_en, data_hi = data_prep.loadParallelCorpus('../../data/parallel/IITB.en-hi.en', '../../data/parallel/IITB.en-hi.hi')

In [7]:
en_rows=data_en.split("\n")
hi_rows=data_hi.split("\n")
(len(en_rows),len(hi_rows))

(1561841, 1561841)

In [8]:
# 3. Shuffle corpora. The sentences in the corpora are not mixed up in the original order
en_rows_all, hi_rows_all = data_prep.shuffle(en_rows, hi_rows)


In [9]:
def prepare_dataset(en_sentences, hi_sentences):
    en_sentences = [ line.lower() for line in en_sentences]
    
    en_vocab_dict = data_prep.buildEngVocab(en_sentences)
    hi_vocab_dict = data_prep.buildHinVocab(hi_sentences)

    en_vocab = list(map(lambda x: x[0], sorted(en_vocab_dict.items(), key=lambda x: -x[1])))
    hi_vocab = list(map(lambda x: x[0], sorted(hi_vocab_dict.items(), key=lambda x: -x[1])))
    
    # Zipf's law
    # https://openreview.net/pdf?id=Bk8N0RLxx - limit vocab to 50k
    if (len(en_vocab) > 50000):
      en_vocab = en_vocab[:50000]
    if (len(hi_vocab) > 50000):
      hi_vocab = hi_vocab[:50000]

    # Build a Word to Index Dictionary for English
    start_idx = 4
    en_word2idx = dict([(word, idx + start_idx) for idx, word in enumerate(en_vocab)])
    en_word2idx['<ukn>'] = 0  # Unknown words
    en_word2idx['<start>'] = 1
    en_word2idx['<end>'] = 2  # End of sentence
    en_word2idx['<pad>'] = 3  # Padding

    en_vocab.append('<ukn>');
    en_vocab.append('<start>');
    en_vocab.append('<end>');
    en_vocab.append('<pad>');
    
    # Build reverse Index to Word Dictionary for English using the already created Word to Index Dictionary
    en_idx2word = dict([(idx, word) for word, idx in en_word2idx.items()])

    # Build a Word to Index Dictionary for Hindi
    start_idx = 4
    hi_word2idx = dict([(word, idx + start_idx) for idx, word in enumerate(hi_vocab)])
    hi_word2idx['<ukn>'] = 0  # Unknown
    hi_word2idx['<start>'] = 1
    hi_word2idx['<end>'] = 2  # End of sentence
    hi_word2idx['<pad>'] = 3  # Padding

    hi_vocab.append('<ukn>');
    hi_vocab.append('<start>');
    hi_vocab.append('<end>');
    hi_vocab.append('<pad>');

    
    # Build the inverse Index to Word Dictionary for Hindi using the already created Word to Index Dictionary
    hi_idx2word = dict([(idx, word) for word, idx in hi_word2idx.items()])

    # Encode words in senteces by their index in Vocabulary
    x = [[en_word2idx.get(word.strip(',." ;:)(][?!'), 3) for word in sentence.split()] for sentence in en_sentences]
    y = [[hi_word2idx.get(word.strip(',." ;:)(।|][?!'), 3) for word in sentence.split()] for sentence in hi_sentences]

    X = []
    Y = []
    for i in range(len(x)):
        n1 = len(x[i])
        n2 = len(y[i])
        n = n1 if n1 < n2 else n2
        if abs(n1 - n2) < 0.3 * n:
            if n1 <= max_sent_length and n2 <= max_sent_length:
                X.append(x[i])
                Y.append(y[i])

    return X, Y, en_word2idx, en_idx2word, en_vocab, hi_word2idx, hi_idx2word, hi_vocab

In [10]:
max_sent_length = 15

In [12]:
dataset_save_location = "out/parallel_trainv1.p"
utils.save_pickle(dataset_save_location, prepare_dataset(en_rows_all, hi_rows_all))

In [13]:
# let us load and check for sanity
X_all, Y_all, en_word2idx_all, en_idx2word_all, en_vocab_all, hi_word2idx_all, hi_idx2word_all, hi_vocab_all = utils.load_pickle_dataset(dataset_save_location)
len(X_all), len(Y_all), len(en_vocab_all), len(hi_vocab_all)

(704153, 704153, 50004, 50004)

In [14]:
for n in range(10):
    data_prep.printSentence(X_all[n], en_idx2word_all)
    print("\n")
    data_prep.printSentence(Y_all[n], hi_idx2word_all)
    print("\n")

linguistics a form of humanity <pad> - linguistics is called 

इसका एक रूप मानवजाति भाषा विज्ञान <pad> कहलाता है 

> 2 - <pad> <pad> subject to minimum net premium of <pad> payable by farmer 

<pad> <pad> <pad> बशर्ते <pad> न्यूनतम निवल प्रीमियम कृषक द्वारा देय हो 

<pad> 

<pad> 

<pad> da <pad> 

<pad> दा <pad> 

united kingdom 

यूनाइटेड <pad> 

utc 

UTC 

& collapse thread 

à¤¯à¤¹à¤¾à¤ à¤à¤¿à¤¸à¤à¤¾à¤à¤ M 

nothing happens unless first a dream ” - carl <pad> 1878 - 1967 poet 

“कोई सपना देखे बिना कुछ नहीं होता ”-कार्ल <pad> <pad> कवि 

a special purpose vehicle has been constituted to take up highway projects 

राजमार्ग परियोजनाओं के निर्माण के लिए एक विशेष प्रयोजन <pad> का सृजन किया गया है 

i end up at five 

मैं ५ पर जाकर <pad> 

