# Prepare data set for training and test

In [1]:
import os
import sys

In [2]:
current_path = os.path.abspath(os.path.join('.'))
module_path = os.path.abspath(os.path.join('..'))


In [3]:
print(module_path)

/Users/prabhat_tripathi/dev/Project/NLP/models


In [4]:
if module_path not in sys.path:
    sys.path.append(module_path)

In [5]:
import data_prep;
import utils;

In [6]:
data_en, data_hi = data_prep.loadParallelCorpus('../../data/parallel/IITB.en-hi.en', '../../data/parallel/IITB.en-hi.hi')

In [7]:
en_rows=data_en.split("\n")
hi_rows=data_hi.split("\n")
(len(en_rows),len(hi_rows))

(1561841, 1561841)

In [8]:
# 1. remove sentenses with more than half non-alpha numeric characters
to_remove = [i for i, val in enumerate(en_rows) if data_prep.isNonAlpha(val) == True]
print(len(to_remove), " sentences bad. Removing them...")

# process in reverse to avoid recomputing offsets
for index in reversed(to_remove):
    del en_rows[index]
    del hi_rows[index]
    
    

12565  sentences bad. Removing them...


In [9]:
# 3. Shuffle corpora. The sentences in the corpora are not mixed up in the original order
en_rows_all, hi_rows_all = data_prep.shuffle(en_rows, hi_rows)


In [10]:
def prepare_dataset(en_sentences, hi_sentences):
    en_sentences = [ line.lower() for line in en_sentences]
    
    en_vocab_dict = data_prep.buildEngVocab(en_sentences)
    hi_vocab_dict = data_prep.buildHinVocab(hi_sentences)

    en_vocab = list(map(lambda x: x[0], sorted(en_vocab_dict.items(), key=lambda x: -x[1])))
    hi_vocab = list(map(lambda x: x[0], sorted(hi_vocab_dict.items(), key=lambda x: -x[1])))
    
    # Zipf's law
    # https://openreview.net/pdf?id=Bk8N0RLxx - limit vocab to 50k
    if (len(en_vocab) > 50000):
      en_vocab = en_vocab[:50000]
    if (len(hi_vocab) > 50000):
      hi_vocab = hi_vocab[:50000]

    # Build a Word to Index Dictionary for English
    start_idx = 4
    en_word2idx = dict([(word, idx + start_idx) for idx, word in enumerate(en_vocab)])
    en_word2idx['<pad>'] = 0  # Padding
    en_word2idx['<start>'] = 1
    en_word2idx['<end>'] = 2  # End of sentence
    en_word2idx['<ukn>'] = 3  # Unknown words
   

    en_vocab.append('<ukn>');
    en_vocab.append('<start>');
    en_vocab.append('<end>');
    en_vocab.append('<pad>');
    
    # Build reverse Index to Word Dictionary for English using the already created Word to Index Dictionary
    en_idx2word = dict([(idx, word) for word, idx in en_word2idx.items()])

    # Build a Word to Index Dictionary for Hindi
    start_idx = 4
    hi_word2idx = dict([(word, idx + start_idx) for idx, word in enumerate(hi_vocab)])
    hi_word2idx['<pad>'] = 0  # Padding
    hi_word2idx['<start>'] = 1
    hi_word2idx['<end>'] = 2  # End of sentence
    hi_word2idx['<ukn>'] = 3  # Unknown
   

    hi_vocab.append('<ukn>');
    hi_vocab.append('<start>');
    hi_vocab.append('<end>');
    hi_vocab.append('<pad>');

    
    # Build the inverse Index to Word Dictionary for Hindi using the already created Word to Index Dictionary
    hi_idx2word = dict([(idx, word) for word, idx in hi_word2idx.items()])

    # Encode words in senteces by their index in Vocabulary
    x = [[en_word2idx.get(word.strip(',." ;:)(][?!'), 3) for word in sentence.split()] for sentence in en_sentences]
    y = [[hi_word2idx.get(word.strip(',." ;:)(।|][?!'), 3) for word in sentence.split()] for sentence in hi_sentences]

    X = []
    Y = []
    for i in range(len(x)):
        n1 = len(x[i])
        n2 = len(y[i])
        n = n1 if n1 < n2 else n2
        if abs(n1 - n2) < 0.3 * n:
            if n1 <= max_sent_length and n2 <= max_sent_length:
              # ignore single word sentences with only unknown word
              if (n1 > 1 or (x[i][0] != 0 and x[i][0] != en_word2idx[''])) and (n2 > 1 or (y[i][0] != 0 and y[i][0] != hi_word2idx[''])):
                X.append(x[i])
                Y.append(y[i])

    return X, Y, en_word2idx, en_idx2word, en_vocab, hi_word2idx, hi_idx2word, hi_vocab

In [11]:
max_sent_length = 15

In [14]:
dataset_save_location = "out/parallel_trainv3.p"
utils.save_pickle(dataset_save_location, prepare_dataset(en_rows_all, hi_rows_all))

In [15]:
# let us load and check for sanity
X_all, Y_all, en_word2idx_all, en_idx2word_all, en_vocab_all, hi_word2idx_all, hi_idx2word_all, hi_vocab_all = utils.load_pickle_dataset(dataset_save_location)
len(X_all), len(Y_all), len(en_vocab_all), len(hi_vocab_all)

(696695, 696695, 50004, 50004)

In [16]:
for n in range(10):
    data_prep.printSentence(X_all[n], en_idx2word_all)
    print("\n")
    data_prep.printSentence(Y_all[n], hi_idx2word_all)
    print("\n")

jejune 

अपर्याप्त 

add 

जोड़ें 

yea we are able to make complete his very fingertips 

हम इस पर क़ादिर हैं कि हम उसकी पोर पोर दुरूस्त करें 

back up complete 

बैक अप पूर्ण 

information education and communication activities iec 

सूचना शिक्षा और संचार कार्यक्रम आईईसी 


चेतावनीः 

the task was difficult since indians were utterly unfamiliar with modern political work 

क्योंकि भारतीय नये राजनीतिक कार्यकलापों से बिल्कुल अपरिचित थे अतः यह कार्य कठिन था 

that was the level of rhetoric 

इस स्तर पर चीख-पुकार <ukn> गयी थी 

mutton egg butter curds and food prepared with oil should be <ukn> 

माँस अंडे मक्खन दही एवं तेल में पकाये गये खाद्य पदार्थ न दें 

<ukn> singh 

<ukn> सिंह 

