# Prepare data set for training and test

In [29]:
import os
import sys

In [2]:
current_path = os.path.abspath(os.path.join('.'))
module_path = os.path.abspath(os.path.join('..'))


In [3]:
print(module_path)

/Users/prabhat_tripathi/dev/Project/NLP/models


In [4]:
if module_path not in sys.path:
    sys.path.append(module_path)

In [5]:
INDIC_NLP_LIB_HOME=current_path + "/indic_nlp_lib"
INDIC_NLP_RESOURCES=current_path + "/indic_nlp_resources"

In [6]:
sys.path.insert(0,'{}/src'.format(INDIC_NLP_LIB_HOME))

In [7]:
print(sys.path)

['/Users/prabhat_tripathi/dev/Project/NLP/models/model4/indic_nlp_lib/src', '', '/anaconda3/lib/python36.zip', '/anaconda3/lib/python3.6', '/anaconda3/lib/python3.6/lib-dynload', '/anaconda3/lib/python3.6/site-packages', '/anaconda3/lib/python3.6/site-packages/aeosa', '/anaconda3/lib/python3.6/site-packages/IPython/extensions', '/Users/prabhat_tripathi/.ipython', '/Users/prabhat_tripathi/dev/Project/NLP/models']


In [8]:
import data_prep;
import utils;

In [9]:
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)

In [10]:
from indicnlp import loader
loader.load()

  ALL_PHONETIC_VECTORS= ALL_PHONETIC_DATA.ix[:,PHONETIC_VECTOR_START_OFFSET:].as_matrix()
  TAMIL_PHONETIC_VECTORS=TAMIL_PHONETIC_DATA.ix[:,PHONETIC_VECTOR_START_OFFSET:].as_matrix()
  ENGLISH_PHONETIC_VECTORS=ENGLISH_PHONETIC_DATA.ix[:,PHONETIC_VECTOR_START_OFFSET:].as_matrix()


In [11]:
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
remove_nuktas=True
factory=IndicNormalizerFactory()
normalizer=factory.get_normalizer("hi",remove_nuktas)

In [12]:
input_text=u"\u0958 \u0915\u093c"
output_text=normalizer.normalize(input_text)
print("input:", input_text, " output (clean):", output_text)

input: क़ क़  output (clean): क क


In [13]:
data_en, data_hi = data_prep.loadParallelCorpus('../../data/parallel/IITB.en-hi.en', '../../data/parallel/IITB.en-hi.hi')

In [14]:
en_rows=data_en.split("\n")
hi_rows=data_hi.split("\n")
(len(en_rows),len(hi_rows))

(1561841, 1561841)

In [15]:
# 1. remove sentenses with more than half non-alpha numeric characters
to_remove = [i for i, val in enumerate(en_rows) if data_prep.isNonAlpha(val) == True]
print(len(to_remove), " sentences bad. Removing them...")

# process in reverse to avoid recomputing offsets
for index in reversed(to_remove):
    del en_rows[index]
    del hi_rows[index]
    
    

12565  sentences bad. Removing them...


In [16]:
# 2. Normalize hindi Nukta characters
hi_rows = [normalizer.normalize(x) for x in hi_rows]

In [17]:
# 3. Shuffle corpora. The sentences in the corpora are not mixed up in the original order
en_rows_all, hi_rows_all = data_prep.shuffle(en_rows, hi_rows)


In [18]:
def prepare_dataset(en_sentences, hi_sentences):
    en_sentences = [ line.lower() for line in en_sentences]
    
    en_vocab_dict = data_prep.buildEngVocab(en_sentences)
    hi_vocab_dict = data_prep.buildHinVocab(hi_sentences)

    en_vocab = list(map(lambda x: x[0], sorted(en_vocab_dict.items(), key=lambda x: -x[1])))
    hi_vocab = list(map(lambda x: x[0], sorted(hi_vocab_dict.items(), key=lambda x: -x[1])))
    
    # Zipf's law
    # https://openreview.net/pdf?id=Bk8N0RLxx - limit vocab to 50k
    if (len(en_vocab) > 50000):
      en_vocab = en_vocab[:50000]
    if (len(hi_vocab) > 50000):
      hi_vocab = hi_vocab[:50000]

    # Build a Word to Index Dictionary for English
    start_idx = 4
    en_word2idx = dict([(word, idx + start_idx) for idx, word in enumerate(en_vocab)])
    en_word2idx['<pad>'] = 0  # Padding
    en_word2idx['<start>'] = 1
    en_word2idx['<end>'] = 2  # End of sentence
    en_word2idx['<ukn>'] = 3  # Unknown words
   

    en_vocab.append('<ukn>');
    en_vocab.append('<start>');
    en_vocab.append('<end>');
    en_vocab.append('<pad>');
    
    # Build reverse Index to Word Dictionary for English using the already created Word to Index Dictionary
    en_idx2word = dict([(idx, word) for word, idx in en_word2idx.items()])

    # Build a Word to Index Dictionary for Hindi
    start_idx = 4
    hi_word2idx = dict([(word, idx + start_idx) for idx, word in enumerate(hi_vocab)])
    hi_word2idx['<pad>'] = 0  # Padding
    hi_word2idx['<start>'] = 1
    hi_word2idx['<end>'] = 2  # End of sentence
    hi_word2idx['<ukn>'] = 3  # Unknown
   

    hi_vocab.append('<ukn>');
    hi_vocab.append('<start>');
    hi_vocab.append('<end>');
    hi_vocab.append('<pad>');

    
    # Build the inverse Index to Word Dictionary for Hindi using the already created Word to Index Dictionary
    hi_idx2word = dict([(idx, word) for word, idx in hi_word2idx.items()])

    # Encode words in senteces by their index in Vocabulary
    x = [[en_word2idx.get(word.strip(',." ;:)(][?!'), 3) for word in sentence.split()] for sentence in en_sentences]
    y = [[hi_word2idx.get(word.strip(',." ;:)(।|][?!'), 3) for word in sentence.split()] for sentence in hi_sentences]

    X = []
    Y = []
    for i in range(len(x)):
        n1 = len(x[i])
        n2 = len(y[i])
        n = n1 if n1 < n2 else n2
        if abs(n1 - n2) < 0.3 * n:
            if n1 <= max_sent_length and n2 <= max_sent_length:
              # ignore single word sentences with only unknown word
              if (n1 > 1 or (x[i][0] != 0 and x[i][0] != en_word2idx[''])) and (n2 > 1 or (y[i][0] != 0 and y[i][0] != hi_word2idx[''])):
                X.append(x[i])
                Y.append(y[i])

    return X, Y, en_word2idx, en_idx2word, en_vocab, hi_word2idx, hi_idx2word, hi_vocab

In [19]:
max_sent_length = 15

In [20]:
dataset_save_location = "out/parallel_trainv4.p"
utils.save_pickle(dataset_save_location, prepare_dataset(en_rows_all, hi_rows_all))

In [21]:
# let us load and check for sanity
X_all, Y_all, en_word2idx_all, en_idx2word_all, en_vocab_all, hi_word2idx_all, hi_idx2word_all, hi_vocab_all = utils.load_pickle_dataset(dataset_save_location)
len(X_all), len(Y_all), len(en_vocab_all), len(hi_vocab_all)

(696695, 696695, 50004, 50004)

In [27]:
for n in range(10):
    data_prep.printSentence(X_all[n], en_idx2word_all)
    print("\n")
    data_prep.printSentence(Y_all[n], hi_idx2word_all)
    print("\n")

but this desire is not all bad 

किन्तु यह अभिलाषा पूर्णतः बुरी नहीं है 

see this also 

यह भी देखें 

nitrogen constitutes nearly four - fifths of the air by volume 

वायु में नाइट्रोजन या <ukn> की मात्रा लगभग 4/5 भाग होती है 

memento 

<ukn> 

acoustics 

<ukn> 

substance 

अर्थ 

illiterate 

अनपढ 

<ukn> 

स्वच्छंद 

<ukn> irani 

बोमन ईरानी 

eggplant 

<ukn> 

