In [1]:
import xml.etree.ElementTree as ETree
import pandas as pd

  
# give the path where you saved the xml file
# inside the quotes
xmldata = "NEWS2018_M-EnHi_trn.xml"
prstree = ETree.parse(xmldata)
root = prstree.getroot()
  
print(root)
items = []
all_items = []
  
for storeno in root.iter('Name'):
    
    ID = storeno.attrib.get('ID')
    SourceName = storeno.find('SourceName').text
    TargetName = storeno.find('TargetName').text
  
    items = [ID, SourceName, TargetName]
    all_items.append(items)
xmldata = "NEWS2018_M-EnHi_dev.xml"
prstree = ETree.parse(xmldata)
root = prstree.getroot()
  
print(root)
  
for storeno in root.iter('Name'):
    
    ID = storeno.attrib.get('ID')
    SourceName = storeno.find('SourceName').text
    TargetName = storeno.find('TargetName').text
  
    items = [ID, SourceName, TargetName]
    all_items.append(items)
  
lines = pd.DataFrame(all_items, columns=[
  'source', 'english', 'hindi'])

<Element 'TransliterationCorpus' at 0x00000214E64A9598>
<Element 'TransliterationCorpus' at 0x00000214A9978A48>


In [2]:
len(lines)

13937

In [3]:
import nltk
from collections import Counter
from tqdm import tqdm_notebook
import numpy as np
import tensorflow as tf
from tensorflow.contrib import seq2seq
from tensorflow.contrib.rnn import DropoutWrapper
import random

In [4]:
MAX_SEQ_LEN = 20
BATCH_SIZE = 64
#number of samples
N = len(lines)

In [5]:
class Lang:
    def __init__(self, counter, vocab_size):
        self.word2id = {}
        self.id2word = {}
        self.pad = "<PAD>"
        self.sos = "<SOS>"
        self.eos = "<EOS>"
        self.unk = "<UNK>"
        
        self.ipad = 0
        self.isos = 1
        self.ieos = 2
        self.iunk = 3
        
        self.word2id[self.pad] = 0
        self.word2id[self.sos] = 1
        self.word2id[self.eos] = 2
        self.word2id[self.unk] = 3
        
        self.id2word[0] = self.pad
        self.id2word[1] = self.sos
        self.id2word[2] = self.eos
        self.id2word[3] = self.unk
        
        curr_id = 4
        for w, c in counter.most_common(vocab_size):
            self.word2id[w] = curr_id
            self.id2word[curr_id] = w
            curr_id += 1
            
    def encodeSentence(self, s, max_len=-1):
        wseq = s.lower().strip()
        if max_len == -1:
            return [self.word2id[w] if w in self.word2id else self.iunk for w in wseq]
        else:
            return ([self.word2id[w] if w in self.word2id else self.iunk for w in wseq] + [self.ieos] + [self.ipad]*max_len)[:max_len]
        
    def encodeSentence2(self, s, max_len=-1):
        wseq = wseq = s.lower().strip()
        return min(max_len, len(wseq)+1), \
            ([self.word2id[w] if w in self.word2id else self.iunk for w in wseq] + \
                [self.ieos] + [self.ipad]*max_len)[:max_len]
    
    def decodeSentence(self, id_seq):
        id_seq = np.array(id_seq + [self.ieos])
        j = np.argmax(id_seq==self.ieos)
        s = ''.join([self.id2word[x] for x in id_seq[:j]])
        s = s.replace(self.unk, "UNK")
        return s

In [6]:
hi_counter = Counter()
hi_sentences=[]
en_counter = Counter()
en_sentences=[]


hi_sentences=lines['hindi'][:N]
en_sentences=lines['english'][:N]
for line in tqdm_notebook(hi_sentences):
    for w in line.strip():
        hi_counter[w] += 1
for line in tqdm_notebook(en_sentences):
    for w in line.strip():
        en_counter[w] += 1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


  0%|          | 0/13937 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if sys.path[0] == '':


  0%|          | 0/13937 [00:00<?, ?it/s]

In [7]:
# A few sample hindi characters
print("Most common hi characters in dataset:\n", hi_counter.most_common(5))

print("\nTotal (hi)characters gathered from dataset:",len(hi_counter))

# A few sample english characters
print("\nMost common en characters in dataset:\n", en_counter.most_common(5))

print("\nTotal (en)characters gathered from dataset:", len(en_counter))

Most common hi characters in dataset:
 [('ा', 9549), ('र', 9487), ('्', 8129), (' ', 6599), ('न', 5857)]

Total (hi)characters gathered from dataset: 83

Most common en characters in dataset:
 [('a', 23354), ('r', 9809), ('i', 9316), ('n', 8959), ('e', 8951)]

Total (en)characters gathered from dataset: 44


In [8]:
en_lang = Lang(en_counter, len(en_counter))
hi_lang = Lang(hi_counter, len(hi_counter))

In [9]:
print("Test en encoding:", en_lang.encodeSentence("Shukriya"))

print("Test en decoding:", en_lang.decodeSentence(en_lang.encodeSentence("Shukriya", 10)))

print("Test hindi encoding:", hi_lang.encodeSentence("शुक्रिया", 10))

print("Test hindi decoding:", hi_lang.decodeSentence((hi_lang.encodeSentence("शुक्रिया", 10))))

Test en encoding: [11, 9, 17, 18, 5, 6, 23, 4]
Test en decoding: shukriya
Test hindi encoding: [30, 23, 14, 6, 5, 9, 20, 4, 2, 0]
Test hindi decoding: शुक्रिया


In [10]:
VE = len(en_lang.word2id)
VH = len(hi_lang.word2id)

In [11]:
en_word_emb_matrix = tf.get_variable("en_word_emb_matrix", (VE, 300), dtype=tf.float32)
hi_word_emb_matrix = tf.get_variable("hi_word_emb_matrix", (VH, 300), dtype=tf.float32)

In [12]:
keep_prob = tf.placeholder(tf.float32)

input_ids = tf.placeholder(tf.int32, (None, MAX_SEQ_LEN))
input_lens = tf.placeholder(tf.int32, (None, ))

ph_target_ids = tf.placeholder(tf.int32, (None, MAX_SEQ_LEN))
target_lens = tf.placeholder(tf.int32, (None, ))

In [13]:
# Add SOS or GO symbol
target_ids = tf.concat([tf.fill([BATCH_SIZE,1], hi_lang.isos), ph_target_ids], -1)

In [14]:
input_emb = tf.nn.embedding_lookup(en_word_emb_matrix, input_ids)
target_emb = tf.nn.embedding_lookup(hi_word_emb_matrix, target_ids[:, :-1])

In [15]:
input_emb.shape

TensorShape([Dimension(None), Dimension(20), Dimension(300)])

In [16]:
encoder_cell = tf.nn.rnn_cell.GRUCell(128) # 128 is the dimension of hidden state
encoder_cell = DropoutWrapper(encoder_cell, output_keep_prob=keep_prob) # Adding Dropout for regularization

Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.


In [17]:
enc_outputs, enc_state = tf.nn.dynamic_rnn(
    encoder_cell, # The encoder GRU cell
    input_emb, # Embedded input sequence
    sequence_length=input_lens, # Sequence lengths of individual inputs in a batch
    initial_state=encoder_cell.zero_state(BATCH_SIZE, dtype=tf.float32)
)

Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [18]:
# Confirm the shape of the final hidden state
enc_state.shape

TensorShape([Dimension(64), Dimension(128)])

In [19]:
decoder_cell = tf.nn.rnn_cell.GRUCell(128)
decoder_cell = DropoutWrapper(decoder_cell, output_keep_prob=keep_prob)

In [20]:
output_projection = tf.layers.Dense(len(hi_lang.word2id))

In [21]:
helper = seq2seq.TrainingHelper(target_emb, target_lens)
decoder = seq2seq.BasicDecoder(decoder_cell, helper, enc_state, output_projection)
outputs, _, outputs_lens = seq2seq.dynamic_decode(decoder, maximum_iterations=MAX_SEQ_LEN, 
                                                  impute_finished=False, swap_memory=True)
output_max_len = tf.reduce_max(outputs_lens)

In [22]:
# Using the decoder_cell without dropout here.
infer_helper = seq2seq.GreedyEmbeddingHelper(hi_word_emb_matrix, tf.fill([BATCH_SIZE, ], hi_lang.isos), hi_lang.ieos)
infer_decoder = seq2seq.BasicDecoder(decoder_cell, infer_helper, enc_state, output_projection)
infer_output = seq2seq.dynamic_decode(infer_decoder, maximum_iterations=MAX_SEQ_LEN, swap_memory=True)

In [23]:
# Sequence mask:
# To make sure we don't back-propagate error from output of length positions
masks = tf.sequence_mask(target_lens, output_max_len, dtype=tf.float32, name='masks')

# Loss function - weighted softmax cross entropy
cost = seq2seq.sequence_loss(
    outputs[0],
    target_ids[:, 1:(output_max_len + 1)],
    masks)

# Optimizer
optimizer = tf.train.AdamOptimizer(0.0001)

In [24]:
train_op = optimizer.minimize(cost)

In [25]:
init = tf.global_variables_initializer()

In [26]:
sess_config = tf.ConfigProto()
sess_config.gpu_options.allow_growth = True

In [27]:
sess = tf.InteractiveSession(config=sess_config)
sess.run(init)

In [28]:
random.seed(41)

In [29]:
parallel = list(zip(en_sentences, hi_sentences))

In [30]:
en_sentences

0              aabhaa
1             aabheer
2           aabhijaat
3               aabid
4             aabshar
             ...     
13932    yuvraj singh
13933    zahoor elahi
13934          zaleel
13935            zayb
13936          zorion
Name: english, Length: 13937, dtype: object

In [31]:
random.shuffle(parallel)

In [32]:
parallel

[('kara-kum', 'काराकुम'),
 ('daler', 'दलेर'),
 ('sulwyn', 'सुल्विन'),
 ('manning', 'मैनिंग'),
 ('shashibala', 'शशिबाला'),
 ('talisma', 'तलीस्मा'),
 ('thirunarayan', 'तिरुनारायण'),
 ('medal of suvorov', 'मेडल ऑफ सुवोरोव'),
 ('errin', 'एरिन'),
 ('gurudwara tibbi sahib', 'गुरूद्वारा टिब्बी साहिब'),
 ('amadhya', 'अमध्य'),
 ('radhika', 'राधिका'),
 ('paul hitchcock', 'पॉल हिचकॉक'),
 ('kush', 'कुश'),
 ('aru', 'अरु'),
 ('fort douaumont', 'फोर्ट डॉमोंट'),
 ('hindu american', 'हिन्दू अमेरिकन'),
 ("saint george's", 'सेंट जॉर्ज'),
 ('abdul haakim', 'अब्दुल हाकिम'),
 ('ebra', 'एब्रा'),
 ('randheer', 'रणधीर'),
 ('roma', 'रोमा'),
 ('dartmoor', 'डार्टमूर'),
 ('moghe', 'मोघे'),
 ('qaylah', 'कयलाह'),
 ('gaon hamara shehar tumhara', 'गाँव हमारा शहर तुम्हारा'),
 ('pamela', 'पामेला'),
 ('corey collymore', 'कॉरे कॉलीमर'),
 ('shankar khan', 'शंकर खान'),
 ('wayne', 'वेन'),
 ('samsherbaz', 'समशेरबाज़'),
 ('pocono', 'पॉकॉनो'),
 ('paramanand', 'परमानन्द'),
 ('kenya', 'केन्या'),
 ('shatrughn', 'शत्रुघ्न'),
 ('sawa

In [33]:
#split data into 95% and 5% for train and validation
train_n = int(0.95*N)
valid_n = N - train_n

In [34]:
train_pairs = parallel[:train_n].copy()
valid_pairs = parallel[train_n:]

In [35]:
def small_test():
    all_bleu = []
    smoothing = nltk.translate.bleu_score.SmoothingFunction().method7
    for m in range(0, valid_n, BATCH_SIZE):
        # print(f"Status: {m}/{N}", end='\r')
        n = m + BATCH_SIZE
        if n > valid_n:
            break

        input_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
        input_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
        for i in range(m, n):
            b,a = en_lang.encodeSentence2(valid_pairs[i][0], MAX_SEQ_LEN)
            input_batch[i-m,:] = a
            input_lens_batch[i-m] = b



        feed_dict={
            input_ids: input_batch,
            input_lens: input_lens_batch,

            keep_prob: 1.0
        }
        pred_batch = sess.run(infer_output[0].sample_id, feed_dict=feed_dict)
        for k, pred_ in enumerate(pred_batch):
            pred_s = hi_lang.decodeSentence(list(pred_))
            ref = valid_pairs[m+k][1]
            try:
                _bx = nltk.translate.bleu_score.sentence_bleu(
                    [ref],
                    pred_s,
                    weights=[1/4]*4,
                    smoothing_function=smoothing)
            except ZeroDivisionError:
                _bx = 0
            all_bleu.append(_bx)

    print(f"BLEU Score: {np.mean(all_bleu)}")

In [36]:
for _e in range(200):
    # Mix things up a bit.
    random.shuffle(train_pairs)
    pbar = tqdm_notebook(range(0, train_n, BATCH_SIZE))
    batch_loss = 0
    bxi = 0
    for m in pbar:
        n = m + BATCH_SIZE
        if n <= train_n:
            # print("Epoch Complete... \n")

            input_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
            input_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
            for i in range(m, n):
                b,a = en_lang.encodeSentence2(train_pairs[i][0], MAX_SEQ_LEN)
                input_batch[i-m,:] = a
                input_lens_batch[i-m] = b

            target_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
            target_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
            for i in range(m, n):
                b,a = hi_lang.encodeSentence2(train_pairs[i][1], MAX_SEQ_LEN)
                target_batch[i-m,:] = a
                target_lens_batch[i-m] = b

            feed_dict={
                input_ids: input_batch,
                input_lens: input_lens_batch,
                ph_target_ids: target_batch,
                target_lens: target_lens_batch,
                keep_prob: 0.8 
            }
            sess.run(train_op, feed_dict=feed_dict)
            batch_loss += sess.run(cost, feed_dict=feed_dict)
            pbar.set_description(f"Epoch: {_e} >> Loss: {batch_loss/(bxi+1):2.2F}:")
            bxi += 1
            if (1 + n//BATCH_SIZE) % 100 == 0:
                small_test()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.021706666905198967
BLEU Score: 0.04320409049441479


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.06043606218612607
BLEU Score: 0.0764802138606902


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.09601003604361098
BLEU Score: 0.10447079319702508


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.11630330336657144
BLEU Score: 0.1250719502413918


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.13610636374585697
BLEU Score: 0.1413802502702629


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.14461334565150757
BLEU Score: 0.14779052464792564


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.1500913277993179
BLEU Score: 0.15510312501545084


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.15214905756174782
BLEU Score: 0.15379650707184736


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.1565878662509907
BLEU Score: 0.16093592042895744


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.161006428728881
BLEU Score: 0.16398295795250406


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.16820424615061333
BLEU Score: 0.16565966331026166


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.16761414335008357
BLEU Score: 0.17023571835992612


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.1699734980691659
BLEU Score: 0.1720869564264645


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.1717775868158697
BLEU Score: 0.17391708061562355


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.1767016983859074
BLEU Score: 0.17805138867676867


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.18286657734889578
BLEU Score: 0.18290757710208919


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.18300130580857615
BLEU Score: 0.18652205249289727


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.1882051453127204
BLEU Score: 0.18636216648745854


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.1917874058828584
BLEU Score: 0.19503536893934337


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.19419012524769055
BLEU Score: 0.19671310501756115


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.2011052358379025
BLEU Score: 0.20058655106593423


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.20464724156609537
BLEU Score: 0.20718661770093433


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.20814014038459408
BLEU Score: 0.20836174693658155


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.2100935420929318
BLEU Score: 0.2105784498893959


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.2144042456246252
BLEU Score: 0.21643737100128896


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.21869978462061837
BLEU Score: 0.22458734998658691


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.22055920468826593
BLEU Score: 0.2236090977098578


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.22162603218521423
BLEU Score: 0.22527480319377516


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.22450044320044124
BLEU Score: 0.22709108915344506


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.23025586720484253
BLEU Score: 0.23101556649309987


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.23242908957616795
BLEU Score: 0.2312318705576959


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.23647187919942203
BLEU Score: 0.2370191088383442


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.23806422891317314
BLEU Score: 0.24323422593140714


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.24215293731832785
BLEU Score: 0.24204630547042685


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.24501978962564977
BLEU Score: 0.25073770991705424


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.25267149581930515
BLEU Score: 0.25347649647421655


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.2558744381660728
BLEU Score: 0.2585370420501788


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.2588078732040781
BLEU Score: 0.25913253216816656


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.2596390872993237
BLEU Score: 0.2584774166889593


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.26409222110631764
BLEU Score: 0.262350692514064


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.266562412582603
BLEU Score: 0.2663863629898383


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.266435478482048
BLEU Score: 0.2651700253018155


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.26797849937465346
BLEU Score: 0.2688549648661498


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.27400230861333486
BLEU Score: 0.2758221388031269


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.27448927896818687
BLEU Score: 0.27866342066826155


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.2777300638255914
BLEU Score: 0.2836101265227099


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.2830895352562771
BLEU Score: 0.2860030088909893


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.29002125118087296
BLEU Score: 0.29248068003608924


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.2985585256814463
BLEU Score: 0.2972867411003123


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.302313307931798
BLEU Score: 0.3007713691495085


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.2983963736768619
BLEU Score: 0.30515656434083394


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.3041005935031027
BLEU Score: 0.31402757237942425


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.31135351660175453
BLEU Score: 0.3117235302835101


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.311532428061841
BLEU Score: 0.315351259331185


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.32179740436369875
BLEU Score: 0.3166422527479407


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.3157826107836449
BLEU Score: 0.32476444627908496


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.3211075325326075
BLEU Score: 0.33309737059078437


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.33142311739240654
BLEU Score: 0.33236189114340736


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.33871801752487146
BLEU Score: 0.3295602013850475


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.33780098982154405
BLEU Score: 0.33391215779482897


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.3428852429740825
BLEU Score: 0.34678105449513386


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.34083687451050515
BLEU Score: 0.3480586497248422


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.34994043582092493
BLEU Score: 0.3477451991193154


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.3542910949868412
BLEU Score: 0.34924802295765667


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.35208245659299126
BLEU Score: 0.34892600874386964


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.35684209826162344
BLEU Score: 0.35522988242010384


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.35808222676909296
BLEU Score: 0.3608467511226604


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.3641834454214634
BLEU Score: 0.35545624769963136


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.3571452496473154
BLEU Score: 0.3636024668624212


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.3650474242088225
BLEU Score: 0.3681184247932948


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.3679179450155653
BLEU Score: 0.36471252857123565


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.3698116400604216
BLEU Score: 0.3726852695366186


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.36874114018295384
BLEU Score: 0.37464737364810413


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.37582683600779465
BLEU Score: 0.3705051470307751


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.37436957499278284
BLEU Score: 0.3743251118267593


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.3777009591822914
BLEU Score: 0.3739284424093468


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.37082565039110205
BLEU Score: 0.3717783376026654


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.37850756730770596
BLEU Score: 0.3812761286280021


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.3798927006183031
BLEU Score: 0.38356590604942337


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.3811049311504299
BLEU Score: 0.380612781253149


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.3828952583094515
BLEU Score: 0.37998084776806423


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.38860492353701614
BLEU Score: 0.38023793447806065


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.3879955668642982
BLEU Score: 0.38943959301994696


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.38582466213944167
BLEU Score: 0.38576523783416


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.3843708504843001
BLEU Score: 0.39144705955025527


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.38364897399253894
BLEU Score: 0.3833830656798375


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.39738644578483134
BLEU Score: 0.39051752033533416


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.39554801016722796
BLEU Score: 0.3900490671987666


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.39354097306368496
BLEU Score: 0.3959846225179625


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.40577621880420356
BLEU Score: 0.3980432749104245


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.40206516914122686
BLEU Score: 0.4053151512915224


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.3977930625095695
BLEU Score: 0.3988273787567843


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.404628850146736
BLEU Score: 0.4097451223076784


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4097398234187416
BLEU Score: 0.4054513842062111


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4100286970165249
BLEU Score: 0.4000355571137592


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4143368741118194
BLEU Score: 0.40691438779067096


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4065659930405802
BLEU Score: 0.4165325822517669


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4150587720801989
BLEU Score: 0.4141918498213375


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4090848595661395
BLEU Score: 0.41036389743700113


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4131999256015037
BLEU Score: 0.4131308522495705


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4113404217397646
BLEU Score: 0.4144052077641319


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.41175175425020705
BLEU Score: 0.4109546639296376


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4132944866532403
BLEU Score: 0.4153980739058164


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.40690679342304853
BLEU Score: 0.4206641544552566


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.41728981588220526
BLEU Score: 0.4167995506701546


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.41845457082518
BLEU Score: 0.4160699332536838


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.422290788655906
BLEU Score: 0.4296608324622545


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4197200373722609
BLEU Score: 0.42485704766194743


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4207799215686695
BLEU Score: 0.42467256146049903


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4289981923014814
BLEU Score: 0.4223474072281538


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4253639133298173
BLEU Score: 0.42663281137499354


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.42773628991383583
BLEU Score: 0.4240393432681973


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.42146758384282157
BLEU Score: 0.4226273657764411


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4246573823893442
BLEU Score: 0.425234594854059


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4286683178374403
BLEU Score: 0.43031898705712895


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4356426089528407
BLEU Score: 0.42933444302260837


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.42875803631159626
BLEU Score: 0.43272908244450986


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4329448092158363
BLEU Score: 0.4343672237797094


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4283415517808914
BLEU Score: 0.43778166637308813


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4361960367184555
BLEU Score: 0.4386210916798648


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4383450445729379
BLEU Score: 0.43492438288580393


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.44218727544489866
BLEU Score: 0.4334662286618146


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4354213082558
BLEU Score: 0.4349846054893202


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4360427886869228
BLEU Score: 0.44332061168291836


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4445935879178261
BLEU Score: 0.4413859611419705


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4446335235999287
BLEU Score: 0.4415723662553438


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4462121893788571
BLEU Score: 0.4431991097473282


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.44784504467876546
BLEU Score: 0.4406083124556634


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.44326436817127457
BLEU Score: 0.449102767828408


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4491010012413298
BLEU Score: 0.4464979133819654


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4514525633539154
BLEU Score: 0.45264155175752274


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4550490294009113
BLEU Score: 0.45916578134335395


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4577530728629872
BLEU Score: 0.45040946629578293


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.45128286648251825
BLEU Score: 0.4493902763989507


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.45465876782024106
BLEU Score: 0.4534776104282157


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.45915368608592216
BLEU Score: 0.4565028536111996


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4511730043164929
BLEU Score: 0.45972918430276427


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4642785347742488
BLEU Score: 0.46175946224710807


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4566406790847525
BLEU Score: 0.46286761738328497


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4585156703831886
BLEU Score: 0.4639174164475797


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.46201790001534987
BLEU Score: 0.45625014161920563


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.46657696008503813
BLEU Score: 0.46099048490257033


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.467081656405795
BLEU Score: 0.4692553334039907


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4688752241483046
BLEU Score: 0.4641264303344032


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4661976842654808
BLEU Score: 0.4653723297919957


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.46003455597521564
BLEU Score: 0.47224910713745183


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4668285624349628
BLEU Score: 0.4711621525744055


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.47046292646203297
BLEU Score: 0.46899672796138747


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4622084442509287
BLEU Score: 0.4779566801943635


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.47810013541330143
BLEU Score: 0.4698522390961026


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4718950688812157
BLEU Score: 0.4701560586609805


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.47113247885647186
BLEU Score: 0.46900688487870773


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.46929839268189505
BLEU Score: 0.4667572985722618


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.47172943643696164
BLEU Score: 0.4784293428151539


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4704831041939114
BLEU Score: 0.4714928954079829


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.46976899328702243
BLEU Score: 0.4718553669372983


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.47824444260765764
BLEU Score: 0.4742811978349265


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.47318854451021897
BLEU Score: 0.4784743741999611


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4776987458417519
BLEU Score: 0.4797596297057566


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.47902812236928227
BLEU Score: 0.47804741505190196


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.47237087342851075
BLEU Score: 0.48296340253461045


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4795554156073917
BLEU Score: 0.478033578181258


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4804122859656788
BLEU Score: 0.4802413205587377


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.48494720115905565
BLEU Score: 0.4792358480338459


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4888872322483593
BLEU Score: 0.48119231672374474


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4859487078632635
BLEU Score: 0.48076725465163817


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.47984209323914906
BLEU Score: 0.48523048412039227


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.48839551792338226
BLEU Score: 0.48149352955508923


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4854252130608268
BLEU Score: 0.49184360837923524


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.48666596160503417
BLEU Score: 0.4839554201271932


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.48846476813711626
BLEU Score: 0.48783953811686


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4903204381631706
BLEU Score: 0.4917608454224805


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4887901945035285
BLEU Score: 0.4887260714707624


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.49050636319131546
BLEU Score: 0.48756889747527554


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4821082707297597
BLEU Score: 0.49495237166802797


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4987684017472301
BLEU Score: 0.4941088186030882


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4910804520236403
BLEU Score: 0.5020957768001283


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4955576499769737
BLEU Score: 0.4949548700109382


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.49617427246107937
BLEU Score: 0.4941144454017056


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.49532132119152916
BLEU Score: 0.49983205190719426


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.5002173999236119
BLEU Score: 0.4998957871537991


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.49680883645837903
BLEU Score: 0.49191547530961427


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4991032652705975
BLEU Score: 0.5011842959430907


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.5057181275036944
BLEU Score: 0.4961004754363885


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.5014930073154262
BLEU Score: 0.5025732789029386


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.5071431885117212
BLEU Score: 0.508070403936397


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.4968016648039256
BLEU Score: 0.4989494706229179


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.49919489894292085
BLEU Score: 0.5035236669803531


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.5030347447508763
BLEU Score: 0.5035288311040894


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.5054007016968652
BLEU Score: 0.49873735224738647


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.502281977649518
BLEU Score: 0.5038181074254615


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.506291546929391
BLEU Score: 0.5012337084222253


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.5030073167489986
BLEU Score: 0.5025738922753871


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.503314093965348
BLEU Score: 0.4987026126081747


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.5069466281879283
BLEU Score: 0.5033347546096854


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.5062870417784402
BLEU Score: 0.5071676758829341


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.5101541646829473
BLEU Score: 0.505169972524514


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.5098421752327245
BLEU Score: 0.5105372743413399


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.5042403443223733
BLEU Score: 0.5022238887135737


  0%|          | 0/207 [00:00<?, ?it/s]

BLEU Score: 0.5037039269610649
BLEU Score: 0.5087270435881411


In [37]:
def transliterate(s):
    input_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
    input_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
    b,a = en_lang.encodeSentence2(s, MAX_SEQ_LEN)
    input_batch[0, :] = a
    input_lens_batch[0] = b
    
    feed_dict={
        input_ids: input_batch,
        input_lens: input_lens_batch,
        #target_ids: target_batch,
        #target_lens: target_lens_batch,
        keep_prob: 1.0
    }
    pred_batch = sess.run(infer_output[0].sample_id, feed_dict=feed_dict)
    pred_ = pred_batch[0]
    pred_s = hi_lang.decodeSentence(list(pred_))
    # ref = valid_pairs[m+k][1]
    return pred_s

some examples:-

In [38]:
transliterate("saya")

'साया'

In [47]:
transliterate('shubham')

'शुभाम'

In [40]:
transliterate('ayushi')

'आयुषी'

In [41]:

#testing XML file
xmldata = "NEWS2018_M-EnHi_tst.xml"
prstree = ETree.parse(xmldata)
root = prstree.getroot()
  
print(root)
store_items = []
all_items = []
  
for storeno in root.iter('Name'):
    
    store_Nr = storeno.attrib.get('ID')
    itemsF = storeno.find('SourceName').text
   
  
    store_items = [store_Nr, itemsF]
    all_items.append(store_items)

  
lines2 = pd.DataFrame(all_items, columns=[
  'source', 'english'])

<Element 'TransliterationCorpus' at 0x00000214C9EF2868>


In [48]:
import random
 
#testing on test XML file 
#example 1
x=random.choice(lines2['english'])
print('english:-',x)
print('hindi:- ',transliterate(x))

english:- kalpataru
hindi pridiction:-  काल्पतुरा


In [49]:
#example 2
x=random.choice(lines2['english'])
print('english:-',x)
print('hindi:- ',transliterate(x))

english:- gyanlata
hindi:-  ग्यानलता


In [50]:
#example 3
x=random.choice(lines2['english'])
print('english:-',x)
print('hindi:- ',transliterate(x))

english:- vandita
hindi:-  वंदिता


In [51]:
#example 4
x=random.choice(lines2['english'])
print('english:-',x)
print('hindi:- ',transliterate(x))

english:- malvin
hindi:-  मल्विन


In [55]:
#example 5
x=random.choice(lines2['english'])
print('english:-',x)
print('hindi:- ',transliterate(x))

english:- herriott
hindi:-  हेरिटो
