# Task 5 - Attention-based Neural Machine Translation
Team Members

*   Libin Kutty
*   Viju Sudhi
*   Ritu Gahir

In [1]:
%tensorflow_version 2.x

In [2]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import pandas as pd
import os
import io
import time

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
print(os.getcwd())
os.chdir("/content/drive/My Drive/Colab Notebook/IDL/")
print(os.getcwd())

/content
/content/drive/My Drive/Colab Notebook/IDL


In [5]:
dataFile = os.getcwd()
path = os.path.join(dataFile, 'hin-eng/hin.txt')

data = pd.read_csv(path, sep="\t", header=None)
data.columns = ["eng", "hin", "info"]

In [None]:
data

Unnamed: 0,eng,hin,info
0,Wow!,वाह!,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
1,Help!,बचाओ!,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
2,Jump.,उछलो.,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
3,Jump.,कूदो.,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
4,Jump.,छलांग.,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
...,...,...,...
2768,"If you go to that supermarket, you can buy mos...",उस सूपरमार्केट में तुम लगभग कोई भी रोजाने में ...,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
2769,The passengers who were injured in the acciden...,जिन यात्रियों को दुर्घटना मे चोट आई थी उन्हे अ...,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
2770,"Democracy is the worst form of government, exc...","लोकतंत्र सरकार का सबसे घिनौना रूप है, अगर बाकी...",CC-BY 2.0 (France) Attribution: tatoeba.org #1...
2771,If my boy had not been killed in the traffic a...,अगर मेरा बेटा ट्रेफ़िक हादसे में नहीं मारा गया...,CC-BY 2.0 (France) Attribution: tatoeba.org #3...


In [None]:
src_data = data['eng']
tar_data = data['hin']

In [None]:
def load_data(src, tar):
  xdataset = []
  ydataset = []

  for sent in src:
    sentence = 'SOS ' + sent + ' EOS'
    xdataset.append(sentence)
  
  for sent in tar:
    sentence = 'SOS ' + sent + ' EOS'
    ydataset.append(sentence)

  return xdataset, ydataset

In [None]:
class Vocabulary:

  def __init__(self, name):
    PAD_token = 0   # Used for padding short sentences
    SOS_token = 1   # Start-of-sentence token
    EOS_token = 2   # End-of-sentence token
    self.name = name
    self.word2index = {"PAD": PAD_token , "SOS": SOS_token, "EOS": EOS_token}
    self.word2count = {"PAD": 0, "SOS": 0, "EOS": 0}
    self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
    self.num_words = 3
    self.num_sentences = 0
    self.longest_sentence = 0

  def tokenize(self, text):
      # obtains tokens with a least 1 alphabet
      pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
      return pattern.findall(text.lower())

  def add_word(self, word):
    if word not in self.word2index:
        # First entry of word into vocabulary
        self.word2index[word] = self.num_words
        self.word2count[word] = 1
        self.index2word[self.num_words] = word
        self.num_words += 1
    else:
        # Word exists; increase word count
        self.word2count[word] += 1
        
  def add_sentence(self, sentence):
    sentence_len = 0
    for word in sentence.split(' '):
        sentence_len += 1
        self.add_word(word)
    if sentence_len > self.longest_sentence:
        # This is the longest sentence
        self.longest_sentence = sentence_len
    # Count the number of sentences
    self.num_sentences += 1

  def to_word(self, index):
    return self.index2word[index]

  def to_index(self, word):
    return self.word2index[word]

In [138]:
xdataset, ydataset  = load_data(src_data, tar_data)

In [139]:
xdataset[0]

'SOS Wow! EOS'

In [140]:
print(len(xdataset))
print(len(ydataset))

2773
2773


In [73]:
xdataset = xdataset[:1000]
ydataset = ydataset[:1000]

In [141]:
voc_input = Vocabulary('Input')
for sent in xdataset:
  voc_input.add_sentence(sent)

In [142]:
voc_output = Vocabulary('Output')
for sent in ydataset:
  voc_output.add_sentence(sent)

Converting text into matrix form

In [143]:
xtrain_voc = []
ytrain_voc = []
for sent in xdataset:
  xtemp= []
  for word in sent.split(' '):
    xtemp.append(voc_input.to_index(word))
  xtrain_voc.append(xtemp)

for sent in ydataset:
  ytemp= []
  for word in sent.split(' '):
    ytemp.append(voc_output.to_index(word))
  ytrain_voc.append(ytemp)

print(xtrain_voc)
print(ytrain_voc)

[[1, 3, 2], [1, 4, 2], [1, 5, 2], [1, 5, 2], [1, 5, 2], [1, 6, 2], [1, 6, 2], [1, 7, 2], [1, 7, 2], [1, 8, 9, 2], [1, 10, 11, 2], [1, 12, 2], [1, 13, 14, 2], [1, 15, 16, 2], [1, 17, 18, 2], [1, 19, 2], [1, 20, 2], [1, 20, 2], [1, 21, 2], [1, 21, 2], [1, 22, 23, 2], [1, 22, 23, 2], [1, 22, 23, 2], [1, 24, 25, 2], [1, 24, 25, 2], [1, 26, 27, 2], [1, 10, 28, 2], [1, 10, 29, 2], [1, 30, 31, 2], [1, 32, 33, 2], [1, 34, 35, 2], [1, 36, 33, 2], [1, 37, 2], [1, 24, 38, 2], [1, 24, 39, 40, 2], [1, 24, 41, 2], [1, 10, 42, 2], [1, 10, 43, 2], [1, 10, 44, 2], [1, 45, 46, 2], [1, 47, 48, 2], [1, 49, 50, 2], [1, 49, 50, 2], [1, 49, 50, 2], [1, 49, 50, 2], [1, 51, 2], [1, 34, 52, 2], [1, 13, 53, 14, 2], [1, 54, 2], [1, 55, 56, 2], [1, 57, 58, 2], [1, 59, 60, 2], [1, 24, 61, 62, 2], [1, 24, 61, 62, 2], [1, 24, 63, 64, 2], [1, 24, 63, 64, 2], [1, 24, 63, 64, 2], [1, 24, 63, 64, 2], [1, 24, 63, 64, 2], [1, 24, 65, 66, 2], [1, 10, 67, 2], [1, 10, 68, 2], [1, 10, 68, 2], [1, 69, 70, 14, 2], [1, 69, 70, 14

Padding

In [144]:
X_temp = np.zeros([len(xtrain_voc),len(max(xtrain_voc,key = lambda x: len(x)))])
for i,j in enumerate(xtrain_voc):
    X_temp[i][0:len(j)] = j

Y_temp = np.zeros([len(ytrain_voc),len(max(ytrain_voc,key = lambda x: len(x)))])
for i,j in enumerate(ytrain_voc):
    Y_temp[i][0:len(j)] = j

X_temp = X_temp.astype(np.int32)
Y_temp = Y_temp.astype(np.int32)

print(X_temp)
print(Y_temp)

[[   1    3    2 ...    0    0    0]
 [   1    4    2 ...    0    0    0]
 [   1    5    2 ...    0    0    0]
 ...
 [   1 3434  126 ...    0    0    0]
 [   1 2301  179 ...    0    0    0]
 [   1  550   24 ...  193  598    2]]
[[   1    3    2 ...    0    0    0]
 [   1    4    2 ...    0    0    0]
 [   1    5    2 ...    0    0    0]
 ...
 [   1 3087 2663 ...    0    0    0]
 [   1 1722   44 ...    0    0    0]
 [   1 1125   15 ...  241  164    2]]


In [145]:
print(X_temp.shape,Y_temp.shape)

(2773, 24) (2773, 27)


In [146]:
input_train, input_test, output_train, output_test = train_test_split(X_temp, Y_temp, test_size=0.2)

# Show length
print(len(input_train), len(output_train), len(input_test), len(output_test))

2218 2218 555 555


In [147]:
BUFFER_SIZE = len(input_train)
BATCH_SIZE = 32
steps_per_epoch = len(input_train)//BATCH_SIZE
embedding_dim = 128
units = 256

vocab_inp_size = voc_input.num_words+1
vocab_out_size = voc_output.num_words+1

dataset_train = tf.data.Dataset.from_tensor_slices((input_train, output_train)).shuffle(BUFFER_SIZE)
dataset_train = dataset_train.batch(BATCH_SIZE, drop_remainder=True)

In [148]:
input_batch, target_batch = next(iter(dataset_train))
input_batch.shape, target_batch.shape

(TensorShape([32, 24]), TensorShape([32, 27]))

In [149]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')   ##GRU

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [83]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # query hidden state shape == (batch_size, hidden size)
    # query_with_time_axis shape == (batch_size, 1, hidden size)
    # values shape == (batch_size, max_len, hidden size)
    # we are doing this to broadcast addition along the time axis to calculate the score
    query_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(
        self.W1(query_with_time_axis) + self.W2(values)))

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

attention_layer = BahdanauAttention(10)

In [150]:
class DotProdAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(DotProdAttention, self).__init__()

  def call(self, query, values):

    query_with_time_axis = tf.expand_dims(query, 1)

    score = np.dot(query_with_time_axis.transpose(), values)

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

attention_layer = DotProdAttention(10)

In [125]:
class LuongMultiplicativeAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(LuongMultiplicativeAttention, self).__init__()
    self.W = tf.keras.layers.Dense(units)

  def call(self, query, values):

    query_with_time_axis = tf.expand_dims(query, 1)

    score = query_with_time_axis.transpose() * self.W * values

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

attention_layer = LuongMultiplicativeAttention(10)

In [151]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')  ##GRU
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

In [152]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_out_size, embedding_dim, units, BATCH_SIZE)

In [153]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [154]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [155]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)
    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([voc_output.word2index['SOS']] * BATCH_SIZE, 1)
    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [156]:
EPOCHS = 100

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset_train.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)
    
  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 2.3723
Epoch 1 Loss 1.8958
Time taken for 1 epoch 61.06557559967041 sec

Epoch 2 Batch 0 Loss 1.8383
Epoch 2 Loss 1.6949
Time taken for 1 epoch 37.871031761169434 sec

Epoch 3 Batch 0 Loss 1.4792
Epoch 3 Loss 1.6127
Time taken for 1 epoch 37.868489503860474 sec

Epoch 4 Batch 0 Loss 1.5345
Epoch 4 Loss 1.5535
Time taken for 1 epoch 41.14142107963562 sec

Epoch 5 Batch 0 Loss 1.6234
Epoch 5 Loss 1.5080
Time taken for 1 epoch 37.85800647735596 sec

Epoch 6 Batch 0 Loss 1.4226
Epoch 6 Loss 1.4641
Time taken for 1 epoch 38.112282514572144 sec

Epoch 7 Batch 0 Loss 1.6131
Epoch 7 Loss 1.4160
Time taken for 1 epoch 38.03775906562805 sec

Epoch 8 Batch 0 Loss 1.3791
Epoch 8 Loss 1.3703
Time taken for 1 epoch 37.949124813079834 sec

Epoch 9 Batch 0 Loss 1.2106
Epoch 9 Loss 1.3254
Time taken for 1 epoch 37.84076738357544 sec

Epoch 10 Batch 0 Loss 1.2126
Epoch 10 Loss 1.2808
Time taken for 1 epoch 39.52230715751648 sec

Epoch 11 Batch 0 Loss 1.1855
Epoch 11 Loss 1.2422
Time

In [157]:
max_length_targ, max_length_inp = Y_temp.shape[1], X_temp.shape[1]

In [158]:
voc_output.index2word

{0: 'PAD',
 1: 'SOS',
 2: 'EOS',
 3: 'वाह!',
 4: 'बचाओ!',
 5: 'उछलो.',
 6: 'कूदो.',
 7: 'छलांग.',
 8: 'नमस्ते।',
 9: 'नमस्कार।',
 10: 'वाह-वाह!',
 11: 'चियर्स!',
 12: 'समझे',
 13: 'कि',
 14: 'नहीं?',
 15: 'मैं',
 16: 'ठीक',
 17: 'हूँ।',
 18: 'बहुत',
 19: 'बढ़िया!',
 20: 'अंदर',
 21: 'आ',
 22: 'जाओ।',
 23: 'बाहर',
 24: 'निकल',
 25: 'जाओ!',
 26: 'चले',
 27: 'ख़ुदा',
 28: 'हाफ़िज़।',
 29: 'उत्तम!',
 30: 'सही!',
 31: 'आपका',
 32: 'स्वागत',
 33: 'है।',
 34: 'स्वागतम्।',
 35: 'मज़े',
 36: 'करना।',
 37: 'मौज',
 38: 'करो।',
 39: 'भूल',
 40: 'गया।',
 41: 'गई।',
 42: 'पैसे',
 43: 'दूंगा।',
 44: 'मेरा',
 45: 'पेट',
 46: 'भर',
 47: 'गया',
 48: 'चलो',
 49: 'चलें!',
 50: 'मुझे',
 51: 'जवाब',
 52: 'दो।',
 53: 'पंछी',
 54: 'उड़ते',
 55: 'हैं।',
 56: 'माफ़',
 57: 'कीजिए।',
 58: 'ख़ूब!',
 59: 'बेहोश',
 60: 'हो',
 61: 'खेद',
 62: 'की',
 63: 'बात',
 64: 'है,',
 65: 'लेकिन',
 66: 'वैसा',
 67: 'ही',
 68: 'हँसा।',
 69: 'बोर',
 70: 'रहा',
 71: 'दीवालिया',
 72: 'चुका',
 73: 'थक',
 74: 'ठंड',
 75: 'रही',
 76: '

In [92]:
from nltk.translate.bleu_score import sentence_bleu

In [159]:
def evaluate(inputs):
  attention_plot = np.zeros((max_length_targ, max_length_inp))

#  sentence = preprocess_sentence(sentence)

#  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
#  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
#                                                         maxlen=max_length_inp,
#                                                         padding='post')
#  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([voc_output.word2index['SOS']], 0)
  #print(dec_input)
  p = []
  for t in range(max_length_targ):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()

    predicted_id = tf.argmax(predictions[0]).numpy()
    p.append(predicted_id)

    if voc_output.index2word[predicted_id] != 'EOS':
      result += voc_output.index2word[predicted_id] + ' '

    if voc_output.index2word[predicted_id] == 'EOS':
      return result, attention_plot

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, attention_plot

In [160]:
def plot_attention(attention, sentence, predicted_sentence):
  fig = plt.figure(figsize=(10,10))
  ax = fig.add_subplot(1, 1, 1)
  ax.matshow(attention, cmap='viridis' ,clim=[0,1])

  fontdict = {'fontsize': 14}

  ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
  ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

  ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
  ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

  plt.show()

In [161]:
def translate(input, output_txt, input_txt):
  result, attention_plot = evaluate(input)

  print('Actual translation: ',output_txt)
#  print('Input: %s' % (sentence))
  print('Predicted translation: {}'.format(result))

  score = 0.0
  score = sentence_bleu(test, result)

#  attention_plot = attention_plot[:len(result.split(' ')), :len(input_txt.split(' '))]
#  plot_attention(attention_plot, input_txt.split(' '), result.split(' '))

  return score


In [162]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fc4377ebda0>

In [163]:
dataset_test = tf.data.Dataset.from_tensor_slices((input_test, output_test))
dataset_test = dataset_test.batch(1, drop_remainder=True)
print(dataset_test)
count = 0
accuracy = 0.0
for (batch, (inp, targ)) in enumerate(dataset_test):#.take(steps_per_epoch)):
  test = ''
  for i in targ[0]:
    #print(i.numpy())
    if voc_output.index2word[i.numpy()] != 'EOS' and voc_output.index2word[i.numpy()] != 'SOS':
      test += voc_output.index2word[i.numpy()] + ' '
    
    if voc_output.index2word[i.numpy()] == 'EOS':
      break

  inp_txt = ''
  for i in inp[0]:
    if voc_input.index2word[i.numpy()] != 'EOS' and voc_input.index2word[i.numpy()] != 'SOS':
      inp_txt += voc_input.index2word[i.numpy()] + ' '
    
    if voc_input.index2word[i.numpy()] == 'EOS':
      break

  score = translate(inp, test, inp_txt)
  print('BLEU Score/Accuracy: ',score)
  count +=1
  accuracy += score
  #break

  #if count == 5:
  #  break

accuracy = accuracy / count

print('Overall Average Accuracy: ', accuracy)

<BatchDataset shapes: ((1, 24), (1, 27)), types: (tf.int32, tf.int32)>
Actual translation:  मुझे एक जरूरी कॉल करनी है । 
Predicted translation: मुझे बहुत पसंद है। 
BLEU Score/Accuracy:  0.8055344092731546


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


Actual translation:  वह अपने कमरे में रो रही थी। 
Predicted translation: वह चिट्ठी से घर से जल रहा था। 
BLEU Score/Accuracy:  0.7186082239261684
Actual translation:  मुझे लगता है कि यहाँ कुछ ग़लतफ़ैमी हुई है। 
Predicted translation: मेरे पास तुम्हारे लिए समय नहीं है। 
BLEU Score/Accuracy:  0.7806724026248424
Actual translation:  घूमफिरकर बात मत करो, सीधे-सीधे बोलो। 
Predicted translation: इस महीने में रहते हैं। 
BLEU Score/Accuracy:  0.7679634266158699
Actual translation:  मैंने प्रथम पुरस्कार जीत लिया। 
Predicted translation: मुझे परिस्तिथि के साथ आने से पहले देखा था। 
BLEU Score/Accuracy:  0.7810213065790322
Actual translation:  मेरे लिए तैरना आसान है। 
Predicted translation: यह मेरे लिए बहुत ज़्यादा है। 
BLEU Score/Accuracy:  0.8020396005825877
Actual translation:  मैं मेट्रो लेकर स्कूल जाती हूँ। 
Predicted translation: मैं बहुत थक गया हूँ। 
BLEU Score/Accuracy:  0.8507331335123524
Actual translation:  मैं पानी के बिना इन गोलियों को निगल नहीं सकती। 
Predicted translation: मुझे ठीक स

I just tried the BLEU score as it is used to check the translation accuracy but I am not sure if the scores are correct. Even for perfect translation the score was not maximum.

    Actual translation:  चिड़ियाँ घोसले बनातीं हैं। 
    Predicted translation: पक्षी घोसले बनातें हैं। 
    Actual translation:  बाद में मिलेंगे। 
    Predicted translation: फिर मिलेंगे।

    The above translated text amaze me as they have one word different but still the meaning is same. There are many such examples.

# Results

---
* Which parts of the sentence are used as a token? Each character, each word, or are some words split up? - Each word is used as a token. Every unique word is represented by unique token.
* Do the same tokens in different language have the same ID?
e.g. Would the same token index map to the German word die and to the English word die? - Yes, There's possiblity of getting same token ID for two same words in different language. Depending on the position of the word in both the languages. 
* What is the relation between the encoder output and the encoder hidden state which is used to initialize the decoder hidden state?
(for the architecture used in the tutorial) - It is used to generate the attention weights
* Is the decoder attending to all previous positions, including the previous decoder predictions? - Yes
* Does the Encoder output change in different decoding steps? - No it remains the same.
* Does the context vector change in different decoding steps? - Yes it does
* The decoder uses teacher forcing. Does this mean the time steps can be computed in parallel? - Yes, as we already know what the output will be, so while training it can be computed parallelly
* Why is a mask applied to the loss function? - As the input are padded to make them equal size, we need to apply mask to avoid the padded input during calculation of loss


# Conclusion

---
I found dot product to be the better attention mechanism , considering this language. As I got better result with dot product compare to other mechanism.
The model was able to attend to correct token most of the time.
