In [None]:
import tensorflow as tf
import numpy as np
import tensorflow_datasets as tfds
from keras.preprocessing.text import Tokenizer


import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

Using TensorFlow backend.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
embeddings_index = dict()
f = open('/content/drive/My Drive/glove.6B.300d.txt',encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [None]:
docs=embeddings_index.keys()
tokenizer_en = Tokenizer(num_words=400000)
tokenizer_en.fit_on_texts(docs)
vocab_size = len(tokenizer_en.word_index) + 3
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer_en.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
train_dataset_en=tf.data.TextLineDataset('/content/drive/My Drive/DEEP LEARNING ASSIGNMENTS/A4_set2/train_en.txt')
train_dataset_ta=tf.data.TextLineDataset('/content/drive/My Drive/DEEP LEARNING ASSIGNMENTS/A4_set2/train_ta.txt')
train_dataset_en=[str(i.decode('utf-8')).replace('\'',' \'') for i in train_dataset_en.as_numpy_iterator()]
train_dataset_en=tf.data.Dataset.from_tensor_slices(train_dataset_en)

In [None]:
tokenizer_ta= tfds.features.text.SubwordTextEncoder.build_from_corpus((element for element in train_dataset_ta.as_numpy_iterator()), target_vocab_size=10000)

In [None]:
vocab_size = len(tokenizer_en.word_index) + 3
vocab_tar_size = len(tokenizer_ta.word_index) + 3
units = 1024
BATCH_SIZE = 64
embedding_dim = 256


In [None]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_matrix, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, 300,embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),trainable=False)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [None]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    # self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    # context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    # x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x,initial_state = hidden)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state

In [None]:
encoder = Encoder(vocab_size, embedding_matrix, units, BATCH_SIZE)
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(tf.zeros((BATCH_SIZE,20)),sample_hidden)
encoder.load_weights('/content/drive/My Drive/DEEP LEARNING ASSIGNMENTS/A4_set2/Copy of encoder_new.h5')

In [None]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)
sample_decoder_output, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)
decoder.load_weights('/content/drive/My Drive/DEEP LEARNING ASSIGNMENTS/A4_set2/Copy of decoder_new.h5')

In [None]:
def evaluate(sentence,max_length_targ=100):
#   attention_plot = np.zeros((max_length_targ, max_length_inp))

  start_token = [len(tokenizer_en.word_index)+1]
  end_token = [len(tokenizer_en.word_index) + 2]
  
  st=tokenizer_en.texts_to_sequences([sentence.lower()])
  flat_list = [item for sublist in st for item in sublist]
  inp_sentence = start_token + flat_list + end_token
  encoder_input = tf.expand_dims(inp_sentence, 0)
  
  result = []

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(encoder_input, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([vocab_tar_size-2] , 0)

  for t in range(max_length_targ):
    predictions, dec_hidden = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

    # storing the attention weights to plot later on
    # attention_weights = tf.reshape(attention_weights, (-1, ))
    # attention_plot[t] = attention_weights.numpy()

    predicted_id = tf.argmax(predictions[0]).numpy()

    
    result += [predicted_id]
    
    if predicted_id == vocab_tar_size-1:
      return result, sentence
    
    
    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence

In [None]:
def translate(sentence, plot=''):
  result,sentence = evaluate(sentence.replace('\'',' \''))
  predicted_sentence = tokenizer_ta.sequences_to_texts([result.numpy()[1:]])  
  return predicted_sentence[0]
  
#   predicted_sentence = tokenizer_ta.decode([i for i in result 
#                                             if i <= tokenizer_ta.vocab_size])
#   return predicted_sentence  
#   print(result)
#   print('Input: {}'.format(sentence))
#   print('Predicted translation: {}'.format(predicted_sentence))


In [None]:
translate("That's where we're going.")
print ("நாம் எங்கே போகிறோம் என்று.")

எங்காவது எங்கும் நாம் எங்கே.
நாம் எங்கே போகிறோம் என்று.


In [None]:
test_dataset_en=tf.data.TextLineDataset('/content/drive/My Drive/DEEP LEARNING ASSIGNMENTS/A4_set2/dev_en.txt')
test_dataset_ta=tf.data.TextLineDataset('/content/drive/My Drive/DEEP LEARNING ASSIGNMENTS/A4_set2/dev_ta.txt')

test_dataset_en=[str(i.decode('utf-8')).replace('\'',' \'') for i in test_dataset_en.as_numpy_iterator()]
test_dataset_en=tf.data.Dataset.from_tensor_slices(test_dataset_en)

In [None]:
test_dataset = tf.data.Dataset.zip((test_dataset_en, test_dataset_ta))

In [None]:
from nltk.translate.bleu_score import sentence_bleu
def bleu_score_function(reference,candidate):
  bleu1=sentence_bleu([reference], candidate,weights=(1, 0, 0, 0))
  bleu2=sentence_bleu([reference], candidate,weights=(0.5, 0.5, 0, 0))
  bleu3=sentence_bleu([reference], candidate,weights=(0.33, 0.33, 0.33, 0))
  bleu4=sentence_bleu([reference], candidate,weights=(0.25, 0.25, 0.25, 0.25))
  return bleu1,bleu2,bleu3,bleu4



In [None]:
scores1=[]
scores2=[]
scores3=[]
scores4=[]
for(inp,tar) in test_dataset:
  prediction=translate(inp.numpy().decode('utf-8').lower().replace('\'',' \''))
  print(inp.numpy())
  reference=tar.numpy().decode('utf-8').split(' ')
  candidate=prediction.split(' ')
  print(reference)
  print(candidate)
  bleu1,bleu2,bleu3,bleu4 = bleu_score_function(reference, candidate)
  print(bleu1)
  print(bleu2)
  print(bleu3)
  print(bleu4)
  scores1.append(bleu1)
  scores2.append(bleu2)
  scores3.append(bleu3)
  scores4.append(bleu4)
  break


print(sum(scores1)/len(scores1))
print(sum(scores2)/len(scores2))
print(sum(scores3)/len(scores3))
print(sum(scores4)/len(scores4))


b"You will tell us Cobra 's endgame, or die by the same sword you once used to kill our master."
['நீங்கள்', 'எங்களுக்கு', 'கோப்ரா', 'என்ற', 'எண்ட்கேமை', 'சொல்கிறேன்,', 'அல்லது', 'Die', 'நீ', 'நம்', 'மாஸ்டர்', 'கொல்ல', 'பயன்படுத்தப்பட்ட', 'அதே', 'வாள்.']
['நீங்கள்', 'எங்கள்', 'சில', 'முடிவுக்ள', 'அடியை', 'பார்க்க', 'உதவியும்', 'கொல்லவில்லை', 'உள்ளது?']
0.05704634655917688
2.552826540650872e-155
2.212165637760313e-204
5.400301927028362e-232
0.05704634655917688
2.552826540650872e-155
2.212165637760313e-204
5.400301927028362e-232


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
