In [46]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

In [47]:
path_to_file="hin-eng/hin.txt"

In [48]:
#to convert unicode to ascii
def unicode_to_ascii(s):
    text = s.encode('utf-8').decode('utf-8')
    return text
#preprcessing with turning specific characters
def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    w = w.strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w    

In [49]:
en_sentence = u"May I borrow this book?"
hn_sentence = u"क्या मैं यह पुस्तक उधार ले सकता हूँ?"
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(hn_sentence))

<start> may i borrow this book ? <end>
<start> क्या मैं यह पुस्तक उधार ले सकता हूँ ? <end>


In [50]:
"""
1.Remove the accents
2.clean the sentences
3.Return the sentense in this seq[English,hindi]
"""
def create_dataset(path,num_examples):
    lines=io.open(path,encoding='UTF-8').read().strip().split('\n')
    
    
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
    
    return zip(*word_pairs)

In [51]:
en,hn,un = create_dataset(path_to_file,None)
print(en[-1])
print(hn[-1])

<start> when i was a kid , touching bugs didn't bother me a bit . now i can hardly stand looking at pictures of them . <end>
<start> जब मैं बच्चा था , मुझे कीड़ों को छूने से कोई परेशानी नहीं होती थी , पर अब मैं उनकी तस्वीरें देखना भी बर्दाश्त नहीं कर सकता। <end>


In [52]:
def tokenize(lang):
    lang_tokenizer=tf.keras.preprocessing.text.Tokenizer(
        filters=''
    )
    lang_tokenizer.fit_on_texts(lang)
    tensors = lang_tokenizer.texts_to_sequences(lang)
    tensors = tf.keras.preprocessing.sequence.pad_sequences(tensors,padding='post')
    
    return tensors,lang_tokenizer

In [53]:
#defning the load datase function that will carry all the above three functions:)
def load_dataset(path,num_examples=None):
    inp_lang,targ_lang,unwanted = create_dataset(path,num_examples)
    
    input_tensor,input_language_tokenizer = tokenize(inp_lang)
    target_tensor,targ_lang_tokenizer = tokenize(targ_lang)
    
    return input_tensor,target_tensor,input_language_tokenizer,targ_lang_tokenizer

In [54]:
"""
limiting the examples so that training can be faster
there are >100000 sentences in data set to compile we are selecting
40000 but compromising the the quality 
TODO change the num example to None in releasing patterns
"""

num_examples=30000
input_tensor,target_tensor,inp_lang,targ_lang = load_dataset(path_to_file,num_examples)

#max length of target tensors
max_length_targ,max_length_inp = target_tensor.shape[1],input_tensor.shape[1]

In [55]:
BUFFER_SIZE=len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedin_dim = 256
units=1024
vocab_inp_size=len(inp_lang.word_index)+1
vocab_tar_size=len(targ_lang.word_index)+1

In [56]:
class Encoder(tf.keras.Model):
    def __init__(self,vocab_size,embedding_dim,enc_units,batch_sz):
        super(Encoder,self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size,embedin_dim)
        self.gru = tf.keras.layers.GRU(
            self.enc_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform'
        )
        
    def call(self,x,hidden):
        #x is our input
        x=self.embedding(x)
        output,state = self.gru(x,initial_state=hidden)
        return output,state
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz,self.enc_units))

In [57]:
encoder = Encoder(vocab_inp_size,embedin_dim,units,BATCH_SIZE)


In [58]:
encoder.load_weights("encoder_weights/encoder_weights2")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1df46822288>

# I am Using the Bahdanau Attention for encoding the parameters are:-
* FC = Fully connected (dense) layer
* EO = Encoder output
* H = hidden state
* X = input to the decoder
* The pseudo code for each is:-
* 1)score = FC(tanh(FC(EO) + FC(H)))
* 2)attention weights = softmax(score, axis = 1)
* 3)context vector = sum(attention weights * EO, axis = 1)
* 4)embedding output = It is got from the input passed to the Decoder Embedding Layer.
* 5)merged vector = concat(embedding output, context vector)
* note:-
* This merged vector is then passed to gru layer as hidden state
* note:-
* axis=1 is there because we have to make change accross the max_len field so we took axis 1

In [59]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self,units):
        super(BahdanauAttention,self).__init__()
        self.w1=tf.keras.layers.Dense(units)
        self.w2=tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self,query,values):
        #we will be doing it to get addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query,1)
        
        #shape of the score will be(batchsize,maxlength,1)
        #1 is because it is passing through the final dense layer having units ==1
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(self.w1(query_with_time_axis)+self.w2(values)))
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score,axis=1)
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector=attention_weights * values
        context_vector=tf.reduce_sum(context_vector,axis=1)
        
        return context_vector,attention_weights

In [60]:
class Decoder(tf.keras.Model):
    def __init__(self,vocab_size,embedding_dim,dec_units,batch_sz):
        super(Decoder,self).__init__()
        self.batch_sz=batch_sz
        self.dec_units=dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_dim)
        self.gru=tf.keras.layers.GRU(
            self.dec_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform'
        )
        self.fc=tf.keras.layers.Dense(
            vocab_size
        )
        
        #used for attention
        self.attention = BahdanauAttention(self.dec_units)
    def call(self,x,hidden,enc_output):
        context_vector,attention_weights=self.attention(hidden,enc_output)
        
        x=self.embedding(x)
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output,state = self.gru(x)
        output=tf.reshape(output,(-1,output.shape[2]))
        
        x = self.fc(output)
        
        return x,state,attention_weights

In [61]:
decoder = Decoder(vocab_tar_size, embedin_dim, units, BATCH_SIZE)
decoder.load_weights("decoder_weights/decoder_weights2")



<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1df4683c888>

In [62]:
def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ,max_length_inp))
    
    sentence = preprocess_sentence(sentence)
    inputs = [inp_lang.word_index[i] for i in sentence.split(" ")]
    inputs=tf.keras.preprocessing.sequence.pad_sequences([inputs],maxlen=max_length_inp,padding='post')
    inputs = tf.convert_to_tensor(inputs)
    result = ''
    
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)
    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)
        #storing the attention weights
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()
        result += targ_lang.index_word[predicted_id] + ' '
        if targ_lang.index_word[predicted_id] == '<end>':
            return result, sentence, attention_plot
        dec_input = tf.expand_dims([predicted_id], 0)
    
    return result, sentence, attention_plot
    

In [63]:
# function for plotting the attention weights
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')

    fontdict = {'fontsize': 14}

    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [64]:
def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)
    result = preprocess_sentence(result)
    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

In [65]:
translate(u'He came running.')

Input: <start> he came running . <end>
Predicted translation: <start> वह भागते हुए आया। <end> <end>


In [37]:
translate(u'I love you.')

Input: <start> i love you . <end>
Predicted translation: <start> मैं आपसे प्यार करता हूँ। <end> <end>


In [38]:
translate(u'Did you miss me?')

Input: <start> did you miss me ? <end>
Predicted translation: <start> मेरी याद आई क्या ? <end> <end>


In [39]:
translate(u'A man must work.')

Input: <start> a man must work . <end>
Predicted translation: <start> एक आदमी के लिए काम करना ज़रूरी है। <end> <end>


In [40]:
translate(u'He reads Arabic')


Input: <start> he reads arabic <end>
Predicted translation: <start> वह अरबी पढ़ सकता है। <end> <end>


In [41]:
translate("I feel nauseous.")

Input: <start> i feel nauseous . <end>
Predicted translation: <start> मुझे उल्टी आ रही है। <end> <end>


In [42]:
translate(u'Awesome!')

Input: <start> awesome ! <end>
Predicted translation: <start> बहुत बढ़िया ! <end> <end>


In [43]:
translate(u'Flowers bloom.')

Input: <start> flowers bloom . <end>
Predicted translation: <start> फूल खिलते हैं। <end> <end>


In [44]:
translate(u'Come in.')

Input: <start> come in . <end>
Predicted translation: <start> अंदर आ जाओ। <end> <end>


In [45]:
translate(u'Fantastic!')

Input: <start> fantastic ! <end>
Predicted translation: <start> बहुत ख़ूब ! <end> <end>


In [138]:
translate(u'Mother Teresa was a Catholic nun who lived and worked in Calcutta, India.')

Input: <start> mother teresa was a catholic nun who lived and worked in calcutta , india . <end>
Predicted translation: <start> मदर टेरेसा एक कैथोलिक नन थीं जो कलकत्ता में रहतीं और काम करतीं थीं। <end> <end>


In [139]:
translate(u'In 1951, Sister Teresa was sent to Calcutta, then the largest city in India.')

Input: <start> in 1951 , sister teresa was sent to calcutta , then the largest city in india . <end>
Predicted translation: <start> १९५१ में सिस्टर टेरेसा को कलकत्ता भेजा गया था , जो उस समय भारत का सबसे बड़ा शहर था। <end> <end>


In [35]:
translate(u'I bought two bottles of milk.')

Input: <start> i bought two bottles of milk . <end>
Predicted translation: <start> मैंने रातभर काम किया। <end> <end>


In [145]:
translate(u'I am pleased with the result.')

Input: <start> i am pleased with the result . <end>
Predicted translation: <start> मैं नतीजे से खुश हूँ। <end> <end>
