In [None]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

In [None]:
path_to_file="hin.txt"

In [None]:
#to convert unicode to ascii
def unicode_to_ascii(s):
    text = s.encode('utf-8').decode('utf-8')
    return text
#preprcessing with turning specific characters
def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    w = w.strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w    

In [None]:
en_sentence = u"May I borrow this book?"
hn_sentence = u"क्या मैं यह पुस्तक उधार ले सकता हूँ?"
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(hn_sentence))

<start> may i borrow this book ? <end>
<start> क्या मैं यह पुस्तक उधार ले सकता हूँ ? <end>


In [None]:
"""
1.Remove the accents
2.clean the sentences
3.Return the sentense in this seq[English,German]
"""
def create_dataset(path,num_examples):
    lines=io.open(path,encoding='UTF-8').read().strip().split('\n')
    
    
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
    
    return zip(*word_pairs)

In [None]:
en,hn,un = create_dataset(path_to_file,None)
print(en[-1])
print(hn[-1])

<start> when i was a kid , touching bugs didn't bother me a bit . now i can hardly stand looking at pictures of them . <end>
<start> जब मैं बच्चा था , मुझे कीड़ों को छूने से कोई परेशानी नहीं होती थी , पर अब मैं उनकी तस्वीरें देखना भी बर्दाश्त नहीं कर सकता। <end>


In [None]:
def tokenize(lang):
    lang_tokenizer=tf.keras.preprocessing.text.Tokenizer(
        filters=''
    )
    lang_tokenizer.fit_on_texts(lang)
    tensors = lang_tokenizer.texts_to_sequences(lang)
    tensors = tf.keras.preprocessing.sequence.pad_sequences(tensors,padding='post')
    
    return tensors,lang_tokenizer

In [None]:
#defning the load datase function that will carry all the above three functions:)
def load_dataset(path,num_examples=None):
    inp_lang,targ_lang,unwanted = create_dataset(path,num_examples)
    
    input_tensor,input_language_tokenizer = tokenize(inp_lang)
    target_tensor,targ_lang_tokenizer = tokenize(targ_lang)
    
    return input_tensor,target_tensor,input_language_tokenizer,targ_lang_tokenizer

In [None]:
"""
limiting the examples so that training can be faster
there are >100000 sentences in data set to compile we are selecting
40000 but compromising the the quality 
TODO change the num example to None in releasing patterns
"""

num_examples=None
input_tensor,target_tensor,inp_lang,targ_lang = load_dataset(path_to_file,num_examples)

#max length of target tensors
max_length_targ,max_length_inp = target_tensor.shape[1],input_tensor.shape[1]

In [None]:
#creating bacthes for training and validation 
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

2338 2338 585 585


In [None]:
def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, lang.index_word[t]))

In [None]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])

Input Language; index to word mapping
1 ----> <start>
19 ----> do
7 ----> you
17 ----> have
718 ----> rice
8 ----> ?
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
56 ----> तुम्हारे
45 ----> पास
811 ----> चावल
12 ----> है
11 ----> क्या
4 ----> ?
2 ----> <end>


In [None]:
BUFFER_SIZE=len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedin_dim = 256
units=1024
vocab_inp_size=len(inp_lang.word_index)+1
vocab_tar_size=len(targ_lang.word_index)+1


dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train,target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE,drop_remainder=True)

In [None]:
example_input_batch,example_target_batch = next(iter(dataset))
example_input_batch.shape,example_target_batch.shape

(TensorShape([64, 27]), TensorShape([64, 29]))

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self,vocab_size,embedding_dim,enc_units,batch_sz):
        super(Encoder,self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size,embedin_dim)
        self.gru = tf.keras.layers.GRU(
            self.enc_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform'
        )
        
    def call(self,x,hidden):
        #x is our input
        x=self.embedding(x)
        output,state = self.gru(x,initial_state=hidden)
        return output,state
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz,self.enc_units))

In [None]:
encoder = Encoder(vocab_inp_size,embedin_dim,units,BATCH_SIZE)

In [None]:
#sample to check the layers are working 
sample_hidden = encoder.initialize_hidden_state()
sample_output,sample_hidden = encoder(example_input_batch,sample_hidden)

print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 27, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


# I am Using the Bahdanau Attention for encoding the parameters are:-
* FC = Fully connected (dense) layer
* EO = Encoder output
* H = hidden state
* X = input to the decoder
* The pseudo code for each is:-
* 1)score = FC(tanh(FC(EO) + FC(H)))
* 2)attention weights = softmax(score, axis = 1)
* 3)context vector = sum(attention weights * EO, axis = 1)
* 4)embedding output = It is got from the input passed to the Decoder Embedding Layer.
* 5)merged vector = concat(embedding output, context vector)
* note:-
* This merged vector is then passed to gru layer as hidden state
* note:-
* axis=1 is there because we have to make change accross the max_len field so we took axis 1

In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self,units):
        super(BahdanauAttention,self).__init__()
        self.w1=tf.keras.layers.Dense(units)
        self.w2=tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self,query,values):
        #we will be doing it to get addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query,1)
        
        #shape of the score will be(batchsize,maxlength,1)
        #1 is because it is passing through the final dense layer having units ==1
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(self.w1(query_with_time_axis)+self.w2(values)))
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score,axis=1)
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector=attention_weights * values
        context_vector=tf.reduce_sum(context_vector,axis=1)
        
        return context_vector,attention_weights

In [None]:
attention_layer = BahdanauAttention(10)
attention_result,attention_weights=attention_layer(sample_hidden,sample_output)
print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 27, 1)


In [None]:
class Decoder(tf.keras.Model):
    def __init__(self,vocab_size,embedding_dim,dec_units,batch_sz):
        super(Decoder,self).__init__()
        self.batch_sz=batch_sz
        self.dec_units=dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_dim)
        self.gru=tf.keras.layers.GRU(
            self.dec_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform'
        )
        self.fc=tf.keras.layers.Dense(
            vocab_size
        )
        
        #used for attention
        self.attention = BahdanauAttention(self.dec_units)
    def call(self,x,hidden,enc_output):
        context_vector,attention_weights=self.attention(hidden,enc_output)
        
        x=self.embedding(x)
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output,state = self.gru(x)
        output=tf.reshape(output,(-1,output.shape[2]))
        
        x = self.fc(output)
        
        return x,state,attention_weights

In [None]:
decoder = Decoder(vocab_tar_size, embedin_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 3086)


In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,reduction='none'
)
def loss_function(real,pred):
    mask = tf.math.logical_not(tf.math.equal(real,0))
    loss_ = loss_object(real,pred)
    
    mask = tf.cast(mask,dtype=loss_.dtype)
    loss_ *=mask
    
    return loss_

In [None]:
checkpoint_dir = 'training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [None]:
@tf.function
def train_step(inp,targ,enc_hidden):
    loss=0
    #gradient tape is typically diffrention for complex tensors
    with tf.GradientTape() as tape:
        enc_output,enc_hidden=encoder(inp,enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)
        
        #using teacher forcing method to feed the target as next input
        for t in range(1,targ.shape[1]):
            #passing the enc_output to the decoder
            predictions,dec_hidden,_=decoder(dec_input,dec_hidden,enc_output)
            
            loss +=loss_function(targ[:,t],predictions)
            
            dec_input = tf.expand_dims(targ[:,t],1)
        batch_loss = (loss/int(targ.shape[1]))
        variables = encoder.trainable_variables + decoder.trainable_variables
        gradients = tape.gradient(loss,variables)
        
        optimizer.apply_gradients(zip(gradients,variables))
        return batch_loss

In [None]:
EPOCHS=50
for epoch in range(EPOCHS):
    start =time.time()
    
    enc_hidden = encoder.initialize_hidden_state()
    total_loss=0
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp,targ,enc_hidden)
        total_loss+=batch_loss
        if batch %100==0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {batch_loss.numpy()}')
    if(epoch+1)%2==0:
        checkpoint.save(file_prefix=checkpoint_prefix)
        
    print(f'Epoch {epoch + 1} Loss {total_loss / steps_per_epoch}')
    print(f'Time taken for 1 epoch {time.time() - start} sec\n')    

Epoch 1 Batch 0 Loss [2.7707522  2.493564   1.9399737  1.6622988  2.2166378  1.384514
 1.6622847  2.2168763  1.3853357  1.3855157  3.3245633  2.2165074
 1.1085348  2.7705598  2.4942806  2.4934049  1.3857857  1.6621665
 2.2165315  3.6007357  1.9398619  3.3250663  0.83098847 3.324812
 1.6616551  2.216163   2.7706997  2.4931977  3.8789527  1.3858458
 2.4941764  2.216215   4.154664   3.3243818  2.2159696  2.7703588
 4.43243    2.493514   1.6619762  1.9393227  1.6616756  2.4935462
 1.9392676  2.215973   2.4936655  1.9393702  2.4929953  3.601253
 2.7708766  1.9393842  3.0474362  1.9395077  2.4932873  1.9389938
 2.4935644  1.6621485  2.2164943  1.6620538  1.3857011  3.3245587
 1.9391993  2.4939022  4.4322352  2.4932673 ]
Epoch 1 Loss [2.0150216 1.8196346 1.8931372 1.797775  1.7791157 1.7250419 1.9107745
 1.8059281 1.6422907 1.9229444 1.9472661 1.8743371 1.8137025 1.7421769
 1.9484313 1.886737  1.7773424 1.8563423 1.9572616 1.8851657 1.792235
 2.116726  1.8671862 1.9431593 1.9000729 1.9510716 

In [None]:
def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ,max_length_inp))
    
    sentence = preprocess_sentence(sentence)
    inputs = [inp_lang.word_index[i] for i in sentence.split(" ")]
    inputs=tf.keras.preprocessing.sequence.pad_sequences([inputs],maxlen=max_length_inp,padding='post')
    inputs = tf.convert_to_tensor(inputs)
    result = ''
    
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)
    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)
        #storing the attention weights
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()
        result += targ_lang.index_word[predicted_id] + ' '
        if targ_lang.index_word[predicted_id] == '<end>':
            return result, sentence, attention_plot
        dec_input = tf.expand_dims([predicted_id], 0)
    
    return result, sentence, attention_plot
    

In [None]:
# function for plotting the attention weights
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')

    fontdict = {'fontsize': 14}

    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [None]:
def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)

    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

In [None]:
#restore to the latest checkpoint
print(tf.train.latest_checkpoint(checkpoint_dir))
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

training_checkpoints/ckpt-25


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f4c92a172b0>

In [None]:
translate(u'i love you.')

Input: <start> i love you . <end>
Predicted translation: मैं आपसे प्यार करता हूँ। <end> 


In [None]:
translate(u'Flowers bloom.')

Input: <start> flowers bloom . <end>
Predicted translation: फूल खिलते हैं। <end> 


In [None]:
translate(u'A man must work.')

Input: <start> a man must work . <end>
Predicted translation: एक आदमी के लिए काम करना ज़रूरी है। <end> 


In [None]:
translate(u'Did you miss me?')

Input: <start> did you miss me ? <end>
Predicted translation: मेरी याद आई क्या ? <end> 


In [None]:
translate(u'He came running.')

Input: <start> he came running . <end>
Predicted translation: वह भागते हुए आया। <end> 


In [None]:
encoder.save_weights("encoder_weights2")

In [None]:
decoder.save_weights("decoder_weights2")

In [None]:
from google.colab import files

In [None]:
files.download("decoder_weights2.data-00000-of-00001")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>