<a href="https://colab.research.google.com/github/praveenjune17/English_Tamil_parallel_corpus/blob/master/eng_tam_NMT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tensorflow-gpu==2.0.0-beta1

# Upgrade the beam_search script to TF2 in tensor2tensor since the exsisting is not compatible with TF2
!tf_upgrade_v2 \
  --infile /usr/local/lib/python3.6/dist-packages/tensor2tensor/utils/beam_search.py \
  --outfile /usr/local/lib/python3.6/dist-packages/tensor2tensor/utils/beam_search.py

from __future__ import absolute_import, division, print_function, unicode_literals
from tensor2tensor.utils.beam_search import beam_search
from collections import defaultdict
import os
import shutil

Collecting tensorflow-gpu==2.0.0-beta1
[?25l  Downloading https://files.pythonhosted.org/packages/2b/53/e18c5e7a2263d3581a979645a185804782e59b8e13f42b9c3c3cfb5bb503/tensorflow_gpu-2.0.0b1-cp36-cp36m-manylinux1_x86_64.whl (348.9MB)
[K     |████████████████████████████████| 348.9MB 54kB/s 
Collecting tb-nightly<1.14.0a20190604,>=1.14.0a20190603 (from tensorflow-gpu==2.0.0-beta1)
[?25l  Downloading https://files.pythonhosted.org/packages/a4/96/571b875cd81dda9d5dfa1422a4f9d749e67c0a8d4f4f0b33a4e5f5f35e27/tb_nightly-1.14.0a20190603-py3-none-any.whl (3.1MB)
[K     |████████████████████████████████| 3.1MB 30.4MB/s 
Collecting tf-estimator-nightly<1.14.0.dev2019060502,>=1.14.0.dev2019060501 (from tensorflow-gpu==2.0.0-beta1)
[?25l  Downloading https://files.pythonhosted.org/packages/32/dd/99c47dd007dcf10d63fd895611b063732646f23059c618a373e85019eb0e/tf_estimator_nightly-1.14.0.dev2019060501-py2.py3-none-any.whl (496kB)
[K     |████████████████████████████████| 501kB 43.5MB/s 
Installing c

In [2]:
# Clone the patch scripts 
if not os.path.exists('Neural-Machine-Translation-English-Tamil-model'):
  !git clone https://github.com/praveenjune17/Neural-Machine-Translation-English-Tamil-model
  !unzip Neural-Machine-Translation-English-Tamil-model/tfds_patch_scripts/Transformer_en_tam_2.zip
  
#set the path to the where tfds is installed
path = '/usr/local/lib/python3.6/dist-packages/tensorflow_datasets'
shutil.copy('../content/Transformer_en_tam_2/__init__.py', os.path.join(path, 'translate/__init__.py'))
shutil.copy('../content/Transformer_en_tam_2/en_tam_parallel_text.py', os.path.join(path, 'translate/en_tam_parallel_text.py'))
shutil.copy('../content/Transformer_en_tam_2/en_tam_parallel_text_test.py', os.path.join(path, 'translate/en_tam_parallel_text_test.py'))
shutil.copy('../content/Transformer_en_tam_2/en_tam_parallel_text.txt', os.path.join(path, 'url_checksums/en_tam_parallel_text.txt'))

Cloning into 'Neural-Machine-Translation-English-Tamil-model'...
remote: Enumerating objects: 84, done.[K
remote: Counting objects: 100% (84/84), done.[K
remote: Compressing objects: 100% (81/81), done.[K
remote: Total 299 (delta 39), reused 0 (delta 0), pack-reused 215[K
Receiving objects: 100% (299/299), 125.97 MiB | 38.94 MiB/s, done.
Resolving deltas: 100% (147/147), done.
Archive:  Neural-Machine-Translation-English-Tamil-model/tfds_patch_scripts/Transformer_en_tam_2.zip
  inflating: Transformer_en_tam_2/__init__.py  
  inflating: Transformer_en_tam_2/en_tam_parallel_text.py  
  inflating: Transformer_en_tam_2/en_tam_parallel_text.txt  
  inflating: Transformer_en_tam_2/en_tam_parallel_text_test.py  


'/usr/local/lib/python3.6/dist-packages/tensorflow_datasets/url_checksums/en_tam_parallel_text.txt'

In [0]:
# Import tfds only after the patch is copied.
import tensorflow_datasets as tfds
import tensorflow as tf
tf.random.set_seed(100)

import time
import numpy as np
import matplotlib.pyplot as plt


In [4]:
# Download the weights of English to tamil Translation model(pre-trained) and vocab file from my Gdrive
ckpt_path = './downloaded_ckpts'
if not os.path.exists(ckpt_path):
  os.mkdir(ckpt_path)
dl_manager = tfds.download.DownloadManager(download_dir = os.getcwd(), extract_dir=ckpt_path, dataset_name='en_tam_parallel_text',register_checksums=True)
checkpoint_link = 'https://drive.google.com/uc?export=download&id=1-1ZUoHBfqn9PTZCGRRdoPZEJVdQfcXWG'
vocab_link =      'https://drive.google.com/uc?export=download&id=0B_L4yAn2bWYtWXFDaWp3REdYdnFuWUt3Q1Y1bDRPOEtPcGtR'
vocab_path,checkpoint_path  = dl_manager.download_and_extract([vocab_link, checkpoint_link])
en_vocab_path = os.path.join(vocab_path, 'vocab_en_cleaned653791')
ta_vocab_path = os.path.join(vocab_path, 'vocab_ta_cleaned653791')

HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Completed...', max=1, style=ProgressStyl…

HBox(children=(IntProgress(value=1, bar_style='info', description='Dl Size...', max=1, style=ProgressStyle(des…

HBox(children=(IntProgress(value=1, bar_style='info', description='Extraction completed...', max=1, style=Prog…









In [0]:
tokenizer_en = tfds.features.text.SubwordTextEncoder.load_from_file(en_vocab_path)
tokenizer_ta = tfds.features.text.SubwordTextEncoder.load_from_file(ta_vocab_path)

In [8]:
def encode(lang1, lang2):
  lang1 = [tokenizer_en.vocab_size] + tokenizer_en.encode(
      lang1.numpy()) + [tokenizer_en.vocab_size+1]

  lang2 = [tokenizer_ta.vocab_size] + tokenizer_ta.encode(
      lang2.numpy()) + [tokenizer_ta.vocab_size+1]
  
  return lang1, lang2

def filter_max_length(x, y, max_length=MAX_LENGTH):
  
  return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)

def tf_encode(en, ta):
  return tf.py_function(encode, [en, ta], [tf.int64, tf.int64])

NameError: ignored

In [0]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates
def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
  
  # apply sin to even indices in the array; 2i
  sines = np.sin(angle_rads[:, 0::2])
  
  # apply cos to odd indices in the array; 2i+1
  cosines = np.cos(angle_rads[:, 1::2])
  
  pos_encoding = np.concatenate([sines, cosines], axis=-1)
  
  pos_encoding = pos_encoding[np.newaxis, ...]
    
  return tf.cast(pos_encoding, dtype=tf.float32)

In [0]:
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  
  # add extra dimensions so that we can add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)  #(1 - lower_triangular_matrix)
  return mask  # (seq_len, seq_len)

def scaled_dot_product_attention(q, k, v, mask):
  """Calculate the attention weights.
  q, k, v must have matching leading dimensions.
  k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
  The mask has different shapes depending on its type(padding or look ahead) 
  but it must be broadcastable for addition.
  
  Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.
    
  Returns:
    output, attention_weights
  """

  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
  
  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

  # add the mask to the scaled tensor.
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)  

  # softmax is normalized on the last axis (seq_len_k) so that the scores
  # add up to 1.
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

  return output, attention_weights

class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model
    
    assert d_model % self.num_heads == 0
    
    self.depth = d_model // self.num_heads
    
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    
    self.dense = tf.keras.layers.Dense(d_model)
        
  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])
    
  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]
    
    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)
    
    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
    
    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)
    
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
        
    return output, attention_weights
  
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

In [0]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    
  def call(self, x, training, mask):

    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
    
    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
    
    return out2

In [0]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(DecoderLayer, self).__init__()

    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)

    self.ffn = point_wise_feed_forward_network(d_model, dff)
 
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)
    
    
  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):
    # enc_output.shape == (batch_size, input_seq_len, d_model)

    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
    attn1 = self.dropout1(attn1, training=training)
    out1 = self.layernorm1(attn1 + x)
    
    attn2, attn_weights_block2 = self.mha2(
        enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
    attn2 = self.dropout2(attn2, training=training)
    out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)
    
    ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
    ffn_output = self.dropout3(ffn_output, training=training)
    out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)
    
    return out3, attn_weights_block1, attn_weights_block2

In [0]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    
    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
    self.pos_encoding = positional_encoding(input_vocab_size, self.d_model)
    
    
    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
  
    self.dropout = tf.keras.layers.Dropout(rate)
        
  def call(self, x, training, mask):

    seq_len = tf.shape(x)[1]
    
    # adding embedding and position encoding.
    x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))  #(refer last line of 3.4 Embeddings and Softmax)
    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)
    
    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)
    
    return x  # (batch_size, input_seq_len, d_model)

In [0]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, 
               rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
    self.pos_encoding = positional_encoding(target_vocab_size, self.d_model)
    
    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)
    
  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):

    seq_len = tf.shape(x)[1]
    attention_weights = {}
    
    x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]
    
    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                             look_ahead_mask, padding_mask)
      
      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
      attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
    
    # x.shape == (batch_size, target_seq_len, d_model)
    return x, attention_weights

In [0]:
class Transformer(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               target_vocab_size, rate=0.1):
    super(Transformer, self).__init__()

    self.encoder = Encoder(num_layers, d_model, num_heads, dff, 
                           input_vocab_size, rate)

    self.decoder = Decoder(num_layers, d_model, num_heads, dff, 
                           target_vocab_size, rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)
    
  def call(self, inp, tar, training, enc_padding_mask, 
           look_ahead_mask, dec_padding_mask):

    enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
    
    # dec_output.shape == (batch_size, tar_seq_len, d_model)
    dec_output, attention_weights = self.decoder(
        tar, enc_output, training, look_ahead_mask, dec_padding_mask)
    
    final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
    
    return final_output, attention_weights

In [16]:
num_layers = 4  #denoted as 'L' in BERT , no.of blocks
d_model = 256   #ll sub-layers in the model, as well as the embedding layers, produce outputs of dimension dmodel=512.
dff = 1024      #denoted as 'H' in BERT 
num_heads = 4   #denoted as 'A' in BERT
input_vocab_size = tokenizer_en.vocab_size + 2
target_vocab_size = tokenizer_ta.vocab_size + 2
dropout_rate = 0.3
print('english vocab size is {} '.format(input_vocab_size))
print('tamil vocab size is {}'.format(target_vocab_size))

english vocab size is 8155 
tamil vocab size is 7152


In [0]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
    
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps
    
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [0]:
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

In [0]:
transformer = Transformer(num_layers, d_model, num_heads, dff,
                          input_vocab_size, target_vocab_size, dropout_rate)

In [0]:
def create_masks(inp, tar):
  # Encoder padding mask
  enc_padding_mask = create_padding_mask(inp)  #output 1 if padded 0 is present else 0
  
  # Used in the 2nd attention block in the decoder.
  # This padding mask is used to mask the encoder outputs.
  dec_padding_mask = create_padding_mask(inp)
  # Used in the 1st attention block in the decoder.
  # It is used to pad and mask future tokens in the input received by 
  # the decoder.allows decoder to attend to all positions in the decoder up to and including that position(refer architecture)
  dec_target_padding_mask = create_padding_mask(tar)
  
  look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
  
  combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
  
  return enc_padding_mask, combined_mask, dec_padding_mask

In [29]:
ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.

_=ckpt.restore(os.path.join(checkpoint_path, 'ckpt-411')).expect_partial()


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f70354ea630>

In [0]:
# beam search with beam size 3

def bs(inp_sentence):
  inp_sentence = inp_sentence
  beam_size = 4
  start_token = [tokenizer_en.vocab_size]
  end_token = [tokenizer_en.vocab_size + 1]
  inp_sentence = start_token + tokenizer_en.encode(inp_sentence) + end_token
  encoder_input = tf.expand_dims(inp_sentence, 0)
  encoder_input = tf.concat([encoder_input]*beam_size, axis=0)
  start = tokenizer_ta.vocab_size
  end = tokenizer_ta.vocab_size+1
  def transformer_query(output):

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
          encoder_input, output)
    predictions, attention_weights = transformer(encoder_input, 
                                                 output,
                                                 False,
                                                 enc_padding_mask,
                                                 combined_mask,
                                                 dec_padding_mask)


    return (predictions[:,-1:,:])
  return beam_search(transformer_query, [start], beam_size, 30, target_vocab_size, 1, stop_early=True, eos_id=[tokenizer_ta.vocab_size+1])

In [45]:
start = time.time()
text = input('Enter a english sentence:- ')
translated_output = bs(text)[0][0]
print(tokenizer_ta.decode([j for j in translated_output[0] if j < tokenizer_ta.vocab_size]))
print('time to process {}'.format(time.time()-start))

Enter a english sentence:- I'm not a bad coder
குறைந்தபட்சம் இல்லை
time to process 12.996816158294678
