<a href="https://colab.research.google.com/github/praveenjune17/English_Tamil_parallel_corpus/blob/master/Train_eng_tam_NMT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**a)**  Please refer  https://www.tensorflow.org/beta/tutorials/text/transformer#create_the_transformer to gain indepth knowledge about transformers .

**b)**     Change runtime to GPU before running this script

In [1]:
!pip install tensorflow-gpu==2.0.0-beta1

# Upgrade the beam_search script to TF2 in tensor2tensor since the exsisting is not compatible with TF2
!tf_upgrade_v2 \
  --infile /usr/local/lib/python3.6/dist-packages/tensor2tensor/utils/beam_search.py \
  --outfile /usr/local/lib/python3.6/dist-packages/tensor2tensor/utils/beam_search.py

from __future__ import absolute_import, division, print_function, unicode_literals
from tensor2tensor.utils.beam_search import beam_search
from collections import defaultdict
import os
import shutil

TensorFlow 2.0 Upgrade Script
-----------------------------
Converted 1 files
Detected 0 issues that require attention
--------------------------------------------------------------------------------


Make sure to read the detailed log 'report.txt'



### Patch script to add eng-tam parallel corpus into tensorflow datasets

In [2]:
if not os.path.exists('Neural-Machine-Translation-English-Tamil-model'):
  !git clone https://github.com/praveenjune17/Neural-Machine-Translation-English-Tamil-model
  !unzip Neural-Machine-Translation-English-Tamil-model/tfds_patch_scripts/Transformer_en_tam_2.zip
  
#set the path to the where tfds is installed
path = '/usr/local/lib/python3.6/dist-packages/tensorflow_datasets'
shutil.copy('../content/Transformer_en_tam_2/__init__.py', os.path.join(path, 'translate/__init__.py'))
shutil.copy('../content/Transformer_en_tam_2/en_tam_parallel_text.py', os.path.join(path, 'translate/en_tam_parallel_text.py'))
shutil.copy('../content/Transformer_en_tam_2/en_tam_parallel_text_test.py', os.path.join(path, 'translate/en_tam_parallel_text_test.py'))
shutil.copy('../content/Transformer_en_tam_2/en_tam_parallel_text.txt', os.path.join(path, 'url_checksums/en_tam_parallel_text.txt'))

'/usr/local/lib/python3.6/dist-packages/tensorflow_datasets/url_checksums/en_tam_parallel_text.txt'

In [0]:
# Import tfds only after the patch is copied.
import tensorflow_datasets as tfds
import tensorflow as tf
tf.random.set_seed(100)

import time
import numpy as np
import matplotlib.pyplot as plt

In [4]:
# using a single sample(github_joshua_en_ta) for training 
# this dataset contains 110K lines of en, ta parallel text
examples, meta_data  = tfds.load('en_tam_parallel_text/github_joshua_en_ta', with_info=True, as_supervised=True)

W0812 05:26:28.934224 140573774956416 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/data/util/random_seed.py:58: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [0]:
BATCH_SIZE = 64
# Drop examples of token size greater than 50 
MAX_LENGTH = 50                                                 


In [0]:
# Subword generation API # Took 180secs to complete in the last run

train_examples = examples['train']
tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
  (en.numpy() for  en, _ in train_examples), target_vocab_size=2**13)
tokenizer_ta = tfds.features.text.SubwordTextEncoder.build_from_corpus(
  (ta.numpy() for _, ta in train_examples), target_vocab_size=2**13)

In [7]:
sample_string = 'Transformer is awesome.'

tokenized_string_eng = tokenizer_en.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string_eng))

original_string = tokenizer_en.decode(tokenized_string_eng)
print ('The original string: {}'.format(original_string))

assert original_string == sample_string

Tokenized string is [8123, 1227, 5516, 297, 2308, 8071, 8, 7984, 94, 3439, 8085]
The original string: Transformer is awesome.


In [8]:
sample_string = 'நீங்கள் இங்கு அரட்டை அடிக்க அனுமதி இல்லை'
tokenized_string_ta = tokenizer_ta.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string_ta))

original_string = tokenizer_ta.decode(tokenized_string_ta)
print ('The original string: {}'.format(original_string))
assert original_string == sample_string

Tokenized string is [21, 32, 52, 1, 19, 4, 258, 1, 5, 20, 174, 11, 1, 11, 24, 84, 2, 5, 1, 33, 109, 3, 104, 29, 133, 1, 12, 10]
The original string: நீங்கள் இங்கு அரட்டை அடிக்க அனுமதி இல்லை


In [9]:
for ts in tokenized_string_eng:
  print ('{} ----> {}'.format(ts, tokenizer_en.decode([ts])))
for ts in tokenized_string_ta:
  print ('{} ----> {}'.format(ts, tokenizer_ta.decode([ts])))

8123 ----> T
1227 ----> ran
5516 ----> sf
297 ----> or
2308 ----> mer
8071 ---->  
8 ----> is 
7984 ----> aw
94 ----> es
3439 ----> ome
8085 ----> .
21 ----> ந
32 ----> ீ
52 ----> ங
1 ----> ்
19 ----> கள
4 ----> ் 
258 ----> இங
1 ----> ்
5 ----> க
20 ----> ு 
174 ----> அர
11 ----> ட
1 ----> ்
11 ----> ட
24 ----> ை 
84 ----> அட
2 ----> ி
5 ----> க
1 ----> ்
33 ----> க 
109 ----> அன
3 ----> ு
104 ----> மத
29 ----> ி 
133 ----> இல
1 ----> ்
12 ----> ல
10 ----> ை


In [0]:
def encode(lang1, lang2):
  lang1 = [tokenizer_en.vocab_size] + tokenizer_en.encode(
      lang1.numpy()) + [tokenizer_en.vocab_size+1]

  lang2 = [tokenizer_ta.vocab_size] + tokenizer_ta.encode(
      lang2.numpy()) + [tokenizer_ta.vocab_size+1]
  
  return lang1, lang2

def filter_max_length(x, y, max_length=MAX_LENGTH):
  
  return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)

def tf_encode(en, ta):
  return tf.py_function(encode, [en, ta], [tf.int64, tf.int64])

In [11]:
train_dataset = train_examples.map(tf_encode)
train_dataset = train_dataset.filter(filter_max_length)
BUFFER_SIZE = 160000
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFER_SIZE, seed=100).padded_batch(
    BATCH_SIZE, padded_shapes=([-1], [-1]))
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
en_batch, ta_batch = next(iter(train_dataset))
en_batch, ta_batch

W0812 05:29:13.175656 140571801765632 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string
W0812 05:29:13.177540 140571801765632 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string
W0812 05:29:13.179733 140571818551040 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string
W0812 05:29:13.180990 140571818551040 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string
W0812 05:29:13.192764 140571818551040 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string


(<tf.Tensor: id=1428288, shape=(64, 13), dtype=int64, numpy=
 array([[8295, 2745,   81, 8296,    0,    0,    0,    0,    0,    0,    0,
            0,    0],
        [8295, 1247, 8296,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0],
        [8295, 7783, 3357, 1050, 8296,    0,    0,    0,    0,    0,    0,
            0,    0],
        [8295, 4905, 5910, 1548, 8136, 8296,    0,    0,    0,    0,    0,
            0,    0],
        [8295, 5245,   52, 6608, 8296,    0,    0,    0,    0,    0,    0,
            0,    0],
        [8295, 2161, 1385, 8296,    0,    0,    0,    0,    0,    0,    0,
            0,    0],
        [8295, 2485, 8296,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0],
        [8295, 7259,  804, 8296,    0,    0,    0,    0,    0,    0,    0,
            0,    0],
        [8295, 5183, 3848, 8154, 8296,    0,    0,    0,    0,    0,    0,
            0,    0],
        [8295, 6696,  120, 1634, 8296,    0,    0,    0,    0,    

In [0]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates
def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
  
  # apply sin to even indices in the array; 2i
  sines = np.sin(angle_rads[:, 0::2])
  
  # apply cos to odd indices in the array; 2i+1
  cosines = np.cos(angle_rads[:, 1::2])
  
  pos_encoding = np.concatenate([sines, cosines], axis=-1)
  
  pos_encoding = pos_encoding[np.newaxis, ...]
    
  return tf.cast(pos_encoding, dtype=tf.float32)

In [0]:
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  
  # add extra dimensions so that we can add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

In [0]:
def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)  #(1 - lower_triangular_matrix)
  return mask  # (seq_len, seq_len)

In [0]:
def scaled_dot_product_attention(q, k, v, mask):
  """Calculate the attention weights.
  q, k, v must have matching leading dimensions.
  k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
  The mask has different shapes depending on its type(padding or look ahead) 
  but it must be broadcastable for addition.
  
  Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.
    
  Returns:
    output, attention_weights
  """

  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
  
  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

  # add the mask to the scaled tensor.
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)  

  # softmax is normalized on the last axis (seq_len_k) so that the scores
  # add up to 1.
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

  return output, attention_weights

In [0]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model
    
    assert d_model % self.num_heads == 0
    
    self.depth = d_model // self.num_heads
    
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    
    self.dense = tf.keras.layers.Dense(d_model)
        
  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])
    
  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]
    
    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)
    
    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
    
    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)
    
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
        
    return output, attention_weights

In [0]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

In [0]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    
  def call(self, x, training, mask):

    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
    
    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
    
    return out2

In [0]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(DecoderLayer, self).__init__()

    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)

    self.ffn = point_wise_feed_forward_network(d_model, dff)
 
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)
    
    
  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):
    # enc_output.shape == (batch_size, input_seq_len, d_model)

    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
    attn1 = self.dropout1(attn1, training=training)
    out1 = self.layernorm1(attn1 + x)
    
    attn2, attn_weights_block2 = self.mha2(
        enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
    attn2 = self.dropout2(attn2, training=training)
    out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)
    
    ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
    ffn_output = self.dropout3(ffn_output, training=training)
    out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)
    
    return out3, attn_weights_block1, attn_weights_block2

In [0]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    
    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
    self.pos_encoding = positional_encoding(input_vocab_size, self.d_model)
    
    
    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
  
    self.dropout = tf.keras.layers.Dropout(rate)
        
  def call(self, x, training, mask):

    seq_len = tf.shape(x)[1]
    
    # adding embedding and position encoding.
    x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))  #(refer last line of 3.4 Embeddings and Softmax)
    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)
    
    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)
    
    return x  # (batch_size, input_seq_len, d_model)

In [0]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, 
               rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
    self.pos_encoding = positional_encoding(target_vocab_size, self.d_model)
    
    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)
    
  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):

    seq_len = tf.shape(x)[1]
    attention_weights = {}
    
    x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]
    
    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                             look_ahead_mask, padding_mask)
      
      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
      attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
    
    # x.shape == (batch_size, target_seq_len, d_model)
    return x, attention_weights

In [0]:
class Transformer(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               target_vocab_size, rate=0.1):
    super(Transformer, self).__init__()

    self.encoder = Encoder(num_layers, d_model, num_heads, dff, 
                           input_vocab_size, rate)

    self.decoder = Decoder(num_layers, d_model, num_heads, dff, 
                           target_vocab_size, rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)
    
  def call(self, inp, tar, training, enc_padding_mask, 
           look_ahead_mask, dec_padding_mask):

    enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
    
    # dec_output.shape == (batch_size, tar_seq_len, d_model)
    dec_output, attention_weights = self.decoder(
        tar, enc_output, training, look_ahead_mask, dec_padding_mask)
    
    final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
    
    return final_output, attention_weights

In [23]:
num_layers = 4  #denoted as 'L' in BERT , no.of blocks
d_model = 256   #ll sub-layers in the model, as well as the embedding layers, produce outputs of dimension dmodel=512.
dff = 1024      #denoted as 'H' in BERT 
num_heads = 4   #denoted as 'A' in BERT
input_vocab_size = tokenizer_en.vocab_size + 2
target_vocab_size = tokenizer_ta.vocab_size + 2
dropout_rate = 0.3
print('english vocab size is {} '.format(input_vocab_size))
print('tamil vocab size is {}'.format(target_vocab_size))

english vocab size is 8297 
tamil vocab size is 4703


In [0]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
    
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps
    
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [0]:
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

In [0]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [0]:
def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  
  return tf.reduce_mean(loss_)

In [0]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')

In [0]:
transformer = Transformer(num_layers, d_model, num_heads, dff,
                          input_vocab_size, target_vocab_size, dropout_rate)

In [0]:
def create_masks(inp, tar):
  # Encoder padding mask
  enc_padding_mask = create_padding_mask(inp)  #output 1 if padded 0 is present else 0
  
  # Used in the 2nd attention block in the decoder.
  # This padding mask is used to mask the encoder outputs.
  dec_padding_mask = create_padding_mask(inp)
  # Used in the 1st attention block in the decoder.
  # It is used to pad and mask future tokens in the input received by 
  # the decoder.allows decoder to attend to all positions in the decoder up to and including that position(refer architecture)
  dec_target_padding_mask = create_padding_mask(tar)
  
  look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
  
  combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
  
  return enc_padding_mask, combined_mask, dec_padding_mask

In [0]:
checkpoint_path = './Checkpoints/Train_from_scratch'

In [0]:
ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if (ckpt_manager.latest_checkpoint):
  ckpt.restore(ckpt_manager.latest_checkpoint).expect_partial()
  print ('Latest checkpoint restored is {} !!'.format(ckpt_manager.latest_checkpoint.split('/')[-1]))
  print ('size of the checkpoint directory is {}MB '.format(sum(os.path.getsize(os.path.join(checkpoint_path,f)) for f in os.listdir(checkpoint_path))/(1024*1024)))

In [0]:
EPOCHS = 10

In [0]:
# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.

train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]

@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]
  
  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
  
  with tf.GradientTape() as tape:
    predictions, _ = transformer(inp, tar_inp, 
                                 True, 
                                 enc_padding_mask, 
                                 combined_mask, 
                                 dec_padding_mask)
    loss = loss_function(tar_real, predictions)
    
  gradients = tape.gradient(loss, transformer.trainable_variables)    
  optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
  
  train_loss(loss)
  train_accuracy(tar_real, predictions)

In [0]:
for epoch in range(EPOCHS): #train for 4 hours
  
  start = time.time()  
  train_loss.reset_states()
  train_accuracy.reset_states()
  
  # inp -> english, tar -> tamil
  for (batch, (inp, tar)) in enumerate(train_dataset):
    train_step(inp, tar)
    

    if batch % 50 == 0:
      print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
          epoch + 1, batch, train_loss.result(), train_accuracy.result()))
    # save every 1000 batch
    if batch % 1000 == 0:
      ckpt_save_path = ckpt_manager.save()
      
      print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                            ckpt_save_path))
      

  print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))
  print ('Time taken for {} epoch : {} secs\n'.format(epoch + 1, time.time() - start))
  print('predicted translation {}'.format(tokenizer_ta.decode([j for j in bs('hello')[0][0][0] if j < tokenizer_ta.vocab_size])))
  ckpt_save_path = ckpt_manager.save()

Epoch 1 Batch 0 Loss 0.4291 Accuracy 0.1569
Saving checkpoint for epoch 1 at ./Checkpoints/Train_from_scratch/ckpt-4
Epoch 1 Batch 50 Loss 0.5097 Accuracy 0.1593
Epoch 1 Batch 100 Loss 0.5027 Accuracy 0.1570
Epoch 1 Batch 150 Loss 0.4975 Accuracy 0.1555
Epoch 1 Batch 200 Loss 0.4973 Accuracy 0.1557
Epoch 1 Batch 250 Loss 0.4949 Accuracy 0.1555
Epoch 1 Batch 300 Loss 0.4952 Accuracy 0.1555
Epoch 1 Batch 350 Loss 0.4941 Accuracy 0.1554
Epoch 1 Batch 400 Loss 0.4928 Accuracy 0.1555
Epoch 1 Batch 450 Loss 0.4914 Accuracy 0.1554
Epoch 1 Batch 500 Loss 0.4911 Accuracy 0.1554
Epoch 1 Batch 550 Loss 0.4908 Accuracy 0.1550
Epoch 1 Batch 600 Loss 0.4905 Accuracy 0.1547
Epoch 1 Batch 650 Loss 0.4911 Accuracy 0.1546
Epoch 1 Batch 700 Loss 0.4928 Accuracy 0.1551
Epoch 1 Batch 750 Loss 0.4929 Accuracy 0.1552
Epoch 1 Batch 800 Loss 0.4930 Accuracy 0.1550
Epoch 1 Batch 850 Loss 0.4930 Accuracy 0.1551
Epoch 1 Batch 900 Loss 0.4931 Accuracy 0.1552
Epoch 1 Batch 950 Loss 0.4937 Accuracy 0.1553
Epoch 1 Ba

In [0]:
# beam search with beam size 3

def bs(inp_sentence):
  inp_sentence = inp_sentence
  beam_size = 4
  start_token = [tokenizer_en.vocab_size]
  end_token = [tokenizer_en.vocab_size + 1]
  inp_sentence = start_token + tokenizer_en.encode(inp_sentence) + end_token
  encoder_input = tf.expand_dims(inp_sentence, 0)
  encoder_input = tf.concat([encoder_input]*beam_size, axis=0)
  start = tokenizer_ta.vocab_size
  end = tokenizer_ta.vocab_size+1
  def transformer_query(output):

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
          encoder_input, output)
    predictions, attention_weights = transformer(encoder_input, 
                                                 output,
                                                 False,
                                                 enc_padding_mask,
                                                 combined_mask,
                                                 dec_padding_mask)


    return (predictions[:,-1:,:])
  return beam_search(transformer_query, [start], beam_size, 30, target_vocab_size, 1, stop_early=True, eos_id=[tokenizer_ta.vocab_size+1])

In [0]:
start = time.time()
text = input('Enter a english sentence:- ')
translated_output = bs(text)[0][0]
print('probable outputs')
for dec_ids in translated_output:
  print(tokenizer_ta.decode([j for j in dec_ids if j < tokenizer_ta.vocab_size]))

print('time to process {}'.format(time.time()-start))

In [0]:
# Add more data
#https://github.com/praveenjune17/Neural-Machine-Translation-English-Tamil-model/blob/master/tfds_patch_scripts/Transformer_en_tam_2.zip


# The dataset used here contains 100K lines (even lesser after applying filter_max_length) so with more data the translation
  #quality will be better. Have trained a model with 450Klines of parallel text , data were downloaded from the below links
 
  #If you are willing to contribute additional data that is not in
  #https://github.com/praveenjune17/English_Tamil_parallel_corpus/blob/master/Available_download_links then Please add.
  
  
# increase MAX_LENGTH to 100 or 120 to train long sentences but make sure the training  is carried out on high end GPUs.
  # In Colab GPU crashes when MAX_LENGTH > 51

# train tamil to english model
  
# Please refer to https://github.com/praveenjune17/datasets/blob/master/tensorflow_datasets/translate/en_tam_parallel_text.py
  # for the preprocessing steps carried out.Open for suggestions