In [None]:
import tensorflow as tf
print(tf.__version__)

2.7.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path_to_file = "/content/drive/My Drive/datasets/deu.txt"

In [None]:
import io

lines = io.open(path_to_file, encoding='UTF-8').read().strip().split('\n')


In [None]:
len(lines)

248311

In [None]:
num_samples = 10000

In [None]:
input_texts = []
target_texts = []

In [None]:
for line in lines[:min(num_samples, len(lines)-1)]:
    input_text, target_text, _ = line.split("\t")

    input_texts.append(input_text)
    target_texts.append(target_text)



In [None]:
len(input_texts)

10000

In [None]:
#specify a start and end for each sentence

for i in range(len(input_texts)):
  input_texts[i] = '<start> '+input_texts[i]+' <end>'

for i in range(len(target_texts)):
  target_texts[i] = '<start> '+target_texts[i]+' <end>'

In [None]:
#vectorise the words based on tokenisation
#internal vocabulary created based on which word vectors are formed

def tokenize(texts):

  vocab = tf.keras.preprocessing.text.Tokenizer(filters='')

  #form the internal vocabulary
  vocab.fit_on_texts(texts)

  #form vectors, internal vocab to numbers
  word_vec = vocab.texts_to_sequences(texts)

  #since the input statements are of different length
  #we will do padding, to make the vectors of equal length
  word_vec = tf.keras.preprocessing.sequence.pad_sequences(word_vec, padding='post')

  return word_vec, vocab


In [None]:
inp_vec, inp_vocab = tokenize(input_texts)
targ_vec, targ_vocab = tokenize(target_texts)

In [None]:
inp_vec[0]

array([ 1, 56,  2,  0,  0,  0,  0], dtype=int32)

In [None]:
print(inp_vocab.index_word)

{1: '<start>', 2: '<end>', 3: 'i', 4: 'tom', 5: "i'm", 6: 'is', 7: 'a', 8: 'you', 9: 'it', 10: 'it.', 11: "it's", 12: 'was', 13: 'he', 14: 'tom.', 15: 'we', 16: 'me.', 17: 'do', 18: 'can', 19: "don't", 20: 'are', 21: 'to', 22: "i'll", 23: 'go', 24: 'get', 25: 'you.', 26: 'my', 27: "you're", 28: 'have', 29: 'the', 30: 'be', 31: 'she', 32: 'like', 33: 'come', 34: 'who', 35: "we're", 36: 'not', 37: "tom's", 38: 'up.', 39: 'keep', 40: 'this', 41: 'love', 42: 'they', 43: 'did', 44: 'that', 45: 'take', 46: 'am', 47: 'let', 48: 'here.', 49: 'need', 50: 'me', 51: "that's", 52: 'no', 53: 'how', 54: "he's", 55: "let's", 56: 'go.', 57: 'stop', 58: 'want', 59: 'home.', 60: 'that.', 61: 'saw', 62: 'him.', 63: 'this.', 64: 'got', 65: 'it?', 66: 'may', 67: 'us.', 68: 'now.', 69: 'has', 70: 'just', 71: 'in.', 72: "they're", 73: 'on', 74: "can't", 75: 'hate', 76: 'will', 77: 'you?', 78: "we'll", 79: 'try', 80: 'in', 81: 'look', 82: 'see', 83: 'please', 84: 'so', 85: 'know', 86: 'what', 87: 'too', 88: '

In [None]:
targ_vocab.index_word[1]

'<start>'

In [None]:
inp_vec.ndim

2

Data Preprocessing completed.



In [None]:
from sklearn.model_selection import train_test_split

input_train, input_test, target_train, target_test = train_test_split(inp_vec, targ_vec, test_size=0.2)

In [None]:
len(input_train)

8000

In [None]:
#configuring model paramaters

BUFFER_SIZE = len(input_train)
BATCH_SIZE = 64 #training batch size
iter_per_epoch = len(input_train)/64
embedding_dim = 256
units = 1024 #hidden
vocab_inp_size = len(inp_vocab.word_index) + 1
vocab_tar_size = len(targ_vocab.word_index) + 1

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((input_train, target_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
dataset

<BatchDataset shapes: ((64, 7), (64, 11)), types: (tf.int32, tf.int32)>

In [None]:
input_batch, target_batch = next(iter(dataset))
input_batch.shape, target_batch.shape

(TensorShape([64, 7]), TensorShape([64, 11]))

In [None]:
input_batch[1]

<tf.Tensor: shape=(7,), dtype=int32, numpy=array([   1,   19, 1214,   10,    2,    0,    0], dtype=int32)>

Encoder Class

In [None]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_hidden_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_hidden_units = enc_hidden_units

    # embedding layer
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

    # GRU Layer

    self.gru = tf.keras.layers.GRU(self.enc_hidden_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  # Encoder network comprises an Embedding layer followed by a GRU layer
  def call(self, input, hidden):
    #input first passed through embedding layer, then GRU layer
    input = self.embedding(input)
    output, state = self.gru(input, initial_state=hidden)
    return output, state

  # To initialize the hidden state
  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_hidden_units))

In [None]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(input_batch, sample_hidden)

sample_output.shape, sample_hidden.shape

#sample output shape: [batch size, input seq length, hidden units]
#sample hidden state shape: [batch size, hidden units]

(TensorShape([64, 7, 1024]), TensorShape([64, 1024]))

Attention Mechanism

In [None]:
# building Bahdanau Attention Mechanism

class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, window_size):
    super(BahdanauAttention, self).__init__()
    self.L1 = tf.keras.layers.Dense(window_size) #first layer for decoder hidden state
    self.L2 = tf.keras.layers.Dense(window_size) #second layer for encoder outputs
    self.final_layer = tf.keras.layers.Dense(1) #fully connected layer, both the layer are added,
                                          #and go through a tan activation function

  def call(self, decoder_hidden, encoder_output):

    #to broadcast addition along time axis, for calculating the score
    decoder_hidden_with_time = tf.expand_dims(decoder_hidden, 1) #shape: [batch size, 1, hidden units]

    #now decoder hidden state and encoder output will go through tan activation function
    #this happens in the fully connected layer
    score = self.final_layer(tf.nn.tanh(self.L1(decoder_hidden_with_time) + self.L2(encoder_output)))

    #attention weights calculated by softmax function
    attention_weights = tf.nn.softmax(score, axis=1)

    #calculate context vector
    context_vector1 = attention_weights*encoder_output
    #we added the vectors along the axis
    context_vector = tf.reduce_sum(context_vector1, axis=1)

    return attention_weights, context_vector1, context_vector

In [None]:
attention = BahdanauAttention(10)
weights, context_vec_intermediate, context_vect = attention(sample_hidden, sample_output)

In [None]:
#context_vec was calculated by attention_weight*sample_output
weights.shape, sample_output.shape, context_vec_intermediate.shape

(TensorShape([64, 7, 1]),
 TensorShape([64, 7, 1024]),
 TensorShape([64, 7, 1024]))

In [None]:
context_vect.shape

TensorShape([64, 1024])

Decoder Class

In [None]:
#pass the input through embedding layer
#then context layer is concatenated, passed through GRU layer
#then passed into the fully connected layer

class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_hidden_units, batch_sz):
    super(Decoder,self).__init__()
    self.dec_hidden_units = dec_hidden_units
    self.batch_sz = batch_sz

    # embedding layer
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

    #GRU layer
    self.gru = tf.keras.layers.GRU(self.dec_hidden_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    
    #fully connected layer, this is a dense layer
    self.fully_connected = tf.keras.layers.Dense(vocab_size)

    #for context vector, attention output
    self.attention = BahdanauAttention(self.dec_hidden_units)

  def call(self, input, hidden, enc_output):
    attention_weights, _, context_vec = self.attention(hidden, enc_output)

    #input will pass throught embedding layer
    input = self.embedding(input)

    #concatenate with context vector
    input = tf.concat([tf.expand_dims(context_vec, 1), input], axis=-1)

    #passing input through GRU layer

    output, states = self.gru(input) #shape: [batch_size, seq_length, hidden_units]
    output = tf.reshape(output, (-1, output.shape[2]))

    #pass the output throught fully connected layer
    final_output = self.fully_connected(output)

    return final_output, states





In [None]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

decoder_output, hidden_state = decoder(tf.random.uniform((BATCH_SIZE, 1)), sample_hidden, sample_output)

In [None]:
vocab_tar_size, decoder_output.shape

(4997, TensorShape([64, 4997]))

Loss function

In [None]:
#will be using adam optimiser

optimizer = tf.keras.optimizers.Adam()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

# Loss function
def loss_function(real, pred):

  # Since sentences were not of equal length, we did padding
  # If there's a '0' in the sequence, the loss is being nullified
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

Training the model

In [None]:
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, hidden = encoder(inp, enc_hidden)

    # <start> is the first decoder input
    dec_input = tf.expand_dims([targ_vocab.word_index['<start>']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):

      # Pass enc_output to the decoder
      predictions, dec_hidden = decoder(dec_input, hidden, enc_output)

      # Compute the loss
      loss += loss_function(targ[:, t], predictions)

      # Use teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  # As this function is called per batch, compute the batch_loss
  batch_loss = (loss / int(targ.shape[1]))

  # Get the model's variables
  variables = encoder.trainable_variables + decoder.trainable_variables

  # Compute the gradients
  gradients = tape.gradient(loss, variables)

  # Update the variables of the model/network
  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [None]:
dataset.take(int(iter_per_epoch))

<TakeDataset shapes: ((64, 7), (64, 11)), types: (tf.int32, tf.int32)>

In [None]:
int(iter_per_epoch)

125

In [None]:
import time

EPOCHS = 10

# Training loop
for epoch in range(EPOCHS):
  start = time.time()

  # Initialize the hidden state
  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  # Loop through the dataset
  for (batch, (inp, targ)) in enumerate(dataset.take(int(iter_per_epoch))):

    # Call the train method
    batch_loss = train_step(inp, targ, enc_hidden)

    # Compute the loss (per batch)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))

  # Output the loss observed until that epoch
  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / iter_per_epoch))
  
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 3.4233
Epoch 1 Batch 100 Loss 1.9615
Epoch 1 Loss 2.0434
Time taken for 1 epoch 441.9445195198059 sec

Epoch 2 Batch 0 Loss 1.6276
Epoch 2 Batch 100 Loss 1.4435
Epoch 2 Loss 1.5594
Time taken for 1 epoch 411.04077196121216 sec

Epoch 3 Batch 0 Loss 1.3184
Epoch 3 Batch 100 Loss 1.3920
Epoch 3 Loss 1.3385
Time taken for 1 epoch 441.9105978012085 sec

Epoch 4 Batch 0 Loss 1.1486
Epoch 4 Batch 100 Loss 1.1733
Epoch 4 Loss 1.1728
Time taken for 1 epoch 413.1242368221283 sec

Epoch 5 Batch 0 Loss 1.0115
Epoch 5 Batch 100 Loss 1.0362
Epoch 5 Loss 1.0003
Time taken for 1 epoch 441.91834902763367 sec

Epoch 6 Batch 0 Loss 0.7807
Epoch 6 Batch 100 Loss 0.8846
Epoch 6 Loss 0.8322
Time taken for 1 epoch 408.433123588562 sec

Epoch 7 Batch 0 Loss 0.7400
Epoch 7 Batch 100 Loss 0.6545
Epoch 7 Loss 0.6811
Time taken for 1 epoch 409.8346116542816 sec

Epoch 8 Batch 0 Loss 0.4465
Epoch 8 Batch 100 Loss 0.6049
Epoch 8 Loss 0.5540
Time taken for 1 epoch 409.4323625564575 sec

Epoch 9