In [1]:
import tensorflow as tf

# import matplotlib.pyplot as plt
# import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

In [2]:
import sys
sys.path.append('../')

In [3]:
from tf_seq2seq.base import Encoder, Decoder
from tf_seq2seq.utils import dataset_download
from tf_seq2seq.text_preprocessing import load_and_tokenize

In [4]:
# may be omitted, provided we remove tests
from tf_seq2seq.text_preprocessing import (
  preprocess_sentence, unicode_to_ascii, create_dataset
)

In [5]:
import tensorflow
print(tensorflow.__version__)

2.3.0


# Dataset download
Resources: http://www.manythings.org/anki/

In [6]:
path_to_file = dataset_download('anki-spa-eng')

## Test dataset functions

In [7]:
target_sentences, input_sentences = create_dataset(path_to_file, None)
print(target_sentences[-1])
print(input_sentences[-1])

<start> if you want to sound like a native speaker , you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo . <end>
<start> si quieres sonar como un hablante nativo , debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado . <end>


In [47]:
num_examples = 30000
input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer = load_and_tokenize(path_to_file, num_examples)
test_word = 'go'
test_symbols = np.arange(20)

print('A target sentence: {}'.format(target_sentences[0]))
print('A tokenized word: original: {}, tokenized:{}'.format(
        test_word, 
        targ_lang_tokenizer.texts_to_sequences([[test_word]])
    )
)

for symbol, word in zip(test_symbols, targ_lang_tokenizer.sequences_to_texts([test_symbols])[0].split(' ')):
    print('symbol: {}, word: {}'.format(symbol, word))

max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]
print('Sentences input length: {}'.format(max_length_targ))
print('Sentences target length: {}'.format(max_length_inp))

A target sentence: <start> go . <end>
A tokenized word: original: go, tokenized:[[36]]
symbol: 0, word: <start>
symbol: 1, word: <end>
symbol: 2, word: .
symbol: 3, word: i
symbol: 4, word: tom
symbol: 5, word: you
symbol: 6, word: ?
symbol: 7, word: is
symbol: 8, word: a
symbol: 9, word: it
symbol: 10, word: s
symbol: 11, word: t
symbol: 12, word: the
symbol: 13, word: he
symbol: 14, word: to
symbol: 15, word: we
symbol: 16, word: me
symbol: 17, word: m
symbol: 18, word: this
Sentences input length: 11
Sentences target length: 16


# Configuration

In [9]:
num_examples = 200

BATCH_SIZE = 64
embedding_dim = 256
units = 1024


# Dataset creation

In [10]:
input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer = load_and_tokenize(path_to_file, num_examples)
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

In [11]:
BUFFER_SIZE = len(input_tensor_train)

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [12]:
vocab_inp_size = len(inp_lang_tokenizer.word_index)+1
vocab_tar_size = len(targ_lang_tokenizer.word_index)+1

steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

# Encoder

In [13]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

In [14]:
sample_hidden = encoder.initialize_hidden_state()

# sample input
example_input_batch, example_target_batch = next(iter(dataset))
print((example_input_batch.shape, example_target_batch.shape))

sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

(TensorShape([64, 7]), TensorShape([64, 6]))
Encoder output shape: (batch size, sequence length, units) (64, 7, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [15]:
output_np = sample_output.numpy()
state_np = sample_hidden.numpy()

In [16]:
np.linalg.norm(output_np.transpose([1,0,2])[-1] - state_np)

0.0

# Decoder

In [17]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

In [18]:
sample_decoder_output, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 111)


# Optimizer and loss function

In [19]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [20]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [21]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    # call input embedding
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # NO passing enc_output to the decoder
      predictions, dec_hidden = decoder(dec_input, dec_hidden)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [22]:
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 3.1157
Epoch 1 Loss 3.0421
Time taken for 1 epoch 23.558788061141968 sec

Epoch 2 Batch 0 Loss 2.9244
Epoch 2 Loss 2.8468
Time taken for 1 epoch 3.1116487979888916 sec

Epoch 3 Batch 0 Loss 2.2922
Epoch 3 Loss 2.2209
Time taken for 1 epoch 1.6587862968444824 sec

Epoch 4 Batch 0 Loss 1.9501
Epoch 4 Loss 1.9082
Time taken for 1 epoch 2.5115628242492676 sec

Epoch 5 Batch 0 Loss 1.8580
Epoch 5 Loss 1.8237
Time taken for 1 epoch 1.3530480861663818 sec

Epoch 6 Batch 0 Loss 1.7142
Epoch 6 Loss 1.7619
Time taken for 1 epoch 2.722677230834961 sec

Epoch 7 Batch 0 Loss 1.7264
Epoch 7 Loss 1.6830
Time taken for 1 epoch 1.3232486248016357 sec

Epoch 8 Batch 0 Loss 1.6749
Epoch 8 Loss 1.6443
Time taken for 1 epoch 2.1947288513183594 sec

Epoch 9 Batch 0 Loss 1.5846
Epoch 9 Loss 1.6057
Time taken for 1 epoch 1.2567236423492432 sec

Epoch 10 Batch 0 Loss 1.5750
Epoch 10 Loss 1.5805
Time taken for 1 epoch 1.8516712188720703 sec



# Translate

In [23]:
def evaluate(sentence):
  sentence = preprocess_sentence(sentence)

  inputs = [inp_lang_tokenizer.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang_tokenizer.word_index['<start>']], 0)

  for t in range(max_length_targ):
    predictions, dec_hidden = decoder(dec_input,
                                      dec_hidden)

    predicted_id = tf.argmax(predictions[0]).numpy()

    predicted_word = targ_lang_tokenizer.index_word[predicted_id]
    result += predicted_word + ' '

    if predicted_word == '<end>':
      return result.strip(), sentence

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence

In [24]:
def translate(sentence):
  result, sentence = evaluate(sentence)

  print('Input: %s' % (sentence))
  print('Predicted translation: {}'.format(result))

# Restore checkpoint

In [25]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f3276026128>

In [26]:
translate(u'Buenos días.')

KeyError: 'buenos'

In [27]:
translate(u'hace mucho frio aqui.')

KeyError: 'hace'

In [28]:
translate(u'esta es mi vida.')

KeyError: 'es'

In [29]:
translate(u'¿todavia estan en casa?')

KeyError: 'todavia'

In [30]:
translate(u'trata de averiguarlo.')

KeyError: 'trata'

In [31]:
type(target_tensor_train)

numpy.ndarray

In [32]:
testset = tf.data.Dataset.from_tensor_slices((input_tensor_val , target_tensor_val)).shuffle(BUFFER_SIZE)
testset = testset.batch(1, drop_remainder=True)

In [33]:
 for (batch, (inp, targ)) in enumerate(testset.take(10)):
   inp_sentence = ' '.join(inp_lang_tokenizer.sequences_to_texts(inp.numpy())[0].split(' ')[1:-1])
   targ_sentence = ' '.join(targ_lang_tokenizer.sequences_to_texts(targ.numpy())[0].split(' ')[1:-1])

   predicted_sentence, _ = evaluate(inp_sentence)
   predicted_sentence = ' '.join(predicted_sentence.split(' ')[:-1])
   print('\ninput: {}'.format(inp_sentence))
   print('target: {}'.format(targ_sentence))
   print('predicted: {}'.format(predicted_sentence))


input: me traslade .
target: i moved .
predicted: i .

input: imposible !
target: no way !
predicted: i .

input: largo !
target: go away .
predicted: i .

input: agarra a tom .
target: get tom .
predicted: i .

input: perfecto !
target: perfect !
predicted: i .

input: ¿ quien corria ?
target: who ran ?
predicted: i .

input: corre !
target: run !
predicted: i .

input: ayudame .
target: help me .
predicted: i .

input: espera un momento !
target: hang on !
predicted: i .

input: decime .
target: tell me .
predicted: i .
