In [180]:
import nltk
import numpy as np
import re
import shutil
import tensorflow as tf
import os
import unicodedata

from nltk.translate.bleu_score import sentence_bleu,SmoothingFunction

In [181]:
def preprocess_sentence(sent):
  sent = "".join([c for c in unicodedata.normalize('NFD',sent) if unicodedata.category(c) != 'Mn'])
  sent = re.sub(r"[^a-zA-Z!.?]+", r" ", sent)
  sent = re.sub(r"\s+", " ", sent)
  sent = sent.lower()
  return sent

In [182]:
def download_and_read():
  en_sents,fr_sents_in,fr_sents_out = [],[],[]
  local_file = '/content/drive/MyDrive/Colab Notebooks/livroDeDeepLearning/cap-8-recurrent-neural-networks/datasets/fra.txt'
  with open(local_file,'r') as fin:
    for i,line in enumerate(fin.readlines()):
      en_sent,fr_sent = line.strip().split('\t')[:2]
      en_sent = preprocess_sentence(en_sent)
      fr_sent = preprocess_sentence(fr_sent)
      
      fr_sent_in = [w for w in ("BOS "+fr_sent).split()]
      fr_sent_out = [w for w in (fr_sent + " EOS").split()]
      en_sent = [w for w in en_sent.split()]

      en_sents.append(en_sent)
      fr_sents_in.append(fr_sent_in)
      fr_sents_out.append(fr_sent_out)

      if i > 30000:
        break

  return en_sents,fr_sents_in, fr_sents_out

In [183]:
sents_en, sents_fr_in, sents_fr_out = download_and_read()

In [184]:
tokenizer_en = tf.keras.preprocessing.text.Tokenizer(filters='',lower=False)
tokenizer_en.fit_on_texts(sents_en)
data_en = tokenizer_en.texts_to_sequences(sents_en)
data_en = tf.keras.preprocessing.sequence.pad_sequences(data_en,padding='post')

In [185]:
tokenizer_fr = tf.keras.preprocessing.text.Tokenizer(filters='',lower=False)
tokenizer_fr.fit_on_texts(sents_fr_in)
tokenizer_fr.fit_on_texts(sents_fr_out)

data_fr_in = tokenizer_fr.texts_to_sequences(sents_fr_in)
data_fr_out = tokenizer_fr.texts_to_sequences(sents_fr_out)

data_fr_in = tf.keras.preprocessing.sequence.pad_sequences(data_fr_in,padding='post')
data_fr_out = tf.keras.preprocessing.sequence.pad_sequences(data_fr_out,padding='post')

In [186]:
vocab_size_en = len(tokenizer_en.word_index)
vocab_size_fr = len(tokenizer_fr.word_index)

word2idx_en = tokenizer_en.word_index
idx2word_en = {i:k for k,i in word2idx_en.items()}

word2idx_fr = tokenizer_fr.word_index
idx2word_fr = {i:k for k,i in word2idx_fr.items()}

print("vocab size (en): {:d}, vocab size (fr): {:d}".format(vocab_size_en, vocab_size_fr))

vocab size (en): 6258, vocab size (fr): 9835


In [187]:
NUM_SENT_PAIRS = len(sents_en)
batch_size = 64
dataset = tf.data.Dataset.from_tensor_slices((data_en,data_fr_in,data_fr_out))
dataset = dataset.shuffle(10000)
test_size = NUM_SENT_PAIRS // 4
test_dataset = dataset.take(test_size).batch(batch_size,drop_remainder=True)
train_dataset = dataset.skip(test_size).batch(batch_size,drop_remainder=True)

In [188]:
class Encoder(tf.keras.Model):

  def __init__(self,vocab_size,num_timesteps,embedding_dim,encoder_dim,**kwargs):
    super(Encoder,self).__init__(**kwargs)

    self.encoder_dim = encoder_dim

    self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_dim, input_length= num_timesteps)

    self.rnn = tf.keras.layers.GRU(encoder_dim,return_sequences=False,return_state=True)
  
  def call(self,x,state):
    x = self.embedding(x)
    x, state = self.rnn(x,initial_state=state)
    return x,state
  
  def init_state(self,batch_size):
    return tf.zeros((batch_size,self.encoder_dim))

In [189]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, num_timesteps, decoder_dim, **kwargs):
    super(Decoder,self).__init__(**kwargs)

    self.decoder_dim = decoder_dim
    self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=num_timesteps)

    self.rnn = tf.keras.layers.GRU(decoder_dim,return_sequences=True,return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self,x,state):
    x = self.embedding(x)
    x,state = self.rnn(x,state)
    x = self.dense(x)
    return x,state

In [190]:
embedding_dim = 256
encoder_dim, decoder_dim = 1024,1024
maxlen_en = data_en.shape[1]
maxlen_fr = data_fr_out.shape[1]

encoder = Encoder(vocab_size_en+1,maxlen_en,embedding_dim,encoder_dim)
decoder = Decoder(vocab_size_fr+1,embedding_dim,maxlen_fr,decoder_dim)

In [191]:
for encoder_in, decoder_in, decoder_out in train_dataset:
  encoder_state = encoder.init_state(batch_size)
  encoder_out, encoder_state = encoder(encoder_in, encoder_state)
  decoder_state = encoder_state
  decoder_pred, decoder_state = decoder(decoder_in, decoder_state)
  break
print("encoder input :", encoder_in.shape)
print("encoder output :", encoder_out.shape, "state:",encoder_state.shape)
print("decoder output (logits):", decoder_pred.shape, "state:",
decoder_state.shape)
print("decoder output (labels):", decoder_out.shape)

encoder input : (64, 7)
encoder output : (64, 1024) state: (64, 1024)
decoder output (logits): (64, 15, 9836) state: (64, 1024)
decoder output (labels): (64, 15)


In [192]:
def loss_fn(ytrue,ypred):
  scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  mask = tf.math.logical_not(tf.math.equal(ytrue,0))
  mask = tf.cast(mask, dtype=tf.int64)
  loss = scce(ytrue,ypred,sample_weight=mask)
  return loss

In [193]:
@tf.function
def train_step(encoder_in, decoder_in, decoder_out, encoder_state):
  with tf.GradientTape() as tape:
    encoder_out, encoder_state = encoder(encoder_in, encoder_state)
    decoder_state = encoder_state
    decoder_pred, decoder_state = decoder(decoder_in,decoder_state)
    loss = loss_fn(decoder_out,decoder_pred)
  variables = (encoder.trainable_variables + decoder.trainable_variables)
  gradients = tape.gradient(loss,variables)
  
  optimizer.apply_gradients(zip(gradients,variables))

  return loss

In [194]:
def predict(encoder, decoder, sents_en,
            data_en, sents_fr_out, word2idx_fr, idx2word_fr):
  random_id = np.random.choice(len(sents_en))
  encoder_in = tf.expand_dims(data_en[random_id],axis=0)
  decoder_out = tf.expand_dims(sents_fr_out[random_id],axis=0)

  encoder_state = encoder.init_state(1)
  encoder_out, encoder_state = encoder(encoder_in,encoder_state)
  decoder_in = tf.expand_dims(tf.constant([word2idx_fr['BOS']]),axis=0)
  decoder_state = encoder_state
  pred_sent_fr = []
  while True:
    decoder_pred, decoder_state = decoder(decoder_in,decoder_state)
    decoder_pred = tf.argmax(decoder_pred,axis=-1)
    pred_word = idx2word_fr[decoder_pred.numpy()[0][0]]
    pred_sent_fr.append(pred_word)
    if pred_word == 'EOS':
      break
    decoder_in = decoder_pred
  print('inglês: '," ".join(sents_en[random_id]))
  print('true: '," ".join(sents_fr_out[random_id]))
  print("predicted: ", " ".join(pred_sent_fr))

In [195]:
optimizer = tf.keras.optimizers.Adam()
checkpoint_prefix = os.path.join(".", "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,encoder=encoder,decoder=decoder)

In [None]:
num_epochs = 250
eval_scores = []

for e in range(num_epochs):
  encoder_state = encoder.init_state(batch_size)

  for batch,data in enumerate(train_dataset):
    encoder_in, decoder_in, decoder_out = data
    loss = train_step(encoder_in,decoder_in,decoder_out,encoder_state)
  
  print("Epoch: {}, Loss: {:.4f}".format(e + 1, loss.numpy()))

  predict(encoder, decoder, sents_en, data_en, sents_fr_out, word2idx_fr, idx2word_fr)

Epoch: 1, Loss: 1.3098
inglês:  i paid my taxes.
true:  j ai paye mes impots. EOS
predicted:  j aime les bandes EOS
Epoch: 2, Loss: 1.0246
inglês:  i haven t started.
true:  je n ai pas commence. EOS
predicted:  je ne suis pas si fatigue. EOS
Epoch: 3, Loss: 0.8095
inglês:  i built a shelter.
true:  j ai construit un abri. EOS
predicted:  j ai eu un arbre. EOS
Epoch: 4, Loss: 0.6207
inglês:  my name is tom.
true:  mon nom est tom. EOS
predicted:  j ai les mains froides. EOS
Epoch: 5, Loss: 0.4270
inglês:  is that a wig?
true:  est ce une perruque ? EOS
predicted:  est ce une urgence ? EOS
Epoch: 6, Loss: 0.3754
inglês:  tom seems worried.
true:  tom semble inquiet. EOS
predicted:  tom a l air d etre cote. EOS
Epoch: 7, Loss: 0.2970
inglês:  was anybody hurt?
true:  qui que ce soit a t il ete blesse ? EOS
predicted:  qui a ete tue ? EOS
Epoch: 8, Loss: 0.2752
inglês:  what woke you up?
true:  qu est ce qui vous a reveillees ? EOS
predicted:  qu est ce qui vous a reveille ? EOS
Epoch: 9,