In [0]:
import tensorflow as tf
import numpy as np
import unicodedata 
import re

In [0]:
raw_data = (
    ('What a ridiculous concept!', 'Quel concept ridicule !'),
    ('Your idea is not entirely crazy.', "Votre idée n'est pas complètement folle."),
    ("A man's worth lies in what he is.", "La valeur d'un homme réside dans ce qu'il est."),
    ('What he did is very wrong.', "Ce qu'il a fait est très mal."),
    ("All three of you need to do that.", "Vous avez besoin de faire cela, tous les trois."),
    ("Are you giving me another chance?", "Me donnez-vous une autre chance ?"),
    ("Both Tom and Mary work as models.", "Tom et Mary travaillent tous les deux comme mannequins."),
    ("Can I have a few minutes, please?", "Puis-je avoir quelques minutes, je vous prie ?"),
    ("Could you close the door, please?", "Pourriez-vous fermer la porte, s'il vous plaît ?"),
    ("Did you plant pumpkins this year?", "Cette année, avez-vous planté des citrouilles ?"),
    ("Do you ever study in the library?", "Est-ce que vous étudiez à la bibliothèque des fois ?"),
    ("Don't be deceived by appearances.", "Ne vous laissez pas abuser par les apparences."),
    ("Excuse me. Can you speak English?", "Je vous prie de m'excuser ! Savez-vous parler anglais ?"),
    ("Few people know the true meaning.", "Peu de gens savent ce que cela veut réellement dire."),
    ("Germany produced many scientists.", "L'Allemagne a produit beaucoup de scientifiques."),
    ("Guess whose birthday it is today.", "Devine de qui c'est l'anniversaire, aujourd'hui !"),
    ("He acted like he owned the place.", "Il s'est comporté comme s'il possédait l'endroit."),
    ("Honesty will pay in the long run.", "L'honnêteté paye à la longue."),
    ("How do we know this isn't a trap?", "Comment savez-vous qu'il ne s'agit pas d'un piège ?"),
    ("I can't believe you're giving up.", "Je n'arrive pas à croire que vous abandonniez."),
)

In [0]:
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')


def normalize_string(s):
  s = unicode_to_ascii(s)
  s = re.sub(r'([!.?])', r' \1 ', s)
  s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
  s = re.sub(r'\s+', r' ', s)
  return s

In [0]:
raw_data_en, raw_data_fr = list(zip(*raw_data))

In [0]:
raw_data_en, raw_data_fr = list(raw_data_en), list(raw_data_fr)

In [28]:
normalize_string(raw_data_en[0])

'What a ridiculous concept ! '

In [0]:
raw_data_en = [normalize_string(data) for data in raw_data_en]

raw_data_fr_in = ['<start> ' + normalize_string(data) for data in raw_data_fr]
raw_data_fr_out = [normalize_string(data) + ' <end>' for data in raw_data_fr]

In [35]:
print('FR Input : ', raw_data_fr_in[0])
print('FR Output:', raw_data_fr_out[0])

FR Input :  <start> Quel concept ridicule ! 
FR Output: Quel concept ridicule !  <end>


In [0]:
en_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')

en_tokenizer.fit_on_texts(raw_data_en)

In [0]:
data_en = en_tokenizer.texts_to_sequences(raw_data_en)

In [0]:
data_en = tf.keras.preprocessing.sequence.pad_sequences(data_en, padding='post')

In [40]:
data_en

array([[ 8,  5, 21, 22, 23,  0,  0,  0,  0,  0],
       [24, 25,  6, 26, 27, 28,  1,  0,  0,  0],
       [ 5, 29, 30, 31, 32,  9,  8,  7,  6,  1],
       [ 8,  7, 13,  6, 33, 34,  1,  0,  0,  0],
       [35, 36, 37,  2, 38, 39, 10, 40,  1,  0],
       [41,  2, 14, 15, 42, 43,  3,  0,  0,  0],
       [44, 45, 46, 47, 48, 49, 50,  1,  0,  0],
       [11, 16, 51,  5, 17, 52, 18,  3,  0,  0],
       [53,  2, 54,  4, 55, 18,  3,  0,  0,  0],
       [13,  2, 56, 57, 19, 58,  3,  0,  0,  0],
       [10,  2, 59, 60,  9,  4, 61,  3,  0,  0],
       [62, 12, 63, 64, 65, 66,  1,  0,  0,  0],
       [67, 15,  1, 11,  2, 68, 69,  3,  0,  0],
       [17, 70, 20,  4, 71, 72,  1,  0,  0,  0],
       [73, 74, 75, 76,  1,  0,  0,  0,  0,  0],
       [77, 78, 79, 80,  6, 81,  1,  0,  0,  0],
       [ 7, 82, 83,  7, 84,  4, 85,  1,  0,  0],
       [86, 87, 88,  9,  4, 89, 90,  1,  0,  0],
       [91, 10, 92, 20, 19, 93, 12,  5, 94,  3],
       [16, 11, 12, 95,  2, 96, 14, 97,  1,  0]], dtype=int32)

In [0]:
fr_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')


# FIT twice can append the tokens
fr_tokenizer.fit_on_texts(raw_data_fr_in)
fr_tokenizer.fit_on_texts(raw_data_fr_out)


data_fr_in = fr_tokenizer.texts_to_sequences(raw_data_fr_in)
data_fr_in = tf.keras.preprocessing.sequence.pad_sequences(data_fr_in, padding='post')

data_fr_out = fr_tokenizer.texts_to_sequences(raw_data_fr_out)
data_fr_out = tf.keras.preprocessing.sequence.pad_sequences(data_fr_out, padding='post')

In [0]:
dataset = tf.data.Dataset.from_tensor_slices((data_en, data_fr_in, data_fr_out)).shuffle(20).batch(5)

## Seq2Seq without Attention

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_size, lstm_size):
    super(Encoder, self).__init__()
    self.lstm_size = lstm_size
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
    self.lstm = tf.keras.layers.LSTM(lstm_size, return_sequences=True, return_state=True)

  def call(self, sequence, states):
    embed = self.embedding(sequence)
    output, state_h, state_c = self.lstm(embed, states)

    return output, state_h, state_c

  def init_states(self, batch_size):
    return (tf.zeros([batch_size, self.lstm_size]), # state h
            tf.zeros([batch_size, self.lstm_size])) # state c

In [0]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_size, lstm_size):
    super(Decoder, self).__init__()
    self.lstm_size = lstm_size
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
    self.lstm = tf.keras.layers.LSTM(lstm_size, return_sequences=True, return_state=True)

    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, sequence, state):
    embed = self.embedding(sequence)
    lstm_output, state_h, state_c = self.lstm(embed, state)
    logits = self.dense(lstm_output)

    return logits, state_h, state_c

In [0]:
EMBEDDING_SIZE = 32
LSTM_SIZE = 64

en_vocab_size = len(en_tokenizer.word_index) + 1

encoder = Encoder(en_vocab_size, EMBEDDING_SIZE, LSTM_SIZE)


fr_vocab_size = len(fr_tokenizer.word_index) + 1
decoder = Decoder(fr_vocab_size, EMBEDDING_SIZE, LSTM_SIZE)



source_input = tf.constant([[1,3,5,7,2,0,0,0]])
initial_state = encoder.init_states(1)

encoder_output, en_state_h, en_state_c = encoder(source_input, initial_state)

target_input = tf.constant([[1,4,6,9,2,0,0]])
decoder_output, de_state_h, de_state_c = decoder(target_input, (en_state_h, en_state_c))

In [72]:
print('Source sequences', source_input.shape)
print('Encoder outputs', encoder_output.shape)
print('Encoder state_h', en_state_h.shape)
print('Encoder state_c', en_state_c.shape)

print('\nDestination vocab size', fr_vocab_size)
print('Destination sequences', target_input.shape)
print('Decoder outputs', decoder_output.shape)
print('Decoder state_h', de_state_h.shape)
print('Decoder state_c', de_state_c.shape)

Source sequences (1, 8)
Encoder outputs (1, 8, 64)
Encoder state_h (1, 64)
Encoder state_c (1, 64)

Destination vocab size 110
Destination sequences (1, 7)
Decoder outputs (1, 7, 110)
Decoder state_h (1, 64)
Decoder state_c (1, 64)


In [0]:
def loss_func(targets, logits):
  crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

  mask = tf.math.logical_not(tf.math.equal(targets, 0))
  mask = tf.cast(mask, dtype=tf.int64)

  loss = crossentropy(targets, logits, sample_weight=mask)

  return loss

In [0]:
optimizer = tf.keras.optimizers.Adam()

In [0]:
@tf.function
def train_step(source_seq, target_seq_in, target_seq_out, en_initial_states):
  with tf.GradientTape() as tape:
    en_outputs = encoder(source_seq, en_initial_states)
    # en_outpus = (), (), () <- output, state_h, state_c
    en_states = en_outputs[1:]
    de_states = en_states

    de_outputs = decoder(target_seq_in, de_states)
    logits = de_outputs[0]
    loss = loss_func(target_seq_out, logits)
  
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return loss

In [0]:
def predict():
  test_source_text = raw_data_en[np.random.choice(len(raw_data_en))]
  print(test_source_text)

  test_source_seq = en_tokenizer.texts_to_sequences([test_source_text])
  print(test_source_seq)

  en_initial_states = encoder.init_states(1)

  en_outputs = encoder(tf.constant(test_source_seq), en_initial_states)

  de_input = tf.constant([[fr_tokenizer.word_index['<start>']]])

  de_state_h, de_state_c = en_outputs[1:]


  out_words = []
  while True:
    de_output, de_state_h ,de_state_c = decoder(de_input, (de_state_h, de_state_c))

    de_input = tf.argmax(de_output, -1)

    out_words.append(fr_tokenizer.index_word[de_input.numpy()[0][0]])

    if out_words[-1] == '<end>' or len(out_words) >= 20:
      break

  print(' '.join(out_words))
  print()

In [89]:
NUM_EPOCHS = 250
BATCH_SIZE = 5

for e in range(NUM_EPOCHS):
  en_initial_states = encoder.init_states(BATCH_SIZE)

  for batch, (source_seq, target_seq_in, target_seq_out) in enumerate(dataset.take(-1)):
    loss = train_step(source_seq, target_seq_in, target_seq_out, en_initial_states)

  print('Epoch {} Loss {:.4f}'.format(e+1, loss.numpy()))

  try:
    predict()
  except Exception:
    continue

Epoch 1 Loss 0.0212
Are you giving me another chance ? 
[[41, 2, 14, 15, 42, 43, 3]]
me donnez vous une autre chance ? <end>

Epoch 2 Loss 0.0185
He acted like he owned the place . 
[[7, 82, 83, 7, 84, 4, 85, 1]]
il s est comporte comme s il possedait l endroit . <end>

Epoch 3 Loss 0.0174
Could you close the door please ? 
[[53, 2, 54, 4, 55, 18, 3]]
pourriez vous fermer la porte s il <end>

Epoch 4 Loss 0.0206
Did you plant pumpkins this year ? 
[[13, 2, 56, 57, 19, 58, 3]]
cette annee avez vous plante des citrouilles ? <end>

Epoch 5 Loss 0.0115
Do you ever study in the library ? 
[[10, 2, 59, 60, 9, 4, 61, 3]]
est ce que vous etudiez a la bibliotheque des fois ? <end>

Epoch 6 Loss 0.0163
I can t believe you re giving up . 
[[16, 11, 12, 95, 2, 96, 14, 97, 1]]
je n arrive pas a croire que vous abandonniez . <end>

Epoch 7 Loss 0.0188
Guess whose birthday it is today . 
[[77, 78, 79, 80, 6, 81, 1]]
devine de qui c est l anniversaire aujourd hui ! <end>

Epoch 8 Loss 0.0143
Do you ev

## Seq2Seq with Attention

In [0]:
class LuongAttention(tf.keras.Model):
  def __init__(self, rnn_size):
    super(LuongAttention, self).__init__()
    self.wa = tf.keras.layers.Dense(rnn_size)

  def call(self, decoder_output, encoder_output):
    score = tf.matmul(decoder_output, self.wa(encoder_output), transpose_b=True)

    alignment = tf.nn.softmax(score, axis=2)
    context = tf.matmul(alignment, encoder_output)
    return context, alignment

In [0]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_size, rnn_size):
    super(Decoder, self).__init__()
    self.attention = LuongAttention(rnn_size)
    self.rnn_size = rnn_size
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
    self.lstm = tf.keras.layers.LSTM(rnn_size, return_sequences=True, return_state=True)

    self.wc = tf.keras.layers.Dense(rnn_size, activation='tanh')
    self.ws = tf.keras.layers.Dense(vocab_size)
  
  def call(self, sequence, state, encoder_output):
    embed = self.embedding(sequence)
    lstm_out, state_h, state_c = self.lstm(embed, initial_state=state)

    context, alignment = self.attention(lstm_out, encoder_output)
    lstm_out = tf.concat([tf.squeeze(context, 1), tf.squeeze(lstm_out, 1)], 1)
    lstm_out = self.wc(lstm_out)
    logits = self.ws(lstm_out)

    return logits, state_h, state_c, alignment

In [0]:
decoder = Decoder(fr_vocab_size, EMBEDDING_SIZE, LSTM_SIZE)

In [0]:
@tf.function
def train_step(source_seq, target_seq_in, target_seq_out, en_initial_states):
  loss = 0

  with tf.GradientTape() as tape:
    en_outputs = encoder(source_seq, en_initial_states)
    en_states = en_outputs[1:]

    de_state_h, de_state_c = en_states

    for i in range(target_seq_out.shape[1]):
      decoder_in = tf.expand_dims(target_seq_in[:, i], 1)
      logits, de_state_h, de_state_c, _ = decoder(decoder_in, (de_state_h, de_state_c), en_outputs[0])

      loss += loss_func(target_seq_out[:, i], logits)
    
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return loss / target_seq_out.shape[1]

In [0]:
def predict(test_source_text=None):
    if test_source_text is None:
        test_source_text = raw_data_en[np.random.choice(len(raw_data_en))]
    print(test_source_text)
    test_source_seq = en_tokenizer.texts_to_sequences([test_source_text])
    print(test_source_seq)

    en_initial_states = encoder.init_states(1)
    en_outputs = encoder(tf.constant(test_source_seq), en_initial_states)

    de_input = tf.constant([[fr_tokenizer.word_index['<start>']]])
    de_state_h, de_state_c = en_outputs[1:]
    out_words = []
    alignments = []

    while True:
        de_output, de_state_h, de_state_c, alignment = decoder(
            de_input, (de_state_h, de_state_c), en_outputs[0])
        de_input = tf.expand_dims(tf.argmax(de_output, -1), 0)
        out_words.append(fr_tokenizer.index_word[de_input.numpy()[0][0]])
        
        alignments.append(alignment.numpy())

        if out_words[-1] == '<end>' or len(out_words) >= 20:
            break

    print(' '.join(out_words))
    return np.array(alignments), test_source_text.split(' '), out_words

In [101]:
NUM_EPOCHS = 250
BATCH_SIZE = 5

for e in range(NUM_EPOCHS):
  en_initial_states = encoder.init_states(BATCH_SIZE)

  for batch, (source_seq, target_seq_in, target_seq_out) in enumerate(dataset.take(-1)):
    loss = train_step(source_seq, target_seq_in, target_seq_out, en_initial_states)

  print('Epoch {} Loss {:.4f}'.format(e+1, loss.numpy()))

  try:
    predict()
  except Exception:
    continue

Epoch 1 Loss 3.8407
He acted like he owned the place . 
[[7, 82, 83, 7, 84, 4, 85, 1]]
<end>
Epoch 2 Loss 3.6584
Few people know the true meaning . 
[[17, 70, 20, 4, 71, 72, 1]]
vous vous vous vous <end>
Epoch 3 Loss 3.0123
Germany produced many scientists . 
[[73, 74, 75, 76, 1]]
l . <end>
Epoch 4 Loss 2.5829
Don t be deceived by appearances . 
[[62, 12, 63, 64, 65, 66, 1]]
vous pas pas pas . <end>
Epoch 5 Loss 2.7396
Don t be deceived by appearances . 
[[62, 12, 63, 64, 65, 66, 1]]
pas pas pas pas ? <end>
Epoch 6 Loss 2.9235
He acted like he owned the place . 
[[7, 82, 83, 7, 84, 4, 85, 1]]
il il il il il il il <end>
Epoch 7 Loss 2.2152
Do you ever study in the library ? 
[[10, 2, 59, 60, 9, 4, 61, 3]]
est il il il il il il il <end>
Epoch 8 Loss 2.1821
Honesty will pay in the long run . 
[[86, 87, 88, 9, 4, 89, 90, 1]]
honnetete honnetete paye a . <end>
Epoch 9 Loss 1.8679
Excuse me . Can you speak English ? 
[[67, 15, 1, 11, 2, 68, 69, 3]]
vous vous vous vous vous vous vous ? ? <end