In [0]:
import tensorflow as tf
import numpy as np
import unicodedata 
import re
import matplotlib.pyplot as plt
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [0]:
path_to_zip = tf.keras.utils.get_file('spa-eng.zip', origin = 'http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip', extract=True)

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [0]:
path_to_file = os.path.dirname(path_to_zip) + '/spa-eng/spa.txt'

## Data Preprocess

In [0]:
data = pd.read_csv(path_to_file, sep='\t', header=None)
data.columns = ['eng', 'spa']

In [0]:
data.head()

Unnamed: 0,eng,spa
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.


In [0]:
def unicode_to_ascii(word):
  return ''.join(char for char in unicodedata.normalize('NFD', word) if unicodedata.category(char) != 'Mn')

In [0]:
def preprocess_text(word):
  word = unicode_to_ascii(word)
  word = re.sub(r'([?.!,¿])', r' \1 ', word)
  word = re.sub(r'\s+', r' ', word)
  word = re.sub(r'[^a-zA-Z?.!,¿]+', r' ', word)
  word = word.strip()
  return word

In [0]:
data['eng'] = data['eng'].apply(preprocess_text)

data['spa'] = data['spa'].apply(preprocess_text)

In [0]:
data.head()

Unnamed: 0,eng,spa
0,Go .,Ve .
1,Go .,Vete .
2,Go .,Vaya .
3,Go .,Vayase .
4,Hi .,Hola .


In [0]:
def create_inp_tar(x, mode):
  if mode == 'input':
    input_word = '<start> ' + x
    return input_word
  else:
    target_word = x + ' <end>'
    return target_word

In [0]:
data['spa_inp'] = data['spa'].apply(create_inp_tar, mode='input')
data['spa_tar'] = data['spa'].apply(create_inp_tar, mode='target')

In [0]:
data.head()

Unnamed: 0,eng,spa,spa_inp,spa_tar
0,Go .,Ve .,<start> Ve .,Ve . <end>
1,Go .,Vete .,<start> Vete .,Vete . <end>
2,Go .,Vaya .,<start> Vaya .,Vaya . <end>
3,Go .,Vayase .,<start> Vayase .,Vayase . <end>
4,Hi .,Hola .,<start> Hola .,Hola . <end>


In [0]:
# split train & validation

In [0]:
train, valid = train_test_split(data, test_size=0.2, random_state=1228)

In [0]:
train.shape

(95171, 4)

In [0]:
valid.shape

(23793, 4)

In [0]:
train.head()

Unnamed: 0,eng,spa,spa_inp,spa_tar
34713,You have butterfingers .,Tienes las manos aguadas .,<start> Tienes las manos aguadas .,Tienes las manos aguadas . <end>
3291,That s weird .,Es raro .,<start> Es raro .,Es raro . <end>
26436,You can use this pen .,Puedes usar este lapiz .,<start> Puedes usar este lapiz .,Puedes usar este lapiz . <end>
25476,Tom broke Mary s mug .,Tomas le rompio la taza a Maria .,<start> Tomas le rompio la taza a Maria .,Tomas le rompio la taza a Maria . <end>
75187,You don t need to worry about it .,No necesitas preocuparte por eso .,<start> No necesitas preocuparte por eso .,No necesitas preocuparte por eso . <end>


In [0]:
# English

tokenizer_en = tf.keras.preprocessing.text.Tokenizer(filters='')
tokenizer_en.fit_on_texts(train['eng'])


# Spa 
tokenizer_spa = tf.keras.preprocessing.text.Tokenizer(filters='')
tokenizer_spa.fit_on_texts(train['spa_inp'])
tokenizer_spa.fit_on_texts(train['spa_tar'])


In [0]:
# Convert to array

eng_data = tokenizer_en.texts_to_sequences(train['eng'])

spa_inp = tokenizer_spa.texts_to_sequences(train['spa_inp'])
spa_tar = tokenizer_spa.texts_to_sequences(train['spa_tar'])

In [0]:
# Pad Sequence

eng_data = tf.keras.preprocessing.sequence.pad_sequences(eng_data, padding='post')

spa_inp = tf.keras.preprocessing.sequence.pad_sequences(spa_inp, padding='post')
spa_tar = tf.keras.preprocessing.sequence.pad_sequences(spa_tar, padding='post')

In [0]:
# Prepare Validation data

# Convert to array
eng_data_val = tokenizer_en.texts_to_sequences(valid['eng'])

spa_inp_val = tokenizer_spa.texts_to_sequences(valid['spa_inp'])
spa_tar_val = tokenizer_spa.texts_to_sequences(valid['spa_tar'])

# Pad Sequence
eng_data_val = tf.keras.preprocessing.sequence.pad_sequences(eng_data_val, padding='post')

spa_inp_val = tf.keras.preprocessing.sequence.pad_sequences(spa_inp_val, padding='post')
spa_tar_val = tf.keras.preprocessing.sequence.pad_sequences(spa_tar_val, padding='post')

In [0]:
eng_data.shape

(95171, 49)

In [0]:
spa_inp.shape

(95171, 52)

In [0]:
spa_tar.shape

(95171, 52)

In [0]:
train_dataset = tf.data.Dataset.from_tensor_slices((eng_data, spa_inp, spa_tar))
train_dataset = train_dataset.shuffle(20000).batch(36).prefetch(tf.data.experimental.AUTOTUNE)

In [0]:
valid_dataset = tf.data.Dataset.from_tensor_slices((eng_data_val, spa_inp_val, spa_tar_val)).batch(36)

## Model

In [0]:
EMBEDDING_DIM = 256
RNN_SIZE = 512
BATCH_SIZE = 32
NUM_EPOCHS = 15

In [0]:
# Enocder Build

class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_size):
    super(Encoder, self).__init__()
    self.vocab_size = vocab_size
    self.embedding_dim = embedding_dim
    self.rnn_size = rnn_size

    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.lstm = tf.keras.layers.LSTM(rnn_size, return_sequences=True, return_state=True)

  def call(self, sequence, states):
    embed = self.embedding(sequence)
    output, state_h, state_c = self.lstm(embed, initial_state=states)

    return output, state_h, state_c

  def init_states(self, batch_size):
    return (tf.zeros([batch_size, self.rnn_size]),
            tf.zeros([batch_size, self.rnn_size]))

In [0]:
# Define Encoder

en_vocab_size = len(tokenizer_en.word_index) + 1

encoder = Encoder(en_vocab_size, EMBEDDING_DIM, RNN_SIZE)

In [0]:
tmp_input = tf.constant([[1,2,3,4,0,0,0]])
tmp_state = encoder.init_states(1)

test_enc_result = encoder(tmp_input, tmp_state)

test_enc_result

(<tf.Tensor: shape=(1, 7, 512), dtype=float32, numpy=
 array([[[-0.00614513, -0.00414992, -0.00221396, ..., -0.00130516,
          -0.00183235,  0.00191142],
         [-0.0080321 ,  0.00216335,  0.00371047, ...,  0.0026473 ,
          -0.0067199 , -0.00182101],
         [-0.01260014, -0.00033205,  0.00433422, ..., -0.00069088,
          -0.00261538,  0.00041561],
         ...,
         [-0.01252446,  0.00583516, -0.00823903, ...,  0.00520807,
          -0.00517339,  0.00315527],
         [-0.0149942 ,  0.01029188, -0.01603903, ...,  0.00770738,
          -0.0039962 ,  0.00605606],
         [-0.01674292,  0.01418889, -0.02155618, ...,  0.00977241,
          -0.00254354,  0.007801  ]]], dtype=float32)>,
 <tf.Tensor: shape=(1, 512), dtype=float32, numpy=
 array([[-1.67429186e-02,  1.41888913e-02, -2.15561762e-02,
          1.20317480e-02,  7.08915992e-03, -3.37243988e-03,
         -2.85808626e-03,  6.40990119e-03,  2.30774898e-02,
          1.59785186e-03,  8.51194840e-03, -4.58410848e-03

In [0]:
class BahdunauAttention(tf.keras.layers.Layer):
  def __init__(self, rnn_units):
    super(BahdunauAttention, self).__init__()
    self.rnn_units = rnn_units
    self.w1 = tf.keras.layers.Dense(rnn_units)
    self.w2 = tf.keras.layers.Dense(rnn_units)
    self.v = tf.keras.layers.Dense(1)

  def call(self, enc_output, dec_output):
    dec_output_expand = tf.expand_dims(dec_output, 1)

    score = self.v(tf.nn.tanh(self.w1(enc_output) + self.w2(dec_output_expand)))

    alignment = tf.nn.softmax(score, axis=1)

    context = alignment * enc_output
    context = tf.reduce_sum(context, axis=1)
    return context, alignment

In [0]:
class LuongAttention(tf.keras.layers.Layer):
  def __init__(self, rnn_units, score_type):
    super(LuongAttention, self).__init__()
    self.rnn_units = rnn_units
    self.score_type = score_type

    if self.score_type not in ['dot', 'scaled_dot', 'general', 'concat']:
      raise ValueError('The score type should be [dot, scaled_dot, general or concat]')

    if self.score_type == 'general':
      self.w = tf.keras.layers.Dense(rnn_units, use_bias=False)
    
    elif self.score_type == 'concat':
      self.w1 = tf.keras.layers.Dense(rnn_units, use_bias=False)
      self.w2 = tf.keras.layers.Dense(rnn_units, use_bias=False)
      self.v = tf.keras.layers.Dense(1, use_bias=False)
  
  def call(self, enc_output, dec_state):
    dec_state_expand = tf.expand_dims(dec_state, axis=1)                          # b, 1, rnn_unit
    print(dec_state_expand)

    if self.score_type == 'dot':
      score = tf.keras.layers.Dot([2,2])([enc_output, dec_state_expand])

    elif self.score_type == 'scaled_dot':
      score = tf.keras.layers.Dot([2,2])([enc_output, dec_state_expand])
      score /= tf.math.sqrt(tf.cast(self.rnn_units, tf.float32))
    
    elif self.score_type == 'general':
      weighted_score = self.w(enc_output)                                         # b, seq_len, rnn_unit
      # score = tf.matmul(dec_state_expand, weighted_score, transpose_a=True)
      score = tf.keras.layers.Dot([2,2])([weighted_score, dec_state_expand])

    elif self.score_type == 'concat':
      score = self.v(tf.nn.tanh(self.w1(enc_output) + self.w2(dec_state_expand)))
    
    alignment = tf.nn.softmax(score, axis=1)

    context = enc_output * alignment

    return context, alignment

In [0]:
temp_encoder_output = tf.random.normal(shape=[1, 7, 10])

In [110]:
temp_encoder_output

<tf.Tensor: shape=(1, 7, 10), dtype=float32, numpy=
array([[[-1.5333682 ,  0.9687358 ,  0.51973623,  1.0118471 ,
         -0.56730044,  0.5504088 ,  0.20109397, -1.4487932 ,
          0.94162154, -0.28442284],
        [ 0.04542429, -1.5500308 ,  0.4460409 , -1.0035542 ,
          0.2564453 ,  0.03552574,  0.7308755 , -0.14459497,
          0.7672014 , -0.06879083],
        [ 1.3462104 , -0.85520595, -0.45659062, -0.89591306,
         -0.32333207,  0.5324416 , -0.49169323, -0.6201448 ,
         -0.5825952 , -1.1989889 ],
        [ 1.2993677 ,  0.4300618 ,  1.0791647 , -0.65238297,
         -0.94279146, -0.59405845,  0.8469603 , -1.1827168 ,
         -1.0424469 , -0.12601969],
        [ 0.5149404 ,  0.05975062, -0.17946713, -1.5837878 ,
          0.54801875, -0.13831331, -0.1036338 ,  0.43188828,
          0.36808008,  0.02365669],
        [-0.6780803 ,  1.3068374 , -1.6513115 ,  1.3070569 ,
         -0.40296808, -0.40382203, -0.89380467, -2.1603436 ,
          0.09640348, -1.7490631 ],


In [111]:
tf.nn.softmax(temp_encoder_output, axis=1)

<tf.Tensor: shape=(1, 7, 10), dtype=float32, numpy=
array([[[0.01854449, 0.26352257, 0.18362737, 0.2741743 , 0.0916503 ,
         0.25623035, 0.15278368, 0.04623095, 0.26479352, 0.13120048],
        [0.08992404, 0.02122909, 0.17058152, 0.03653836, 0.20887293,
         0.1531157 , 0.25951242, 0.17034876, 0.2224117 , 0.16277331],
        [0.33021745, 0.04252947, 0.069171  , 0.04069087, 0.11697366,
         0.25166774, 0.07641934, 0.10587911, 0.05766979, 0.05257078],
        [0.3151059 , 0.15377109, 0.32128772, 0.05191117, 0.06295936,
         0.08158208, 0.2914561 , 0.06032375, 0.0364114 , 0.15371954],
        [0.14380834, 0.10618192, 0.09125932, 0.02045299, 0.2795831 ,
         0.12868336, 0.11265108, 0.30318135, 0.14921808, 0.17853881],
        [0.0436176 , 0.3695332 , 0.02094418, 0.36832798, 0.1080196 ,
         0.09867643, 0.05111737, 0.02269397, 0.1137192 , 0.03032845],
        [0.05878223, 0.04323263, 0.14312884, 0.20790437, 0.13194104,
         0.03004436, 0.05605993, 0.2913421 , 

In [112]:
tf.nn.softmax(temp_encoder_output, axis=-1)

<tf.Tensor: shape=(1, 7, 10), dtype=float32, numpy=
array([[[0.01503061, 0.18349595, 0.1171193 , 0.19157971, 0.0394943 ,
         0.1207673 , 0.08516162, 0.01635712, 0.17858745, 0.05240667],
        [0.09063957, 0.01838318, 0.13530175, 0.03175063, 0.11193428,
         0.0897468 , 0.1798894 , 0.07495378, 0.18654418, 0.08085649],
        [0.39430162, 0.04362803, 0.06499531, 0.04188773, 0.0742601 ,
         0.17474844, 0.06275339, 0.05518886, 0.05730057, 0.03093588],
        [0.27198747, 0.1140287 , 0.21823077, 0.03862908, 0.02889293,
         0.0409491 , 0.17300987, 0.02272968, 0.02615241, 0.06538996],
        [0.1482676 , 0.09405023, 0.07404043, 0.01817943, 0.15325408,
         0.07715105, 0.07987354, 0.13645115, 0.12801641, 0.09071612],
        [0.0452206 , 0.32913584, 0.01708706, 0.3292081 , 0.059541  ,
         0.05949017, 0.03644589, 0.01027064, 0.09810483, 0.01549582],
        [0.06430531, 0.04063119, 0.12321325, 0.19607651, 0.07673959,
         0.01911265, 0.04217535, 0.13912864, 

In [0]:
temp_encoder_output = tf.random.normal(shape=[1, 7, 10])
temp_decoder_output = tf.random.normal(shape=[1, 10])

In [129]:
temp_att = BahdunauAttention(10)

temp_att(temp_encoder_output, temp_decoder_output)

(<tf.Tensor: shape=(1, 10), dtype=float32, numpy=
 array([[-0.5855036 ,  0.38170958, -0.9046085 ,  0.06162816,  0.05167503,
          0.15797682,  0.25347024, -0.00974671,  0.08748729, -0.10125525]],
       dtype=float32)>, <tf.Tensor: shape=(1, 7, 1), dtype=float32, numpy=
 array([[[0.0598629 ],
         [0.13104557],
         [0.05593685],
         [0.19833282],
         [0.20575988],
         [0.19707207],
         [0.15198989]]], dtype=float32)>)

In [134]:
temp_att = LuongAttention(10, 'dot')

temp_att(temp_encoder_output, temp_decoder_output)

tf.Tensor(
[[[ 1.7199415  -1.2444223  -0.04586986  0.14021657  1.1259062
   -0.06523275 -1.1580428  -1.2161112   0.25881335 -0.6251855 ]]], shape=(1, 1, 10), dtype=float32)


(<tf.Tensor: shape=(1, 7, 10), dtype=float32, numpy=
 array([[[ 3.0800587e-01,  1.7826103e-01,  8.0980396e-01,  8.7565696e-01,
          -2.4563229e-02, -8.0329192e-01,  1.6416353e-01, -6.2656856e-01,
           3.0390871e-01, -5.1775593e-01],
         [ 1.9294964e-01,  4.5962423e-01, -1.9713746e-01, -3.2736632e-01,
           3.6716688e-02, -1.6814189e-01, -4.3211734e-01,  1.3663816e-01,
           2.9262275e-01, -2.5147009e-01],
         [-5.0168834e-04,  1.3870597e-04,  2.3442586e-05, -1.8393168e-04,
           2.6076325e-04, -4.0779795e-04,  3.7179794e-04,  1.2460988e-04,
          -1.1472677e-04, -1.6985616e-05],
         [-3.7481950e-03,  3.3612747e-03, -2.2990910e-02,  8.1233755e-03,
           7.6570306e-03,  9.4703706e-03, -3.7942128e-03,  1.3686546e-02,
          -1.5358163e-02,  8.1786364e-03],
         [-3.3824854e-02, -8.4468648e-03, -1.9113198e-02,  3.4687147e-03,
          -1.3516012e-02,  2.0175783e-03,  5.3623375e-03, -2.4989169e-02,
           3.4980271e-02,  6.981321

In [131]:
temp_att = LuongAttention(10, 'scaled_dot')

temp_att(temp_encoder_output, temp_decoder_output)

tf.Tensor(
[[[ 1.7199415  -1.2444223  -0.04586986  0.14021657  1.1259062
   -0.06523275 -1.1580428  -1.2161112   0.25881335 -0.6251855 ]]], shape=(1, 1, 10), dtype=float32)


(<tf.Tensor: shape=(1, 7, 10), dtype=float32, numpy=
 array([[[ 1.56400844e-01,  9.05183181e-02,  4.11206514e-01,
           4.44645733e-01, -1.24728465e-02, -4.07899797e-01,
           8.33598226e-02, -3.18162292e-01,  1.54320374e-01,
          -2.62908816e-01],
         [ 1.77821651e-01,  4.23587918e-01, -1.81681126e-01,
          -3.01699549e-01,  3.38379592e-02, -1.54958904e-01,
          -3.98237705e-01,  1.25925213e-01,  2.69679993e-01,
          -2.31753856e-01],
         [-5.88358194e-02,  1.62668303e-02,  2.74924422e-03,
          -2.15707049e-02,  3.05811781e-02, -4.78247665e-02,
           4.36028428e-02,  1.46137048e-02, -1.34546552e-02,
          -1.99199910e-03],
         [-3.11198067e-02,  2.79073566e-02, -1.90884590e-01,
           6.74452260e-02,  6.35733530e-02,  7.86288008e-02,
          -3.15018743e-02,  1.13634072e-01, -1.27512857e-01,
           6.79040402e-02],
         [-1.60336852e-01, -4.00399007e-02, -9.06005427e-02,
           1.64424311e-02, -6.40687123e-02

In [132]:
temp_att = LuongAttention(10, 'general')

temp_att(temp_encoder_output, temp_decoder_output)

tf.Tensor(
[[[ 1.7199415  -1.2444223  -0.04586986  0.14021657  1.1259062
   -0.06523275 -1.1580428  -1.2161112   0.25881335 -0.6251855 ]]], shape=(1, 1, 10), dtype=float32)


(<tf.Tensor: shape=(1, 7, 10), dtype=float32, numpy=
 array([[[ 3.03207338e-03,  1.75483827e-03,  7.97187723e-03,
           8.62014852e-03, -2.41805494e-04, -7.90777151e-03,
           1.61605969e-03, -6.16807025e-03,  2.99174013e-03,
          -5.09689609e-03],
         [ 6.38376415e-01,  1.52067280e+00, -6.52231872e-01,
          -1.08309579e+00,  1.21477656e-01, -5.56299627e-01,
          -1.42966604e+00,  4.52069163e-01,  9.68146265e-01,
          -8.31992090e-01],
         [-8.61470867e-03,  2.38178065e-03,  4.02542850e-04,
          -3.15837096e-03,  4.47767973e-03, -7.00247660e-03,
           6.38430426e-03,  2.13973061e-03, -1.97002338e-03,
          -2.91667413e-04],
         [-1.09540168e-02,  9.82325058e-03, -6.71904236e-02,
           2.37403847e-02,  2.23775022e-02,  2.76769474e-02,
          -1.10885035e-02,  3.99986282e-02, -4.48838919e-02,
           2.39018854e-02],
         [-3.11580058e-02, -7.78088998e-03, -1.76062603e-02,
           3.19523131e-03, -1.24503709e-02

In [133]:
temp_att = LuongAttention(10, 'concat')

temp_att(temp_encoder_output, temp_decoder_output)

tf.Tensor(
[[[ 1.7199415  -1.2444223  -0.04586986  0.14021657  1.1259062
   -0.06523275 -1.1580428  -1.2161112   0.25881335 -0.6251855 ]]], shape=(1, 1, 10), dtype=float32)


(<tf.Tensor: shape=(1, 7, 10), dtype=float32, numpy=
 array([[[ 0.05394077,  0.03121868,  0.14182016,  0.15335295,
          -0.00430173, -0.14067972,  0.0287498 , -0.10973033,
           0.05322323, -0.09067408],
         [ 0.03503969,  0.08346786, -0.0358002 , -0.05944979,
           0.00666776, -0.0305346 , -0.0784726 ,  0.02481352,
           0.05314035, -0.04566702],
         [-0.38080466,  0.10528424,  0.01779401, -0.13961266,
           0.19793138, -0.30953753,  0.28221184,  0.09458467,
          -0.08708292, -0.01289287],
         [-0.01983974,  0.01779171, -0.12169421,  0.0429982 ,
           0.04052977,  0.05012804, -0.02008332,  0.07244487,
          -0.08129298,  0.04329071],
         [-0.31834912, -0.0794993 , -0.17988756,  0.03264648,
          -0.12720855,  0.01898883,  0.05046867, -0.23519039,
           0.32922354,  0.00657061],
         [-0.22825742,  0.07399994, -0.2555065 ,  0.13000178,
           0.03296008,  0.27946538,  0.508204  , -0.00156827,
          -0.08421

In [0]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_size, attention_func):
    super(Decoder, self).__init__()
    self.vocab_size = vocab_size
    self.embedding_dim = embedding_dim
    self.rnn_size = rnn_size
    self.attention_func = attention_func

    self.embed = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.lstm = tf.keras.layers.LSTM(rnn_size, return_sequences=True, return_state=True)
    self.attention = LuongAttention(rnn_size, self.attention_func)
    self.wc = tf.keras.layers.Dense(rnn_size, activation='tanh')
    self.ws = tf.keras.layers.Dense(vocab_size)

  def call(self, sequence, state, encoder_output, att_type):
    hidden_state, cell_state = state
    embed = self.embedding(sequence)

    if att_type == 'bahdunau':
      context, alignment = self.attention(encoder_output, state)
      x = tf.concat([embed, context], axis=-1)
      lstm_out, state_h, state_c = self.lstm(x)
      output = tf.reshape(lstm_out, (-1, lstm_out.shape[2]))
      x = self.ws(output)  
      
      return x, alignment, state_h, state_c

    elif att_type == 'luong':
      lstm_out, state_h, state_c = self.lstm(embed, initial_state=state)
      context, alignment = self.attention(encoder_output, state_h)
      lstm_out = tf.concat([tf.squeeze(context, 1), tf.squeeze(lstm_out, 1)], 1)
      lstm_out = self.wc(lstm_out)
      x = self.ws(lstm_out)

      return x, alignment, state_h, state_c

In [0]:
spa_vocab_size = len(tokenizer_spa.word_index) + 1

decoder = Decoder(spa_vocab_size, EMBEDDING_DIM, RNN_SIZE, 'concat')