In [0]:
%tensorflow_version 2.x

TensorFlow 2.x selected.


In [0]:
import tensorflow as tf
import numpy as np
import re
import time
import os
import random
import spacy
import unicodedata
import pandas as pd

In [0]:
SEED = 1228

random.seed(SEED)
tf.random.set_seed(SEED)
np.random.seed(SEED)

In [0]:
# Download the file
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

In [0]:
def unicode_to_ascii(sentence):
  return ''.join(c for c in unicodedata.normalize('NFD', sentence) if unicodedata.category(c) != 'Mn')


def preprocess_sentence(sentence):
  sentence = unicode_to_ascii(sentence.lower().strip())
  sentence = re.sub(r"([?.!,¿])", r' \1 ', sentence)
  sentence = re.sub(r'[ ]+', " ", sentence)
  sentence = re.sub(r'[^a-zA-Z?.!,¿]+', ' ', sentence)
  sentence = sentence.rstrip().strip()
  sentence = '<start> ' + sentence + ' <end>'
  return sentence

In [0]:
en_sentence = u"May I borrow this book?"
sp_sentence = u"¿Puedo tomar prestado este libro?"
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence).encode('utf-8'))

<start> may i borrow this book ? <end>
b'<start> \xc2\xbf puedo tomar prestado este libro ? <end>'


In [0]:
df = pd.read_csv(path_to_file, sep='\t', header=None)

In [0]:
df.head()

Unnamed: 0,0,1
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.


In [0]:
df[0] = df[0].apply(preprocess_sentence)
df[1] = df[1].apply(preprocess_sentence)

In [0]:
df.head()

Unnamed: 0,0,1
0,<start> go . <end>,<start> ve . <end>
1,<start> go . <end>,<start> vete . <end>
2,<start> go . <end>,<start> vaya . <end>
3,<start> go . <end>,<start> vayase . <end>
4,<start> hi . <end>,<start> hola . <end>


In [0]:
eng = df[0].tolist()
sp = df[1].tolist()

In [0]:
def tokenizer(lang):
  token = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='UNK')
  token.fit_on_texts(lang)
  seq = token.texts_to_sequences(lang)
  return token, seq

In [0]:
eng_token, eng_seq = tokenizer(eng)
sp_token, sp_seq = tokenizer(sp)

In [0]:
def max_length_checker(seq):
  return max([len(s) for s in seq])

In [0]:
eng_maxlen = max_length_checker(eng_seq)
sp_maxlen = max_length_checker(sp_seq)

print('eng :', eng_maxlen)
print('sp :', sp_maxlen)

eng : 51
sp : 53


In [0]:
def pad_seq(seq, maxlen):
  return tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen, padding='post')

In [0]:
eng_seq = pad_seq(eng_seq, eng_maxlen)
sp_seq = pad_seq(sp_seq, sp_maxlen)

In [0]:
print(eng_seq.shape)
print(sp_seq.shape)

(118964, 51)
(118964, 53)


In [0]:
sp_seq

array([[   2,  365,    4, ...,    0,    0,    0],
       [   2, 1322,    4, ...,    0,    0,    0],
       [   2,  501,    4, ...,    0,    0,    0],
       ...,
       [   2,   23, 5874, ...,    0,    0,    0],
       [   2,   38, 1306, ...,    0,    0,    0],
       [   2,   45,  122, ..., 2101,    4,    3]], dtype=int32)

In [0]:
sp_seq_target = np.zeros(shape=sp_seq.shape, dtype=np.int)
sp_seq_target[:, :-1] = sp_seq[:, 1:]

In [0]:
sp_seq_target

array([[ 365,    4,    3, ...,    0,    0,    0],
       [1322,    4,    3, ...,    0,    0,    0],
       [ 501,    4,    3, ...,    0,    0,    0],
       ...,
       [  23, 5874,    6, ...,    0,    0,    0],
       [  38, 1306,  141, ...,    0,    0,    0],
       [  45,  122, 1441, ...,    4,    3,    0]])

In [0]:
class config:
  batch_size = 128

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, emb_dim, hid_dim, n_layers, dropout):
    super(Encoder, self).__init__()
    self.vocab_size = vocab_size
    self.emb_dim = emb_dim
    self.hid_dim = hid_dim
    self.n_layers = n_layers
    self.dropout = dropout

    self.emb = tf.keras.layers.Embedding(vocab_size, emb_dim)
    
    self.lstm = []
    for _ in range(self.n_layers):
      lstm = tf.keras.layers.LSTM(hid_dim, return_sequences=True, return_state=True, dropout=dropout, recurrent_dropout=dropout)
      self.lstm.append(lstm)
  
  def call(self, inputs):
    x = self.emb(inputs)

    for lstm_layer in self.lstm:
      x = lstm_layer(x)
    
    output, h_state, c_state = x
    return output, h_state, c_state

In [0]:
enc_input = tf.keras.layers.Input(shape=(100,))
enc_out, enc_h, enc_c = Encoder(100, 50, 200, 2, 0.2)(enc_input)

In [0]:
enc_out

<tf.Tensor 'encoder_10/Identity:0' shape=(None, 100, 200) dtype=float32>

In [0]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, emb_dim, hid_dim, n_layers, dropout):
    super(Decoder, self).__init__()
    self.vocab_size = vocab_size
    self.emb_dim = emb_dim
    self.hid_dim = hid_dim
    self.n_layers = n_layers
    self.dropout = dropout


    self.emb = tf.keras.layers.Embedding(vocab_size, emb_dim)

    self.lstm = []
    for _ in range(self.n_layers):
      lstm = tf.keras.layers.LSTM(hid_dim, return_sequences=True, return_state=True, dropout=dropout, recurrent_dropout=dropout)
      self.lstm.append(lstm)
  
    self.fc = tf.keras.layers.Dense(vocab_size, activation='softmax')

  def call(self, inputs, h_state, c_state):
    x = self.emb(inputs)

    for lstm_layer in self.lstm:
      x = lstm_layer(x, initial_state=[h_state, c_state])
    
    output, h_state, c_state = x

    x = self.fc(output)

    return x, h_state, c_state

In [0]:
encoder_input = tf.keras.layers.Input(shape = (51,))
encoder_output, encoder_h, encoder_c = Encoder(len(eng_token.word_index)+1, 128, 128, 1, 0.3)(encoder_input)

decoder_init_state = [encoder_h, encoder_c]

decoder_input = tf.keras.layers.Input(shape=(53,))
decoder_output, h_state, c_state = Decoder(len(sp_token.word_index)+1, 128, 128, 1, 0.3)(decoder_input, encoder_h, encoder_c)

In [0]:
encoder_h

<tf.Tensor 'encoder_11/Identity_1:0' shape=(None, 128) dtype=float32>

In [0]:
model = tf.keras.models.Model([encoder_input, decoder_input], decoder_output)

In [0]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           [(None, 51)]         0                                            
__________________________________________________________________________________________________
input_20 (InputLayer)           [(None, 53)]         0                                            
__________________________________________________________________________________________________
encoder_11 (Encoder)            ((None, 51, 128), (N 1787264     input_19[0][0]                   
__________________________________________________________________________________________________
decoder_7 (Decoder)             ((None, 53, 24795),  6503899     input_20[0][0]                   
Total params: 8,291,163
Trainable params: 8,291,163
Non-trainable params: 0
________________

In [0]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc'])

In [69]:
hist = model.fit([eng_seq, sp_seq], sp_seq_target, batch_size=128, epochs=50)

Train on 118964 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
  8448/118964 [=>............................] - ETA: 10:35 - loss: 0.3130 - acc: 0.9358

KeyboardInterrupt: ignored

In [71]:
model.layers[2]

<__main__.Encoder at 0x7f8ea9a9c9b0>

In [145]:
# Inference Stage
sp_idx_word = {v:k for k, v in sp_token.word_index.items()}

input_sentence = 'hi'

preprocessed_sentence = preprocess_sentence(input_sentence)

inf_input = [eng_token.word_index[word] for word in preprocessed_sentence.split(' ')]

convert_token_pad = pad_seq([inf_input], eng_maxlen)

encoder_output, enc_h_state, enc_c_state = model.layers[2](convert_token_pad)

# state initializer
dec_h_state, dec_c_state = enc_h_state, enc_c_state

dec_input_initalizer = np.expand_dims(np.array([sp_token.word_index['<start>']]), 0)  # (1,1) shape

result = ''
for timestep in range(sp_maxlen):
  predicted, dec_h, dec_c = model.layers[3](dec_input_initalizer, dec_h_state, dec_c_state)
  predicted_id = np.argmax(predicted[0])
  result += sp_idx_word[predicted_id] + ' '

  if predicted_id == sp_token.word_index['<end>']:
    break

  dec_h_state, dec_c_state = dec_h, dec_c  
  dec_input_initializer = np.expand_dims(np.asarray(predicted_id), 0)

print(result)

hola ! ! de de de por por por <end> 
