In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

In [None]:
# 1. preprocessing data
# 2. build model
# 2.1 encoder
# 2.2 attention
# 2.3 decoder
# 2.4 loss & optimizer
# 2.5 train
# 3. evaluation
# 3.1 given sentence ,return translated results
# 3.2 visualize results (attention)

In [39]:
en_spa_file_path = './spa-eng/spa.txt'

import unicodedata

def unicode2ascii(seq):
    return "".join(c for c in unicodedata.normalize('NFD', seq) if unicodedata.category(c) != 'Mn')

import re
def preprocess_sequence(seq):
    seq = unicode2ascii(seq.lower().strip())
    seq = re.sub(r"([?¿:,.!])", r" \1 ", seq)
    seq = re.sub(" +", " ", seq)
    seq = re.sub(r'[^a-zA-Z?.!,¿]', " ", seq)
    seq = seq.rstrip().strip()
    seq = "<start> " + seq + " <end>"
    return seq

t='If you want to sound like a native speaker, you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo.	Si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un músico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado.'
print(preprocess_sequence(t))

<start> if you want to sound like a native speaker , you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo .  si quieres sonar como un hablante nativo , debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado . <end>


In [48]:
lines = open(en_spa_file_path, encoding="UTF-8").read().strip().split('\n')
seq_pairs = [line.split('\t') for line in lines]
en_spa_pairs = [(preprocess_sequence(en), preprocess_sequence(spa)) for (en, spa) in seq_pairs]
en_dataset, sp_dataset = zip(*en_spa_pairs)
print(en_dataset[-1])
print(sp_dataset[-1])

<start> if you want to sound like a native speaker , you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo . <end>
<start> si quieres sonar como un hablante nativo , debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado . <end>


In [51]:
def tokenizer(lang_dataset):
    toker = keras.preprocessing.text.Tokenizer(num_words=None, filters='', split=' ')
    toker.fit_on_texts(lang_dataset)
    tensor = toker.texts_to_sequences(lang_dataset)
    tensor = keras.preprocessing.sequence.pad_sequences(tensor)
    return tensor, toker

output_tensor, output_tokenizer = tokenizer(en_dataset[0:30000])
input_tensor, input_tokenizer = tokenizer(sp_dataset[0:30000])

def max_length(tensor):
    return max(len(i) for i in tensor)

from sklearn.model_selection import train_test_split
input_train, input_eval, output_train, output_eval = train_test_split(input_tensor, output_tensor, test_size=0.2)

max_length_input = max_length(input_tensor)
max_length_output = max_length(output_tensor)
buffer_size = 30000
batch_size = 64
epochs = 20
def make_tf_dataset(input_tensor, output_tensor, batch_size, epochs, shuffle):
    tf_dataset = tf.data.Dataset.from_tensor_slices((input_tensor, output_tensor))
    if shuffle:
        tf_dataset = tf_dataset.shuffle(buffer_size)
    return tf_dataset.repeat(epochs).batch(batch_size, drop_remainder=True)

train_set = make_tf_dataset(input_train, output_train, batch_size, epochs, True)
eval_set = make_tf_dataset(input_eval, output_eval, batch_size, 1, False)

for x, y in train_set.take(1):
    print(x.shape)
    print(y.shape)

(64, 16)
(64, 11)


In [69]:
embedding_units = 256
units = 1024
input_vocab_size = len(input_tokenizer.word_index) + 1
output_vocab_size = len(output_tokenizer.word_index) + 1

In [86]:
class Encoder(keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units, batch_size):
        super().__init__()
        self.batch_size = batch_size
        self.rnn_units = rnn_units
        self.embedding = keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = keras.layers.GRU(rnn_units, return_state=True, return_sequences=True, recurrent_initializer="glorot_uniform")
        
    def call(self, x, hidden_state):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden_state)
        return output, state
    
    def initial_state(self):
        return tf.zeros((batch_size, self.rnn_units))

In [85]:
class BahdanauAttention(keras.Model):
    def __init__(self, units):
        super().__init__()
        self.W1 = keras.layers.Dense(units)
        self.W2 = keras.layers.Dense(units)
        self.V = keras.layers.Dense(1)
        
    def call(self, hidden_state, encoder_output):
        hidden = tf.expand_dims(hidden_state, axis=1)
        score = self.V(tf.nn.tanh(self.W1(hidden) + self.W2(encoder_output)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context = tf.reduce_sum(attention_weights*encoder_output, axis=1)
        return context, attention_weights

In [91]:
class Decoder(keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units, batch_size):
        super().__init__()
        self.batch_size = batch_size
        self.fc = keras.layers.Dense(vocab_size)
        self.embedding = keras.layers.Embedding(vocab_size, embedding_dim)
        self.rnn_units = rnn_units
        self.gru = keras.layers.GRU(rnn_units, return_sequences=True, return_state=True, recurrent_initializer="glorot_uniform")
        self.attention = BahdanauAttention(self.rnn_units)
        
    def call(self, x, hidden_state, encoder_output):
        # context.shape: (batch_size, units)
        context, attention_weights = self.attention(hidden_state, encoder_output)
        # batch_size, 1, embedding_dim
        x = self.embedding(x)
        combine_x = tf.concat([tf.expand_dims(context, axis=1), x], axis=-1)
        # output: batch_size, 1, rnn_units
        # state:  batch_size, rnn_units
        output, state = self.gru(combine_x)
        #output = tf.reshape(output, (-1, output.shape[2]))
        output = tf.squeeze(output, 1)
        output = self.fc(output)
        return output, state, attention_weights

In [70]:
t = tf.constant([[[1., 2., 3.]], [[4., 5., 3.]]])
v=tf.reshape(t, (-1, t.shape[2]))
print(v[:,2])
print(t.shape)
print(t.shape[2])
print(tf.reshape(t, (-1, t.shape[2])))
print(tf.squeeze(t, 1))
a = tf.constant([1,2,0,3,0,5])
print(tf.not_equal(a, 0))

tf.Tensor([3. 3.], shape=(2,), dtype=float32)
(2, 1, 3)
3
tf.Tensor(
[[1. 2. 3.]
 [4. 5. 3.]], shape=(2, 3), dtype=float32)
tf.Tensor(
[[1. 2. 3.]
 [4. 5. 3.]], shape=(2, 3), dtype=float32)
tf.Tensor([ True  True False  True False  True], shape=(6,), dtype=bool)


In [88]:
optimizer = keras.optimizers.Adam()
loss_object = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_func(real, pred):
    mask = tf.not_equal(real, 0)
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

In [92]:
encoder = Encoder(input_vocab_size, embedding_units, units, batch_size)
decoder = Decoder(output_vocab_size, embedding_units, units, batch_size)

@tf.function
def train_step(input, target, hidden):
    loss = 0
    with tf.GradientTape() as tape:
        encode_output, hidden = encoder(input, hidden)
        for t in range(target.shape[1] - 1):
            decode_input = tf.expand_dims(target[:, t], axis=-1)
            pred, hidden_state, _ = decoder(decode_input, hidden, encode_output)
            loss += loss_func(target[:, t+1], pred)
    batch_loss = loss / int(target.shape[0])
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss

In [94]:
epochs = 10
steps_per_epoch = len(input_tensor) // batch_size

for epoch in range(epochs):
    time_start = time.time()
    total_loss = 0
    hidden = encoder.initial_state()
    
    for (batch, (input, targ)) in enumerate(train_set.take(steps_per_epoch)):
        batch_loss = train_step(input, targ, hidden)
        total_loss += batch_loss
        
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))
            
    print('Epoch {} Loss {:.4f}'.format(epoch+1, total_loss / steps_per_epoch))
    
    print('Time take for 1 epoch {} sec\n'.format(time.time() - time_start))

Epoch 1 Batch 0 Loss 0.2992
Epoch 1 Batch 100 Loss 0.3037
Epoch 1 Batch 200 Loss 0.2985
Epoch 1 Batch 300 Loss 0.2725
Epoch 1 Batch 400 Loss 0.2706
Epoch 1 Loss 0.2960
Time take for 1 epoch 1258.712735414505 sec

Epoch 2 Batch 0 Loss 0.2683
Epoch 2 Batch 100 Loss 0.2761
Epoch 2 Batch 200 Loss 0.2608
Epoch 2 Batch 300 Loss 0.2459
Epoch 2 Batch 400 Loss 0.2504
Epoch 2 Loss 0.2635
Time take for 1 epoch 1202.2063212394714 sec

Epoch 3 Batch 0 Loss 0.2356
Epoch 3 Batch 100 Loss 0.2421
Epoch 3 Batch 200 Loss 0.2439
Epoch 3 Batch 300 Loss 0.2325
Epoch 3 Batch 400 Loss 0.2265
Epoch 3 Loss 0.2410
Time take for 1 epoch 1436.0191569328308 sec

Epoch 4 Batch 0 Loss 0.2411
Epoch 4 Batch 100 Loss 0.2178
Epoch 4 Batch 200 Loss 0.2337
Epoch 4 Batch 300 Loss 0.2244
Epoch 4 Batch 400 Loss 0.2196
Epoch 4 Loss 0.2232
Time take for 1 epoch 1165.0504508018494 sec

Epoch 5 Batch 0 Loss 0.2106
Epoch 5 Batch 100 Loss 0.2217
Epoch 5 Batch 200 Loss 0.1993
Epoch 5 Batch 300 Loss 0.2147
Epoch 5 Batch 400 Loss 0.19