In [1]:
import tensorflow as tf 
import tensorflow.keras as keras
from tensorflow.keras.layers import GRU, LSTM, Dense, Input, Lambda
import os, io
import shutil 
import tqdm 
import math
import pathlib
import numpy as np 
import matplotlib.pyplot as plt
from typing import *
import unicodedata
import re



In [2]:
SPANISH_DATASET_URL = 'http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip'

In [3]:
# Download the spanish dataset.
path_to_zip = tf.keras.utils.get_file(
      'spa-eng.zip',
      origin=SPANISH_DATASET_URL,
      extract=True
    )

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [101]:
path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"
#path_to_file = pathlib.Path(path_to_zip).parent/'spa-eng/spa.txt'


In [102]:
#1. convert unicode files to ascii
def unicode_to_ascii(s): 
  """
    function, converts the unicode data into ascii.
    Params: 
      s(dtype: str): input string
    Return(dtype: ascii)
  """
  return ''.join(c for c in unicodedata.normalize('NFD',s) if unicodedata.category(c)!='Mn')

def preprocess_sentence(w):
  """
    this function, does a preprocessing of the string, like converted to ascii and 
      removing the whitespaces.. And also adds the start and end token to the start and
      end of the string respectively.
    Params:
      w(dtype: str): string, what needed to be preprocessed.
    Return(dtype: str)
      returns the string, which is preprocessed
  """
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([.,!?¿])", r" \1 ", w)
    w = re.sub('\s{2,}', ' ', w)

    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w = w.strip()

    w = '<start> ' + w + ' <end>'
    return w

In [103]:
def read_data(path: str) -> Tuple:
  """
    this function, will read the text from the input path, using io, and seperate into the 
    context and target for the preprocessing.
    Params:
      path(type: str): Input path of the text data.

    Return(type: (List, List))
      returns the list of context and list of target resp.
  """
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines]
  
    context = np.array([context for target, context in word_pairs])
    target = np.array([target for target, context in word_pairs])

    return context, target

In [104]:
context_text, target_text = read_data(path_to_file)
context_text.shape

(118964,)

In [105]:
context_text[1]

'<start> vete . <end>'

In [106]:
target_text[1]

'<start> go . <end>'

In [107]:
def get_vectorized_value(text): 
  """
    this function used to get the vector value for the given text value.
    Params:
      text(type; np.ndarray): numpy array contains the context or target data(text).
    Return(type: (tf.Tensor, tf.keras.Preprocessing))
      returns the tensor(which is a ineger sequence for text data), and vectorized function.
  """
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(text)

    text_tensor = lang_tokenizer.texts_to_sequences(text)

    text_tensor = tf.keras.preprocessing.sequence.pad_sequences(text_tensor,
                                                         padding='post')
    return text_tensor, lang_tokenizer

In [108]:
context_tensor, context_tokenizer = get_vectorized_value(context_text)
target_tensor, target_tokenizer = get_vectorized_value(target_text)
target_tensor.shape

(118964, 51)

In [109]:
def split_dataset(context_data: np.array, target_data: np.array,
                  is_test: bool, train_split: float, val_split: float,
                  test_split=0.0) -> Tuple: 
  """
    this function, will create train, test and val data
    Params:
      is_test(dtype: Bool): used for does needed a test data or not.
      train_split(dtype; float): Amount of training dataset.
      test_split(dtype; float): Amount of testing dataset.
      val_split(dtype; float): Amount of validation dataset.
  """
    assert is_test and test_split > 0, "You cannot create a testing split, by specifying is_test to False"
    assert train_split <= 1.0, 'Train Split value should be float, and should be lesser than 1.0'
    assert val_split <= 1.0, 'val Split value should be float, and should be lesser than 1.0'
    assert test_split <= 1.0, 'test Split value should be float, and should be lesser than 1.0'
    assert train_split + test_split + val_split == 1.0, "Sum of train, test and val split, does't add up to 1.0"

    len_data =  context_data.shape[0]
    n_train = int(train_split * len_data)
    n_val = int(val_split * len_data)

    train_inds = np.random.choice(np.arange(len_data), n_train, replace=False)
    val_inds = np.random.choice([i for i in np.arange(len_data) if i not in train_inds], n_val, replace=False)

    if test_split: 
    
        n_test = int(test_split * context_data.shape[0])
        test_inds = [i for i in np.arange(len_data) if i not in train_inds and i not in val_inds]

        return (context_data[train_inds], target_data[train_inds]), \
              (context_data[val_inds], target_data[val_inds]), \
          (context_data[test_inds], target_data[test_inds])

    return (context_data[train_inds], target_data[train_inds]), (context_data[val_inds], target_data[val_inds])

In [110]:
train_data, val_data, test_data = split_dataset(context_tensor, target_tensor, True, 0.8, 0.1, 0.1)

In [111]:
train_data[0].shape

(95171, 53)

In [112]:
def create_tensorflow_dataset(data: tf.Tensor, batch_size: int) -> tf.data.Dataset: 
  """
    this function, will create a tensorflow dataset, which utilizes the gpu/tpu more than the numpy
      array.
    Params:
      data(type: Tuple): It is a tuple of data(train_X, train_y)
      batch_size(dtype: int): Number of batch.
    Return(type: tf.data.Dataset)
      returns the data, that is converted to tf.data.Dataset.
  """ 
    tensorflow_dataset = tf.data.Dataset.from_tensor_slices(data)
    tensorflow_dataset = (
               tensorflow_dataset.shuffle(1024)
              .batch(batch_size, drop_remainder=True)
              .prefetch(tf.data.experimental.AUTOTUNE)
            )
  
    return tensorflow_dataset

In [113]:
train_ds = create_tensorflow_dataset(train_data, 64)
val_ds = create_tensorflow_dataset(val_data, 64)
test_ds = create_tensorflow_dataset(test_data, 64)

In [114]:
train_ds

<PrefetchDataset element_spec=(TensorSpec(shape=(64, 53), dtype=tf.int32, name=None), TensorSpec(shape=(64, 51), dtype=tf.int32, name=None))>

In [115]:
eg_in, eg_de = next(iter(train_ds))

In [116]:
INPUT_VOCAB_SIZE = len(context_tokenizer.word_index) + 1
TARGET_VOCAB_SIZE = len(target_tokenizer.word_index) + 1
EMB_DIMS = 256 
HIDDEN_DIMS = 1024
INPUT_SEQ_SIZE = eg_in.shape[1] 
TARGET_SEQ_SIZE = eg_de.shape[1]

In [117]:
class Encoder(tf.keras.Model):
  """
    this class, is used as for constructing the encoder of type lstm.
    Methods:
      __init__: constructor.
      call: used to pass the input to get an output.
      initialize_hidden_state: used to initialize the initial hidden state of encoder(h0).
    Params:
      vocab_size(dtype: int) Dimension of the vectorized input.
      embedding_dim(dtype: int): number of hidden units in the embedding layer.
      h;idden_units(dtype: int): Number of hidden units in the LSTM layer.
      bt_size(dtype: int): Batch Size.
  """
      def __init__(self, vocab_size: int, embedding_dim: int, hidden_units: int, bt_size: int):
        super(Encoder, self).__init__()
        self.bt_size = bt_size
        self.hidden_units = hidden_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, name="embedding_layr")
        self.rnn = tf.keras.layers.GRU(self.hidden_units,
                                       return_sequences=True, 
                                       return_state=True, 
                                       recurrent_initializer='glorot_uniform',
                                       name="lstm_layer"
                                      )

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.rnn(x, hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.bt_size, 1024))

In [118]:
lstm_encoder = Encoder(INPUT_VOCAB_SIZE, EMB_DIMS, HIDDEN_DIMS, 64)

In [119]:
sample_hidden = lstm_encoder.initialize_hidden_state()
sample_output, sample_hidden_ = lstm_encoder(eg_in, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 53, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [120]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_units, bt_size):
        """
          this class, is used as for constructing the decoder of type lstm.
          Methods:
            __init__: constructor.
            call: used to pass the input to get an output.
            initialize_hidden_state: used to initialize the initial hidden state of encoder(h0).
          Params:
            vocab_size(dtype: int) Dimension of the vectorized input.
            embedding_dim(dtype: int): number of hidden units in the embedding layer.
            h;idden_units(dtype: int): Number of hidden units in the LSTM layer.
            bt_size(dtype: int): Batch Size.
      """
        super(Decoder, self).__init__()
        self.bt_size = bt_size
        self.hidden_units = hidden_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, name="embedding_layr")
        self.rnn = tf.keras.layers.GRU(self.hidden_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)


    def call(self, x, hidden_state):
        x = self.embedding(x)
        output, state = self.rnn(x, initial_state = hidden_state)
        output = tf.reshape(output, (-1, output.shape[2]))

        x = self.fc(output)
        return x, state

In [121]:
lstm_decoder = Decoder(TARGET_VOCAB_SIZE, EMB_DIMS, HIDDEN_DIMS, 64)

In [122]:
sample_decoder_output, _ = lstm_decoder(tf.random.uniform((64, 1)), sample_hidden)

In [123]:
sample_decoder_output.shape

TensorShape([64, 12934])

In [27]:
dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']] * 64, 1)

In [188]:
optimizer = tf.keras.optimizers.Adam()

def train_step(inp, targ, encoder,
                       decoder, input_tok, target_tok, batch_size):
    loss = 0
    enc_hidden = encoder.initialize_hidden_state()
    with tf.GradientTape() as tape: 
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([target_tok.word_index["<start>"]] * batch_size, 1)

        for t in range(1, targ.shape[1]):
            predictions, dec_hidden = decoder(dec_input, dec_hidden)

            loss += loss_function(targ[:, t], predictions)

            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss


def test_step(inp, targ, encoder,
                      decoder, input_tok, target_tok, batch_size):
    enc_hidden = encoder.initialize_hidden_state()
    loss = 0
    enc_output, enc_hidden = encoder(inp, enc_hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([target_tok.word_index["<start>"]] * batch_size, 1)

    for t in range(1, targ.shape[1]):
        predictions, dec_hidden = decoder(dec_input, dec_hidden)
        loss += loss_function(targ[:, t], predictions)
        dec_input = tf.expand_dims(targ[:, t], 1)

    loss = loss / int(targ.shape[1])
    return loss

In [189]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0)) 
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

In [190]:
import time
def training_seq2seq(encoder, decoder, train_dataset,
                         val_dataset, input_tok, target_tok, epochs, batch_size):
    training_loss = []
    validation_loss = []

    for epoch in range(epochs):
        start = time.time()
        total_loss = 0
        total_loss_val =- 0

        for (batch, (inp, targ)) in enumerate(tqdm.tqdm(train_dataset)):
            batch_loss = train_step(inp, targ, 
                                    encoder, decoder, input_tok, target_tok, batch_size)

            total_loss += batch_loss

        for (batch, (inp, targ)) in enumerate((val_dataset)):
            batch_loss = test_step(inp, targ,
                                     encoder, decoder, input_tok, target_tok, batch_size)

            total_loss_val += batch_loss

        print(f"Epoch {epoch} train_loss: {total_loss} val_loss: {total_loss_val}")
        

    return encoder, decoder, training_loss, validation_loss

In [None]:
encoder = Encoder(INPUT_VOCAB_SIZE, EMB_DIMS, HIDDEN_DIMS, 64)
decoder = Decoder(TARGET_VOCAB_SIZE, EMB_DIMS, HIDDEN_DIMS, 64)
encoder_t, decoder_t, training_loss, validation_loss = training_seq2seq(encoder, decoder, 
                            train_ds, val_ds, context_tokenizer, target_tokenizer, 10, 64)

encoder_t.save("seq_seq_gru_encoder")
decoder_t.save("seq_seq_gru_decoder")
#https://cnvrg.io/seq2seq-model/

 15%|█▌        | 230/1487 [03:22<17:07,  1.22it/s]

In [None]:
def translate(sentence, encoder, decoder):
    attention_plot = np.zeros((51, 53))

    sentence = preprocess_sentence(sentence)

    inputs = [context_tokenizer.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=51,
                                                         padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, 1024))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)

    for t in range(51):
        predictions, dec_hidden = decoder(dec_input, dec_hidden)

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += target_tokenizer.index_word[predicted_id] + ' '

        if target_tokenizer.index_word[predicted_id] == '':
            return result, sentence
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence
     

In [None]:
translate(u'¿todavia estan en casa?', encoder, decoder)

In [185]:
encoder_inputs = Input(shape=(None,))
en_x=  keras.layers.Embedding(INPUT_VOCAB_SIZE, 256)(encoder_inputs)
encoder = LSTM(50, return_state=True)
encoder_outputs, state_h, state_c = encoder(en_x)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,))
dex=  keras.layers.Embedding(TARGET_VOCAB_SIZE, 256)
final_dex= dex(decoder_inputs)

decoder_lstm = LSTM(50, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(final_dex, initial_state=encoder_states)
decoder_dense = Dense(TARGET_VOCAB_SIZE, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [186]:
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)


In [None]:
model.compile(optimizer= ‘adam’, loss='categorical_crossentropy', metrics=['acc'])

model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
         batch_size=128,
         epochs=10,
         validation_split=0.05)