## This Notebook is to enable multiple epoch runs since the model needed over 50+ epoch runs.  So the model had to be stored and picked up again for more training.

In [1]:
# Standard imports
import pandas as pd
import numpy as np
import re
import pickle
import string

# Visualization library
import seaborn as sns
import matplotlib.pyplot as plt

# NLP library
from nltk.tokenize import word_tokenize

In [2]:
from tensorflow import keras
import tensorflow as tf

from tensorflow.keras.layers import Input, LSTM, Dense, Masking
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


from keras.backend import manual_variable_initialization
manual_variable_initialization(True)

In [3]:
#settign same seed to continue with similar model build
from numpy.random import seed
seed(42)
tf.random.set_seed(42)

In [4]:
# Encoder training setup (coming from the initial model run)
num_encoder_tokens = 5044
num_decoder_tokens = 8089

max_encoder_seq_length = 8
max_decoder_seq_length = 17

latent_dim = 256

In [5]:
df= pd.read_csv('./data/EngSpa.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,eng,spa,eng_len,spa_len
0,15828,a dog has four legs,un perro tiene cuatro patas,19,27
1,15829,a lion is an animal,un león es un animal,19,20
2,15830,a noise woke her up,un ruido la despertó,19,20
3,15831,a nurse wears white,una enfermera se viste de blanco,19,32
4,15832,a truck hit the dog,un camión atropelló al perro,19,28


In [6]:
df.drop(columns = ['Unnamed: 0', 'eng_len', 'spa_len'], inplace=True)
df.head()

Unnamed: 0,eng,spa
0,a dog has four legs,un perro tiene cuatro patas
1,a lion is an animal,un león es un animal
2,a noise woke her up,un ruido la despertó
3,a nurse wears white,una enfermera se viste de blanco
4,a truck hit the dog,un camión atropelló al perro


In [92]:
#Converting dataframe to list of lists so it can be used for model building
data = df.values.tolist()

# Examining the first 5 rows
data[:8]

[['a bird has wings', 'los pájaros tienen alas'],
 ['a bird has wings', 'un pájaro tiene alas'],
 ['a cab is waiting', 'hay un taxi esperando'],
 ['a deal is a deal', 'un trato es un trato'],
 ['a dog is barking', 'un perro está ladrando'],
 ['a fox came along', 'un zorro se acercó'],
 ['a girl phoned me', 'una chica me llamó por teléfono'],
 ['a lion is strong', 'el león es fuerte']]

In [93]:
# Building empty lists to hold sentences
input_docs = []
target_docs = []
# Building empty vocabulary sets
input_tokens = set()
target_tokens = set()

# Setting the amount of data points to use
n = 17500

for line in data[:n]:
    # Defining an input document and a target document
    input_doc, target_doc = line[0], line[1]
    
    # Appending each input sentence to input_docs
    input_docs.append(input_doc)
    
    # Formatting target documents
    # Splitting words from punctuation  
    target_doc = " ".join(re.findall(r"[\w']+|[^\s\w]", target_doc))
    # Adding <START> and <END> tags
    target_doc = '<START> ' + target_doc + ' <END>'
    # Appending each formated target to target documents
    target_docs.append(target_doc)
  
    # Splitting each sentence into words and adding to vocabulary
    for token in re.findall(r"[\w']+|[^\s\w]", input_doc):
        if token not in input_tokens:
            input_tokens.add(token)
    
    for token in target_doc.split():
        if token not in target_tokens:
            target_tokens.add(token)

In [94]:
print(input_docs[0])
print(target_docs[0])

a bird has wings
<START> los pájaros tienen alas <END>


## Importing dictionaries that were built in Step 2

In [95]:

import pickle

with open('./data/rtfd.p', 'rb') as fp:
    reverse_target_features_dict = pickle.load(fp)
    
with open('./data/tfd.p', 'rb') as fp:
    target_features_dict = pickle.load(fp)
    
with open('./data/rifd.p', 'rb') as fp:
    reverse_input_features_dict = pickle.load(fp)

with open('./data/ifd.p', 'rb') as fp:
    input_features_dict = pickle.load(fp)

In [96]:
# Creating empty matricies for input data

encoder_input_data = np.zeros(
    (len(input_docs), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_docs), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

In [97]:
# This block of code sets up our data for input to the neural net using the dictionary input

for line, (input_doc, target_doc) in enumerate(zip(input_docs, target_docs)):

    for timestep, token in enumerate(re.findall(r"[\w']+|[^\s\w]", input_doc)):
        # Assigning 1.0 for the current line, timestep, & word in encoder_input_data:
        encoder_input_data[line, timestep, input_features_dict[token]] = 1.0
        # Potential Conditional Statement

    for timestep, token in enumerate(target_doc.split()):
        # Assigning 1.0 for same in decoder_input_data
        decoder_input_data[line, timestep, target_features_dict[token]] = 1.0
        if timestep > 0:
            
            # Setting the decoder target data for 1 previous timestep
            decoder_target_data[line, timestep - 1, target_features_dict[token]] = 1.0

In [98]:
#Loading the model
from tensorflow.keras.models import Model, load_model

training_model = load_model('endspan15.h5')

In [99]:
encoder_inputs = training_model.input[0] #input1
encoder_outputs, state_h_enc, state_c_enc = training_model.layers[2].output #lstm1
encoder_states = [state_h_enc, state_c_enc]

encoder_model = Model(encoder_inputs, encoder_states)

latent_dim = 256

## NEW
decoder_inputs = training_model.input[1] #input2

##
decoder_state_input_hidden = Input(shape=(latent_dim,))
decoder_state_input_cell = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_hidden, decoder_state_input_cell]

## NEW
decoder_lstm = training_model.layers[3]
##

decoder_outputs, state_hidden, state_cell = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_hidden, state_cell]

## NEW
decoder_dense = training_model.layers[4]
##

decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [100]:
#Continue with training
epochs = 1
batch_size = 128

model_checkpoint= ModelCheckpoint('endspan{epoch:02d}.h5',period=1,save_weights_only=False)

history = training_model.fit([encoder_input_data, decoder_input_data], decoder_target_data, 
                    batch_size = batch_size, 
                    epochs = epochs, 
                    callbacks=[model_checkpoint],         
                    validation_split = 0.2)
                    #callbacks = callback)



In [101]:
def string_to_matrix(user_input):
    '''This function takes in a string and outputs the corresponding matrix'''
    tokens = re.findall(r"[\w']+|[^\s\w]", user_input)
    user_input_matrix = np.zeros((1, max_encoder_seq_length, num_encoder_tokens), dtype='float32')
    for timestep, token in enumerate(tokens):
        if token in input_features_dict:
            user_input_matrix[0, timestep, input_features_dict[token]] = 1.
    return user_input_matrix

In [102]:
def decode_sequence(test_input):
    '''This function takes in a sentence and returns the decoded sentence'''
    
    # Encode the input as state vectors.
    states_value = encoder_model.predict(string_to_matrix(test_input))

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first token of target sequence with the start token.
    target_seq[0, 0, target_features_dict['<START>']] = 1.

    # Sampling loop for a batch of sequences
    decoded_sentence = ''

    stop_condition = False
    while not stop_condition:
        # Run the decoder model to get possible output tokens (with probabilities) & states
        output_tokens, hidden_state, cell_state = decoder_model.predict([target_seq] + states_value)

        # Choose token with highest probability and append it to decoded sentence
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_features_dict[sampled_token_index]
        decoded_sentence += " " + sampled_token

        # Exit condition: either hit max length or find stop token.
        if (sampled_token == '<END>' or len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [hidden_state, cell_state]

    return decoded_sentence

In [103]:
# Example decoded sentence
decode_sequence('how are you')

' qué están de <END>'

In [104]:
# Example decoded sentence
decode_sequence('it is hot today')

' hoy es <END>'

In [105]:
# Example decoded sentence
decode_sequence('i am')

' estoy de <END>'