Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [7]:
# Standard imports
import pandas as pd
import numpy as np
import re
import pickle
import string
import json

# Visualization library
import seaborn as sns
import matplotlib.pyplot as plt

# NLP library
from nltk.tokenize import word_tokenize
from tensorflow import keras
import tensorflow as tf

from tensorflow.keras.layers import Input, LSTM, Dense, Masking
from tensorflow.keras.models import Model
from tensorflow.keras.backend import manual_variable_initialization

manual_variable_initialization(True)

In [8]:
from numpy.random import seed
seed(42)
tf.random.set_seed(42)

In [9]:
# Encoder training setup
num_encoder_tokens = 5134
num_decoder_tokens = 8263

max_encoder_seq_length = 8
max_decoder_seq_length = 17

latent_dim = 256

In [10]:

with open('./data/rtfd.p', 'rb') as fp:
    reverse_target_features_dict = pickle.load(fp)
    
with open('./data/tfd.p', 'rb') as fp:
    target_features_dict = pickle.load(fp)
    
with open('./data/rifd.p', 'rb') as fp:
    reverse_input_features_dict = pickle.load(fp)

with open('./data/ifd.p', 'rb') as fp:
    input_features_dict = pickle.load(fp)

In [11]:
# Python 3
first2pairs = {k: input_features_dict[k] for k in list(input_features_dict)[:2]}
first2pairs

{'a': 0, 'aback': 1}

In [12]:
# Python 3
first2pairs = {k: reverse_input_features_dict[k] for k in list(reverse_input_features_dict)[:2]}
first2pairs

{0: 'a', 1: 'aback'}

In [13]:
# Python 3
first2pairs = {k: target_features_dict[k] for k in list(target_features_dict)[:10]}
first2pairs

{'<END>': 0,
 '<START>': 1,
 'a': 2,
 'abajo': 3,
 'abandona': 4,
 'abandonada': 5,
 'abandonado': 6,
 'abandonamos': 7,
 'abandonar': 8,
 'abandonaron': 9}

In [14]:
# Python 3
first2pairs = {k: reverse_target_features_dict[k] for k in list(reverse_target_features_dict)[:10]}
first2pairs

{0: '<END>',
 1: '<START>',
 2: 'a',
 3: 'abajo',
 4: 'abandona',
 5: 'abandonada',
 6: 'abandonado',
 7: 'abandonamos',
 8: 'abandonar',
 9: 'abandonaron'}

In [15]:
#TRANSLATING UNSEEN TEXT
# Building Encoder model
from tensorflow.keras.models import Model, load_model

training_model = load_model('./models/training_model_gcp.h5')

encoder_inputs = training_model.input[0] #input1
encoder_outputs, state_h_enc, state_c_enc = training_model.layers[2].output #lstm1
encoder_states = [state_h_enc, state_c_enc]

encoder_model = Model(encoder_inputs, encoder_states)

latent_dim = 256

## Building decoder model
decoder_inputs = training_model.input[1] #input2

##
decoder_state_input_hidden = Input(shape=(latent_dim,), name="input_3")
decoder_state_input_cell = Input(shape=(latent_dim,), name="input_4")
decoder_states_inputs = [decoder_state_input_hidden, decoder_state_input_cell]

## NEW
decoder_lstm = training_model.layers[3]
##

decoder_outputs, state_hidden, state_cell = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_hidden, state_cell]

## NEW
decoder_dense = training_model.layers[4]
##

decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [89]:
def string_to_matrix(user_input):
    '''This function takes in a string and outputs the corresponding matrix'''
    tokens = re.findall(r"[\w']+|[^\s\w]", user_input)
    user_input_matrix = np.zeros((1, max_encoder_seq_length, num_encoder_tokens), dtype='float32')
    for timestep, token in enumerate(tokens):
        if token in input_features_dict:
            user_input_matrix[0, timestep, input_features_dict[token]] = 1.
            print(timestep, token)
    return user_input_matrix

In [101]:
def decode_sequence(test_input):
    '''This function takes in a sentence and returns the decoded sentence'''
    
    # Encode the input as state vectors.
    states_value = encoder_model.predict(string_to_matrix(test_input))
    print(f' num decoder tokens is {num_decoder_tokens}')

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    
    # Populate the first token of target sequence with the start token.
    target_seq[0, 0, target_features_dict['<START>']] = 1.

    # Sampling loop for a batch of sequences
    decoded_sentence = ''

    stop_condition = False
    while not stop_condition:
        # Run the decoder model to get possible output tokens (with probabilities) & states
        output_tokens, hidden_state, cell_state = decoder_model.predict([target_seq] + states_value)

        # Choose token with highest probability and append it to decoded sentence
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        print(sampled_token_index, np.amax(output_tokens[0, -1, :]))
        
        sampled_token = reverse_target_features_dict[sampled_token_index]
        print(sampled_token)
        #decoded_sentence += " " + sampled_token

        # Exit condition: either hit max length or find stop token.
        #if (sampled_token == '<END>' or len(decoded_sentence) > max_decoder_seq_length):
        if (len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True
        else:
            decoded_sentence += " " + sampled_token
            print(decoded_sentence)

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [hidden_state, cell_state]

    decoded_sentence.replace('<END>','')
    return decoded_sentence

In [108]:
# Example decoded sentence
decode_sequence('my wife is here')

0 my
1 wife
2 is
3 here
 num decoder tokens is 8263
5015 0.97886306
mi
 mi
3241 0.34943423
esposa
 mi esposa
3104 0.6781973
es
 mi esposa es
0 0.05695426
<END>
 mi esposa es <END>
0 0.079956956
<END>


' mi esposa es <END>'

In [31]:
# Example decoded sentence
decode_sequence('it is hot today')

' hoy hace'

In [32]:
# Example decoded sentence
decode_sequence('what is your name')

' qué es tu'

In [35]:
# Example decoded sentence
decode_sequence('can this translate longer sentences?')

' esto puede el'

In [110]:
training_model.summary