In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras
import numpy as np
from sklearn.externals import joblib

class TokenizerWrap(Tokenizer):
    def __init__(self, texts, padding, reverse=False, num_words = None):

        Tokenizer.__init__(self, num_words = num_words)
        self.fit_on_texts(texts)
        self.index_to_word = dict(zip(self.word_index.values(),
                                      self.word_index.keys()))
        self.tokens = self.texts_to_sequences(texts)
        if reverse:
            self.tokens = [list(reversed(x)) for x in self.tokens]
            truncating = 'pre'
        else:
            truncating = 'post'
        self.num_tokens = [len(x) for x in self.tokens]
        self.max_tokens = np.mean(self.num_tokens) \
                          + 2 * np.std(self.num_tokens)
        self.max_tokens = int(self.max_tokens)
        self.tokens_padded = pad_sequences(self.tokens,
                                           maxlen = self.max_tokens,
                                           padding = padding,
                                           truncating = truncating)

    def token_to_word(self, token):
        word = " " if token == 0 else self.index_to_word[token]
        return word
        
    def tokens_to_string(self, tokens):
        words = [self.index_to_word[token]
                 for token in tokens
                 if token != 0]
        text = " ".join(words)
        return text

    def text_to_tokens(self, text, reverse = False, padding = False):
        tokens = self.texts_to_sequences([text])
        tokens = np.array(tokens)

        if reverse:
            tokens = np.flip(tokens, axis = 1)
            truncating = 'pre'
        else:
            truncating = 'post'
        if padding:
            tokens = pad_sequences(tokens,
                                   maxlen = self.max_tokens,
                                   padding = 'pre',
                                   truncating = truncating)

        return tokens

def load_inputs():
    
    tokenizer_src = joblib.load('tokenizer_src.pkl')
    tokenizer_dest = joblib.load('tokenizer_dest.pkl')
    
    return (tokenizer_src, tokenizer_dest)

Using TensorFlow backend.


In [5]:
tokenizer_src, tokenizer_dest = load_inputs()

In [1]:
def create_model():
    from tensorflow.python.keras.models import Model
    from tensorflow.python.keras.layers import Input, Dense, GRU, Embedding
    
    num_words = 10000
    encoder_input = Input(shape = (None, ), name ='encoder_input')
    embedding_size = 128
    encoder_embedding = Embedding(input_dim = num_words, output_dim = embedding_size,
                              name = 'encoder_embedding')
    state_size = 512
    encoder_gru1 = GRU(state_size, name = 'encoder_gru1', return_sequences = True)
    encoder_gru2 = GRU(state_size, name = 'encoder_gru2', return_sequences = True)
    encoder_gru3 = GRU(state_size, name = 'encoder_gru3', return_sequences = False)

    def connect_encoder():
        net = encoder_input
        net = encoder_embedding(net)
        net = encoder_gru1(net)
        net = encoder_gru2(net)
        net = encoder_gru3(net)
        encoder_output = net
        return encoder_output
    
    encoder_output = connect_encoder()
    
    decoder_initial_state = Input(shape = (state_size,), name  = 'decoder_initial_state')
    decoder_input = Input(shape=(None, ), name = 'decoder_input')
    decoder_embedding = Embedding(input_dim = num_words, output_dim = embedding_size,
                                  name = 'decoder_embedding')
    decoder_gru1 = GRU(state_size, name = 'decoder_gru1', return_sequences = True)
    decoder_gru2 = GRU(state_size, name = 'decoder_gru2', return_sequences = True)
    decoder_gru3 = GRU(state_size, name = 'decoder_gru3', return_sequences = True)
    decoder_dense = Dense(num_words, activation = 'linear', name = 'decoder_output')
    
    def connect_decoder(initial_state):
        net = decoder_input
        net = decoder_embedding(net)
        net = decoder_gru1(net, initial_state = initial_state)
        net = decoder_gru2(net, initial_state = initial_state)
        net = decoder_gru3(net, initial_state = initial_state)
        decoder_output = decoder_dense(net)

        return decoder_output
    
    decoder_output = connect_decoder(initial_state = encoder_output)
    
    model = Model(inputs = [encoder_input, decoder_input],
                    outputs = [decoder_output])
    model.load_weights('21_checkpoint.keras')
    
    model_encoder = Model(inputs = [encoder_input],
                      outputs = [encoder_output])
    decoder_output = connect_decoder(initial_state = decoder_initial_state)

    model_decoder = Model(inputs = [decoder_input, decoder_initial_state],
                      outputs = [decoder_output])
    
    return (model, model_encoder, model_decoder)

In [2]:
model, model_encoder, model_decoder = create_model()

  from ._conv import register_converters as _register_converters


In [10]:
def translate(input_text, tokenizer_src = tokenizer_src, tokenizer_dest = tokenizer_dest):
    token_start = 3
    token_end = 4
    input_tokens = tokenizer_src.text_to_tokens(text = input_text,reverse = True,padding = True)
    initial_state = model_encoder.predict(input_tokens)
    max_tokens = tokenizer_dest.max_tokens
    shape = (1, max_tokens)
    decoder_input_data = np.zeros(shape = shape, dtype = np.int)
    token_int = token_start
    output_text = ''
    count_tokens = 0
    while token_int != token_end and count_tokens < max_tokens:
        decoder_input_data[0, count_tokens] = token_int
        x_data = \
        {
            'decoder_initial_state': initial_state,
            'decoder_input': decoder_input_data
        }
        decoder_output = model_decoder.predict(x_data)
        token_onehot = decoder_output[0, count_tokens, :]
        token_int = np.argmax(token_onehot)
        sampled_word = tokenizer_dest.token_to_word(token_int)
        output_text += " " + sampled_word
        count_tokens += 1
    output_tokens = decoder_input_data[0]
    return output_text[1:-5]

In [11]:
translate('welcome to the parliament')

'aprobación de la gestión de la comisión'

In [12]:
translate('hope you feel good today')

'el parlamento rechaza la solicitud de voto'

In [13]:
translate('I would like to introduce you..')

'quisiera hacer una observación'

In [14]:
translate('what is you name?')

'¿hay alguna observación'

In [15]:
translate('people of the country')

'el deporte de la carne'

In [16]:
translate('the number of lost people is increasing')

'el sistema de reacción rápida es un ejemplo'

In [17]:
translate('the number of countries')

'el parlamento aprueba la resolución'

In [34]:
translate('i am going back home')

'me gustaría que esto'

In [19]:
translate('I am a good person')

'me refiero a la cuestión de la'

In [22]:
translate('Mr. President')

'señor presidente'

In [24]:
translate('I am leaving')

'me gustaría que'

In [27]:
translate('the session is over, thank you')

'la votación tendrá lugar mañana a las 12 00 horas'

In [29]:
translate('our proposal is to do')

'la enmienda nº 1'

In [30]:
translate('how are you')

'aplausos'

In [31]:
translate('i want to sleep')

'me gustaría que'

In [32]:
translate('what is going on')

'¿qué se'

In [33]:
translate('she lost a book')

'el sr ha sido'

In [35]:
translate('where is my car')

'¿hay alguna observación'

In [36]:
translate('she likes to write')

'el sr ha hecho'