In [1]:
from sklearn.externals import joblib
import numpy as np
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
dataset = joblib.load('data/dataset.pkl')
train = joblib.load('data/train.pkl')
test = joblib.load('data/test.pkl')

In [3]:
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(dataset[:,0])
eng_length = max(len(line.split()) for line in dataset[:,0])
eng_vocab_size = len(eng_tokenizer.word_counts) + 1
targ_tokenizer = Tokenizer()
targ_tokenizer.fit_on_texts(dataset[:,1])
targ_length = max(len(line.split()) for line in dataset[:,1])
targ_vocab_size = len(targ_tokenizer.word_counts) + 1

In [4]:
trainX = eng_tokenizer.texts_to_sequences(train[:,0])
trainX = pad_sequences(trainX, maxlen=eng_length, padding='post')
trainX

array([[   2,   14,    8,   69,    0],
       [  74,  176,    0,    0,    0],
       [   1,   41,  118,  126,    0],
       ...,
       [   6, 1418,    0,    0,    0],
       [  74,  301,    0,    0,    0],
       [  18,  660,   30,    0,    0]], dtype=int32)

In [5]:
testX = eng_tokenizer.texts_to_sequences(test[:,0])
testX = pad_sequences(testX, maxlen=eng_length, padding='post')

In [6]:
model = load_model('model_31_07.h5')

## Evaluate model

In [7]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None
 
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)
 
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, spa_tokenizer, source)
        raw_src, raw_target = raw_dataset[i]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append([raw_target.split()])
        predicted.append(translation.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
 

In [None]:
print('train')
evaluate_model(model, spa_tokenizer, trainX, train)

In [None]:
print('test')
evaluate_model(model, spa_tokenizer, testX, test)

## Translate single phrase

In [8]:
def translate(model, token):
    for i, tok in enumerate(token):
        tok = tok.reshape((1, tok.shape[0]))
        translation = predict_sequence(model, targ_tokenizer, tok)
        print(translation)

In [9]:
def encode_input(input_text, tokenizer=eng_tokenizer, leng=eng_length):
    input_text = [input_text]
    token = tokenizer.texts_to_sequences(input_text)
    token = pad_sequences(token, maxlen=leng, padding='post')
    return token

In [25]:
translate(model, encode_input('it is a second'))

es un turno
