In [1]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu
from numpy.random import shuffle

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))
 
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
 
# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)
 
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X
 
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None
 
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)
 
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, spa_tokenizer, source)
        raw_src, raw_target = raw_dataset[i]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append([raw_target.split()])
        predicted.append(translation.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
 

In [3]:
dataset = load_clean_sentences('english-spanish-both.pkl')
train = load_clean_sentences('english-spanish-train.pkl')

test = load_clean_sentences('english-spanish-test.pkl')


In [4]:
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])

In [5]:
spa_tokenizer = create_tokenizer(dataset[:, 1])
spa_vocab_size = len(spa_tokenizer.word_index) + 1
spa_length = max_length(dataset[:, 1])

In [6]:
trainX = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
testX = encode_sequences(eng_tokenizer, eng_length, test[:, 0])

In [7]:
model = load_model('model1.h5')

In [8]:
print('train')
evaluate_model(model, spa_tokenizer, trainX, train)

train
src=[keep away from the fire], target=[mantengase lejos del fuego], predicted=[tom que]
src=[wait until i sit down], target=[espera a que me siente], predicted=[tom no que]
src=[how do you know that tom has never been to boston], target=[como sabes que tom nunca ha estado en boston], predicted=[tom no que que de de de]
src=[i want to know the facts], target=[quiero conocer los hechos], predicted=[tom que que]
src=[do you like playing sports], target=[te gusta hacer deporte], predicted=[tom que que]
src=[i will help you all i can], target=[te ayudare en todo lo que pueda], predicted=[tom no que de]
src=[it was fun playing in the park], target=[fue divertido jugar en el parque], predicted=[tom no que]
src=[choose one person], target=[elija una persona], predicted=[tom el]
src=[he raised his hand], target=[el alzo la mano], predicted=[tom que]
src=[can you give me some money], target=[puedes darme algo de dinero], predicted=[tom no que]


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU-1: 0.101536
BLEU-2: 0.021002
BLEU-3: 0.000000
BLEU-4: 0.000000


In [None]:
print('test')
evaluate_model(model, spa_tokenizer, testX, test)