In [5]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

In [10]:
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [11]:
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

In [46]:
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

In [13]:
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [47]:
filename = 'spa.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-spanish.pkl')
# spot check
for i in range(100):
    print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

Saved: english-spanish.pkl
[go] => [ve]
[go] => [vete]
[go] => [vaya]
[go] => [vayase]
[hi] => [hola]
[run] => [corre]
[run] => [corred]
[who] => [quien]
[fire] => [fuego]
[fire] => [incendio]
[fire] => [disparad]
[help] => [ayuda]
[help] => [socorro auxilio]
[help] => [auxilio]
[jump] => [salta]
[jump] => [salte]
[stop] => [parad]
[stop] => [para]
[stop] => [pare]
[wait] => [espera]
[wait] => [esperen]
[go on] => [continua]
[go on] => [continue]
[hello] => [hola]
[i ran] => [corri]
[i ran] => [corria]
[i try] => [lo intento]
[i won] => [he ganado]
[oh no] => [oh no]
[relax] => [tomatelo con soda]
[smile] => [sonrie]
[attack] => [al ataque]
[attack] => [atacad]
[get up] => [levanta]
[go now] => [ve ahora mismo]
[got it] => [lo tengo]
[got it] => [lo pillas]
[got it] => [entendiste]
[he ran] => [el corrio]
[hop in] => [metete adentro]
[hug me] => [abrazame]
[i fell] => [me cai]
[i know] => [yo lo se]
[i left] => [sali]
[i lied] => [menti]
[i lost] => [perdi]
[i quit] => [dimito]
[i quit

In [28]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle
 
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))
 
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)
 
# load dataset
raw_dataset = load_clean_sentences('english-spanish.pkl')
 
# reduce dataset size
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:9000], dataset[9000:]
# save
save_clean_data(dataset, 'english-spanish-both.pkl')
save_clean_data(train, 'english-spanish-train.pkl')
save_clean_data(test, 'english-spanish-test.pkl')

Saved: english-spanish-both.pkl
Saved: english-spanish-train.pkl
Saved: english-spanish-test.pkl


In [25]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [26]:
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))
 
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
 
# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)
 
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X
 
# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y
 
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model


In [29]:
dataset = load_clean_sentences('english-spanish-both.pkl')
train = load_clean_sentences('english-spanish-train.pkl')
test = load_clean_sentences('english-spanish-test.pkl')

In [55]:
train[7345]

array(['it was all gone', 'todo termino'], dtype='<U275')

In [31]:
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
spa_tokenizer = create_tokenizer(dataset[:, 1])
spa_vocab_size = len(spa_tokenizer.word_index) + 1
spa_length = max_length(dataset[:, 1])
print('Spanish Vocabulary Size: %d' % spa_vocab_size)
print('Spanish Max Length: %d' % (spa_length))

English Vocabulary Size: 2343
English Max Length: 5
Spanish Vocabulary Size: 4520
Spanish Max Length: 8


In [32]:
trainX = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_sequences(spa_tokenizer, spa_length, train[:, 1])
trainY = encode_output(trainY, spa_vocab_size)
# prepare validation data
testX = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_sequences(spa_tokenizer, spa_length, test[:, 1])
testY = encode_output(testY, spa_vocab_size)

In [33]:
model = define_model(eng_vocab_size, spa_vocab_size, eng_length, spa_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [34]:
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 5, 256)            599808    
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 8, 256)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 8, 256)            525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 8, 4520)           1161640   
Total params: 2,812,072
Trainable params: 2,812,072
Non-trainable params: 0
_________________________________________________________________
None


In [35]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)


Train on 9000 samples, validate on 1000 samples
Epoch 1/30
 - 50s - loss: 3.4791 - val_loss: 2.6582

Epoch 00001: val_loss improved from inf to 2.65816, saving model to model.h5
Epoch 2/30
 - 49s - loss: 2.4941 - val_loss: 2.5594

Epoch 00002: val_loss improved from 2.65816 to 2.55937, saving model to model.h5
Epoch 3/30
 - 52s - loss: 2.4046 - val_loss: 2.5306

Epoch 00003: val_loss improved from 2.55937 to 2.53065, saving model to model.h5
Epoch 4/30
 - 51s - loss: 2.3446 - val_loss: 2.4983

Epoch 00004: val_loss improved from 2.53065 to 2.49827, saving model to model.h5
Epoch 5/30
 - 56s - loss: 2.2939 - val_loss: 2.4749

Epoch 00005: val_loss improved from 2.49827 to 2.47489, saving model to model.h5
Epoch 6/30
 - 50s - loss: 2.2369 - val_loss: 2.4394

Epoch 00006: val_loss improved from 2.47489 to 2.43937, saving model to model.h5
Epoch 7/30
 - 53s - loss: 2.1657 - val_loss: 2.3949

Epoch 00007: val_loss improved from 2.43937 to 2.39493, saving model to model.h5
Epoch 8/30
 - 52s 

<keras.callbacks.History at 0x11b489940>

In [113]:
from pickle import load
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu
from numpy.random import shuffle

In [109]:
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))
 
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
 
# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)
 
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X
 
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None
 
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)
 
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, spa_tokenizer, source)
        raw_src, raw_target = raw_dataset[i]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append([raw_target.split()])
        predicted.append(translation.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
 

In [230]:
dataset = load_clean_sentences('english-spanish-both.pkl')

train = load_clean_sentences('english-spanish-train.pkl')
shuffle(train)
test = load_clean_sentences('english-spanish-test.pkl')
shuffle(test)
new_test = array([['how are you']])


In [231]:
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])

In [232]:
spa_tokenizer = create_tokenizer(dataset[:, 1])
spa_vocab_size = len(spa_tokenizer.word_index) + 1
spa_length = max_length(dataset[:, 1])

In [233]:
trainX = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
testX = encode_sequences(eng_tokenizer, eng_length, test[:, 0])

In [234]:
model = load_model('model.h5')

array([[1, 2, 3]], dtype=int32)

In [235]:
print('train')
evaluate_model(model, spa_tokenizer, trainX, train)

train
src=[i cut myself], target=[me corto], predicted=[me corte]
src=[we love you], target=[os amamos], predicted=[te amamos]
src=[show your cards], target=[muestra tus cartas], predicted=[muestra tus cartas]
src=[i saw the news], target=[vi las noticias], predicted=[yo la noticias]
src=[we broke up], target=[lo dejamos], predicted=[nos dejamos]
src=[who ran], target=[quien corria], predicted=[quien corrio]
src=[youre fired], target=[estan despedidos], predicted=[estas despedidos]
src=[get out], target=[sal], predicted=[salte]
src=[arent you happy], target=[no estas feliz], predicted=[no tan feliz]
src=[here take this], target=[toma coge esto], predicted=[toma esto esto]
BLEU-1: 0.586463
BLEU-2: 0.456608
BLEU-3: 0.336658
BLEU-4: 0.145743


In [236]:
print('test')
evaluate_model(model, spa_tokenizer, testX, test)

test
src=[its christmas], target=[es navidad], predicted=[es nublado]
src=[we forgot], target=[nos olvidamos], predicted=[lo entendemos]
src=[youre restless], target=[eres nervioso], predicted=[eres ingenuo]
src=[answer tom], target=[respondale a tomas], predicted=[respondanle a tomas]
src=[it is up to you], target=[depende de ti], predicted=[es lo bien]
src=[good luck tom], target=[buena suerte tom], predicted=[es es tom]
src=[sign this], target=[firme esto], predicted=[firma esto]
src=[dont open it], target=[no lo abra], predicted=[no lo]
src=[i am busy], target=[estoy ocupada], predicted=[estoy llena]
src=[they helped tom], target=[ellos ayudaron a tom], predicted=[ellos a a tom]
BLEU-1: 0.381686
BLEU-2: 0.253028
BLEU-3: 0.160687
BLEU-4: 0.044933


In [112]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 5, 256)            599808    
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 8, 256)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 8, 256)            525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 8, 4520)           1161640   
Total params: 2,812,072
Trainable params: 2,812,072
Non-trainable params: 0
_________________________________________________________________


In [193]:
src = array([['how are you']])

In [194]:
src[0,:]

array(['how are you'], dtype='<U11')

In [195]:
src = src.reshape((1, src.shape[0]))

In [196]:
src.shape

(1, 1)

In [197]:
out_tokenizer = create_tokenizer(src[0,:])

In [198]:
predict_sequence(model, out_tokenizer, src)

ValueError: Error when checking input: expected embedding_1_input to have shape (5,) but got array with shape (1,)

In [163]:
prediction = model.predict(src, verbose=2)

ValueError: could not convert string to float: 'go on'