# Neural Machine Translation using Encoder - Decoder Model in LSTM

In [1]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

from pickle import load
from numpy.random import rand
from numpy.random import shuffle

from tensorflow.python.keras.preprocessing.text import Tokenizer

from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.models import Model
from keras.layers import LSTM, Input
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint


from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

Using TensorFlow backend.


In [2]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

In [4]:
# load datasets
dataset = load_clean_sentences('english-french-both.pkl')
train = load_clean_sentences('english-french-train.pkl')
test = load_clean_sentences('english-french-test.pkl')

In [5]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [6]:
# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [7]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
fra_tokenizer = create_tokenizer(dataset[:, 1])
fra_vocab_size = len(fra_tokenizer.word_index) + 1
fra_length = max_length(dataset[:, 1])
print('french Vocabulary Size: %d' % fra_vocab_size)
print('french Max Length: %d' % (fra_length))

English Vocabulary Size: 2125
English Max Length: 5
french Vocabulary Size: 4397
french Max Length: 10


In [8]:
eng_tokenizer.index_to_word = dict(zip(eng_tokenizer.word_index.values(), eng_tokenizer.word_index.keys()))
fra_tokenizer.index_to_word = dict(zip(fra_tokenizer.word_index.values(), fra_tokenizer.word_index.keys()))

In [9]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [10]:
# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [11]:
# prepare training data
trainX = encode_sequences(fra_tokenizer, fra_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])

In [12]:
trainY = encode_output(trainY, eng_vocab_size)

In [13]:
print(trainX[9])
print(train[0])

[2185    0    0    0    0    0    0    0    0    0]
['i will return' 'je reviendrai']


In [14]:
print(trainX.shape)
print(trainY.shape)
fra_vocab_size

(9000, 10)
(9000, 5, 2125)


4397

In [15]:
# prepare validation data
testX = encode_sequences(fra_tokenizer, fra_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

In [16]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    X_input = Input((src_timesteps,))
    X = Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True)(X_input)
    X = LSTM(n_units)(X)
    X = RepeatVector(tar_timesteps)(X)
    X = LSTM(n_units, return_sequences=True)(X)
    X = TimeDistributed(Dense(tar_vocab, activation='softmax'))(X)
    model = Model(inputs = X_input, outputs = X)
    
    return model

In [17]:
# define model
model = define_model(fra_vocab_size, eng_vocab_size, fra_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [18]:
# fit model
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Train on 9000 samples, validate on 1000 samples
Epoch 1/30
 - 43s - loss: 4.2561 - val_loss: 3.4952

Epoch 00001: val_loss improved from inf to 3.49524, saving model to model.h5
Epoch 2/30
 - 40s - loss: 3.3083 - val_loss: 3.3091

Epoch 00002: val_loss improved from 3.49524 to 3.30911, saving model to model.h5
Epoch 3/30
 - 42s - loss: 3.1434 - val_loss: 3.2105

Epoch 00003: val_loss improved from 3.30911 to 3.21049, saving model to model.h5
Epoch 4/30
 - 43s - loss: 3.0153 - val_loss: 3.1099

Epoch 00004: val_loss improved from 3.21049 to 3.10988, saving model to model.h5
Epoch 5/30
 - 41s - loss: 2.8434 - val_loss: 2.9636

Epoch 00005: val_loss improved from 3.10988 to 2.96361, saving model to model.h5
Epoch 6/30
 - 39s - loss: 2.6658 - val_loss: 2.8375

Epoch 00006: val_loss improved from 2.96361 to 2.83755, saving model to model.h5
Epoch 7/30
 - 39s - loss: 2.5026 - val_loss: 2.7410

Epoch 00007: val_loss improved from 2.83755 to 2.74101, saving model to model.h5
Epoch 8/30
 - 39s 

<keras.callbacks.History at 0xc2d63a1cf8>

model.load_weights('model.h5')

In [19]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [20]:
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [21]:
def tokens_to_string(self, tokens):
    words = [self.index_to_word[token] for token in tokens if token != 0]

    text = " ".join(words)

    return text

In [22]:
def print_trans(sources, model, tokenizer):
    for i, source in enumerate(sources):
        source_ = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, tokenizer, source_)
        print(tokens_to_string(fra_tokenizer, list(source)),"==>",translation)

In [23]:
example1 = testX[0:10, :]
print_trans(example1, model, eng_tokenizer)

cest gros ==> its stupid
je vous ai contrariees ==> ive upset you
viens bientot ==> come soon
jai arrete de fumer ==> i feel smoking
attendeznous ==> take yourself
estce un elan ==> is it a wolf
vous vous souvenez ==> you you remember
cest amusant ==> thats funny
elle marche ==> it works
viens vite ==> come quickly


In [24]:
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, tokenizer, source)
        raw_target, raw_src = raw_dataset[i]
        actual.append(raw_target.split())
        predicted.append(translation.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [25]:
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

train


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU-1: 0.087014
BLEU-2: 0.000000
BLEU-3: 0.000000
BLEU-4: 0.000000
test
BLEU-1: 0.083169
BLEU-2: 0.000000
BLEU-3: 0.000000
BLEU-4: 0.000000
