In [None]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array

In [None]:
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [None]:
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

In [None]:
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

In [None]:
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [None]:
filename = 'spa.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-spanish.pkl')
# spot check
for i in range(100):
    print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

In [1]:
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle
 
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))
 
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)
 
# load dataset
raw_dataset = load_clean_sentences('english-spanish.pkl')
shuffle(raw_dataset)
# reduce dataset size
n_sentences = 8000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:7000], dataset[7000:]
# save
save_clean_data(dataset, 'english-spanish-both.pkl')
save_clean_data(train, 'english-spanish-train.pkl')
save_clean_data(test, 'english-spanish-test.pkl')

Saved: english-spanish-both.pkl
Saved: english-spanish-train.pkl
Saved: english-spanish-test.pkl


In [2]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))
 
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
 
# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)
 
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X
 
# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y
 
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model


In [4]:
dataset = load_clean_sentences('english-spanish-both.pkl')
train = load_clean_sentences('english-spanish-train.pkl')
shuffle(train)
test = load_clean_sentences('english-spanish-test.pkl')
shuffle(test)

In [11]:
test

array([['variety is the spice of life',
        'la variedad es la salsa de la vida'],
       ['i do not feel like drinking beer tonight',
        'esta noche no tengo ganas de beber cerveza'],
       ['she pushed him out the window', 'ella le empujo por la ventana'],
       ...,
       ['ive always loved you', 'siempre te he querido'],
       ['tom is thorough', 'tom es meticuloso'],
       ['everything weve told you is true',
        'todo lo que te hemos contado es cierto']], dtype='<U275')

In [5]:
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
spa_tokenizer = create_tokenizer(dataset[:, 1])
spa_vocab_size = len(spa_tokenizer.word_index) + 1
spa_length = max_length(dataset[:, 1])
print('Spanish Vocabulary Size: %d' % spa_vocab_size)
print('Spanish Max Length: %d' % (spa_length))

English Vocabulary Size: 4440
English Max Length: 26
Spanish Vocabulary Size: 6633
Spanish Max Length: 30


In [6]:
trainX = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_sequences(spa_tokenizer, spa_length, train[:, 1])
trainY = encode_output(trainY, spa_vocab_size)
# prepare validation data
testX = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_sequences(spa_tokenizer, spa_length, test[:, 1])
testY = encode_output(testY, spa_vocab_size)

In [7]:
model = define_model(eng_vocab_size, spa_vocab_size, eng_length, spa_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [8]:
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 26, 256)           1136640   
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 30, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 30, 256)           525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 30, 6633)          1704681   
Total params: 3,891,945
Trainable params: 3,891,945
Non-trainable params: 0
_________________________________________________________________
None


In [9]:
filename = 'model1.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=20, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)


Train on 7000 samples, validate on 1000 samples
Epoch 1/20
 - 126s - loss: 2.5294 - val_loss: 1.5582

Epoch 00001: val_loss improved from inf to 1.55817, saving model to model1.h5
Epoch 2/20
 - 122s - loss: 1.5021 - val_loss: 1.4912

Epoch 00002: val_loss improved from 1.55817 to 1.49118, saving model to model1.h5
Epoch 3/20
 - 121s - loss: 1.4568 - val_loss: 1.4850

Epoch 00003: val_loss improved from 1.49118 to 1.48502, saving model to model1.h5
Epoch 4/20
 - 119s - loss: 1.4296 - val_loss: 1.4715

Epoch 00004: val_loss improved from 1.48502 to 1.47153, saving model to model1.h5
Epoch 5/20
 - 123s - loss: 1.4046 - val_loss: 1.4692

Epoch 00005: val_loss improved from 1.47153 to 1.46915, saving model to model1.h5
Epoch 6/20
 - 124s - loss: 1.3905 - val_loss: 1.4678

Epoch 00006: val_loss improved from 1.46915 to 1.46780, saving model to model1.h5
Epoch 7/20
 - 128s - loss: 1.3779 - val_loss: 1.4728

Epoch 00007: val_loss did not improve from 1.46780
Epoch 8/20
 - 134s - loss: 1.3675 -

<keras.callbacks.History at 0xb3795f128>