# Machine Translation
starter code: https://github.com/udacity/aind2-nlp-capstone

In [1]:
import numpy as np
import os
import codecs

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

from keras.layers import GRU, Input, Dense, TimeDistributed
from keras.models import Model
from keras.layers import Activation, Bidirectional, RepeatVector
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


In [2]:
def load_data(path):
    """
    Load dataset
    """
    input_file = os.path.join(path)
    with codecs.open(input_file, "r", "utf-8") as f:
        data = f.read()

    return data.split('\n')

In [3]:
english_sentences = load_data('data/small_vocab_en')
french_sentences = load_data('data/small_vocab_fr')

In [4]:
def tokenize(x):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    tokenized_data = tokenizer.texts_to_sequences(x)
    return tokenized_data, tokenizer

def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    padder = pad_sequences(x, maxlen=length, padding='post')
    return padder

def preprocess(x, y):
    """
    Preprocess x and y
    :param x: Feature List of sentences
    :param y: Label List of sentences
    :return: Tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer =\
    preprocess(english_sentences, french_sentences)

In [5]:
print('- English sentences preprocessed: {}'.format(preproc_english_sentences.shape))
print('- English vocabulary size: {}'.format(len(english_tokenizer.word_index)))
print('- French sentences preprocessed: {}'.format(preproc_french_sentences.shape))
print('- French vocabulary size: {}'.format(len(french_tokenizer.word_index)))

- English sentences preprocessed: (137861, 15)
- English vocabulary size: 199
- French sentences preprocessed: (137861, 21, 1)
- French vocabulary size: 344


In [6]:
def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

print('`logits_to_text` function loaded.')

`logits_to_text` function loaded.


# Model

In [16]:
def build_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a model that incorporates embedding, encoder-decoder, and bidirectional RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    # build the layers
    learning_rate = .005
    inputs = Input(shape=input_shape[1:])
    #print(inputs.shape)
    layers = Embedding(english_vocab_size, english_vocab_size, mask_zero=False)(inputs)
    #print(layers.shape)
    layers = Bidirectional(GRU(256, dropout=0.5, recurrent_dropout=0.5))(layers)
    layers = RepeatVector(output_sequence_length)(layers)
    layers = Bidirectional(GRU(256, dropout=0.5, recurrent_dropout=0.5,
                               return_sequences=True))(layers)
    layers = TimeDistributed(Dense(4 * french_vocab_size, activation='relu'))(layers)
    layers = TimeDistributed(Dense(2 * french_vocab_size, activation='relu'))(layers)
    outputs = TimeDistributed(Dense(french_vocab_size, activation='softmax'))(layers)
    
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(
        loss=sparse_categorical_crossentropy,
        optimizer=Adam(learning_rate),
        metrics=['accuracy']
    )
    print(model.summary())
    return model


In [17]:
# Reshaping the input to work with a basic RNN
tmp_x = pad(preproc_english_sentences, preproc_french_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))
print(tmp_x.shape)

# Train the neural network
model = build_model(tmp_x.shape, preproc_french_sentences.shape[1],
    len(english_tokenizer.word_index) + 1,
    len(french_tokenizer.word_index) + 1)

(137861, 21)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 21)                0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 21, 200)           40000     
_________________________________________________________________
bidirectional_6 (Bidirection (None, 512)               701952    
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 21, 512)           0         
_________________________________________________________________
bidirectional_7 (Bidirection (None, 21, 512)           1181184   
_________________________________________________________________
time_distributed_4 (TimeDist (None, 21, 1380)          707940    
_________________________________________________________________
time_distributed_5 (TimeDist (None, 21, 690)           952890  

In [18]:
model.fit(tmp_x, preproc_french_sentences, batch_size=1024, 
                    epochs=10, validation_split=0.2)

Train on 110288 samples, validate on 27573 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fac7c48bb00>

In [19]:
model.save('model')

In [23]:
# Print prediction(s)
print(logits_to_text(model.predict(tmp_x[:1])[0], french_tokenizer))

new jersey est jamais parfois en en et il il il est est en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [24]:
tmp_x[:1]

array([[17, 23,  1,  8, 67,  4, 39,  7,  3,  1, 55,  2, 44,  0,  0,  0,
         0,  0,  0,  0,  0]], dtype=int32)