<a href="https://colab.research.google.com/github/raphaelreinauer/NMT/blob/master/Neural_Machine_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Implementing a Neural Machine Translation Model**

We will use the encoder/decoder architecture. With GRU layers for both encoder and decoder as well as a Dense layer, a RepeatVector layer, and a TimeDistributed layer.

Compilation Time: 6 days on 96 GPUs

In [0]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GRU, Dense, TimeDistributed, RepeatVector, Input
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.preprocessing.text import Tokenizer

**Preprocessing**


en_text: english text

fr_text: french translation of english text

In [0]:
# load files
#!wget https://assets.datacamp.com/production/repositories/4609/datasets/3459f954752fb2fce7c0b29e25f067e9784b69fb/vocab_en.txt
#!wget https://assets.datacamp.com/production/repositories/4609/datasets/644e461abb0910edb038e8b2c4ce7071b5aeca12/vocab_fr.txt

#!wget https://raw.githubusercontent.com/udacity/deep-learning/master/language-translation/data/small_vocab_en
#!wget https://raw.githubusercontent.com/udacity/deep-learning/master/language-translation/data/small_vocab_fr



with open('small_vocab_en', 'r') as en_text_file:
    en_text=en_text_file.readlines()

with open('small_vocab_fr', 'r') as fr_text_file:
    fr_text=fr_text_file.readlines()


en_len = 15
fr_len = 25
en_vocab = 100
fr_vocab = 125


# english tokenizer
en_tok = Tokenizer(num_words=100) #only consider num_words most comman words the other words will be considered as out-of-vocabulary (OOV)

en_tok.fit_on_texts(en_text)

# french tokenizer

fr_tok = Tokenizer(num_words=125)

fr_tok.fit_on_texts(fr_text)

'''
Tokenizer class as functions:
index_word
word_index
texts_to_sequences: sentence is converted to a list of IDs
'''

# Padding: make all sentences to be of the same length

from tensorflow.keras.preprocessing.sequence import pad_sequences



def sents2seqs(input_type, sentences, onehot=False, pad_type='post', reverse=False):
    '''
    Converts the sentences to a list of sequence of IDs,
    Pad the sentences so that they have equal length and,
    Optionally convert the IDs to onehot vectors.
    '''
    assert input_type in ["source", "target"]
    if input_type == 'source':
      tokenizer = en_tok
      pad_length = en_len
      vocab_size = en_vocab
    elif input_type == 'target':
      tokenizer = fr_tok
      pad_length = fr_len
      vocab_size = fr_vocab
    
    encoded_text = tokenizer.texts_to_sequences(sentences)
    preproc_text = pad_sequences(encoded_text, padding=pad_type, truncating='post', maxlen=pad_length)
    if reverse:
      preproc_text = preproc_text[:,::-1]
      
    if onehot:
        assert vocab_size is not None, "Cannot do to_categorical without num_classes for safety"
        preproc_text = to_categorical(preproc_text, num_classes=vocab_size)
    return preproc_text



In [3]:
en_vocab

100

**Define the Model**

In [4]:
'''
# Constants
fr_vocab: size of the french vocabulary
en_vocab: size of the english vocabulary
hsize: size of the hidden state

# Different layers
en_inputs:encoder input layer
en_out and en_state: encoder GRU output
de_out decoder: GRU output
de_pred decoder: prediction

'''

hsize = 48


# Define the encoder
en_inputs = Input(shape=(en_len, en_vocab))
en_gru = GRU(hsize, return_state=True)
en_out, en_state = en_gru(en_inputs)

# Define the encoder model
#encoder = Model(inputs=en_inputs, outputs=en_state)

# Define the decoder
de_inputs = RepeatVector(fr_len)(en_state)
de_gru = GRU(hsize, return_sequences=True)
de_out = de_gru(de_inputs, initial_state=en_state)

# prediction layer
de_dense = Dense(fr_vocab, activation='softmax')
de_dense_time = TimeDistributed(de_dense)

de_pred = de_dense_time(de_out)

# Define the complete model
nmt = Model(inputs=en_inputs, outputs=de_pred)

# Compile the model
nmt.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

nmt.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 15, 100)]    0                                            
__________________________________________________________________________________________________
gru (GRU)                       [(None, 48), (None,  21600       input_1[0][0]                    
__________________________________________________________________________________________________
repeat_vector (RepeatVector)    (None, 25, 48)       0           gru[0][1]                        
__________________________________________________________________________________________________
gru_1 (GRU)                     (None, 25, 48)       14112       repeat_vector[0][0]              
                                                                 gru[0][1]                    

**Training the model**

In [5]:
en_text = en_text[:1000]
fr_text = fr_text[:1000]
n_epochs = 5
bsize = 250
data_size = len(en_text)

train_size, valid_size = 800, 200
inds = np.arange(len(en_text))
np.random.shuffle(inds)


train_inds = inds[:train_size]
valid_inds = inds[train_size:train_size+valid_size]

tr_en = [en_text[ti] for ti in train_inds]
tr_fr = [fr_text[ti] for ti in train_inds]
v_en = [en_text[ti] for ti in valid_inds]
v_fr = [fr_text[ti] for ti in valid_inds]

v_en_x = sents2seqs('source', v_en, onehot=True, reverse=True)
v_de_y = sents2seqs('target', v_fr, onehot=True)



for ei in range(n_epochs):
    for i in range(0, data_size, bsize):
        en_x = sents2seqs('source', en_text[i:i+bsize], onehot=True, reverse=True)
        de_y = sents2seqs('target', fr_text[i:i+bsize], onehot=True)
        nmt.train_on_batch(en_x, de_y)

    # Evaluate the trained model on the validation data
    res = nmt.evaluate(v_en_x, v_de_y, batch_size=valid_size, verbose=0)

    print("{} => Loss:{}, Val Acc: {}".format(ei+1,res[0], res[1]*100.0))

1 => Loss:4.762917995452881, Val Acc: 47.02000021934509
2 => Loss:4.6717329025268555, Val Acc: 53.56000065803528
3 => Loss:4.514949798583984, Val Acc: 53.53999733924866
4 => Loss:4.225861072540283, Val Acc: 53.53999733924866
5 => Loss:3.6791574954986572, Val Acc: 53.53999733924866


In [0]:
print('Training (EN):\n', tr_en[:3], '\nTraining (FR):\n', tr_fr[:3])
print('\nValid (EN):\n', v_en[:3], '\nValid (FR):\n', v_fr[:3])

In [0]:
# Make Translation

en_st = ['the united states is sometimes chilly during december , but it is sometimes freezing in june .']
print('English: {}'.format(en_st))

# Convert the English sentence to a sequence
en_seq = sents2seqs('source', en_st, onehot=True, reverse=True)

# Predict probabilities of words using en_seq
fr_pred = model.predict(en_seq)

# Get the sequence indices (max argument) of fr_pred
fr_seq = np.argmax(fr_pred, axis=-1)[0]

# Convert the sequence of IDs to a sentence and print
fr_sent = [fr_tok.index_word[i] for i in fr_seq if i != 0]
print("French (Custom): {}".format(' '.join(fr_sent)))
print("French (Google Translate): les etats-unis sont parfois froids en décembre, mais parfois gelés en juin")