<a href="https://colab.research.google.com/github/raphaelreinauer/NMT/blob/master/Neural_Machine_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Implementing a Neural Machine Translation Model**

We will use the encoder/decoder architecture. With GRU layers for both encoder and decoder as well as a Dense layer, a RepeatVector layer, and a TimeDistributed layer.

Compilation Time: 6 days on 96 GPUs

In [0]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GRU, Dense, TimeDistributed, RepeatVector, Input

from tensorflow.keras.preprocessing.text import Tokenizer

**Preprocessing**


en_text: english text

fr_text: french translation of english text

In [0]:
# load files
en_text = tf.keras.utils.get_file(fname='vocab_en.txt', origin='https://assets.datacamp.com/production/repositories/4609/datasets/3459f954752fb2fce7c0b29e25f067e9784b69fb/vocab_en.txt')
fr_text = tf.keras.utils.get_file(fname='vocab_fr.txt', origin='https://assets.datacamp.com/production/repositories/4609/datasets/644e461abb0910edb038e8b2c4ce7071b5aeca12/vocab_fr.txt')

en_len = 100
fr_len = 100
en_vocab = 50000
fr_vocab = 50000

# english tokenizer
en_tok = Tokenizer(num_words=50) #only consider num_words most comman words the other words will be considered as out-of-vocabulary (OOV)

en_tok.fit_on_texts(en_text)

# french tokenizer

fr_tok = Tokenizer()

fr_tok.fit_on_texts(fr_text)

'''
Tokenizer class as functions:
index_word
word_index
texts_to_sequences: sentence is converted to a list of IDs
'''

# Padding: make all sentences to be of the same length

from tensorflow.keras.preprocessing.sequence import pad_sequences


def sents2seqs(input_type, sentences, onehot=False, pad_type='post', reverse=False):
    '''
    Converts the sentences to a list of sequence of IDs,
    Pad the sentences so that they have equal length and,
    Optionally convert the IDs to onehot vectors.
    '''
    encoded_text = en_tok.texts_to_sequences(sentences)
    preproc_text = pad_sequences(encoded_text, padding=pad_type, truncating='post', maxlen=en_len)
    if reverse:
      # Reverse the text using numpy axis reversing
      preproc_text = preproc_text[:, ::-1]
    if onehot:
        preproc_text = to_categorical(preproc_text, num_classes=en_vocab)
    return preproc_text


**Define the Model**

In [24]:
'''
# Constants
fr_vocab: size of the french vocabulary
en_vocab: size of the english vocabulary
hsize: size of the hidden state

# Different layers
en_inputs:encoder input layer
en_out and en_state: encoder GRU output
de_out decoder: GRU output
de_pred decoder: prediction

'''

hsize = 50


# Define the encoder
en_inputs = Input(shape=(en_len, en_vocab))
en_gru = GRU(hsize, return_state=True)
en_out, en_state = en_gru(en_inputs)

# Define the encoder model
#encoder = Model(inputs=en_inputs, outputs=en_state)

# Define the decoder
de_inputs = RepeatVector(fr_len)(en_state)
de_gru = GRU(hsize, return_sequences=True)
de_out = de_gru(de_inputs, initial_state=en_state)

# prediction layer
de_dense = Dense(fr_vocab, activation='softmax')
de_dense_time = TimeDistributed(de_dense)

de_pred = de_dense_time(de_out)

# Define the complete model
nmt = Model(inputs=en_inputs, outputs=de_pred)

# Compile the model
nmt.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

nmt.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 100, 50000)] 0                                            
__________________________________________________________________________________________________
gru_9 (GRU)                     [(None, 50), (None,  7507800     input_7[0][0]                    
__________________________________________________________________________________________________
repeat_vector_4 (RepeatVector)  (None, 100, 50)      0           gru_9[0][1]                      
__________________________________________________________________________________________________
gru_10 (GRU)                    (None, 100, 50)      15300       repeat_vector_4[0][0]            
                                                                 gru_9[0][1]                  