# Machine Language Translation

In this notebook, we will make a `Seq2Seq model` using encoder and decoder RNN. The purpose of the model will be to convert `English language to French Language`.

In [1]:
## Importing Libraries
import numpy as np
from keras.models import Model
from keras.layers import Input,LSTM,Dense
from keras.utils import *
from keras.initializers import *
import tensorflow as tf
import time,random

In [2]:
## Hyperparameter
batch_size = 64        # for training
latent_dim = 256       # for encoding space
num_samples = 10000    #  to train on

## Preprocessing the Input

In [3]:
with open('fra.txt', 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
lines

['Go.\tVa !',
 'Run!\tCours\u202f!',
 'Run!\tCourez\u202f!',
 'Wow!\tÇa alors\u202f!',
 'Fire!\tAu feu !',
 "Help!\tÀ l'aide\u202f!",
 'Jump.\tSaute.',
 'Stop!\tÇa suffit\u202f!',
 'Stop!\tStop\u202f!',
 'Stop!\tArrête-toi !',
 'Wait!\tAttends !',
 'Wait!\tAttendez !',
 'I see.\tJe comprends.',
 "I try.\tJ'essaye.",
 "I won!\tJ'ai gagné !",
 "I won!\tJe l'ai emporté !",
 'Oh no!\tOh non !',
 'Attack!\tAttaque !',
 'Attack!\tAttaquez !',
 'Cheers!\tSanté !',
 'Cheers!\tÀ votre santé !',
 'Cheers!\tMerci !',
 'Cheers!\tTchin-tchin !',
 'Get up.\tLève-toi.',
 "Got it!\tJ'ai pigé !",
 'Got it!\tCompris !',
 'Got it?\tPigé\u202f?',
 'Got it?\tCompris\u202f?',
 "Got it?\tT'as capté\u202f?",
 'Hop in.\tMonte.',
 'Hop in.\tMontez.',
 'Hug me.\tSerre-moi dans tes bras !',
 'Hug me.\tSerrez-moi dans vos bras !',
 'I fell.\tJe suis tombée.',
 'I fell.\tJe suis tombé.',
 'I know.\tJe sais.',
 'I left.\tJe suis parti.',
 'I left.\tJe suis partie.',
 "I lost.\tJ'ai perdu.",
 "I'm 19.\tJ'ai 19 ans.",

All lines are seperated by '\n'.<br>
All english inputs are present in starting and French are strored after '\t' in the same line.

In [4]:
## Vectorize the data
input_texts = []
target_texts = []
input_chars = set()
target_chars = set()

for line in lines[: min(num_samples, len(lines) - 1)]:  ## taking only first 10000 records
    input_text, target_text = line.split('\t')
    target_text = '\t' + target_text + '\n'   
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_chars:  ## appending every char(only once) into a set
            input_chars.add(char)
    for char in target_text:
        if char not in target_chars:
            target_chars.add(char)

In [5]:
print('Input texts (only 5) - ',input_texts[:5],'\n'),
print('Target texts (only 5) - ',target_texts[:5],'\n'), 
print('Input chars - ',input_chars,'\n'), 
print('Target chars - ',target_chars,'\n')

Input texts (only 5) -  ['Go.', 'Run!', 'Run!', 'Wow!', 'Fire!'] 

Target texts (only 5) -  ['\tVa !\n', '\tCours\u202f!\n', '\tCourez\u202f!\n', '\tÇa alors\u202f!\n', '\tAu feu !\n'] 

Input chars -  {'h', "'", 'N', 'p', 'z', '4', 'V', 'r', 'B', '5', 'G', 's', 'o', 'C', '2', 'i', 'y', 'U', 'm', 'E', 'L', '-', 'J', 'd', 'F', '’', 'a', '.', 'q', 'x', 'c', ':', 'H', 'n', '!', 'l', 'b', 'f', 'K', 'e', 'w', 'u', 'g', '9', 'Y', 'j', ' ', '1', 'D', 'v', 'O', 'T', ',', 't', 'M', '$', 'W', 'A', 'k', '7', '0', '&', '6', 'Q', 'Z', '3', 'R', 'I', 'S', 'P', '?'} 

Target chars -  {'œ', 'p', 'o', 'C', 'i', 'E', 'm', ':', 'n', 'l', 'K', 'w', ' ', 'D', 'é', 'T', 't', 'M', '6', 'ê', 'S', 'Ç', 'h', 'N', 'r', 'B', 's', '(', 'U', 'y', 'd', '’', '«', '.', 'û', 'c', 'ë', '$', 'k', 'Q', 'I', '\n', 'P', 'â', '\u2009', 'z', 'É', 'V', '\u202f', '5', '»', 'J', '‘', 'x', 'H', 'b', 'ï', '\t', '9', 'j', 'è', 'î', 'ù', "'", 'ô', 'G', 'Ê', '-', 'L', 'À', 'F', 'ç', 'a', 'q', '!', 'f', 'e', 'g', 'u', 'Y', '\xa0', 'à'

In [6]:
input_chars = sorted(list(input_chars))
target_chars = sorted(list(target_chars))
num_encoder_tokens = len(input_chars)
num_decoder_tokens = len(target_chars)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

#Print size
print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 10000
Number of unique input tokens: 71
Number of unique output tokens: 93
Max sequence length for inputs: 16
Max sequence length for outputs: 59


## One-Hot Encoding

Turn the sentences into 3 Numpy arrays, encoder_input_data, decoder_input_data, decoder_target_data:<br>
- *encoder_input_data* is a 3D array of shape (num_pairs, max_english_sentence_length, num_english_characters) containing a one-hot vectorization of the English sentences.<br>
- *decoder_input_data* is a 3D array of shape (num_pairs, max_french_sentence_length, num_french_characters) containg a one-hot vectorization of the French sentences.<br>
- *decoder_target_data* is the same as decoder_input_data but offset by one timestep. decoder_target_data[:, t, :] will be the same as decoder_input_data[:, t + 1, :].<br>

In [7]:
## Define data for encoder and decoder
input_token_id = dict([(char, i) for i, char in enumerate(input_chars)])
target_token_id = dict([(char, i) for i, char in enumerate(target_chars)])

encoder_in_data = np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype='float32')

decoder_in_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')

decoder_target_data = np.zeros((len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32')


## One-Hot Encoding
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_in_data[i, t, input_token_id[char]] = 1.
    for t, char in enumerate(target_text):
        decoder_in_data[i, t, target_token_id[char]] = 1.
        if t > 0:
            decoder_target_data[i, t - 1, target_token_id[char]] = 1.

In [8]:
encoder_in_data

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [9]:
decoder_in_data

array([[[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [10]:
## Define and process the input sequence
encoder_inputs = Input(shape=(None, num_encoder_tokens))  ## 71
encoder = LSTM(latent_dim, return_state=True)   ## 256
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

## We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

## Using `encoder_states` set up the decoder as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)

## adding encoder state as encoder states
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [11]:
## Final model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [12]:
## Model Summary
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 71)]   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 93)]   0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 256), (None, 335872      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 256),  358400      input_2[0][0]                    
                                                                 lstm[0][1]            

In [13]:
## Model data Shape
print("encoder_in_data shape:",encoder_in_data.shape)
print("decoder_in_data shape:",decoder_in_data.shape)
print("decoder_target_data shape:",decoder_target_data.shape)

encoder_in_data shape: (10000, 16, 71)
decoder_in_data shape: (10000, 59, 93)
decoder_target_data shape: (10000, 59, 93)


In [14]:
#Compiling and training the model
import keras
model.compile(optimizer=keras.optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, decay=0.001), loss='categorical_crossentropy')
model.fit([encoder_in_data, decoder_in_data], decoder_target_data, batch_size = batch_size, epochs=50, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1d2ba4e96a0>

In [18]:
## Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

The below part of codes will define the decode sequence for the text that we will pass to the model as the input sequence. This could be understood as the module for translating the input language into the target language. In this part, the input sequence is encoded into the state vectors. The state vector and the target sequence is passed to the decoder and it produces the prediction for the next character. Using these predictions, the next character is sampled and it is appended to the target sequence. This process is repeated to generate until the end of the sequence.

In [19]:
reverse_input_char_index = dict((i, char) for char, i in input_token_id.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_id.items())

#Define Decode Sequence
def decode_sequence(input_seq):
    #Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    #Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    #Get the first character of target sequence with the start character.
    target_seq[0, 0, target_token_id['\t']] = 1.

    #Sampling loop for a batch of sequences
    #(to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        #Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        #Exit condition: either hit max length
        #or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        #Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        #Update states
        states_value = [h, c]

    return decoded_sentence

In [20]:
for seq_index in range(10):
    input_seq = encoder_in_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: Go.
Decoded sentence: Va te faire foutre !

-
Input sentence: Run!
Decoded sentence: Courez !

-
Input sentence: Run!
Decoded sentence: Courez !

-
Input sentence: Wow!
Decoded sentence: Attendez un peu !

-
Input sentence: Fire!
Decoded sentence: Trouve un boulot !

-
Input sentence: Help!
Decoded sentence: Aide-moi à sortir.

-
Input sentence: Jump.
Decoded sentence: Signez juste ici.

-
Input sentence: Stop!
Decoded sentence: Arrête de ronchonner.

-
Input sentence: Stop!
Decoded sentence: Arrête de ronchonner.

-
Input sentence: Stop!
Decoded sentence: Arrête de ronchonner.

