# English to French Translator using Deep Learning
# Neural Machine Translation Using Sequence-to-Sequence Model with LSTM

## Importing necessary modules

In [4]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input,LSTM,Dense
import numpy as np

## Defining Hyperparameters and Data Path

In [1]:
#data path
data_path='fra.txt'

batch_size=64  #batch size for training
epochs=100   #no.of epochs to train for
latent_dim=256  #Latent dimensionality of the encoding space
num_samples=10000  #no of samples to train

## Data Preprocessing and Vectorization

In [11]:
# Vectorize the data.
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
with open(data_path, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")
for line in lines[: min(num_samples, len(lines) - 1)]:
    input_text, target_text = line.split("\t")
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = "\t" + target_text + "\n"
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

In [13]:
input_characters=sorted(list(input_characters))
target_characters=sorted(list(target_characters))
num_encoder_tokens=len(input_characters)
num_decoder_tokens=len(target_characters)
max_encoder_seq_length=max([len(txt) for txt in input_texts])
max_decoder_seq_length=max([len(txt) for txt in target_texts])

print('No.of samples:',len(input_texts))
print('No.of unique input_tokens:',num_encoder_tokens)
print('No.of unique output_tokens:',num_decoder_tokens)
print('Max sequence length for inputs:',max_encoder_seq_length)
print('Max sequence length for outputs:',max_decoder_seq_length)


No.of samples: 10000
No.of unique input_tokens: 71
No.of unique output_tokens: 94
Max sequence length for inputs: 16
Max sequence length for outputs: 59


## Creating Token Index Dictionaries and Preparing Encoder and Decoder Data

In [15]:
input_token_index=dict([(char,i) for i ,char in enumerate(input_characters)])
target_token_index=dict([(char,i) for i,char in enumerate(target_characters)])

encoder_input_data=np.zeros((len(input_texts),max_encoder_seq_length,num_encoder_tokens),dtype='float32')
decoder_input_data=np.zeros((len(input_texts),max_decoder_seq_length,num_decoder_tokens),dtype='float32')
decoder_target_data=np.zeros((len(input_texts),max_decoder_seq_length,num_decoder_tokens),dtype='float32')


### One hot Encoding

In [17]:
for i,(input_text,target_text) in enumerate(zip(input_texts,target_texts)):
    for t,char in enumerate(input_text):
        encoder_input_data[i,t,input_token_index[char]]=1.
    encoder_input_data[i,t+1:,input_token_index[' ']]=1.
    for t,char in enumerate(target_text):
        #decoder_target_data is ahead of decoder_input_data by ome timestep
        decoder_input_data[i,t,target_token_index[char]]=1.
        if t>0:
            #decoder_target_data will be ahead by one timestep not include start character
            decoder_target_data[i,t-1,target_token_index[char]]=1.
    decoder_input_data[i,t+1:,target_token_index[' ']]=1.
    decoder_target_data[i,t:,target_token_index[' ']]=1.

print("encoder input data shape:",encoder_input_data[0].shape)


encoder input data shape: (16, 71)


## Building Encoder and Decoder Model

In [19]:
#define an input sequence and process it
encoder_inputs=Input(shape=(None,num_encoder_tokens))
encoder=LSTM(latent_dim,return_state=True)
encoder_outputs,state_h,state_c=encoder(encoder_inputs)
#we discard encoder_outputs and only keep states
encoder_states=[state_h,state_c]


#set up decoder using encoder_states as initial state.
decoder_inputs=Input(shape=(None,num_decoder_tokens))
#set up decoder to return full output sequences and return internal states as well
#dont use return states in training model,but we will use them in inference
decoder_lstm=LSTM(latent_dim,return_sequences=True,return_state=True)
decoder_outputs,_,_=decoder_lstm(decoder_inputs,initial_state=encoder_states)
decoder_dense=Dense(num_decoder_tokens,activation='softmax')
decoder_outputs=decoder_dense(decoder_outputs)

## Defining ,Compiling and Training the model

In [21]:
#define model turn encoder_input_data & decoder_input_data into decoder_target_data
model=Model([encoder_inputs,decoder_inputs],decoder_outputs)

model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])
model.fit([encoder_input_data,decoder_input_data],decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

# Save model
model.save("s2s_model.keras")

Epoch 1/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 372ms/step - accuracy: 0.6894 - loss: 1.6267 - val_accuracy: 0.6865 - val_loss: 1.1711
Epoch 2/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 359ms/step - accuracy: 0.7330 - loss: 1.0110 - val_accuracy: 0.7005 - val_loss: 1.0784
Epoch 3/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 356ms/step - accuracy: 0.7498 - loss: 0.9014 - val_accuracy: 0.7184 - val_loss: 0.9893
Epoch 4/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 356ms/step - accuracy: 0.7776 - loss: 0.7975 - val_accuracy: 0.7530 - val_loss: 0.8630
Epoch 5/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 362ms/step - accuracy: 0.7963 - loss: 0.7127 - val_accuracy: 0.7590 - val_loss: 0.8160
Epoch 6/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 393ms/step - accuracy: 0.8050 - loss: 0.6712 - val_accuracy: 0.7707 - val_loss: 0.7791
Epoc

## Building the Inference Model (Sampling Mode)

In [23]:
#next: interference mode (sampling)
#1.encode input & retrieve initial decoder state
#2.run one step of decoder with this initial state & 'start of sequence' token as target
#output next target token
#3.repeat with current target token & current states

#define sampling models
encoder_model= Model(encoder_inputs,encoder_states)
decoder_state_input_h=Input(shape=(latent_dim,))
decoder_state_input_c=Input(shape=(latent_dim,))
decoder_states_inputs=[decoder_state_input_h,decoder_state_input_c]
decoder_outputs,states_h,state_c=decoder_lstm(decoder_inputs,initial_state=decoder_states_inputs)
decoder_states=[states_h,state_c]
decoder_outputs=decoder_dense(decoder_outputs)
decoder_model=Model([decoder_inputs]+decoder_states_inputs,
                    [decoder_outputs]+decoder_states)

## Creating Reverse Token Lookup Dictionaries

In [25]:
#reverse-look up token index to decode sequence back to readable
reverse_input_char_index=dict((i,char) for char,i in input_token_index.items())
reverse_target_char_index=dict((i,char) for char,i in target_token_index.items())

def decode_sequence(input_seq):
    #encode input as state vectors
    states_value=encoder_model.predict(input_seq)
    #generate empty target sequence of length 1
    target_seq=np.zeros((1,1,num_decoder_tokens))
    #populate first character of target sequences with start character
    target_seq[0,0,target_token_index['\t']]=1.

    #sampling loop for batch of statements ..bath size is 1
    stop_condition=False
    decoded_sentence=' '
    while not stop_condition:
        output_tokens,h,c=decoder_model.predict([target_seq]+states_value)
        #sample a token
         #sample a token
        sampled_token_index=np.argmax(output_tokens[0,-1,:])
        sampled_char=reverse_target_char_index[sampled_token_index]
        decoded_sentence+=sampled_char

        #exit:hit max length or find stop character
        if sampled_char== '\n' or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition=True
        #update target sequence (of length 1)
        target_seq=np.zeros((1,1,num_decoder_tokens))
        target_seq[0,0,sampled_token_index]=1.
        #update sates
        states_value=[h,c]
    return decoded_sentence

## Testing the Model on Sample Inputs

In [None]:
for seq_index in range(100):
    #take 1 sequence (part of training set) trying to decoding
    input_seq=encoder_input_data[seq_index:seq_index+1]
    decoded_sentence=decode_sequence(input_seq)
    print('-')
    print('input sentence:',input_texts[seq_index])
    print('decoded sentence:',decoded_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 367ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 299ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 