# English to French Language Translation using Encoders and Decoders

Dataset link: https://www.manythings.org/anki/

## Importing libraries and initializing parameters

In [2]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense

import numpy as np
import pandas as pd

In [3]:
batch_size = 64 # batch size for training
epochs = 100 # Number of epochs to train for
latemt_dim = 256 # latent dimensionality of the encoding space.
num_samples = 10000 # Number of samples to train on. (10,000)

## Reading Input text and extracting target texts and characters

### Example:

**Input text:** Hi Hello

**Target text:** \tsalut bonjour\n

<u>Note:</u> We have added \t and \n to let model know about the Start and End of String and to know where the words are ending.

\t refers to the start and \n refers to the end

In [4]:
input_texts, target_texts = [], [] # input_texts refer to English and target_texts refer to French

input_characters, target_characters = set(), set() # input_characters refer to English and target_characters refer to French

# reading the lines from the file and storing in lines list
with open('fra.txt', 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

for line in lines[: min(num_samples, len(lines)-1)]:

    input_text, target_text, _ = line.split('\t')

    target_text = '\t' + target_text + '\n' # \t tab space to indicate to start of sentence and \n to indicate end of sentence

    # Adding the extracted texts into their respective arrays
    input_texts.append(input_text)
    target_texts.append(target_text)

    # Seperating characters from each text both input and output and storing in respective arrays

    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)

    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

In [17]:
input_texts

['Go.',
 'Go.',
 'Go.',
 'Go.',
 'Hi.',
 'Hi.',
 'Run!',
 'Run!',
 'Run!',
 'Run!',
 'Run!',
 'Run!',
 'Run!',
 'Run!',
 'Run.',
 'Run.',
 'Run.',
 'Run.',
 'Run.',
 'Run.',
 'Run.',
 'Run.',
 'Who?',
 'Wow!',
 'Wow!',
 'Wow!',
 'Duck!',
 'Duck!',
 'Duck!',
 'Fire!',
 'Help!',
 'Hide.',
 'Hide.',
 'Jump!',
 'Jump.',
 'Stop!',
 'Stop!',
 'Stop!',
 'Wait!',
 'Wait!',
 'Wait!',
 'Wait.',
 'Wait.',
 'Wait.',
 'Wait.',
 'Begin.',
 'Begin.',
 'Go on.',
 'Go on.',
 'Go on.',
 'Hello!',
 'Hello!',
 'Hello.',
 'Hello.',
 'Hello.',
 'Hello.',
 'I see.',
 'I see.',
 'I try.',
 'I won!',
 'I won!',
 'I won.',
 'Oh no!',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Smile.',
 'Smile.',
 'Smile.',
 'Sorry?',
 'Attack!',
 'Attack!',
 'Attack!',
 'Attack!',
 'Buy it.',
 'Buy it.',
 'Buy it.',
 'Buy it.',
 'Cheers!',
 'Cheers!',
 'Cheers!',
 'Cheers!',
 'Eat it.',
 'Eat it.',
 'Exhale.',
 'Get up.',
 'Get up.',
 'Ge

In [5]:
len(input_characters)

70

In [8]:
len(target_characters)

91

## Setting parameters for Encoders and Decoders

In [6]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))

num_encoder_tokens = len(input_characters) # Passing input characters to encoders, hence specifying the tokens
num_decoder_tokens = len(target_characters) # Target tokens set for decoders

max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])


In [7]:
num_encoder_tokens, num_decoder_tokens

(70, 91)

## Adding index

In [8]:
input_token_index = dict([(char,i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char,i) for i, char in enumerate(target_characters)])

In [9]:
input_token_index

{' ': 0,
 '!': 1,
 '"': 2,
 '$': 3,
 '%': 4,
 '&': 5,
 "'": 6,
 ',': 7,
 '-': 8,
 '.': 9,
 '0': 10,
 '1': 11,
 '2': 12,
 '3': 13,
 '5': 14,
 '7': 15,
 '8': 16,
 '9': 17,
 ':': 18,
 '?': 19,
 'A': 20,
 'B': 21,
 'C': 22,
 'D': 23,
 'E': 24,
 'F': 25,
 'G': 26,
 'H': 27,
 'I': 28,
 'J': 29,
 'K': 30,
 'L': 31,
 'M': 32,
 'N': 33,
 'O': 34,
 'P': 35,
 'Q': 36,
 'R': 37,
 'S': 38,
 'T': 39,
 'U': 40,
 'V': 41,
 'W': 42,
 'Y': 43,
 'a': 44,
 'b': 45,
 'c': 46,
 'd': 47,
 'e': 48,
 'f': 49,
 'g': 50,
 'h': 51,
 'i': 52,
 'j': 53,
 'k': 54,
 'l': 55,
 'm': 56,
 'n': 57,
 'o': 58,
 'p': 59,
 'q': 60,
 'r': 61,
 's': 62,
 't': 63,
 'u': 64,
 'v': 65,
 'w': 66,
 'x': 67,
 'y': 68,
 'z': 69}

In [10]:
target_token_index

{'\t': 0,
 '\n': 1,
 ' ': 2,
 '!': 3,
 '%': 4,
 '&': 5,
 "'": 6,
 ',': 7,
 '-': 8,
 '.': 9,
 '0': 10,
 '1': 11,
 '2': 12,
 '3': 13,
 '5': 14,
 '8': 15,
 '9': 16,
 ':': 17,
 '?': 18,
 'A': 19,
 'B': 20,
 'C': 21,
 'D': 22,
 'E': 23,
 'F': 24,
 'G': 25,
 'H': 26,
 'I': 27,
 'J': 28,
 'K': 29,
 'L': 30,
 'M': 31,
 'N': 32,
 'O': 33,
 'P': 34,
 'Q': 35,
 'R': 36,
 'S': 37,
 'T': 38,
 'U': 39,
 'V': 40,
 'W': 41,
 'Y': 42,
 'a': 43,
 'b': 44,
 'c': 45,
 'd': 46,
 'e': 47,
 'f': 48,
 'g': 49,
 'h': 50,
 'i': 51,
 'j': 52,
 'k': 53,
 'l': 54,
 'm': 55,
 'n': 56,
 'o': 57,
 'p': 58,
 'q': 59,
 'r': 60,
 's': 61,
 't': 62,
 'u': 63,
 'v': 64,
 'w': 65,
 'x': 66,
 'y': 67,
 'z': 68,
 '\xa0': 69,
 '«': 70,
 '»': 71,
 'À': 72,
 'Ç': 73,
 'É': 74,
 'Ê': 75,
 'à': 76,
 'â': 77,
 'ç': 78,
 'è': 79,
 'é': 80,
 'ê': 81,
 'î': 82,
 'ï': 83,
 'ô': 84,
 'ù': 85,
 'û': 86,
 'œ': 87,
 '\u2009': 88,
 '’': 89,
 '\u202f': 90}

## Creating `encoder_input_data, decoder_input_data, decoder_target_data`



In [11]:
# Data inserted to encoder
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens), # Parameters related to English
    dtype=float)

# Output received from encoder - Context vector
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), # Parameters related to French
    dtype=float)

# Output received from decoder
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), # Parameters related to French
    dtype=float)

## Deriving One Hot representation

In [12]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
  for t, char in enumerate(input_text):
    encoder_input_data[i, t, input_token_index[char]] = 1.
  encoder_input_data[i, t+1:, input_token_index[' ']] = 1.
  for t, char in enumerate(target_text):
    # decoder_target_data is ahead of decoder_input_data by one timestep
    decoder_input_data[i, t, target_token_index[char]] = 1.
    if t>0:
      # decoder_target_data will be ahead by one timestep and will not include the first character
      decoder_target_data[i, t-1, target_token_index[char]] = 1.
  decoder_input_data[i, t+1:, target_token_index[' ']] = 1.
  decoder_target_data[i, t:, target_token_index[' ']] = 1.



In [13]:
encoder_input_data[0].shape

(14, 70)

## Creating LSTM layer

### Step 1: Encoder layer

In [14]:
# Define the input sequence and process it
encoder_inputs = Input(shape=(None, num_encoder_tokens))

# Initializing LSTM layer
encoder = LSTM(latemt_dim, return_state=True)

# Encoder will give three outputs - encoder_outputs, state_h, state_c
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# We are only considering state_h and state_c as we are not focusing on the output of each LSTM in the encoders part.
encoder_states = [state_h, state_c]

### Step 2: Decoder layer

In [16]:
# Set up the decoder, using encoder_states as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))

# Setting up the decoder to return full output sequences, as we will capture and send it to the next LSTM.
# We are also setting up to return the internal states as well, and we dont use the return states in the training model, but we will
# use them in reference.
decoder_lstm = LSTM(latemt_dim, return_sequences=True, return_state=True)

# Mostly focused on the outputs of the decoder
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)

# Dense layer and adding the activation function
decoder_dense = Dense(num_decoder_tokens, activation='softmax')

# Getting the decoder_outputs
decoder_outputs = decoder_dense(decoder_outputs)

## Training the model

This model will turn the `encoder_input_data` and `decoder_input_data` into `decoder_target_data`.

Remember, `encoder_input_data` and `decoder_input_data` both are same, only the the inputs are one step ahead in time.

In [17]:
# Initializing the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [18]:
# Fitting the model with the training data
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

Epoch 1/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - accuracy: 0.7053 - loss: 1.5557 - val_accuracy: 0.7135 - val_loss: 1.0894
Epoch 2/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.7468 - loss: 0.9588 - val_accuracy: 0.7141 - val_loss: 0.9777
Epoch 3/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.7613 - loss: 0.8616 - val_accuracy: 0.7507 - val_loss: 0.8737
Epoch 4/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.7888 - loss: 0.7620 - val_accuracy: 0.7730 - val_loss: 0.7840
Epoch 5/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.7975 - loss: 0.7056 - val_accuracy: 0.7853 - val_loss: 0.7435
Epoch 6/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.8129 - loss: 0.6466 - val_accuracy: 0.7972 - val_loss: 0.6974
Epoch 7/100
[1m

<keras.src.callbacks.history.History at 0x7a9f74115890>