## **Machine Translation**
Machine Translation (MT) is the task of automatically converting one natural language into another, preserving the meaning of the input text, and producing fluent text in the output language.

### **Required Libraries**

In [1]:
from tensorflow.keras.models import Model 
from tensorflow.keras.layers import Input, LSTM, Dense
import numpy as np 

### **intializing some parameteres**

In [2]:
batch_size = 64 # Batch size for training
epochs = 100    # Number of epochs to train for 
latent_dim = 256 # Latent dimensionality of the encoding space 
num_samples = 10000 # Number of Samples to Train

# Path of the data txt file
data_path = r'C:\Users\jgaur\Tensorflow_Tut\Machine_Translation\fra-eng\fra.txt'

In [3]:
'''Vectorize the data'''
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

''' reading file '''
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

''' preparing the data for training '''
for line in lines[: min(num_samples, len(lines) -1)]:
    input_text, target_text, _ = line.split('\t')
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

In [4]:
print(len(input_characters))
print(len(target_characters))

71
94


In [5]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters))
max_decoder_seq_length = max([len(txt) for txt in target_texts])
max_encoder_seq_length = max([len(txt) for txt in input_texts]

In [6]:
print("Number of samples:", len(input_texts))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokerns:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)

Number of samples: 10000
Number of unique input tokens: 71
Number of unique output tokerns: 94
Max sequence length for inputs: 15
Max sequence length for outputs: 59


In [8]:
input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)]
)

output_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)]
)

In [11]:
# input_token_index

In [12]:
encoder_input_data = np.zeros(
    # total no. of sentence, max length of a sentence, total no. of english characters
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens), 
    dtype='float32'
)

decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), 
    dtype='float32'
)

decoder_output_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32'
)

In [14]:
encoder_input_data.shape

(10000, 15, 71)

In [19]:
input_token_index[' ']

0

In [21]:
for i, (input_text, output_text) in enumerate(zip(input_texts, target_texts)):
    for i_, char in enumerate(input_text):
        encoder_input_data[i, i_, input_token_index[char]] = 1 
    encoder_input_data[i, i_ + 1:, input_token_index[' ']] = 1
    for i_, char in enumerate(output_text):
        decoder_input_data[i, i_, output_token_index[char]] = 1
        if i_ > 0:
            # decoder_target data will be ahead by one timestep
            # and will not include the start character
            decoder_output_data[i, i_ - 1, output_token_index[char]] = 1
    decoder_input_data[i, i_ + 1:, output_token_index[' ']] = 1
    decoder_output_data[i, i_:, output_token_index[' ']] = 1

In [27]:
num_encoder_tokens

71

## **Encoder**
Encoder decoder models allow for a process in which a machine learning model generates a sentence describing an image. It receives the image as the input and outputs a sequence of words. This also works with videos

In [28]:
# Define an input sequence and process it
encoder_inputs = Input(shape=(None, num_encoder_tokens))
print(encoder_inputs)
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard 'encoder_outputs and only keep the states
encoder_states = [state_h, state_c]

Tensor("input_1:0", shape=(None, None, 71), dtype=float32)


## **Decoder**
`Decoder` means to convert a coded message into intelligible language. ... In the machine learning model, the role of the decoder will be to convert the two-dimensional vector into the output sequence, the English sentence. It is also built with RNN layers and a dense layer to predict the English word.

In [30]:
# Set up the decoder, using 'encoder_states' as initial_state
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return  full output sequences, 
# and to return  internal states as well. We don't use the
# return states in the training model, but we will use them in inference

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, 
                                    initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [34]:
model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 71)]   0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, None, 94)]   0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 256), (None, 335872      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 256),  359424      input_3[0][0]                    
                                                                 lstm[0][1]            

In [33]:
'''Define the model that will tun'''
# encoder_input_data & decoder_input_data into decoder_target_data
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

''' compiling the model '''
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])

''' training '''
history = model.fit([encoder_input_data, decoder_input_data], decoder_output_data,
            batch_size=batch_size, epochs=epochs, validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78