<a href="https://colab.research.google.com/github/sanchitkripalani47/Data-Science/blob/main/Eng_To_French.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## English to French Translation

In [1]:
# Import Numpy library
import numpy as np

In [2]:
# Mount the Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Read the data
# The data is present in the form of txt file with line by line separation

filepath = '/content/drive/MyDrive/Datasets/eng_to_french.txt'

raw_text = open(filepath, encoding='utf-8').read().split('\n')

In [4]:
# Print the number of lines/translations in the dataset
print(len(raw_text))

145437


We will use 10,000 samples for this task. Also, we need to preprocess the text by adding tokens in the english and french sentences.

In [24]:
# Text Preprocessing

num_samples = 10000

# Storing eng and french sentences 
eng_sents = []
fre_sents = []

# Storing eng and french characters
eng_chars = set()
fre_chars  = set()


# Process the sentences
for i in range(num_samples):
    eng_sent = str(raw_text[i].split('\t'))[0]
    
    # The target text must have start and end tokens, 
    # which is needed in the decoder part. '\t' is used for start and '\n' for end. 
    fre_sent = '\t' + str(raw_text[i]).split('\t')[1] + '\n'
    
    # Adding all the sentences in the lists
    eng_sents.append(eng_sent)
    fre_sents.append(fre_sent)
    
    # Adding all unique characters using sets
    for ch in eng_sent:
        eng_chars.add(ch)
        
    for ch in fre_sent:
        fre_chars.add(ch)

In [25]:
# Sort the characters
eng_chars = sorted(list(eng_chars))
fre_chars = sorted(list(fre_chars))

In [26]:
# Using dictionaries to store index to characters and visa versa

# For english index to char
eng_index_to_char = {}

# For english char to index
eng_char_to_index = {}

# For french index to char
fre_char_to_index = {}

# For french char to index
fre_index_to_char = {}

for key, value in enumerate(eng_chars):
    eng_index_to_char[key] = value
    eng_char_to_index[value] = key
    
for key, value in enumerate(fre_chars):
    fre_index_to_char[key] = value
    fre_char_to_index[value] = key

In [27]:
# Get the maximum length of english and french sentences
max_len_eng = max([len(i) for i in eng_sents])
max_len_fre = max([len(i) for i in fre_sents])

print(f'The maximum length of an English sentence here is: {max_len_eng}')
print(f'The maximum length of a French sentence here is: {max_len_fre}')

The maximum length of an English sentence here is: 1
The maximum length of a French sentence here is: 59


In [28]:
# Preparing a one-hot encoding for the sentences 

tokenized_eng_sents = np.zeros(shape=(num_samples, max_len_eng, len(eng_chars)), dtype='float32')
tokenized_fre_sents = np.zeros(shape=(num_samples, max_len_fre, len(fre_chars)), dtype='float32')
target_data = np.zeros(shape=(num_samples, max_len_fre, len(fre_chars)), dtype='float32')

# Here 
# 1D = num_samples, 2D = Max len of language, 3D = Total characters in language

In [29]:
# Vectorize the english and french sentences

for i in range(num_samples):
    for key, char in enumerate(eng_sents[i]):
        tokenized_eng_sents[i, key, eng_char_to_index[char]] = 1
        
    for key, char in enumerate(fre_sents[i]):
        tokenized_fre_sents[i, key, fre_char_to_index[char]] = 1
    
        # The target data for decoder will be ahead by one timestamp 
        # And it will not include the start character.
        if key > 0:
            target_data[i, key-1, fre_char_to_index[char]] = 1

### Modelling

In [30]:
# Import necessary tensorflow libraries for creating the model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense

In [31]:
# Encoder model

enc_ip = Input(shape=(None, len(eng_chars)))
# Note here that only the end state is returned and not all the sequences
enc_LSTM = LSTM(256, return_state=True)
enc_op, enc_h, enc_c = enc_LSTM(enc_ip)
enc_states = [enc_h, enc_c]

In [32]:
# Decoder model

dec_ip = Input(shape=(None, len(fre_chars)))
# Note here that we output both, all the sequences as well as the final state
dec_LSTM = LSTM(256, return_sequences=True, return_state=True)
dec_out, _, _ = dec_LSTM(dec_ip, initial_state=enc_states)
dec_dense = Dense(len(fre_chars), activation='softmax')
dec_out = dec_dense(dec_out)

In [33]:
# Combining both the models
model = Model(inputs=[enc_ip, dec_ip], outputs=[dec_out])

# Run training 
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

# Fit the model
model.fit(x=[tokenized_eng_sents, tokenized_fre_sents],
          y=target_data,
          batch_size=64,
          epochs=10,
          validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3e1d06ab10>

### Testing The Model

In [34]:
# Inference model for testing

# For Encoder
enc_model_inf = Model(enc_ip, enc_states)

# For Decoder
dec_state_ip_h = Input(shape=(256,))
dec_state_ip_c = Input(shape=(256,))
dec_ip_states = [dec_state_ip_h, dec_state_ip_c]

dec_out, dec_h, dec_c = dec_LSTM(dec_ip, initial_state=dec_ip_states)

dec_states = [dec_h, dec_c]
dec_out = dec_dense(dec_out)

dec_model_inf = Model(inputs=[dec_ip]+dec_ip_states,
                      outputs=[dec_out]+dec_states)

In [35]:
def decode_seq(ip_seq):

    # Intial State value will come from encoder
    states_val = enc_model_inf.predict(ip_seq)

    target_seq = np.zeros((1,1,len(fre_chars)))
    target_seq[0,0,fre_char_to_index['\t']] = 1

    translated_sent = ''
    toStop = False

    while not toStop:
        dec_out, dec_h, dec_c = dec_model_inf.predict(x=[target_seq]+states_val)

        max_val_index = np.argmax(dec_out[0,-1,:])
        sampled_fre_char = fre_index_to_char[max_val_index]
        translated_sent += sampled_fre_char

        if ((sampled_fre_char == '\n')  or (len(translated_sent) > max_len_fre)):
            toStop = True

        target_seq = np.zeros((1,1,len(fre_chars)))
        target_seq[0,0,max_val_index] = 1

        states_val = [dec_h, dec_out]

    return translated_sent

In [36]:
for seq_index in range(10):
    ip_seq = tokenized_eng_sents[seq_index:seq_index+1]
    translated_sent = decode_seq(ip_seq)
    print('-')
    print('English Sentence: ', eng_sent[seq_index])
    print('French Sentence: ', translated_sent)

ValueError: ignored