<a class="anchor" id="About_Data"></a>
# <p style="padding:10px;background-color:lightblue;margin:0;color:#c27849;font-family:newtimeroman;font-size:150%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:500">Machine Translation</p> 

<div style="background-color:#d5eae9; padding:20px; font-size:15px">

**This notebook includes the following:**

- Preprocessing
- Data cleaning
- Exploratory data analysis (EDA)
- Preparing the data to train a model
- Training and making predictions using various classification models
- Model evaluation

<a class="anchor" id="About_Data"></a>
# <p style="padding:10px;background-color:lightblue;margin:0;color:#c27849;font-family:newtimeroman;font-size:150%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:500">Dataset importing</p> 

In [3]:
import numpy as np
import re
import pickle
import os
import seaborn as sns
import string
import tensorflow as tf
import warnings
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
warnings.filterwarnings('ignore')

In [4]:
with open("/content/drive/MyDrive/parallel-n/IITB.en-hi.en",'r',encoding='utf-8') as f:
    english_sentences = f.read().split('\n')

with open("/content/drive/MyDrive/parallel-n/IITB.en-hi.hi",'r',encoding='utf-8') as f:
    hindi_sentences = f.read().split('\n')

In [7]:
#Some parameters
vocab_size = 10000
total_sentences = 50000
maxlen = 16
epochs = 70
validation_split = 0.05
max_sentence_length= maxlen

<a class="anchor" id="About_Data"></a>
# <p style="padding:10px;background-color:lightblue;margin:0;color:#c27849;font-family:newtimeroman;font-size:150%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:500">Data Preprocessing</p> 

In [8]:
def preprocess(text):
    text = ''.join(ch for ch in text if ch not in string.punctuation)
    text = text.lower()
    text = re.sub(r'\d','',text)
    text = re.sub(r'\s+',' ',text)
    text = text.strip()
    return text

In [9]:
eng_sentence = [preprocess(en) for en in english_sentences]

In [11]:
hindi_sentence = [re.sub('[a-zA-Z]','',preprocess(hi))for hi in hindi_sentences]

In [13]:
 len(eng_sentence),len(hindi_sentence)

(1659084, 1659084)

In [14]:
#Remove duplicate sentences
english_unique = set()
eng_sentence_temp = []
hindi_sentence_temp = []
l = len(english_sentences)
for i in range(l):
    if english_sentences[i] not in english_unique:
        english_unique.add(eng_sentence[i])
        eng_sentence_temp.append(eng_sentence[i])
        hindi_sentence_temp.append(hindi_sentence[i])

eng_sentence = eng_sentence_temp
hindi_sentence = hindi_sentence_temp

In [15]:
en_data = []
hi_data = []

cnt = 0

for (en,hi) in zip(eng_sentence, hindi_sentence):
    l = min(len(en.split()), len(hi.split()))
    if l <= maxlen:
        en_data.append(en)
        hi_data.append(hi)
    cnt += 1
    if cnt == total_sentences:
        break

In [16]:
len(en_data), len(hi_data)

(49249, 49249)

In [17]:
hi_data = ['<START> ' + hi + ' <END>' for hi in hi_data]

In [19]:
len(hi_data)

49249

In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize English texts
en_tokenizer = Tokenizer(filters='', oov_token='<OOV>', lower=False)
en_tokenizer.fit_on_texts(en_data)
en_sequences = en_tokenizer.texts_to_sequences(en_data)

# Tokenize Hindi texts
hi_tokenizer = Tokenizer(filters='', oov_token='<OOV>', lower=False)
hi_tokenizer.fit_on_texts(hi_data)
hi_sequences = hi_tokenizer.texts_to_sequences(hi_data)



# Calculate vocabulary sizes
english_vocab_size = len(en_tokenizer.word_index) + 1
hindi_vocab_size = len(hi_tokenizer.word_index) + 1

# Print vocabulary sizes
print("English Vocab Size: ", english_vocab_size)
print("Hindi Vocab Size: ", hindi_vocab_size)

English Vocab Size:  3622
Hindi Vocab Size:  3835


In [23]:
#Prepare encoder data
encoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(en_sequences, maxlen=maxlen, padding='post')

In [24]:
encoder_inputs

array([[1127,   79,  200, ...,    0,    0,    0],
       [ 984,  590, 1450, ...,    0,    0,    0],
       [   2,  153,   25, ...,    0,    0,    0],
       ...,
       [ 685,    0,    0, ...,    0,    0,    0],
       [ 181,    0,    0, ...,    0,    0,    0],
       [  28, 1190,  281, ...,    0,    0,    0]], dtype=int32)

<a class="anchor" id="About_Data"></a>
# <p style="padding:10px;background-color:lightblue;margin:0;color:#c27849;font-family:newtimeroman;font-size:150%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:500">Split Dataset for training and testing</p> 

In [25]:
#Prepare decoder data
decoder_inputs = []
decoder_outputs = []

for hi in hi_sequences:
    decoder_inputs.append(hi[:-1])
    decoder_outputs.append(hi[1:])

decoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_inputs, maxlen=maxlen, padding='post')
decoder_outputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_outputs, maxlen=maxlen, padding='post')

# Training and Testing split
# 95%, 5%
split = int(0.9 * total_sentences)

X_train = [encoder_inputs[:split], decoder_inputs[:split]]
y_train = decoder_outputs[:split]

# Test data to evaluate our NMT model using BLEU score
X_test = en_data[:split]
y_test = hi_data[:split]

print(X_train[0].shape, X_train[1].shape, y_train.shape)

(45000, 16) (45000, 16) (45000, 16)


In [27]:
import numpy as np
import tensorflow as tf

class LSTMModel(tf.keras.Model):
    def __init__(self, encoder_vocab_size=None, decoder_vocab_size=None, embedding_size=128, num_rnn_units=32, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.encoder_vocab_size = encoder_vocab_size
        self.decoder_vocab_size = decoder_vocab_size
        self.embedding_size = embedding_size
        # encoder
        self.input_1 = tf.keras.layers.InputLayer(input_shape=(None,), name='input_1')
        self.embedding_1 = tf.keras.layers.Embedding(encoder_vocab_size, embedding_size, mask_zero=True, name='embedding_1')
        self.encoder_lstm = tf.keras.layers.LSTM(num_rnn_units, return_state=True, name='encoder_lstm')
        # decoder
        self.input_2 = tf.keras.layers.InputLayer(input_shape=(None,), name='input_2')
        self.embedding_2 = tf.keras.layers.Embedding(decoder_vocab_size, embedding_size, mask_zero=True, name='embedding_2')
        self.decoder_lstm = tf.keras.layers.LSTM(num_rnn_units, activation='relu', return_sequences=True, return_state=True, name='decoder_lstm')
        self.token_layer = tf.keras.layers.Dense(decoder_vocab_size, activation='softmax', name='token_layer')

    def call(self, inputs):
        encoder_input = self.input_1(inputs[0])
        decoder_input = self.input_2(inputs[1])
        # encode the inputs
        encoder_embed = self.embedding_1(encoder_input)
        # run rnn on the encoded sequence
        _, state_h, state_c = self.encoder_lstm(encoder_embed)
        # decode the target
        decoder_embed = self.embedding_2(decoder_input)
        x, _, _ = self.decoder_lstm(decoder_embed, initial_state=[state_h, state_c])
        return self.token_layer(x)

    def predict_sequence(self, text, input_tokenizer, output_tokenizer, max_len):
        if type(text) != list:
            text = [text]
        input_sequence = input_tokenizer.texts_to_sequences(text)
        if type(input_sequence) == list:
            input_sequence = np.array(input_sequence)
        encoder_embed = self.embedding_1(input_sequence)
        # run rnn on the encoded sequence
        _, next_h, next_c = self.encoder_lstm(encoder_embed)
        curr_token = [[output_tokenizer.word_index['<START>']]]

        out_seq = ""
        for _ in range(max_len):
            decoder_embedding = self.embedding_2(np.array(curr_token))
            x, next_h, next_c = self.decoder_lstm(decoder_embedding, initial_state=[next_h, next_c])
            x = self.token_layer(x)
            next_token = np.argmax(x[0, 0, :])
            next_word = output_tokenizer.index_word[next_token]
            if next_word == "<END>":
                break
            curr_token[0][0] = next_token
            out_seq += " " + next_word
        return out_seq.strip()

# Example usage:
# lstm_model = LSTMModel(encoder_vocab_size=5000, decoder_vocab_size=5000)
# result = lstm_model.predict_sequence("example input text", input_tokenizer, output_tokenizer, max_len=20)


In [28]:
en_hi_model = LSTMModel(encoder_vocab_size=english_vocab_size, decoder_vocab_size= hindi_vocab_size ,embedding_size=256, num_rnn_units=64)
en_hi_model(np.array([[[1]*max_sentence_length], [[1]*max_sentence_length]]))
en_hi_model.summary()
# model.load_weights(Weights_DIR+"\\model.h5")
loss = tf.keras.losses.SparseCategoricalCrossentropy()



Model: "lstm_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     multiple                  927232    
                                                                 
 encoder_lstm (LSTM)         multiple                  82176     
                                                                 
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_2 (Embedding)     multiple                  981760    
                                                                 
 decoder_lstm (LSTM)         multiple                  82176     
                                                                 
 token_layer (Dense)         multiple                  2

In [29]:
#Save model after each epoch
save_model_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="/content/drive/MyDrive/parallel-n/model6",
    monitor='val_accuracy',
    mode='max'
)
en_hi_model.compile(optimizer='rmsprop', loss=loss, metrics=['accuracy'])
en_hi_model.fit(X_train, y_train, epochs=30,batch_size= 32, validation_split=validation_split, callbacks=[save_model_callback, tf.keras.callbacks.TerminateOnNaN()])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7c58b135db40>

In [30]:
en_hi_model.save_weights(r"/content/drive/MyDrive/parallel-n/model6/en_hi_model.h5")

<a class="anchor" id="About_Data"></a>
# <p style="padding:10px;background-color:lightblue;margin:0;color:#c27849;font-family:newtimeroman;font-size:150%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:500">Testing ans Analysis</p> 

In [31]:
# Testing and Analysis

candidates = []
references = []

ctr = 20
i = 0

while ctr > 0:
    l = len(X_test[i].split())
    if l <= maxlen:  # Choose only sentences of length in range [5,15]
        pred_sentence = en_hi_model.predict_sequence(X_test[i], en_tokenizer, hi_tokenizer, max_len=maxlen)
        candidates.append(pred_sentence.split())

        print("Input: ", X_test[i])
        print("Prediction: ", pred_sentence)

        # Google Translated Reference (if using Google Translate API)
        # google_translated_sentence = translate_client.translate(X_test[i], target_language='hi')['translatedText']
        # print("Google Translated Reference: ", google_translated_sentence)

        print("Dataset Reference: ", ' '.join(y_test[i].split()[1:-1]))
        print()
        references.append([y_test[i].split()[1:-1]])

        ctr -= 1
    i += 1

# Printing candidates and references for evaluation
# print("References:", references)

Input:  give your application an accessibility workout
Prediction:  अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें
Dataset Reference:  अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें

Input:  accerciser accessibility explorer
Prediction:  एक्सेर्साइसर पहुंचनीयता अन्वेषक
Dataset Reference:  एक्सेर्साइसर पहुंचनीयता अन्वेषक

Input:  the default plugin layout for the bottom panel
Prediction:  नीचे के इंच फ़ाइल
Dataset Reference:  निचले पटल के लिए डिफोल्ट प्लगइन खाका

Input:  the default plugin layout for the top panel
Prediction:  ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका
Dataset Reference:  ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका

Input:  a list of plugins that are disabled by default
Prediction:  उन प्लगइनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है
Dataset Reference:  उन प्लगइनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है

Input:  highlight duration
Prediction:  हाइलाइट अवधिः
Dataset Reference:  अवधि को हाइलाइट रकें

Input:  the duration of the highlight box when selecting a