#  English–Hindi Neural Machine Translation

## Importing Libraries

In [48]:
import numpy as np
import pandas as pd
import string
from string import digits
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input,LSTM,Embedding,Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Loading the Dataset

In [51]:
lines = pd.read_csv(r"C:\Users\Dell\Downloads\Hindi_English_Truncated_Corpus.csv",encoding = "utf-8")

# Filters TED talk sentence pairs, selects only English and Hindi columns, and removes missing or duplicate entries.
lines = lines[lines['source'] == 'ted'][['english_sentence', 'hindi_sentence']].dropna().drop_duplicates()

lines = lines.sample(n=25000, random_state=42) #Randomly selects 25,000 sentence pairs from the filtered dataset.


## Text Cleaning

In [53]:
# The below code remove punctuation and digits and converts text to lowercase and strips whitespace.
def clean_text(text):
    exclude = set(string.punctuation)
    text = ''.join(ch for ch in text if ch not in exclude)
    text = text.translate(str.maketrans('', '', digits))
    return text.strip().lower()

In [55]:
# Applies cleaning and adds special tokens to Hindi sentences to mark start and end (start_, _end).
lines['english_sentence'] = lines['english_sentence'].apply(clean_text)
lines['hindi_sentence'] = lines['hindi_sentence'].apply(clean_text)
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x : 'start_' + x + '_end')

## Tokenization

The tokenizer builds a vocabulary from English sentences and then converts each sentence into a sequence of integer word indices.

In [57]:
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(lines['english_sentence'])
eng_sequence = eng_tokenizer.texts_to_sequences(lines['english_sentence'])

In [59]:
eng_sequence

[[11, 203, 68, 54, 56, 139, 516, 19, 56, 95, 9],
 [84, 4936],
 [20, 30, 297, 30, 125, 5, 4937],
 [2, 13, 709, 1353],
 [2, 24, 28, 30, 321, 30, 10, 57, 4938, 1203, 970, 88, 29, 4, 62, 1354],
 [2, 322, 1, 7105, 53, 4939, 7106],
 [13, 668, 9, 1576, 3150],
 [151, 18, 84, 48, 47, 119, 106, 7, 258, 1, 1260],
 [81, 63, 3, 1204, 5, 119, 240, 3, 31, 4940, 4, 765],
 [7107, 2, 7108, 360, 12, 498, 3, 1, 222, 82, 12, 638, 4941],
 [53, 258, 12, 3818],
 [24, 3819, 2311, 971, 847],
 [14, 323, 10, 18, 1014, 4942, 639],
 [3151, 144, 2, 7109],
 [14, 44, 13, 188, 65, 7, 31, 508, 2312, 1712, 8, 15],
 [20, 75, 59, 3, 766, 3152, 38, 10, 68, 18, 487],
 [24, 1447, 53, 46, 2083, 2670],
 [35, 63, 3, 18, 25, 550, 310, 541, 67, 32],
 [2, 361, 407, 1, 848],
 [38, 8, 161, 10, 1, 1871, 7110],
 [9, 3, 170, 91, 140, 1872],
 [39, 125, 1577, 7, 1, 89, 108, 1577, 37],
 [11, 105, 3153, 3, 128, 7, 2, 4943, 13],
 [42, 24, 28, 5, 3820, 2671, 24, 28, 92, 4944],
 [6, 24, 2672, 4945, 4946],
 [6, 8, 175, 812, 16, 31, 1713],
 [4, 

In [61]:
hin_tokenizer = Tokenizer()
hin_tokenizer.fit_on_texts(lines['hindi_sentence'])
hin_sequence = hin_tokenizer.texts_to_sequences(lines['hindi_sentence'])

In [63]:
hin_sequence

[[1, 16, 166, 56, 15, 154, 7, 11, 145, 1274, 384, 7, 62, 384, 3, 2],
 [1, 61, 8166, 15, 2],
 [1, 59, 10, 878, 122, 5, 42, 2],
 [1, 4, 13, 776, 2704, 2],
 [1,
  4,
  575,
  65,
  104,
  3,
  13,
  804,
  457,
  15,
  8167,
  112,
  5648,
  6,
  2165,
  674,
  12,
  5649,
  209,
  2],
 [1, 4, 8168, 8169, 172, 1057, 210, 110, 2],
 [1, 13, 467, 200, 53, 8, 640, 8, 468, 3, 2],
 [1, 91, 24, 134, 12, 1162, 173, 14, 335, 15, 47, 31, 111, 659, 589, 3, 2],
 [1, 273, 38, 172, 844, 194, 344, 5, 4369, 5, 8170, 9, 32, 1721, 1603, 2],
 [1,
  2705,
  104,
  3,
  91,
  22,
  4370,
  1013,
  76,
  1058,
  56,
  469,
  128,
  3,
  41,
  22,
  4371,
  214,
  128,
  3,
  2],
 [1, 77, 2706, 139, 23, 2],
 [1, 126, 38, 879, 193, 7, 8171, 5650, 4, 38, 1108, 3, 40, 2],
 [1, 19, 17, 302, 527, 9, 1275, 5651, 5652, 99, 3, 2],
 [1, 281, 8, 8172, 4372, 64, 2],
 [1, 19, 2166, 87, 8, 41, 13, 38, 660, 1276, 6, 605, 48, 2],
 [1, 21, 17, 1722, 5653, 299, 109, 2401, 69, 17, 6, 845, 117, 30, 2],
 [1, 22, 3637, 107, 3, 67, 

## Padding

Pads sequences to uniform length

In [65]:
max_eng_length = max(len(seq) for seq in eng_sequence)
max_hin_length = max(len(seq) for seq in hin_sequence)

encoder_input = pad_sequences(eng_sequence,maxlen = max_eng_length,padding = 'post')
decoder_input = pad_sequences(hin_sequence,maxlen = max_hin_length,padding = 'post')

In [67]:
encoder_input

array([[  11,  203,   68, ...,    0,    0,    0],
       [  84, 4936,    0, ...,    0,    0,    0],
       [  20,   30,  297, ...,    0,    0,    0],
       ...,
       [  14,   37,    9, ...,    0,    0,    0],
       [   5,  271,   23, ...,    0,    0,    0],
       [   2,   10,  106, ...,    0,    0,    0]])

In [69]:
decoder_input

array([[   1,   16,  166, ...,    0,    0,    0],
       [   1,   61, 8166, ...,    0,    0,    0],
       [   1,   59,   10, ...,    0,    0,    0],
       ...,
       [   1,   19,  113, ...,    0,    0,    0],
       [   1,   10, 8161, ...,    0,    0,    0],
       [   1,    4, 3769, ...,    0,    0,    0]])

In [71]:
# decoder_target is shifted version of decoder_input used for teacher forcing. Like if decoder input id "start_maine dekha" the target is "maine dekha_end".
decoder_target = np.zeros((decoder_input.shape[0], decoder_input.shape[1], 1))
decoder_target[:, 0:-1, 0] = decoder_input[:, 1:]
decoder_target = np.zeros((decoder_input.shape[0], decoder_input.shape[1], 1))
decoder_target[:, 0:-1, 0] = decoder_input[:, 1:]

##  Define Model Architecture

### Encoder:

In [73]:
latent_dim = 256  
eng_vocab_size = len(eng_tokenizer.word_index) + 1
hin_vocab_size = len(hin_tokenizer.word_index) + 1

In [75]:
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(eng_vocab_size, latent_dim)(encoder_inputs)
enc_outputs, state_h, state_c = LSTM(latent_dim, return_state=True)(enc_emb)
encoder_states = [state_h, state_c]

This code defines the encoder of a sequence-to-sequence model: it takes tokenized input, embeds it into vectors, processes it with an LSTM, and captures the context in the form of final hidden and cell states to pass to the decoder.

### Decoder:

In [77]:
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(hin_vocab_size, latent_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(hin_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

Builds the decoder: takes target input, embeds it, uses LSTM initialized with encoder states, and outputs predicted token probabilities via a softmax layer.

## Compile and Train

In [79]:
model = Model([encoder_inputs,decoder_inputs],decoder_outputs)
model.compile(optimizer = 'rmsprop',loss = 'sparse_categorical_crossentropy')
model.fit([encoder_input, decoder_input], decoder_target, batch_size=64, epochs=10, validation_split=0.2)

Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 537ms/step - loss: 3.2238 - val_loss: 2.0236
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 586ms/step - loss: 2.0097 - val_loss: 1.9928
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 574ms/step - loss: 1.9670 - val_loss: 1.9541
Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 566ms/step - loss: 1.9262 - val_loss: 1.9290
Epoch 5/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 571ms/step - loss: 1.8956 - val_loss: 1.9006
Epoch 6/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 592ms/step - loss: 1.8618 - val_loss: 1.8797
Epoch 7/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 547ms/step - loss: 1.8471 - val_loss: 1.8549
Epoch 8/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 546ms/step - loss: 1.8154 - val_loss: 1.8381
Epoch 9/

<keras.src.callbacks.history.History at 0x29d5bd8c230>

Trains on source (encoder_input) and target (decoder_input) with shifted targets and uses RMSProp optimizer and cross-entropy loss.

## Inference Models

### Encoder Inference

In [83]:
encoder_model_inf = Model(encoder_inputs, encoder_states)

### Decoder Inference

In [85]:
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
dec_inf_emb = dec_emb_layer(decoder_inputs)
dec_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(dec_inf_emb, initial_state=decoder_states_inputs)
decoder_outputs_inf = decoder_dense(dec_outputs_inf)
decoder_model_inf = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs_inf, state_h_inf, state_c_inf])

## Reverse Lookup

In [87]:
reverse_eng = {v: k for k, v in eng_tokenizer.word_index.items()}
reverse_hin = {v: k for k, v in hin_tokenizer.word_index.items()}

## Translate Function

In [103]:
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: 'start_ ' + x + ' _end')

In [105]:
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(lines['english_sentence'])
eng_seq = eng_tokenizer.texts_to_sequences(lines['english_sentence'])

In [107]:
hin_tokenizer = Tokenizer(filters='')  
hin_tokenizer.fit_on_texts(lines['hindi_sentence'])
hin_seq = hin_tokenizer.texts_to_sequences(lines['hindi_sentence'])

In [109]:
max_eng_len = max(len(seq) for seq in eng_seq)
max_hin_len = max(len(seq) for seq in hin_seq)

In [111]:
def translate(sentence):
    sentence = clean_text(sentence)
    seq = eng_tokenizer.texts_to_sequences([sentence])
    padded = pad_sequences(seq, maxlen=max_eng_len, padding='post')
    states = encoder_model_inf.predict(padded)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = hin_tokenizer.word_index['start_']

    decoded = []
    while True:
        output, h, c = decoder_model_inf.predict([target_seq] + states)
        token_index = np.argmax(output[0, -1, :])
        word = reverse_hin.get(token_index, '')

        if word == '_end' or len(decoded) >= max_hin_len:
            break

        decoded.append(word)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = token_index
        states = [h, c]

    return ' '.join(decoded)
    
print("English: And")
print("Hindi:", translate("And"))

English: And
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 251ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m