In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [2]:
english_sentences = [
    "Hello", "How are you?", "I am fine", "What is your name?", "My name is John",
    "Good morning", "Good night", "Thank you", "Welcome", "See you later",
    "I love you", "I like coffee", "Where is the station?", "I am hungry", "I am tired",
    "Please help me", "I am learning Tamil", "Do you speak English?", "Yes", "No",
    "Excuse me", "Sorry", "Congratulations", "Happy birthday", "Good luck",
    "What time is it?", "I need water", "I am thirsty", "I am lost", "Call the police",
    "Open the door", "Close the window", "I am happy", "I am sad", "It is raining",
    "It is hot", "It is cold", "I am going home", "I will come later", "I am busy",
    "Can you help me?", "I don’t understand", "Where are you?", "I am here", "Let’s go",
    "Sit down", "Stand up", "Be quiet", "Don’t worry", "Take care"
]


In [3]:

tamil_sentences = [
    "வணக்கம்", "நீங்கள் எப்படி இருக்கிறீர்கள்?", "நான் நன்றாக இருக்கிறேன்", "உங்கள் பெயர் என்ன?", "என் பெயர் ஜான்",
    "காலை வணக்கம்", "இரவு வணக்கம்", "நன்றி", "வரவேற்கிறோம்", "பின்னர் பார்க்கலாம்",
    "நான் உன்னை காதலிக்கிறேன்", "நான் காபி விரும்புகிறேன்", "நிலையம் எங்கே?", "நான் பசிக்கிறேன்", "நான் சோர்வாக இருக்கிறேன்",
    "தயவு செய்து எனக்கு உதவுங்கள்", "நான் தமிழ் கற்கிறேன்", "நீங்கள் ஆங்கிலம் பேசுகிறீர்களா?", "ஆம்", "இல்லை",
    "மன்னிக்கவும்", "மன்னிக்கவும்", "வாழ்த்துக்கள்", "பிறந்தநாள் வாழ்த்துக்கள்", "நல்ல அதிர்ஷ்டம்",
    "எப்போது?", "எனக்கு தண்ணீர் வேண்டும்", "நான் தாகமாக இருக்கிறேன்", "நான் தொலைந்துவிட்டேன்", "போலீசை அழிக்கவும்",
    "கதவை திறக்கவும்", "ஜன்னலை மூடு", "நான் சந்தோஷமாக இருக்கிறேன்", "நான் கவலைப்படுகிறேன்", "மழை பெய்கிறது",
    "வெப்பம் உள்ளது", "குளிராக உள்ளது", "நான் வீட்டுக்கு போகிறேன்", "நான் பின்னர் வருவேன்", "நான் பிஸியாக இருக்கிறேன்",
    "நீங்கள் எனக்கு உதவ முடியுமா?", "நான் புரிந்துகொள்ளவில்லை", "நீங்கள் எங்கே இருக்கிறீர்கள்?", "நான் இங்கே இருக்கிறேன்", "நாம் போகலாம்",
    "உறங்குங்கள்", "நிலைத்து நிற்கவும்", "மெச்சமாக இருங்கள்", "கவலைப்பட வேண்டாம்", "கவனமாக இருங்கள்"
]




In [4]:
tamil_sentences = ['start ' + sent + ' end' for sent in tamil_sentences]

In [5]:
eng_tokenizer = Tokenizer(filters='')
eng_tokenizer.fit_on_texts(english_sentences)
eng_sequences = eng_tokenizer.texts_to_sequences(english_sentences)
max_eng_len = max(len(seq) for seq in eng_sequences)
eng_sequences = pad_sequences(eng_sequences, maxlen=max_eng_len, padding='post')
num_eng_words = len(eng_tokenizer.word_index) + 1

tam_tokenizer = Tokenizer(filters='')
tam_tokenizer.fit_on_texts(tamil_sentences)
tam_sequences = tam_tokenizer.texts_to_sequences(tamil_sentences)
max_tam_len = max(len(seq) for seq in tam_sequences)
tam_sequences = pad_sequences(tam_sequences, maxlen=max_tam_len, padding='post')
num_tam_words = len(tam_tokenizer.word_index) + 1

In [6]:
decoder_target_data = np.zeros_like(tam_sequences)
decoder_target_data[:, :-1] = tam_sequences[:, 1:]
decoder_target_data[:, -1] = 0

In [8]:
encoder_inputs = Input(shape=(max_eng_len,))
enc_emb = Embedding(num_eng_words, 256, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(max_tam_len,))
dec_emb = Embedding(num_tam_words, 256, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(num_tam_words, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [9]:
model.fit([eng_sequences, tam_sequences],
          np.expand_dims(decoder_target_data, -1),
          batch_size=16,
          epochs=300,
          validation_split=0.2)

Epoch 1/300
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 374ms/step - accuracy: 0.1688 - loss: 4.3847 - val_accuracy: 0.5500 - val_loss: 4.3413
Epoch 2/300
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - accuracy: 0.6029 - loss: 4.3062 - val_accuracy: 0.5167 - val_loss: 4.2518
Epoch 3/300
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - accuracy: 0.5310 - loss: 4.1716 - val_accuracy: 0.4667 - val_loss: 4.0638
Epoch 4/300
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step - accuracy: 0.4672 - loss: 3.8757 - val_accuracy: 0.4500 - val_loss: 3.6335
Epoch 5/300
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step - accuracy: 0.4651 - loss: 3.2305 - val_accuracy: 0.4500 - val_loss: 3.1534
Epoch 6/300
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step - accuracy: 0.4810 - loss: 2.9008 - val_accuracy: 0.5667 - val_loss: 3.0637
Epoch 7/300
[1m3/3[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7c98c2bcf980>

In [10]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
dec_states_inputs = [decoder_state_input_h, decoder_state_input_c]
dec_emb2 = Embedding(num_tam_words, 256, mask_zero=True)(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=dec_states_inputs)
decoder_outputs2 = decoder_dense(decoder_outputs2)
decoder_model = Model([decoder_inputs] + dec_states_inputs, [decoder_outputs2, state_h2, state_c2])


In [11]:
def translate_sentence(sentence):
    seq = eng_tokenizer.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=max_eng_len, padding='post')
    states_value = encoder_model.predict(seq)

    target_seq = np.array([[tam_tokenizer.word_index['start']]])
    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tam_tokenizer.index_word.get(sampled_token_index, '')
        if sampled_word == 'end' or sampled_word == '':
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        target_seq = np.array([[sampled_token_index]])
        states_value = [h, c]

    return decoded_sentence.strip()


In [13]:
test_sentences = ["Hello", "I am fine", "Good morning"]
for s in test_sentences:
    print(f"English: {s}")
    print(f"Tamil: {translate_sentence(s)}\n")

English: Hello
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Tamil: வணக்கம்

English: I am fine
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Tamil: நான் நன்றாக இருக்கிறேன்

English: Good morning
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Tamil: காலை வணக்கம்

