In [65]:
!pip install tensorflow




In [66]:
# ================== SINGLE CELL: English → Tamil (LSTM Seq2Seq) ==================

import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# ------------------ DATA ------------------
eng_data = [
    "hello",
    "good morning",
    "thank you",
    "how are you",
    "i am learning ai",
    "good night"
]

tam_data = [
    "START வணக்கம் END",
    "START காலை வணக்கம் END",
    "START நன்றி END",
    "START நீங்கள் எப்படி இருக்கிறீர்கள் END",
    "START நான் ai கற்றுக்கொள்கிறேன் END",
    "START இனிய இரவு END"
]

# ------------------ TOKENIZATION ------------------
eng_tok = Tokenizer()
tam_tok = Tokenizer()

eng_tok.fit_on_texts(eng_data)
tam_tok.fit_on_texts(tam_data)

eng_seq = eng_tok.texts_to_sequences(eng_data)
tam_seq = tam_tok.texts_to_sequences(tam_data)

max_eng = max(len(s) for s in eng_seq)
max_tam = max(len(s) for s in tam_seq)

enc_input = pad_sequences(eng_seq, maxlen=max_eng, padding="post")
dec_input = pad_sequences(tam_seq, maxlen=max_tam, padding="post")

dec_target = np.zeros_like(dec_input)
dec_target[:, :-1] = dec_input[:, 1:]

# ------------------ MODEL ------------------
eng_vocab = len(eng_tok.word_index) + 1
tam_vocab = len(tam_tok.word_index) + 1

embed_dim = 64
latent_dim = 128

# Encoder
enc_inputs = Input(shape=(None,))
enc_emb = Embedding(eng_vocab, embed_dim)(enc_inputs)
_, h, c = LSTM(latent_dim, return_state=True)(enc_emb)
enc_states = [h, c]

# Decoder
dec_inputs = Input(shape=(None,))
dec_emb = Embedding(tam_vocab, embed_dim)(dec_inputs)
dec_out, _, _ = LSTM(latent_dim, return_sequences=True, return_state=True)(
    dec_emb, initial_state=enc_states
)
dec_dense = Dense(tam_vocab, activation="softmax")
dec_out = dec_dense(dec_out)

# Training model
model = Model([enc_inputs, dec_inputs], dec_out)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")

model.fit(
    [enc_input, dec_input],
    dec_target[..., np.newaxis],
    epochs=400,
    batch_size=2,
    verbose=0
)

# ------------------ INFERENCE ------------------
encoder_model = Model(enc_inputs, enc_states)

state_h = Input(shape=(latent_dim,))
state_c = Input(shape=(latent_dim,))
states_inputs = [state_h, state_c]

dec_out_inf, h_inf, c_inf = model.layers[5](
    dec_emb, initial_state=states_inputs
)
dec_out_inf = dec_dense(dec_out_inf)
decoder_model = Model(
    [dec_inputs] + states_inputs,
    [dec_out_inf, h_inf, c_inf]
)

reverse_tam = {v: k for k, v in tam_tok.word_index.items()}

def translate(sentence):
    seq = eng_tok.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=max_eng, padding="post")
    states = encoder_model.predict(seq, verbose=0)

    target = np.zeros((1, 1))
    target[0, 0] = tam_tok.word_index["start"]

    result = []

    for _ in range(max_tam):
        preds, h, c = decoder_model.predict([target] + states, verbose=0)
        idx = np.argmax(preds[0, -1])
        word = reverse_tam.get(idx, "")

        if word == "end":
            break

        result.append(word)
        target[0, 0] = idx
        states = [h, c]

    return " ".join(result)

# ------------------ TEST ------------------
print("English:", "good morning")
print("Tamil:", translate("good morning"))

print("English:", "hello")
print("Tamil:", translate("hello"))


English: good morning
Tamil: காலை வணக்கம்
English: hello
Tamil: வணக்கம்
