In [1]:
!pip install tensorflow




In [19]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Input, TimeDistributed

# Load dataset (English-Telugu translation pairs)
data = pd.read_csv('Q4 MT Eng Tel Dataset.txt', sep='\t', header=None)
data.columns = ['English', 'Telugu', 'Meta']
data = data[['English', 'Telugu']]

# Add <start> and <end> to Telugu sentences for clearer prediction targets
data['Telugu'] = data['Telugu'].apply(lambda x: '<start> ' + x + ' <end>')

# Initialize Tokenizers
eng_tokenizer = Tokenizer()
tel_tokenizer = Tokenizer()

# Fit the tokenizer to the texts and ensure <start> and <end> are in the word_index
eng_tokenizer.fit_on_texts(data['English'])
tel_tokenizer.fit_on_texts(data['Telugu'])

# Ensure the <start> and <end> tokens are included in the word index
tel_tokenizer.word_index['<start>'] = len(tel_tokenizer.word_index) + 1
tel_tokenizer.word_index['<end>'] = len(tel_tokenizer.word_index) + 1
tel_tokenizer.index_word[tel_tokenizer.word_index['<start>']] = '<start>'
tel_tokenizer.index_word[tel_tokenizer.word_index['<end>']] = '<end>'

X = eng_tokenizer.texts_to_sequences(data['English'])
y = tel_tokenizer.texts_to_sequences(data['Telugu'])

# Padding the sequences to ensure equal length
max_len = max(max(len(seq) for seq in X), max(len(seq) for seq in y))
X = pad_sequences(X, maxlen=max_len, padding='post')
y = pad_sequences(y, maxlen=max_len, padding='post')

# One-hot encode target sequences for categorical cross-entropy loss
y = tf.keras.utils.to_categorical(y, num_classes=len(tel_tokenizer.word_index) + 1)

# Define the RNN-based Seq2Seq model
# Encoder
encoder_input = Input(shape=(max_len,))
encoder_emb = Embedding(input_dim=len(eng_tokenizer.word_index) + 1, output_dim=64)(encoder_input)
encoder_rnn = SimpleRNN(128, return_state=True)
encoder_output, state_h = encoder_rnn(encoder_emb)

# Decoder
decoder_input = Input(shape=(max_len, len(tel_tokenizer.word_index) + 1))
decoder_rnn = SimpleRNN(128, return_sequences=True)(decoder_input, initial_state=[state_h])
decoder_output = TimeDistributed(Dense(len(tel_tokenizer.word_index) + 1, activation='softmax'))(decoder_rnn)

# Model definition
model = Model([encoder_input, decoder_input], decoder_output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Prepare decoder input data (shift target sequences by 1)
y_decoder_input = np.zeros_like(y)
y_decoder_input[:, 1:] = y[:, :-1]

# Train the model
model.fit([X, y_decoder_input], y, epochs=30, batch_size=32)

# Extract encoder model (for translation)
# Correct way to extract the encoder model
encoder_model = Model(inputs=encoder_input, outputs=[encoder_output, state_h])

# Extract decoder model (for translation)
decoder_state_input_h = Input(shape=(128,))
decoder_input_single = Input(shape=(1, len(tel_tokenizer.word_index) + 1))
decoder_rnn_single = SimpleRNN(128, return_sequences=True)(decoder_input_single, initial_state=[decoder_state_input_h])
decoder_output_single = TimeDistributed(Dense(len(tel_tokenizer.word_index) + 1, activation='softmax'))(decoder_rnn_single)
decoder_model = Model([decoder_input_single, decoder_state_input_h], decoder_output_single)

# Function to translate a sentence
def translate(text):
    # Tokenize and pad the input sentence
    seq = eng_tokenizer.texts_to_sequences([text])
    seq = pad_sequences(seq, maxlen=max_len, padding='post')

    # Get encoder output and state
    encoder_output, state_h = encoder_model.predict(seq)

    # Initialize the decoder input with <start> token
    decoder_input = np.zeros((1, 1, len(tel_tokenizer.word_index) + 1))
    decoder_input[0, 0, tel_tokenizer.word_index['<start>']] = 1

    # List to store translated words
    translated_words = []

    for _ in range(max_len):
        # Get decoder output using the encoder states
        decoder_output_probs = decoder_model.predict([decoder_input, state_h])[0, 0]
        
        # Get the predicted word index
        predicted_index = np.argmax(decoder_output_probs)
        predicted_word = tel_tokenizer.index_word.get(predicted_index, '')

        # Stop translation when <end> token is generated
        if predicted_word == '<end>':
            break

        translated_words.append(predicted_word)

        # Update the decoder input for the next time step
        decoder_input = np.zeros((1, 1, len(tel_tokenizer.word_index) + 1))
        decoder_input[0, 0, predicted_index] = 1

        # Update the decoder state
        state_h = state_h  # No need to update, just pass the last state

    return ' '.join(translated_words)

# Test translations
print("\nTranslation Examples:")
test_sentences = [
    "I ran home.",
    "Who are we?",
    "Thank you.",
    "How tall is she?",
    "He's my son.",
    "I drank coffee.",
    "My head hurts."
]

for sent in test_sentences:
    print(f"English: {sent}")
    print(f"Telugu: {translate(sent)}\n")


Epoch 1/30




[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.1123 - loss: 5.9833 
Epoch 2/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.5294 - loss: 5.0176
Epoch 3/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.5986 - loss: 3.8982
Epoch 4/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.5991 - loss: 3.1087
Epoch 5/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.6030 - loss: 2.7193
Epoch 6/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.6096 - loss: 2.4878
Epoch 7/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.6206 - loss: 2.3063
Epoch 8/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.6315 - loss: 2.2182
Epoch 9/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Telugu: కావాలి కావాలి కావాలి కావాలి కావాలి కావాలి కావాలి కావాలి కావాలి కావాలి కావాలి కావాలి కావాలి

English: Who are we?
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [3