In [1]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd




In [2]:
data = pd.read_csv("Hindi_English_Truncated_Corpus.csv")

In [3]:
data.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [4]:
input_texts = data.english_sentence	.values.astype('str')
target_texts = data.hindi_sentence.values.astype('str')

In [5]:
# Input Tokenization
input_tokenizer = Tokenizer()
input_tokenizer.fit_on_texts(input_texts)
input_vocab_size = len(input_tokenizer.word_index) + 1
input_sequences = input_tokenizer.texts_to_sequences(input_texts)
input_max_len = max(len(seq) for seq in input_sequences)
padded_input_sequences = pad_sequences(input_sequences, maxlen=input_max_len, padding='post')

In [6]:
# output Tokenization
target_tokenizer = Tokenizer()
target_tokenizer.fit_on_texts(target_texts)
target_vocab_size = len(target_tokenizer.word_index) + 1
target_sequences = target_tokenizer.texts_to_sequences(target_texts)
target_max_len = max(len(seq) for seq in target_sequences)
padded_target_sequences = pad_sequences(target_sequences, maxlen=target_max_len, padding='post')

In [7]:
# Create encoder-decoder model
embedding_dim = 128
units = 256

In [8]:
# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(input_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]




In [9]:
# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(target_vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(target_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [10]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [19]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(padded_input_sequences, padded_target_sequences, test_size=0.2, random_state=42)

In [None]:
model.fit([X_train, Y_train[:, :-1]], Y_train.reshape(Y_train.shape[0], Y_train.shape[1], 1)[:, 1:],
          epochs=2, batch_size=32, validation_data=([X_test, Y_test[:, :-1]], Y_test.reshape(Y_test.shape[0], Y_test.shape[1], 1)[:, 1:]), verbose=1)

Epoch 1/2


 460/3191 [===>..........................] - ETA: 16:18:04 - loss: 0.7531 - accuracy: 0.9582