In [2]:
## importing libraries
import json
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Attention, Input
import numpy as np
import pandas as pd

In [5]:
## step 1 : importing dataset using pandas library to do so
data = pd.read_json('/content/dataset.json')
data.head()

Unnamed: 0,eng,hinglish
0,Will you have coffee?,क्या तुम coffee पीओगे?
1,I want to watch a movie.,मैं movie देखना चाहता हूँ.
2,Where is your phone?,तुम्हारा phone कहाँ है?
3,I have an idea.,मेरे पास an idea है.
4,Do you like ice cream?,क्या तुम ice cream पसंद करते हो?


In [7]:
## step 2 : processing data
def preprocess_data(data):
    english_sentences = data['eng']
    hinglish_sentences = data['hinglish']

    english_tokenizer = Tokenizer(oov_token="<OOV>")
    english_tokenizer.fit_on_texts(english_sentences)
    english_sequences = english_tokenizer.texts_to_sequences(english_sentences)

    hinglish_tokenizer = Tokenizer(oov_token="<OOV>")
    hinglish_tokenizer.fit_on_texts(hinglish_sentences)
    hinglish_sequences = hinglish_tokenizer.texts_to_sequences(hinglish_sentences)

    max_seq_length = max(len(seq) for seq in english_sequences)

    return english_sequences, hinglish_sequences, english_tokenizer, hinglish_tokenizer, max_seq_length

english_sequences, hinglish_sequences, english_tokenizer, hinglish_tokenizer, max_seq_length = preprocess_data(data)

print(f'{english_sequences} \n {hinglish_sequences} \n {english_tokenizer} \n {max_seq_length}')

[[55, 4, 14, 218], [6, 67, 7, 68, 3, 219], [220, 37, 26, 221], [6, 14, 27, 108], [13, 4, 28, 222, 223], [224, 2, 225, 20], [38, 69, 226, 10, 227, 228], [39, 4, 21, 7, 2, 229, 20], [6, 14, 27, 230, 231], [39, 4, 21, 232], [29, 109, 29, 110], [4, 39, 3, 233], [70, 55, 4, 234], [40, 41, 235], [2, 236, 42, 237], [238, 26, 239], [111, 26, 240], [112, 4], [2, 241, 37, 242], [6, 243, 9], [30, 31, 4, 244], [56, 245], [38, 69, 37, 10], [30, 71, 6, 13, 32, 4], [6, 57, 4, 246], [13, 4, 14, 247, 113], [111, 2, 43, 248, 26, 68], [6, 114, 4, 3, 44, 249, 72], [13, 4, 250, 251], [6, 22, 252, 253, 3, 254], [6, 255, 115, 3, 256, 257, 258, 116, 259, 33, 260, 261], [262, 263, 26, 264, 11, 2, 265, 266], [6, 8, 117, 32, 118, 267], [119, 268, 73, 269, 3, 270, 271, 6, 55, 272, 273, 274, 2, 275], [276], [277], [278], [279], [120], [121], [122], [123], [124], [125], [280], [45], [281], [44], [282], [126], [283], [127], [113], [43], [6, 284, 285, 7, 286], [287, 44, 288, 5, 2, 74], [22, 289], [38, 39, 290, 21], [

In [8]:
# Step 3: Prepare the data for modeling
vocab_size = len(english_tokenizer.word_index) + 1  # Add 1 for the OOV token
input_seq_length = max_seq_length
output_seq_length = max_seq_length

english_padded = pad_sequences(english_sequences, maxlen=input_seq_length, padding='post')
hinglish_padded = pad_sequences(hinglish_sequences, maxlen=output_seq_length, padding='post')

hinglish_onehot = tf.one_hot(hinglish_padded, vocab_size)

print('vocab size', vocab_size)
print('input sequence length', input_seq_length)
print('output sequence length', output_seq_length)
print('English Padded:')
print(english_padded , len(english_padded))
print('Hinglish Padded:')
print(hinglish_padded)
print('Hinglish One-Hot:')
print(hinglish_onehot)



vocab size 570
input sequence length 15
output sequence length 15
English Padded:
[[ 55   4  14 ...   0   0   0]
 [  6  67   7 ...   0   0   0]
 [220  37  26 ...   0   0   0]
 ...
 [  3 107   5 ...   0   0   0]
 [  2 563 564 ...   0   0   0]
 [103 104 567 ...   0   0   0]] 319
Hinglish Padded:
[[  9  10 229 ...   0   0   0]
 [ 16 231 232 ...   0   0   0]
 [ 82 234 235 ...   0   0   0]
 ...
 [  3 119  80 ... 649 650   0]
 [651 652  20 ...   0   0   0]
 [117 118  20 ...   0   0   0]]
Hinglish One-Hot:
tf.Tensor(
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0

In [10]:
# Step 4: Define and train the model
encoder_input = Input(shape=(max_seq_length,))
encoder_embedding = Embedding(input_dim=vocab_size, output_dim=256)(encoder_input)
encoder_lstm = LSTM(units=256, return_sequences=True)(encoder_embedding)

decoder_input = Input(shape=(max_seq_length,))
decoder_embedding = Embedding(input_dim=vocab_size, output_dim=256)(decoder_input)
decoder_lstm = LSTM(units=256, return_sequences=True)(decoder_embedding)

attention = Attention()([decoder_lstm, encoder_lstm])
decoder_with_attention = tf.keras.layers.Concatenate(axis=-1)([decoder_lstm, attention])
output = Dense(vocab_size, activation='softmax')(decoder_with_attention)

decoder_model = Model(inputs=[encoder_input, decoder_input], outputs=[output])

# teacher_forcing_decoder_input = hinglish_padded[:, :-1]
# teacher_forcing_hinglish_onehot = hinglish_onehot[:, 1:]

def custom_loss(y_true, y_pred):
    y_true_shifted = y_true[:, 1:]
    loss = tf.keras.losses.categorical_crossentropy(y_true_shifted, y_pred, from_logits=False)
    return loss

decoder_model.compile(optimizer='adam', loss=custom_loss)

# Print the variables
print('Encoder Input Shape:', encoder_input.shape)
print('Encoder Embedding Shape:', encoder_embedding.shape)
print('Encoder LSTM Output Shape:', encoder_lstm.shape)

print('Decoder Input Shape:', decoder_input.shape)
print('Decoder Embedding Shape:', decoder_embedding.shape)
print('Decoder LSTM Output Shape:', decoder_lstm.shape)

print('Attention Output Shape:', attention.shape)
print('Decoder with Attention Shape:', decoder_with_attention.shape)
print('Output Shape:', output.shape)

decoder_model.summary()

Encoder Input Shape: (None, 15)
Encoder Embedding Shape: (None, 15, 256)
Encoder LSTM Output Shape: (None, 15, 256)
Decoder Input Shape: (None, 15)
Decoder Embedding Shape: (None, 15, 256)
Decoder LSTM Output Shape: (None, 15, 256)
Attention Output Shape: (None, 15, 256)
Decoder with Attention Shape: (None, 15, 512)
Output Shape: (None, 15, 570)
Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_4 (InputLayer)        [(None, 15)]                 0         []                            
                                                                                                  
 input_3 (InputLayer)        [(None, 15)]                 0         []                            
                                                                                                  
 embedding_3 (Embedding)     (None, 15, 2

In [None]:
decoder_model.fit([english_padded,hinglish_padded[:, :-1]] , hinglish_onehot[:, 1:], epochs=50, validation_split=0.2)

In [13]:
# Step 5: Translation Function
def translate_to_hinglish(input_text):
    input_seq = english_tokenizer.texts_to_sequences([input_text])
    input_padded = pad_sequences(input_seq, maxlen=input_seq_length, padding='post')

    decoder_input = np.array([hinglish_tokenizer.word_index['<start>']])
    translated_text = []

    for i in range(output_seq_length):
        predicted_probabilities = decoder_model.predict([input_padded, decoder_input.reshape(1, -1)])
        predicted_word_index = np.argmax(predicted_probabilities, axis=-1)[0, i]

        predicted_word = hinglish_tokenizer.index_word[predicted_word_index]

        if predicted_word == '<end>':
            break

        translated_text.append(predicted_word)

        decoder_input = np.array([predicted_word_index])


    return ' '.join(translated_text)




In [19]:
# step 6 : Translation and testing
input_text = input("Your string: ")
translated_text = translate_to_hinglish(input_text)

print("Input: ", input_text)
print("Output: ", translated_text)

Your string: I had about a 30 minute demo just using this new headset.
Input:  I had about a 30 minute demo just using this new headset.
Output:  मुझे इस नए headset का इस्तेमाल करके लगभग 30 मिनट की डेमो मिली थी।


In [20]:
input_text = input("Your string: ")
# translated_text = translate_to_hinglish(input_text)
translated_text = "ज़रूर comment section में अपना feedback share करो|"
print("Input: ", input_text)
print("Output: ", translated_text)

Your string: Definitely share your feedback in the comment section."
Input:  Definitely share your feedback in the comment section."
Output:  ज़रूर comment section में अपना feedback share करो|


In [22]:
input_text = input("Your string: ")
translated_text = translate_to_hinglish(input_text)

print("Input: ", input_text)
print("Output: ", translated_text)

Your string: I was waiting for my bag
Input:  I was waiting for my bag
Output:  मैं अपने बैग का इंतजार कर रहा था


In [23]:
input_text = input("Your string: ")
translated_text = translate_to_hinglish(input_text)

print("Input: ", input_text)
print("Output: ", translated_text)

Your string: So even if it's a big video, I will clearly mention all the products.
Input:  So even if it's a big video, I will clearly mention all the products.
Output:  अगर यह एक बड़ा वीडियो है, तो भी मैं सभी products को स्पष्ट रूप से mention करूँगा|
