# Importing Required Libraries

In [40]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
import string

#Imorting google drive to acess dataset from drive

Note - you can skip this step if you have data locally.


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Loading Dataset

In [41]:
# Load your dataset from the CSV file
df = pd.read_csv('drive/MyDrive/Assignment_3/Train.csv')

#Extarting and preprocessing data

In [51]:
# Extract the English and Hinglish sentences from the DataFrame
english_sentences = df['english_sentences'][:750].tolist()
hinglish_sentences = df['hinglish_sentences'][:750].tolist()
english_sentences = [''.join(char for char in item if char not in string.punctuation)for item in english_sentences]
hinglish_sentences = [''.join(char for char in item if char not in string.punctuation)for item in hinglish_sentences]

#Creating token by converting the text to vector formate

In [52]:
# Tokenization
english_tokenizer = keras.layers.TextVectorization(output_mode='int')
english_tokenizer.adapt(english_sentences)

hinglish_tokenizer = keras.layers.TextVectorization(output_mode='int')
hinglish_tokenizer.adapt(hinglish_sentences)

#Defining Encoder and Decoder

In [53]:
# Define the encoder-decoder model
input_layer = Input(shape=(None,))
encoder = keras.layers.Embedding(input_dim=len(english_tokenizer.get_vocabulary()), output_dim=256)(input_layer)
encoder, state_h, state_c = LSTM(256, return_state=True)(encoder)

decoder_input = Input(shape=(None,))
decoder = keras.layers.Embedding(input_dim=len(hinglish_tokenizer.get_vocabulary()), output_dim=256)(decoder_input)
decoder, _, _ = LSTM(256, return_sequences=True, return_state=True)(decoder, initial_state=[state_h, state_c])
output = Dense(len(hinglish_tokenizer.get_vocabulary()), activation='softmax')(decoder)

#Creating Models and training Models

In [54]:
model = Model(inputs=[input_layer, decoder_input], outputs=output)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Training data
x_train = english_tokenizer(np.array(english_sentences))
y_train = hinglish_tokenizer(np.array(hinglish_sentences))

# Padding sequences
x_train = keras.preprocessing.sequence.pad_sequences(x_train, padding='post')
y_train = keras.preprocessing.sequence.pad_sequences(y_train, padding='post')

# Train the model
model.fit([x_train, y_train[:, :-1]], y_train[:, 1:], batch_size=1, epochs=20)

# Inference model (for translation)
encoder_model = Model(inputs=input_layer, outputs=[state_h, state_c])
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_embed = keras.layers.Embedding(input_dim=len(hinglish_tokenizer.get_vocabulary()), output_dim=256)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)

decoder_input = Input(shape=(None,))
decoder_embedded = decoder_embed(decoder_input)
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedded, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_dense = Dense(len(hinglish_tokenizer.get_vocabulary()), activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_input] + decoder_states_inputs, [decoder_outputs] + decoder_states)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


#Creating Translation Fuction

In [57]:
# Translate English to Hinglish
def translate_english_to_hinglish(input_text):
    input_seq = english_tokenizer([input_text])
    initial_states = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))  # Initialize with start token

    stop_condition = False
    output_text = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + initial_states)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = hinglish_tokenizer.get_vocabulary()[sampled_token_index]

        if sampled_token != '<end>':
            output_text += sampled_token + ' '

        # Exit condition: either hitting max length or finding the end token
        if sampled_token == '<end>' or len(output_text.split()) > len(input_text)+5:
            stop_condition = True

        # Update the target sequence (of length 1)
        target_seq = np.array([[sampled_token_index]])

        # Update states
        initial_states = [h, c]

    return output_text

# Example usage
input_text = 'Hello how are you ?'
translated_text = translate_english_to_hinglish(input_text)
print(translated_text)

actually actually actually actually friends friends friends sirf sirf sirf sirf sirf sirf bhaag barbaad barbaad bob shaark b pehle pehle gae gae referred laaye 


In [58]:
input_text = 'Definitely share your feedback in the comment section.'
translated_text = translate_english_to_hinglish(input_text)
print(translated_text)

tim tim tim tim tim tim tim tim nihitaarth nihitaarth nihitaarth nihitaarth yek yek koriyograaphee per minions anubhav per minions anubhav parr accuracy parr kodabreking meril kheenchata ven meril ven traasadee traasadee rahi metacritic traasadee saupa kareng metacritic piece release kareng goyer sev likable pasandida kaha aya jarurat tareeke ghar victor dekhane californiamein jise bete bete kon jaega isaka kaisi 


In [59]:
input_text = 'So even if its a big video, I will clearly mention all the products.'
translated_text = translate_english_to_hinglish(input_text)
print(translated_text)

tim tim tim tim tim tim tim tim tim univerce nihitaarth nihitaarth nihitaarth nihitaarth man nihitaarth yek yek per limited low low per minions sawalbahut sawalbahut typo steven accuracy ironic anubhav zyadatar majakiya chase zyadatar sa actors gordon socho 1920 6 bhi tomaatoz position laghbhag depending remember hone khada vishwaas bad impact robin googlai classics jisne copley copley copley dekhthe apni definitely male dekhunga month paatr udaane paravaah choice chalta gru demiyan demiyan zack 


In [60]:
input_text = 'I was waiting for my bag.'
translated_text = translate_english_to_hinglish(input_text)
print(translated_text)

nefario nefario shaark shaark shaark shaark shaark shaark shaark shaark shaark sirf sirf sirf sirf sirf ninteen bhaag shaark un week victor b barbaad b better charitron philmon bhaga shaadee fight 
