In [1]:
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import nltk.translate.bleu_score as bleu
import random
import string
from sklearn.model_selection import train_test_split
import os
import time

In [2]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
eng_hin = pd.read_csv(
    "/content/drive/MyDrive/PRL/task/data/synthetic-dataset/train.csv"
)
eng_hin.head()

Unnamed: 0,English,Hindi,Hinglish,Average rating,Disagreement
0,Program module is a file that contains instruc...,"माड्यूल, एक संचिका होती है, जिसमें या तो स्रोत...","module , ek program hoti hai , jismen ya to so...",7,6
1,And to Thamud We sent their brother Sali 'h. H...,और (हमने) क़ौमे समूद के पास उनके भाई सालेह को ...,aur hamne aume samood ke pas unke bhaee saleh ...,6,4
2,"and, when reminded, do not remember\n","और जब उन्हें याद दिलाया जाता है, तो वे याद नही...","aur jab unhen yad dilaya jata hai , to ve yad ...",10,0
3,you won the TED Prize 2011.\n,तुम्हें २०११ का टेड प्राइज़ मिल गया है.\n,tumhen २०११ ka ted prize mil gaya hai\n,9,1
4,He gone to Kerodemal College of Delhi Universi...,उन्होंने बाद अध्ययन करने के लिए ये दिल्ली विश्...,unhonne bad science karne ke lie ye delhi univ...,7,0


In [41]:
# create a new dataframe of english and hinglish column
df = pd.DataFrame()
df["english"] = eng_hin["English"]
df["hindi"] = eng_hin["Hindi"]
df.head()

Unnamed: 0,english,hindi
0,Program module is a file that contains instruc...,"माड्यूल, एक संचिका होती है, जिसमें या तो स्रोत..."
1,And to Thamud We sent their brother Sali 'h. H...,और (हमने) क़ौमे समूद के पास उनके भाई सालेह को ...
2,"and, when reminded, do not remember\n","और जब उन्हें याद दिलाया जाता है, तो वे याद नही..."
3,you won the TED Prize 2011.\n,तुम्हें २०११ का टेड प्राइज़ मिल गया है.\n
4,He gone to Kerodemal College of Delhi Universi...,उन्होंने बाद अध्ययन करने के लिए ये दिल्ली विश्...


In [42]:
eng_hin.dropna(inplace=True)
eng_hin.shape

(2766, 7)

In [69]:
import pickle

with open(
    "/content/drive/MyDrive/PRL/task/data/human-generated-dataset/train_human_generated.pkl",
    "rb",
) as f:
    human_generated = pickle.load(f)
    print("Human generated dataset size:", len(human_generated))

human_generated

Human generated dataset size: 1800


{'Program module is a file that contains instructions which are either in the form of source code or machine language.\n': ['Program module ek sanchika hoti hai, jisme ya to source code ya machine language ke rup me anudesh nihit hote hai.\n',
  'Program module ek sanchika hoti hai, that contains instructions which are either in the form of source code or machine language.\n',
  'Program module is a file jisme source code or machine language ke rup me instructions nihit hote he.\n',
  'Program module ak file hoti he that contains instructions which are either in source code or machine language. \n'],
 "And to Thamud We sent their brother Sali 'h. He said:' My people! Serve Allah; you have no god other than Him. He brought you into being out of the earth, and has made you dwell in it. So ask Him to forgive you, and do turn towards Him in repentance. Indeed My Lord is near, responsive to prayers.\n": ['Aur humne unke bhai Salih ko Thamud ke paas bheja. Usne kaha, “My people! Serve Allah;

In [None]:
exclude = set(string.punctuation)  # Set of all special characters
remove_digits = str.maketrans("", "", string.digits)  # Set of all digits

In [44]:
def preprocess(text):
    """Function to preprocess English sentence"""
    text = text.lower()  # lower casing
    text = re.sub("'", "", text)  # remove the quotation marks if any
    text = "".join(ch for ch in text if ch not in exclude)
    text = text.translate(remove_digits)  # remove the digits
    text = text.strip()
    text = re.sub(" +", " ", text)  # remove extra spaces
    text = "<start> " + text + " <end>"
    return text

In [45]:
def preprocess_hin(text):
    """Function to preprocess Marathi sentence"""
    text = re.sub("'", "", text)  # remove the quotation marks if any
    text = "".join(ch for ch in text if ch not in exclude)
    text = re.sub("[२३०८१५७९४६]", "", text)  # remove the digits
    text = text.strip()
    text = re.sub(" +", " ", text)  # remove extra spaces
    text = "<start> " + text + " <end>"
    return text

In [46]:
eng_hin["english"] = df["english"].apply(preprocess)
eng_hin["hindi"] = df["hindi"].apply(preprocess_hin)

eng_hin.rename(columns={"english": "english", "hindi": "hindi"}, inplace=True)

eng_hin.head()

Unnamed: 0,English,Hindi,Hinglish,Average rating,Disagreement,english,hindi
0,Program module is a file that contains instruc...,"माड्यूल, एक संचिका होती है, जिसमें या तो स्रोत...","module , ek program hoti hai , jismen ya to so...",7,6,<start> program module is a file that contains...,<start> माड्यूल एक संचिका होती है जिसमें या तो...
1,And to Thamud We sent their brother Sali 'h. H...,और (हमने) क़ौमे समूद के पास उनके भाई सालेह को ...,aur hamne aume samood ke pas unke bhaee saleh ...,6,4,<start> and to thamud we sent their brother sa...,<start> और हमने क़ौमे समूद के पास उनके भाई साल...
2,"and, when reminded, do not remember\n","और जब उन्हें याद दिलाया जाता है, तो वे याद नही...","aur jab unhen yad dilaya jata hai , to ve yad ...",10,0,<start> and when reminded do not remember <end>,<start> और जब उन्हें याद दिलाया जाता है तो वे ...
3,you won the TED Prize 2011.\n,तुम्हें २०११ का टेड प्राइज़ मिल गया है.\n,tumhen २०११ ka ted prize mil gaya hai\n,9,1,<start> you won the ted prize <end>,<start> तुम्हें का टेड प्राइज़ मिल गया है <end>
4,He gone to Kerodemal College of Delhi Universi...,उन्होंने बाद अध्ययन करने के लिए ये दिल्ली विश्...,unhonne bad science karne ke lie ye delhi univ...,7,0,<start> he gone to kerodemal college of delhi ...,<start> उन्होंने बाद अध्ययन करने के लिए ये दिल...


In [47]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters="")
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(
        tensor, padding="post", maxlen=20, dtype="int32"
    )

    return tensor, lang_tokenizer

In [48]:
def load_dataset():
    input_tensor, inp_lang_tokenizer = tokenize(eng_hin["english"].values)
    target_tensor, targ_lang_tokenizer = tokenize(eng_hin["hindi"].values)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [49]:
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset()

In [50]:
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [51]:
(
    input_tensor_train,
    input_tensor_val,
    target_tensor_train,
    target_tensor_val,
) = train_test_split(input_tensor, target_tensor, test_size=0.2)

print(
    len(input_tensor_train),
    len(target_tensor_train),
    len(input_tensor_val),
    len(target_tensor_val),
)

2212 2212 554 554


In [52]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 32
N_BATCH = BUFFER_SIZE // BATCH_SIZE
embedding_dim = 256
units = 1024
steps_per_epoch = len(input_tensor_train) // BATCH_SIZE

vocab_inp_size = len(inp_lang.word_index.keys())
vocab_tar_size = len(targ_lang.word_index.keys())

dataset = tf.data.Dataset.from_tensor_slices(
    (input_tensor_train, target_tensor_train)
).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [53]:
embeddings_index = dict()
f = open(
    "/content/drive/MyDrive/PRL/task/data/synthetic-dataset/glove.6B.300d.txt"
)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype="float32")
    embeddings_index[word] = coefs
f.close()

embedding_matrix = np.zeros((vocab_inp_size + 1, 300))
for word, i in inp_lang.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [54]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(
            input_dim=vocab_size,
            output_dim=embedding_dim,
            name="embedding_layer_encoder",
            trainable=False,
        )
        self.gru = tf.keras.layers.GRU(
            units,
            return_sequences=True,
            return_state=True,
            recurrent_activation="sigmoid",
            recurrent_initializer="glorot_uniform",
        )

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [55]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(
            units,
            return_sequences=True,
            return_state=True,
            recurrent_activation="sigmoid",
            recurrent_initializer="glorot_uniform",
        )
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.W1 = tf.keras.layers.Dense(self.dec_units)
        self.W2 = tf.keras.layers.Dense(self.dec_units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, x, hidden, enc_output):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        score = self.V(
            tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis))
        )

        attention_weights = tf.nn.softmax(score, axis=1)

        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)

        x = self.embedding(x)

        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        output, state = self.gru(x)

        output = tf.reshape(output, (-1, output.shape[2]))

        x = self.fc(output)

        return x, state, attention_weights

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.dec_units))

In [56]:
tf.keras.backend.clear_session()

encoder = Encoder(vocab_inp_size + 1, 300, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size + 1, embedding_dim, units, BATCH_SIZE)

In [57]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction="none"
)


def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [58]:
checkpoint_dir = "./training_checkpoints"
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(
    optimizer=optimizer, encoder=encoder, decoder=decoder
)

In [59]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        encoder.get_layer("embedding_layer_encoder").set_weights(
            [embedding_matrix]
        )
        dec_hidden = enc_hidden

        dec_input = tf.expand_dims(
            [targ_lang.word_index["<start>"]] * BATCH_SIZE, 1
        )

        for t in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(
                dec_input, dec_hidden, enc_output
            )

            loss += loss_function(targ[:, t], predictions)

            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = loss / int(targ.shape[1])

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [60]:
EPOCHS = 100

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for batch, (inp, targ) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print(
                f"Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy():.4f}"
            )
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix=checkpoint_prefix)

    print(f"Epoch {epoch+1} Loss {total_loss/steps_per_epoch:.4f}")
    print(f"Time taken for 1 epoch {time.time()-start:.2f} sec\n")

Epoch 1 Batch 0 Loss 6.5222
Epoch 1 Loss 5.2688
Time taken for 1 epoch 41.56 sec

Epoch 2 Batch 0 Loss 5.3985
Epoch 2 Loss 4.7789
Time taken for 1 epoch 10.67 sec

Epoch 3 Batch 0 Loss 4.6290
Epoch 3 Loss 4.5858
Time taken for 1 epoch 7.52 sec

Epoch 4 Batch 0 Loss 4.3748
Epoch 4 Loss 4.3632
Time taken for 1 epoch 8.77 sec

Epoch 5 Batch 0 Loss 4.1554
Epoch 5 Loss 4.1030
Time taken for 1 epoch 7.61 sec

Epoch 6 Batch 0 Loss 3.9583
Epoch 6 Loss 3.7828
Time taken for 1 epoch 8.60 sec

Epoch 7 Batch 0 Loss 3.2661
Epoch 7 Loss 3.3910
Time taken for 1 epoch 7.08 sec

Epoch 8 Batch 0 Loss 3.0859
Epoch 8 Loss 2.9276
Time taken for 1 epoch 8.78 sec

Epoch 9 Batch 0 Loss 2.5109
Epoch 9 Loss 2.3943
Time taken for 1 epoch 7.17 sec

Epoch 10 Batch 0 Loss 1.9882
Epoch 10 Loss 1.8577
Time taken for 1 epoch 7.87 sec

Epoch 11 Batch 0 Loss 1.2558
Epoch 11 Loss 1.4060
Time taken for 1 epoch 7.60 sec

Epoch 12 Batch 0 Loss 1.0407
Epoch 12 Loss 1.0609
Time taken for 1 epoch 8.16 sec

Epoch 13 Batch 0 Los

In [61]:
def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess(sentence)

    inputs = [inp_lang.word_index[i] for i in sentence.split(" ")]
    inputs = tf.keras.preprocessing.sequence.pad_sequences(
        [inputs], maxlen=20, padding="post"
    )
    inputs = tf.convert_to_tensor(inputs)

    result = ""

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index["<start>"]], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(
            dec_input, dec_hidden, enc_out
        )
        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1,))
        attention_plot[t] = attention_weights.numpy()
        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.index_word[predicted_id] + " "

        if targ_lang.index_word[predicted_id] == "<end>":
            return result, attention_plot

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, attention_plot

In [62]:
input_sentence = "please ensure that you use the appropriate form "
print("Input sentence in english : ", input_sentence)
predicted_output, attention_plot = evaluate(input_sentence)
print("Predicted sentence in hindi : ", predicted_output)

Input sentence in english :  please ensure that you use the appropriate form 
Predicted sentence in hindi :  आपकी मांग को बदलने का गाना मंजूर किया हुआ <end> 


In [63]:
input_sentence = "and do something with it to change the world "
print("Input sentence in english : ", input_sentence)
predicted_output, attention_plot = evaluate(input_sentence)
print("Predicted sentence in hindi : ", predicted_output)

Input sentence in english :  and do something with it to change the world 
Predicted sentence in hindi :  और अगर मालूम हो <end> 


# OpenInAPP sentences output


In [64]:
input_sentence = (
    "So even if its a big video I will clearly mention all the products "
)
print("Input sentence in english : ", input_sentence)
predicted_output, attention_plot = evaluate(input_sentence)
print("Predicted sentence in hindi : ", predicted_output)

Input sentence in english :  So even if its a big video I will clearly mention all the products 
Predicted sentence in hindi :  तो सच है <end> 


In [65]:
input_sentence = "I was waiting for my bag "
print("Input sentence in english : ", input_sentence)
predicted_output, attention_plot = evaluate(input_sentence)
print("Predicted sentence in hindi : ", predicted_output)

Input sentence in english :  I was waiting for my bag 
Predicted sentence in hindi :  मैंने मैं धैर्य न छूट जाए। <end> 


In [66]:
input_sentence = "definitely share your feedback in the comment section "
print("Input sentence in english : ", input_sentence)
predicted_output, attention_plot = evaluate(input_sentence)
print("Predicted sentence in hindi : ", predicted_output)

Input sentence in english :  definitely share your feedback in the comment section 


KeyError: ignored