In [1]:
"""
Create a Hinglish translation from English text. The text should sound natural and also
convert all the difficult words and phrases in English to Hinglish. This converted text should
be easy to understand for even a non-native Hindi speaker.
We have attached below the statements that are required to be used for this assignment.
1. Definitely share your feedback in the comment section.
2. So even if it's a big video, I will clearly mention all the products.
3. I was waiting for my bag.
Example:
Statement: I had about a 30 minute demo just using this new headset
Output required: मझु ेसि र्फ ३० minute का demo मि ला था इस नयेheadset का इस्तमे ाल करनेके
लि ए
Rules:
● The model must be able to generate a translation that is indistinguishable from
Hindi spoken by a casual Hindi speaker.
● Must be able to keep certain words in English to keep the Hindi translation Easy.
● The Hinglish sentences should be accurate to the meaning of the original sentence
"""

#use MT5 model for this
#use pretrained model
#use hinglish data for training

"\nCreate a Hinglish translation from English text. The text should sound natural and also\nconvert all the difficult words and phrases in English to Hinglish. This converted text should\nbe easy to understand for even a non-native Hindi speaker.\nWe have attached below the statements that are required to be used for this assignment.\n1. Definitely share your feedback in the comment section.\n2. So even if it's a big video, I will clearly mention all the products.\n3. I was waiting for my bag.\nExample:\nStatement: I had about a 30 minute demo just using this new headset\nOutput required: मझु ेसि र्फ ३० minute का demo मि ला था इस नयेheadset का इस्तमे ाल करनेके\nलि ए\nRules:\n● The model must be able to generate a translation that is indistinguishable from\nHindi spoken by a casual Hindi speaker.\n● Must be able to keep certain words in English to keep the Hindi translation Easy.\n● The Hinglish sentences should be accurate to the meaning of the original sentence\n"

In [2]:
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import nltk.translate.bleu_score as bleu
import random
import string
from sklearn.model_selection import train_test_split
import os
import time

In [4]:
dataset = pd.read_csv("data/synthetic-dataset/train.csv")
dataset.head()

Unnamed: 0,English,Hindi,Hinglish,Average rating,Disagreement
0,Program module is a file that contains instruc...,"माड्यूल, एक संचिका होती है, जिसमें या तो स्रोत...","module , ek program hoti hai , jismen ya to so...",7,6
1,And to Thamud We sent their brother Sali 'h. H...,और (हमने) क़ौमे समूद के पास उनके भाई सालेह को ...,aur hamne aume samood ke pas unke bhaee saleh ...,6,4
2,"and, when reminded, do not remember\n","और जब उन्हें याद दिलाया जाता है, तो वे याद नही...","aur jab unhen yad dilaya jata hai , to ve yad ...",10,0
3,you won the TED Prize 2011.\n,तुम्हें २०११ का टेड प्राइज़ मिल गया है.\n,tumhen २०११ ka ted prize mil gaya hai\n,9,1
4,He gone to Kerodemal College of Delhi Universi...,उन्होंने बाद अध्ययन करने के लिए ये दिल्ली विश्...,unhonne bad science karne ke lie ye delhi univ...,7,0


In [5]:
dataset.describe()

Unnamed: 0,Average rating,Disagreement
count,2766.0,2766.0
mean,7.015184,2.20282
std,1.611146,1.885437
min,2.0,0.0
25%,6.0,1.0
50%,7.0,2.0
75%,8.0,3.0
max,10.0,9.0


In [7]:
dataset

Unnamed: 0,English,Hindi,Hinglish,Average rating,Disagreement
0,Program module is a file that contains instruc...,"माड्यूल, एक संचिका होती है, जिसमें या तो स्रोत...","module , ek program hoti hai , jismen ya to so...",7,6
1,And to Thamud We sent their brother Sali 'h. H...,और (हमने) क़ौमे समूद के पास उनके भाई सालेह को ...,aur hamne aume samood ke pas unke bhaee saleh ...,6,4
2,"and, when reminded, do not remember\n","और जब उन्हें याद दिलाया जाता है, तो वे याद नही...","aur jab unhen yad dilaya jata hai , to ve yad ...",10,0
3,you won the TED Prize 2011.\n,तुम्हें २०११ का टेड प्राइज़ मिल गया है.\n,tumhen २०११ ka ted prize mil gaya hai\n,9,1
4,He gone to Kerodemal College of Delhi Universi...,उन्होंने बाद अध्ययन करने के लिए ये दिल्ली विश्...,unhonne bad science karne ke lie ye delhi univ...,7,0
...,...,...,...,...,...
2761,Polar ice caps may melt further and increase t...,अधिक मात्रा में ध्रुवों की बर्फ पिघलने से सागर...,large size men polar ki barph pighalne se ocea...,6,4
2762,"It ' s what turns lead into gold , and makes t...","यही चक्र सीसे को सोना बना देता है , और सोने को...","yahi chakr lead into gold bana deta hai , aur ...",7,1
2763,The President said the North Eastern Hill Univ...,राष्ट्रपति ने कहा कि पूर्वोत्तर पर्वतीय विश्वव...,president ne kaha ki north parvtiy university ...,8,4
2764,The violin bow might well have grown out of th...,बहुत संभव है कि वायलिन का गज भी एक छड़ी को दूस...,bahut snbhav hai ki vaylin ka gaj bhi ek chhar...,5,3


In [10]:
#create a new dataframe of english and hinglish column
df = pd.DataFrame()
df['english'] = dataset['English']
df['hinglish'] = dataset['Hinglish']
df.head()

Unnamed: 0,english,hinglish
0,Program module is a file that contains instruc...,"module , ek program hoti hai , jismen ya to so..."
1,And to Thamud We sent their brother Sali 'h. H...,aur hamne aume samood ke pas unke bhaee saleh ...
2,"and, when reminded, do not remember\n","aur jab unhen yad dilaya jata hai , to ve yad ..."
3,you won the TED Prize 2011.\n,tumhen २०११ ka ted prize mil gaya hai\n
4,He gone to Kerodemal College of Delhi Universi...,unhonne bad science karne ke lie ye delhi univ...


In [11]:
exclude = set(string.punctuation) # Set of all special characters
remove_digits = str.maketrans('', '', string.digits) # Set of all digits

In [12]:
#write function to preprocess english sentence
def preprocess_english_sentence(sentence):
    sentence = sentence.lower()
    sentence = re.sub("'", '', sentence)
    sentence = ''.join(ch for ch in sentence if ch not in exclude)
    sentence = sentence.translate(remove_digits)
    sentence = sentence.strip()
    sentence = re.sub(" +", " ", sentence)
    sentence = '<start> ' + sentence + ' <end>'
    return sentence

In [13]:
#preprocess hinglish sentense
def preprocess_hinglish_sentence(sentence):
    sentence = sentence.lower()
    sentence = re.sub("'", '', sentence)
    sentence = ''.join(ch for ch in sentence if ch not in exclude)
    sentence = sentence.translate(remove_digits)
    sentence = sentence.strip()
    sentence = re.sub(" +", " ", sentence)
    sentence = '<start> ' + sentence + ' <end>'
    return sentence

In [14]:
df['english'] = df['english'].apply(preprocess_english_sentence)
df['hinglish'] = df['hinglish'].apply(preprocess_hinglish_sentence)

df.rename(columns={"english_sentence": "english", "hindi_sentence": "hindi"},inplace=True)

df.head()

Unnamed: 0,english,hinglish
0,<start> program module is a file that contains...,<start> module ek program hoti hai jismen ya t...
1,<start> and to thamud we sent their brother sa...,<start> aur hamne aume samood ke pas unke bhae...
2,<start> and when reminded do not remember <end>,<start> aur jab unhen yad dilaya jata hai to v...
3,<start> you won the ted prize <end>,<start> tumhen २०११ ka ted prize mil gaya hai ...
4,<start> he gone to kerodemal college of delhi ...,<start> unhonne bad science karne ke lie ye de...


In [15]:
#tokenzizer
def tokenizer(language):
    tokenizer = Tokenizer(filters='', split=" ")
    tokenizer.fit_on_texts(language)
    tensor = tokenizer.texts_to_sequences(language)
    tensor = pad_sequences(tensor, padding='post')
    return tensor, tokenizer

In [18]:
def load_dataset():
    input_tensor, inp_lang_tokenizer = tokenizer(df['english'].values)
    target_tensor, targ_lang_tokenizer = tokenizer(df['hinglish'].values)
    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [19]:
input_tensor, target_tensor, input_lang, target_lang = load_dataset()

In [20]:
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]


In [21]:

input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

2212 2212 554 554


In [23]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
N_BATCH = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 256
units = 1024
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE

vocab_inp_size =len(input_lang.word_index.keys())
vocab_tar_size =len(target_lang.word_index.keys())

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [32]:
embeddings_index = dict()
f = open('glove-2.txt',"w")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

embedding_matrix = np.zeros((vocab_inp_size+1, 300))
for word, i in input_lang.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

UnsupportedOperation: not readable