In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/english-to-hindi-parallel-dataset/newdata.csv')
data.info()

In [None]:
english = data.iloc[:,1].values
hindi = data.iloc[:,2].values

print(english)
print(hindi)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import re
import pandas as pd

In [None]:
def preprocessing_data(line):
    line = str(line).lower().strip()
    line = re.sub(r"([?.!,¿|])", r" \1 ", line) # create the space between words and [?.!,¿] these signs
    line = re.sub(r'[" "]+', " ", line) # remove the extra space between the words
    #line = re.sub(r"[^a-zA-Z?.!,¿]+", " ", line) # allow only alphabets and [?.!,¿] these symbols or remove the digits.
    line = line.strip()
    line = '<start> ' + line + ' <end>' 
    
    return line

In [None]:
hindi = [preprocessing_data(sent) for sent in hindi]
english = [preprocessing_data(sent) for sent in english]

In [None]:
print(hindi[0])
english[0]

In [None]:
def word_to_vec(inputs):
    tokenizer = Tokenizer(filters='') # this tokenizer will filters nothing.
    tokenizer.fit_on_texts(inputs)

    tensor = tokenizer.texts_to_sequences(inputs)
    tensor = pad_sequences(tensor, padding='post')

    return tensor, tokenizer

In [None]:
eng_tensor, eng_token = word_to_vec(english[0:10])
hindi_tensor, hindi_token = word_to_vec(hindi[0:10])

In [None]:
print(eng_tensor[0])
hindi_tensor[0]

In [None]:
max_eng, max_hindi = len(eng_tensor[0,:]), len(hindi_tensor[0,:])

print(max_eng, max_hindi)

In [None]:
buffer_size = len(eng_tensor)
batch_size = 64
steps_per_epoch = len(eng_tensor)//batch_size
embedding_dim = 256
units = 1024
vocab_train = len(eng_token.word_index)+1
vocab_label = len(hindi_token.word_index)+1

#dataset = tf.data.Dataset.from_tensor_slices(([eng_tensor, hindi_tensor], hindi_tensor[:,1:])).shuffle(buffer_size)
#dataset = dataset.batch(batch_size, drop_remainder=True)

# hindi_tensor[:, :1] remove <start> token.
encoder_input, decoder_input, decoder_output = eng_tensor, hindi_tensor[:, :], hindi_tensor[:, 1:] 
print(encoder_input.shape)
print(decoder_input.shape)
print(decoder_output.shape)

In [None]:
decoder_input[0]

In [None]:
dec_input = np.zeros((10, max_hindi-1)).astype(int)

In [None]:
for i in range(len(decoder_input[:,0])):
    t = np.where(decoder_input[i, :] == 2)
    dec_input[i, :] = np.delete(decoder_input[i, :], t, axis=0)

In [None]:
print(decoder_output[0])
print(dec_input[0])

In [None]:
#print(decoder_output.shape)

In [None]:
# encoder 
encoder = tf.keras.Input(shape=(max_eng, ))
enc_embd = tf.keras.layers.Embedding(vocab_train, embedding_dim)(encoder)
encoder_gru = tf.keras.layers.GRU(units, return_sequences=True, return_state=True, kernel_regularizer=tf.keras.regularizers.L2(0.001))
output_e, hidden_e = encoder_gru(enc_embd)

print(output_e.shape, hidden_e.shape)

# decoder
decoder = tf.keras.Input(shape=(max_hindi-1, ))
dec_embd = tf.keras.layers.Embedding(vocab_label, embedding_dim)(decoder)
decoder_gru = tf.keras.layers.GRU(units, return_sequences=True, return_state=True, kernel_regularizer=tf.keras.regularizers.L2(0.001))
output_d, hidden_d = decoder_gru(dec_embd, initial_state = hidden_e)
final_output = tf.keras.layers.Dense(vocab_label, activation='softmax', kernel_regularizer=tf.keras.regularizers.L2(0.001))
output_f = final_output(output_d)
print(output_f.shape)

In [None]:
model = tf.keras.Model([encoder, decoder], output_f)

In [None]:
print(model.summary())

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()

def loss_function(real, pred):
    mask_local = tf.math.logical_not(tf.math.equal(real, 0))
    loss_fn = loss_object(real, pred)

    mask = tf.cast(mask_local, dtype=loss_fn.dtype)
    loss_fn *= mask
    return tf.reduce_mean(loss_fn)

In [None]:
model.compile(loss=loss_function, optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit([encoder_input, dec_input], decoder_output, epochs=10, verbose=1)

In [None]:
a = model.predict([encoder_input[0].reshape((-1, max_eng)), dec_input[0].reshape((-1, max_hindi-1))])
print(a.shape)

In [None]:
print(english[0])
print(hindi[0])

for i in range(len(a[0][:])):
    print(hindi_token.index_word[np.argmax(a[0][i])])

In [None]:
# customize for prediction
encoder_model = tf.keras.Model(encoder, hidden_e) 

state = tf.keras.Input(shape=(None, ))
output, hidden_dest = decoder_gru(dec_embd, initial_state = state)
#output_result = final_output(output)

decoder_model = tf.keras.Model([decoder, state], [hidden_dest, output])

In [None]:
# remove start end token
def Expand(sentence):
    return sentence.split("<start>")[-1].split("<end>")[0]

In [None]:
def data(line):
    line = str(line).lower().strip()
    line = re.sub(r"([?.!,¿|])", r" \1 ", line) # create the space between words and [?.!,¿] these signs
    line = re.sub(r'[" "]+', " ", line) # remove the extra space between the words
    #line = re.sub(r"[^a-zA-Z?.!,¿]+", " ", line) # allow only alphabets and [?.!,¿] these symbols or remove the digits.
    line = line.strip()
    line = '<start> ' + line 
    
    return line

In [None]:
# prediction function
def call(inp):
    inp = data(inp) # process the data
    
    whole = []
    for i in inp.split(' '):
        whole.append(eng_token.word_index[i])
    
    inp = pad_sequences([whole], maxlen=max_eng, padding='post') # set data in training format
    print(inp.shape)
    state = encoder_model.predict(inp) # initialize the initial state for decoder
    decoder_input = tf.expand_dims([hindi_token.word_index['<start>']], 0) # initial input of decoder
    
    ans = ''
    
    for i in range(1, max_eng):
        state, output = decoder_model([decoder_input, state])
        pred = final_output(output)
    
        ans += hindi_token.index_word[np.argmax(pred[0][0])] + ' '

        if hindi_token.index_word[np.argmax(pred[0][0])] == '<end>':
            return Expand(ans)
        
        decoder_input = tf.expand_dims([np.argmax(pred[0][0])], 0) # input for next word prediction 
         
    return Expand(ans)

In [None]:
print(english[5])
call(english[5])