In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re

In [2]:
from nltk.translate.bleu_score import sentence_bleu

In [3]:
import tensorflow as tf
tf.__version__

'2.3.0'

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
data = open('/content/drive/My Drive/Colab Notebooks/Machine Translation/English_Hindi/hin.txt','r') 

In [6]:
sent = data.readlines()

In [7]:
sentence_dict=[]
for i in sent:
  x = i.split('\t')
  temp = {'english_sentence':x[0],'hindi_sentence':x[1]}
  sentence_dict.append(temp)

In [8]:
df = pd.DataFrame(data=sentence_dict)

In [9]:
df.head()

Unnamed: 0,english_sentence,hindi_sentence
0,Wow!,वाह!
1,Help!,बचाओ!
2,Jump.,उछलो.
3,Jump.,कूदो.
4,Jump.,छलांग.


In [10]:
df.shape

(2778, 2)

In [11]:
df.isnull().sum()

english_sentence    0
hindi_sentence      0
dtype: int64

In [12]:
msk = np.random.rand(len(df)) < 0.8

In [13]:
test_df = df[~msk]
df = df[msk]

In [14]:
len(df),len(test_df)

(2225, 553)

In [15]:
def clean_eng(text):
  text = re.sub('[^a-zA-Z.]',' ',text)
  text = text.lower()
  return text

In [16]:
def clean_hindi(a):
  a = re.sub(r'\([^)]*\)', '', a)
  x = re.sub('[२३०८१५७९४६0-9a-zA-Z.]',' ',a)
  return x

In [17]:
# Cleaning the english sentences and appending it into a list
english_sentences = []
for i in df['english_sentence']:
  lines = clean_eng(i)
  english_sentences.append(lines)

In [18]:
hindi_decoder_input = []
for i in df['hindi_sentence']:
  i = clean_hindi(i)
  i = '_start '+i
  hindi_decoder_input.append(i)

In [19]:
hindi_decoder_output = []
for i in df['hindi_sentence']:
  i = clean_hindi(i)
  i = i +' end_'
  hindi_decoder_output.append(i)

In [20]:
en_tokenizer = Tokenizer(filters='')
hin_tokenizer = Tokenizer(filters='')

In [21]:
en_tokenizer.fit_on_texts(english_sentences)

In [22]:
hin_tokenizer.fit_on_texts(hindi_decoder_input)
hin_tokenizer.fit_on_texts(hindi_decoder_output)

In [23]:
word_index = en_tokenizer.word_index
en_vocab_size = len(word_index)+1
en_vocab_size

2585

In [24]:
word_index = hin_tokenizer.word_index
hin_vocab_size = len(word_index)+1
hin_vocab_size

2763

In [25]:
en_sequence = en_tokenizer.texts_to_sequences(english_sentences)
en_padded = pad_sequences(en_sequence,padding='post')

In [26]:
hin_in_seq = hin_tokenizer.texts_to_sequences(hindi_decoder_input)
hin_in_padded = pad_sequences(hin_in_seq,padding='post')

In [27]:
hin_out_seq = hin_tokenizer.texts_to_sequences(hindi_decoder_output)
hin_out_padded = pad_sequences(hin_out_seq,padding='post')

In [28]:
dataset = tf.data.Dataset.from_tensor_slices((en_padded, hin_in_padded, hin_out_padded))
dataset = dataset.shuffle(20).batch(10,drop_remainder=True)

In [29]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_size, lstm_size):
    super().__init__()
    self.lstm_size = lstm_size
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
    self.lstm = tf.keras.layers.LSTM(lstm_size, return_sequences=True, return_state=True)

  def call(self, sequence, states):
    embed = self.embedding(sequence)
    output, state_h, state_c = self.lstm(embed, initial_state=states)
    return output, state_h, state_c

  def init_states(self, batch_size):
    return (tf.zeros([batch_size, self.lstm_size]),tf.zeros([batch_size, self.lstm_size]))

In [30]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_size, lstm_size):
    super().__init__()
    self.lstm_size = lstm_size
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
    self.lstm = tf.keras.layers.LSTM(lstm_size, return_sequences=True, return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, sequence, state):
    embed = self.embedding(sequence)
    lstm_out, state_h, state_c = self.lstm(embed, state)
    logits = self.dense(lstm_out)
    return logits, state_h, state_c

In [31]:
EMBEDDING_SIZE = 32
LSTM_SIZE = 64

encoder = Encoder(en_vocab_size, EMBEDDING_SIZE, LSTM_SIZE)
decoder = Decoder(hin_vocab_size, EMBEDDING_SIZE, LSTM_SIZE)

In [32]:
def loss_func(targets, logits):
  crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  mask = tf.math.logical_not(tf.math.equal(targets, 0))
  mask = tf.cast(mask, dtype=tf.int64)
  loss = crossentropy(targets, logits, sample_weight=mask)
  return loss

In [33]:
optimizer = tf.keras.optimizers.Adam()

In [34]:
@tf.function
def train_step(source_seq, target_seq_in, target_seq_out, en_initial_states):
    with tf.GradientTape() as tape:
        en_outputs = encoder(source_seq, en_initial_states)
        en_states = en_outputs[1:]
        de_states = en_states

        de_outputs = decoder(target_seq_in, de_states)
        logits = de_outputs[0]
        loss = loss_func(target_seq_out, logits)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return loss

In [45]:
def predict(test_source_text):
    print(test_source_text)
    test_source_seq = en_tokenizer.texts_to_sequences([test_source_text])

    en_initial_states = encoder.init_states(1)
    en_outputs = encoder(tf.constant(test_source_seq), en_initial_states)

    de_input = tf.constant([[hin_tokenizer.word_index['_start']]])
    de_state_h, de_state_c = en_outputs[1:]
    out_words = []

    while True:
        de_output, de_state_h, de_state_c = decoder(
            de_input, (de_state_h, de_state_c))
        de_input = tf.argmax(de_output, -1)
        out_words.append(hin_tokenizer.index_word[de_input.numpy()[0][0]])

        if out_words[-1] == 'end_' or len(out_words) >= 30:
            break

    print(' '.join(out_words))

In [36]:
NUM_EPOCHS = 100
BATCH_SIZE = 10
for e in range(NUM_EPOCHS):
  en_initial_states = encoder.init_states(BATCH_SIZE)
  for batch, (source_seq, target_seq_in, target_seq_out) in enumerate(dataset.take(-1)):
    loss = train_step(source_seq, target_seq_in,target_seq_out, en_initial_states)
  print('Epoch {} Loss {:.4f}'.format(e + 1, loss.numpy())) 

Epoch 1 Loss 3.8011
Epoch 2 Loss 3.6113
Epoch 3 Loss 3.9066
Epoch 4 Loss 3.3500
Epoch 5 Loss 3.5470
Epoch 6 Loss 3.6124
Epoch 7 Loss 3.4894
Epoch 8 Loss 3.3717
Epoch 9 Loss 3.1936
Epoch 10 Loss 3.3608
Epoch 11 Loss 3.3102
Epoch 12 Loss 3.2515
Epoch 13 Loss 2.9626
Epoch 14 Loss 2.8534
Epoch 15 Loss 2.8570
Epoch 16 Loss 2.8859
Epoch 17 Loss 3.1025
Epoch 18 Loss 2.5776
Epoch 19 Loss 3.1639
Epoch 20 Loss 2.7877
Epoch 21 Loss 2.8151
Epoch 22 Loss 2.9984
Epoch 23 Loss 2.5312
Epoch 24 Loss 2.4930
Epoch 25 Loss 2.7667
Epoch 26 Loss 2.8828
Epoch 27 Loss 2.7963
Epoch 28 Loss 2.4005
Epoch 29 Loss 2.1605
Epoch 30 Loss 2.3453
Epoch 31 Loss 2.1455
Epoch 32 Loss 2.8195
Epoch 33 Loss 2.4740
Epoch 34 Loss 2.1812
Epoch 35 Loss 2.2963
Epoch 36 Loss 1.9052
Epoch 37 Loss 2.0626
Epoch 38 Loss 2.2422
Epoch 39 Loss 1.9438
Epoch 40 Loss 1.9441
Epoch 41 Loss 1.9534
Epoch 42 Loss 1.7661
Epoch 43 Loss 1.8674
Epoch 44 Loss 2.1150
Epoch 45 Loss 1.9565
Epoch 46 Loss 1.8712
Epoch 47 Loss 1.8225
Epoch 48 Loss 1.7196
E

In [37]:
test_df['english_sentence'] = test_df['english_sentence'].apply(clean_eng)
test_df['hindi_sentence'] = test_df['hindi_sentence'].apply(clean_hindi)

In [38]:
test_df = test_df.reset_index()
test_df

Unnamed: 0,index,english_sentence,hindi_sentence
0,7,cheers,वाह-वाह!
1,14,go away,चले जाओ!
2,19,welcome.,स्वागतम्।
3,21,have fun.,मौज करना।
4,26,i m fine.,मैं ठीक हूँ।
...,...,...,...
548,2764,instead of laying off these workers why don t...,इन कर्मचारियों को नौकरी से निकालने की बजाय क्य...
549,2765,our parents took care of us and now it s our t...,हमारे माता-पिता ने हमारी देखभाल की थी और अभ हम...
550,2767,i learned to drive a car and got a driver s li...,जब मैं अठारह साल का था मैंने गाड़ी चलानी सीखी ...
551,2771,in sister teresa was sent to calcutta t...,"में सिस्टर टेरेसा को कलकत्ता भेजा गया था,..."


In [43]:
test_df.iloc[0]

index                      7
english_sentence     cheers 
hindi_sentence      वाह-वाह!
Name: 0, dtype: object

In [48]:
predict(test_df['english_sentence'].iloc[3])

have fun.
ताकतशाली है। अखबारों end_
