In [2]:
#Make imports
import numpy as np
import re
import pickle
import os
import seaborn as sns
import string

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
#TPU settings
%tensorflow_version 2.x
import tensorflow as tf
print("Tensorflow version " + tf.__version__)

try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.TPUStrategy(tpu)

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.
Tensorflow version 2.8.2
Running on TPU  ['10.87.82.234:8470']


In [8]:
def preprocess(text):
  text = ''.join(ch for ch in text if ch not in string.punctuation)
  text = text.lower()
  text = re.sub(r'\d','',text)
  text = re.sub(r'\s+',' ',text)
  text = text.strip()
  return text

In [12]:
#Extract dataset and preprocess
dataset_root = "./drive/MyDrive/ml/"

if os.path.exists(dataset_root + "parallel/preprocessed_data.pickle"):
  with open(dataset_root + "parallel/preprocessed_data.pickle", 'rb') as f:
    english_sentences, hindi_sentences = pickle.load(f)
else:
  if not os.path.exists(dataset_root + "parallel/IITB.en-hi.en"):
    os.system("tar -xzf " + dataset_root + "parallel.tgz -C " + dataset_root)

  with open(dataset_root + "test.en",'r') as f:
    english_sentences = f.read().split('\n')

  with open(dataset_root + "test.hi",'r') as f:
    hindi_sentences = f.read().split('\n')

  english_sentences = [preprocess(en) for en in english_sentences]
  hindi_sentences = ['<START> ' + re.sub('[a-zA-Z]','',preprocess(hi)) + ' <END>' for hi in hindi_sentences]

  #Remove duplicate sentences
  english_unique = set()
  english_sentences_temp = []
  hindi_sentences_temp = []
  l = len(english_sentences)
  for i in range(l):
    if english_sentences[i] not in english_unique:
      english_unique.add(english_sentences[i])
      english_sentences_temp.append(english_sentences[i])
      hindi_sentences_temp.append(hindi_sentences[i])

  english_sentences = english_sentences_temp
  hindi_sentences = hindi_sentences_temp
  
  with open(dataset_root + "preprocessed_data.pickle",'wb') as f:
    pickle.dump((english_sentences, hindi_sentences), f)

In [33]:
print(len(english_sentences), len(hindi_sentences))
print()
english_sentences[:3], hindi_sentences[:3]

998998 998998



(['on the sidelines of this event i hope the delegates joining us from abroad shall have some time to see the history and splendour of delhi',
  'we are proud to be the global host for world environment day',
  'we are also committed to ensure that we do so in a way that is sustainable and green'],
 ['<START> मुझे उम्मीद है कि विदेशों से आए प्रतिनिधियों के पास दिल्ली के इतिहास और गौरव को देखने के लिए कुछ समय मिलेगा। <END>',
  '<START> हमें विश्व पर्यावरण दिवस के लिए वैश्विक मेजबान बनने का गर्व है। <END>',
  '<START> हम वह करने के लिए संकल्पबद्ध हैं जो सतत् औऱ हरित है। <END>'])

In [14]:
#Some parameters
vocab_size = 10000
total_sentences = 25000
maxlen = 10
epochs = 70
validation_split = 0.05

In [15]:
en_data = []
hi_data = []

cnt = 0

for (en,hi) in zip(english_sentences, hindi_sentences):
  l = min(len(en.split()), len(hi.split()))
  if l <= maxlen:
    en_data.append(en)
    hi_data.append(hi)
    cnt += 1
  if cnt == total_sentences:
    break

In [16]:
#Tokenize the texts and convert to sequences
en_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>', lower=False)
en_tokenizer.fit_on_texts(en_data)
en_sequences = en_tokenizer.texts_to_sequences(en_data)

hi_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>', lower=False)
hi_tokenizer.fit_on_texts(hi_data)
hi_sequences = hi_tokenizer.texts_to_sequences(hi_data)

english_vocab_size = len(en_tokenizer.word_index) + 1
hindi_vocab_size = len(hi_tokenizer.word_index) + 1
print("English Vocab Size: ", english_vocab_size)
print("Hindi Vocab Size: ", hindi_vocab_size)

English Vocab Size:  1579
Hindi Vocab Size:  1672


In [17]:
#Prepare encoder data
encoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(en_sequences, maxlen=maxlen, padding='post')

In [18]:
#Prepare decoder data
decoder_inputs = []
decoder_outputs = []

for hi in hi_sequences:
  decoder_inputs.append(hi[:-1])
  decoder_outputs.append(hi[1:])

decoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_inputs, maxlen=maxlen, padding='post')
decoder_outputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_outputs, maxlen=maxlen, padding='post')

In [35]:
# Training and Testing split
# 80%, 20%
split = int(0.80 * total_sentences)

X_train = [encoder_inputs[:split], decoder_inputs[:split]]
y_train = decoder_outputs[:split]

# Test data to evaluate our NMT model using BLEU score
X_test = en_data[:split]
y_test = hi_data[:split]

print(X_train[0].shape, X_train[1].shape, y_train.shape)

(23750, 10) (23750, 10) (23750, 10)


In [20]:
#Define LSTM model
d_model = 256

#Encoder
inputs = tf.keras.layers.Input(shape=(None,))
x = tf.keras.layers.Embedding(english_vocab_size, d_model, mask_zero=True)(inputs)
_,state_h,state_c = tf.keras.layers.LSTM(d_model,activation='relu',return_state=True)(x)

#Decoder
targets = tf.keras.layers.Input(shape=(None,))
embedding_layer = tf.keras.layers.Embedding(hindi_vocab_size, d_model, mask_zero=True)
x = embedding_layer(targets)
decoder_lstm = tf.keras.layers.LSTM(d_model,activation='relu',return_sequences=True, return_state=True)
x,_,_ = decoder_lstm(x, initial_state=[state_h, state_c])
dense1 = tf.keras.layers.Dense(hindi_vocab_size, activation='softmax')
x = dense1(x)

model = tf.keras.models.Model(inputs=[inputs, targets],outputs=x)
model.summary()

loss = tf.keras.losses.SparseCategoricalCrossentropy()
model.compile(optimizer='rmsprop', loss=loss, metrics=['accuracy'])

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 256)    404224      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 256)    428032      ['input_2[0][0]']                
                                                                                              

In [21]:
#Save model after each epoch
save_model_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='./drive/MyDrive/ml/en-hi.h5',
    monitor='val_accuracy',
    mode='max'
)

In [22]:
model.fit(X_train, y_train, epochs=epochs, validation_split=validation_split, callbacks=[save_model_callback, tf.keras.callbacks.TerminateOnNaN()])

Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70


<keras.callbacks.History at 0x7fd4b61c4710>

In [28]:
#Retrieve previously saved stuff
saved_model = tf.keras.models.load_model('./drive/MyDrive/ml/en-hi.h5')

saved_model.summary()

inputs = saved_model.get_layer('input_1').output
_,state_h,state_c = saved_model.get_layer('lstm').output
targets = saved_model.get_layer('input_2').output
embedding_layer = saved_model.get_layer('embedding_1')
decoder_lstm = saved_model.get_layer('lstm_1')
dense1 = saved_model.get_layer('dense')

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 256)    404224      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 256)    428032      ['input_2[0][0]']                
                                                                                              

In [29]:
#Inference Model

#Encoder
encoder = tf.keras.models.Model(inputs, [state_h, state_c])

#Decoder
decoder_input_h = tf.keras.layers.Input(shape=(d_model,))
decoder_input_c = tf.keras.layers.Input(shape=(d_model,))
x = embedding_layer(targets)
x, decoder_output_h, decoder_output_c = decoder_lstm(x, initial_state=[decoder_input_h, decoder_input_c])
x = dense1(x)
decoder = tf.keras.models.Model([targets] + [decoder_input_h, decoder_input_c], 
                                [x] + [decoder_output_h, decoder_output_c])

In [37]:
def predict_sentence(en_input):
  input_seq = en_tokenizer.texts_to_sequences([en_input])

  next_h, next_c = encoder.predict(input_seq)

  curr_token = np.zeros(1)
  curr_token[0] = hi_tokenizer.word_index['<START>']

  pred_sentence = ''

  for i in range(maxlen):
    output, next_h, next_c = decoder.predict([curr_token] + [next_h, next_c])
    next_token = np.argmax(output[0, 0, :])
    next_word = hi_tokenizer.index_word[next_token]
    if next_word == '<END>':
      break
    else:
      pred_sentence += ' ' + next_word
      curr_token[0] = next_token

  return pred_sentence

Original output:  give your application an accessibility workout
Input:   अपने अनुप्रयोग को पहुंचनीयता करें
Prediction:  your application has to do access workout

Original output:  accerciser accessibility explorer
Input:   एक्सेर्साइसर पहुंचनीयता अन्वेषक
Prediction:  exerciser access explorer

Original output:  the default plugin layout for the bottom panel
Input:   ऊपरी पटल के लिए डिफोल्ट प्लगइन खाका
Prediction:  default plugin layout for upper upper

Original output:  a list of plugins that are disabled by default
Input:   उन प्लगइनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है 
Prediction: list disabled by defaulted list of of plugins 

Original output:  highlight duration
Input:   अवधि को हाइलाइट रकें
Prediction:  duration keep highlighted

Original output:  the duration of the highlight box when selecting accessible nodes
Input:   पहुंचनीय आसंधि नोड को चुनते समय हाइलाइट बक्से की अवधि
Prediction:  permission given nodes to certified highlight box of of

Original output: 