In [1]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM , Dense, Input, Embedding
from keras.models import Model
import numpy as np
import re
import string
import pandas as pd
from string import digits
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split


In [5]:
help(Embedding)

Help on class Embedding in module tensorflow.python.keras.layers.embeddings:

class Embedding(tensorflow.python.keras.engine.base_layer.Layer)
 |  Embedding(*args, **kwargs)
 |  
 |  Turns positive integers (indexes) into dense vectors of fixed size.
 |  
 |  e.g. `[[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]`
 |  
 |  This layer can only be used as the first layer in a model.
 |  
 |  Example:
 |  
 |  >>> model = tf.keras.Sequential()
 |  >>> model.add(tf.keras.layers.Embedding(1000, 64, input_length=10))
 |  >>> # The model will take as input an integer matrix of size (batch,
 |  >>> # input_length), and the largest integer (i.e. word index) in the input
 |  >>> # should be no larger than 999 (vocabulary size).
 |  >>> # Now model.output_shape is (None, 10, 64), where `None` is the batch
 |  >>> # dimension.
 |  >>> input_array = np.random.randint(1000, size=(32, 10))
 |  >>> model.compile('rmsprop', 'mse')
 |  >>> output_array = model.predict(input_array)
 |  >>> print(output_array.sh

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/prismspeechproject/neural/master/implementation/cl2.csv', encoding='utf-8')
df.head()

Unnamed: 0,englsih,Hindi
0,Help!,बचाओ!
1,Jump.,उछलो.
2,Jump.,कूदो.
3,Jump.,छलांग.
4,Hello!,नमस्ते।


In [None]:
df.rename(columns={'englsih' : 'English'}, inplace=True)
df.columns

Index(['English', 'Hindi'], dtype='object')

In [None]:
df = df[~pd.isnull(df['English'])]
df.shape

(2772, 2)

In [None]:
df.drop_duplicates(subset="English", keep='first' , inplace=True)
print(df.shape)
df.head(10)

(2570, 2)


Unnamed: 0,English,Hindi
0,Help!,बचाओ!
1,Jump.,उछलो.
4,Hello!,नमस्ते।
6,Cheers!,वाह-वाह!
8,Got it?,समझे कि नहीं?
9,I'm OK.,मैं ठीक हूँ।
10,Awesome!,बहुत बढ़िया!
11,Come in.,अंदर आ जाओ।
12,Get out!,बाहर निकल जाओ!
13,Go away!,चले जाओ!


In [None]:
punctuations = set(string.punctuation)
rm_digits = str.maketrans('', '', digits)

# typing to str
df['English'] = df['English'].apply(lambda e: str(e))
df['Hindi'] = df['Hindi'].apply(lambda h: str(h))

#Here all the data inside () or [] are removed
df['English'] = df['English'].apply(lambda e: re.sub(r"[\(\[].*?[\)\]]", '', e))
df['Hindi'] = df['Hindi'].apply(lambda h : re.sub(r"[\(\[].*?[\)\]]", '', h))

#removing punctuations 
df['English'] = df['English'].apply(lambda e: ''.join(c for c in e if c not in punctuations))
df['Hindi'] = df['Hindi'].apply(lambda h: ''.join(c for c in h if c not in punctuations))

#lower
df['English'] = df['English'].apply(lambda e: e.lower())
df['Hindi'] = df['Hindi'].apply(lambda h: h.lower())

#Remove digits
df['English']=df['English'].apply(lambda e: e.translate(rm_digits))
df['Hindi']=df['Hindi'].apply(lambda h: h.translate(rm_digits))
df['Hindi']=df['Hindi'].apply(lambda h: re.sub("[a-z२३०८१५७९४६]", '', h))

#Striping
df['English'] = df['English'].apply(lambda e: e.strip())
df['Hindi'] = df['Hindi'].apply(lambda h: h.strip())

df.head()

Unnamed: 0,English,Hindi
0,help,बचाओ
1,jump,उछलो
4,hello,नमस्ते।
6,cheers,वाहवाह
8,got it,समझे कि नहीं


In [None]:
#Appending Staring and Ending notation in Hindi column
df['Hindi'] = df['Hindi'].apply(lambda h: 'START_ ' + h + ' _END')
df.head()

Unnamed: 0,English,Hindi
0,help,START_ बचाओ _END
1,jump,START_ उछलो _END
4,hello,START_ नमस्ते। _END
6,cheers,START_ वाहवाह _END
8,got it,START_ समझे कि नहीं _END


In [None]:
# extract words
eng_words = []
for sen in df['English']:
  for w in sen.split():
    if w not in eng_words:
      eng_words.append(w)
hin_words = []
for sen in df['Hindi']:
  for w in sen.split():
    if w not in hin_words:
      hin_words.append(w)
len(eng_words), len(hin_words)

(2340, 2886)

In [None]:
# Calculate number of words in each line of 2 columns
df['len_eng_words'] = df['English'].apply(lambda e: len(e.split(" ")))
df['len_hin_words'] = df['Hindi'].apply(lambda h: len(h.split(" ")))
df.head()

Unnamed: 0,English,Hindi,len_eng_words,len_hin_words
0,help,START_ बचाओ _END,1,3
1,jump,START_ उछलो _END,1,3
4,hello,START_ नमस्ते। _END,1,3
6,cheers,START_ वाहवाह _END,1,3
8,got it,START_ समझे कि नहीं _END,2,5


In [None]:
max_len_encoder = max(df['len_eng_words'])
max_len_decoder = max(df['len_hin_words'])
max_len_encoder , max_len_decoder


(22, 27)

In [None]:
input_words= sorted(list(eng_words))
target_words = sorted(list(hin_words))
num_encoder_tokens = len(input_words) + 1
num_decoder_tokens = len(target_words) + 1
num_encoder_tokens, num_decoder_tokens

(2341, 2887)

In [None]:
input_token_index = dict([(word, i+1) for i,word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i,word in enumerate(target_words)])
input_token_index, target_token_index

In [None]:
df = shuffle(df)
df.head()

Unnamed: 0,English,Hindi,len_eng_words,len_hin_words
1019,i found out where she was,START_ मुझे पता लगा कि वह कहाँ थी। _END,6,9
347,where do you live,START_ आप कहाँ रहते हैं _END,4,6
117,i am who i am,START_ मैं हूँ जो हूँ। _END,5,6
2497,you make mistakes if you do things in a hurry,START_ जल्दबाज़ी में काम करोगे तो ग़लतियाँ तो ...,10,11
462,i know both of them,START_ मैं दोनो को जानता था। _END,5,7


In [None]:
X, Y = df['English'], df['Hindi']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=43)
X_train.shape, X_test.shape

((2056,), (514,))

In [None]:
X_train.to_pickle('X_train.pkl')
X_test.to_pickle('X_test.pkl')
len(X_train)

2056

In [None]:
encoder_input_data = np.zeros((len(X_train), max_len_encoder, num_encoder_tokens), dtype='float32')
decoder_input_data = np.zeros((len(X_train), max_len_decoder, num_decoder_tokens), dtype='float32')
decoder_target_data = np.zeros((len(X_train), max_len_decoder, num_decoder_tokens), dtype='float32')

In [None]:
for i, (input_text, target_text) in enumerate(zip(X_train, Y_train)):
  for t, word in enumerate(input_text.split()):
    encoder_input_data[i, t, input_token_index[word]] = 1
  #encoder_input_data[i, t+1: , input_token_index[' ']] = 1
  for t, word in enumerate(target_text.split()):
    decoder_input_data[i, t, target_token_index[word]] = 1
    if t>0:
      decoder_target_data[i, t-1, target_token_index[word]] = 1
  # decoder_input_data[i, t+1: , target_token_index[' ']] = 1
  # decoder_target_data[i, t: , target_token_index[' ']] = 1

encoder_input_data[0].shape

(22, 2341)

In [None]:
# Encoder
latent_dim = 300
encoder_inputs = Input(shape=(None, num_encoder_tokens))
#enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)

#discarding encoder_output
encoder_state = [state_h, state_c]


In [None]:
# Set up the decoder using encoder_states
decoder_inputs = Input(shape=(None, num_decoder_tokens))
#dec_emb = Embedding(num_decoder_tokens, latent_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _ , _ = decoder_lstm(decoder_inputs, initial_state=encoder_state)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
# creating model instance
batch_size= 128
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics= ['accuracy'])
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=batch_size, epochs=100, validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f06d8dd2750>

In [None]:
encoder_model = Model(encoder_inputs, encoder_state)


decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
reverse_input_token_index = dict([(i, word) for word, i in input_token_index.items()])
reverse_target_token_index = dict([(i, word) for word , i in target_token_index.items()])

In [None]:
def decode_sequence(input_seq):
  # Encode the input_seq as a state vector
  states_value = encoder_model.predict(input_seq)
  #Generate empty target seq of length 1
  target_seq = np.zeros((1,1, num_decoder_tokens))
  # Populate the first word of target sequence with the start word.
  target_seq[0,0, target_token_index['START_']] = 1

  stop_condition = False
  decoded_sentence = ''
  while not stop_condition:
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
    sampled_token_index = np.argmax(output_tokens[0, -1,:])
    sampled_word = reverse_target_token_index[sampled_token_index]
    decoded_sentence += sampled_word

    if(sampled_word == '_END' or len(decoded_sentence) > max_len_decoder):
      stop_condition=True
    
    # Update the target sequence (of length 1)
    target_seq = np.zeros((1,1, num_decoder_tokens))
    target_seq[0,0, sampled_token_index]  =1.

    states_value = [h, c]
  return decoded_sentence    



In [None]:
for seq_index in range(20):

  input_seq = encoder_input_data[seq_index: seq_index+1]
  decoded_sentence= decode_sequence(input_seq)
  print('-')
  print(X_train.iloc[seq_index])
  print(decoded_sentence)

-
i want my money back
मुझेमुझेको_END
-
whos that
वहबहुत_END
-
few students use pencils these days
मेंमेंमेंमें_END
-
if the coffee is too strong add some more water
अगरसेसेसेसेसेसे_END
-
do you feel any pain in your stomach
तुमतुममेंमेंमेंमें_END
-
your hair is too long
वहमेंमेंमेंमें_END
-
they furnished the library with many books
उसनेअपनेकोकोमें_END
-
i can swim
मुझेनहीं_END
-
i cannot walk any farther
मैंनहींनहीं_END
-
you should make use of this chance
तुमइसकेकेमेंमें_END
-
i will give you a call as soon as i get home
मैंमैंसेनहीं_END
-
you must keep an eye on the child
हमपासके_END
-
he could not come because of his illness
वहवहकीसेसेनहीं_END
-
can i use your pencil
मैंमैंमेंमें_END
-
i tried to tell you
मुझेमुझेमुझेनहींनहीं_END
-
the medicine tastes bitter
तुमपासकेकेमें_END
-
what does it contain
मज़े_END
-
waking up is the opposite of going to sleep
उन्होंनेनेकोको_END
-
always keep your office tidy
उसनेअपनेकोके_END
-
he leveled his gun at me
उसनेउसनेअपनेकोको_END


In [None]:
model.save_weights('/content/drive/MyDrive/2KwordEncodingModel.h5')

In [None]:
model.save_weights('sample_data/prism/2KwordEncodingModel.h5')

In [None]:
for i, (input_text, target_text) in enumerate(zip(X_train, Y_train)):
  print(i, input_text,)

0 i want my money back
1 whos that
2 few students use pencils these days
3 if the coffee is too strong add some more water
4 do you feel any pain in your stomach
5 your hair is too long
6 they furnished the library with many books
7 i can swim
8 i cannot walk any farther
9 you should make use of this chance
10 i will give you a call as soon as i get home
