In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
import string
from string import digits
import re
from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Input, Dense,Embedding, Concatenate, TimeDistributed
from tensorflow.keras.models import Model,load_model, model_from_json
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
import pickle as pkl
import numpy as np

In [18]:
with open('hin.txt','r') as f:
  data = f.read()

In [36]:
uncleaned_data_list = data.split('\n')
len(uncleaned_data_list)
uncleaned_data_list = uncleaned_data_list[:38695]
len(uncleaned_data_list)

english_word = []
hindi_word = []
cleaned_data_list = []
for word in uncleaned_data_list:
  english_word.append(word.split('\t')[:1][0])
  hindi_word.append(word.split('\t')[:2][-1])
  
language_data = pd.DataFrame(columns=['English','Hindi'])
language_data['English'] = english_word
language_data['Hindi'] = hindi_word
language_data.to_csv('language_data.csv', index=False)
language_data.head(10)

Unnamed: 0,English,Hindi
0,Wow!,वाह!
1,Duck!,झुको!
2,Duck!,बतख़!
3,Help!,बचाओ!
4,Jump.,उछलो.
5,Jump.,कूदो.
6,Jump.,छलांग.
7,Hello!,नमस्ते।
8,Hello!,नमस्कार।
9,Cheers!,वाह-वाह!


In [37]:
english_text = language_data['English'].values
hindi_text = language_data['Hindi'].values
len(english_text), len(hindi_text)

(2953, 2953)

In [38]:
#to lower case
english_text_ = [x.lower() for x in english_text]
hindi_text_ = [x.lower() for x in hindi_text]

#removing inverted commas
english_text_ = [re.sub("'",'',x) for x in english_text_]
hindi_text_ = [re.sub("'",'',x) for x in hindi_text_]
def remove_punc(text_list):
  table = str.maketrans('', '', string.punctuation)
  removed_punc_text = []
  for sent in text_list:
    sentance = [w.translate(table) for w in sent.split(' ')]
    removed_punc_text.append(' '.join(sentance))
  return removed_punc_text
english_text_ = remove_punc(english_text_)
hindi_text_ = remove_punc(hindi_text_)
remove_digits = str.maketrans('', '', digits)
removed_digits_text = []
for sent in english_text_:
  sentance = [w.translate(remove_digits) for w in sent.split(' ')]
  removed_digits_text.append(' '.join(sentance))
english_text_ = removed_digits_text

# removing the digits from the hindi sentances
hindi_text_ = [re.sub("[२३०८१५७९४६]","",x) for x in hindi_text_]
hindi_text_ = [re.sub("[\u200d]","",x) for x in hindi_text_]

# removing the stating and ending whitespaces
english_text_ = [x.strip() for x in english_text_]
hindi_text_ = [x.strip() for x in hindi_text_]

In [40]:
# Putting the start and end words in the hindi sentances
hindi_text_ = ["start " + x + " end" for x in hindi_text_]

# manipulated_hindi_text_
hindi_text_[0], english_text_[0]

('start start वाह end end', 'wow')

In [41]:
X = english_text_
Y = hindi_text_
X_train, X_test, y_train, y_test=train_test_split(X,Y,test_size=0.1)

In [42]:
def Max_length(data):
  max_length_ = max([len(x.split(' ')) for x in data])
  return max_length_

#Training data
max_length_english = Max_length(X_train)
max_length_hindi = Max_length(y_train)

#Test data
max_length_english_test = Max_length(X_test)
max_length_hindi_test = Max_length(y_test)
max_length_hindi, max_length_english

(29, 22)

In [43]:
englishTokenizer = Tokenizer()
englishTokenizer.fit_on_texts(X_train)
Eword2index = englishTokenizer.word_index
vocab_size_source = len(Eword2index) + 1

X_train = englishTokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=max_length_english, padding='post')

X_test = englishTokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen = max_length_english, padding='post')

hindiTokenizer = Tokenizer()
hindiTokenizer.fit_on_texts(y_train)
Hword2index = hindiTokenizer.word_index
vocab_size_target = len(Hword2index) + 1

y_train = hindiTokenizer.texts_to_sequences(y_train)
y_train = pad_sequences(y_train, maxlen=max_length_hindi, padding='post')

y_test = hindiTokenizer.texts_to_sequences(y_test)
y_test = pad_sequences(y_test, maxlen = max_length_hindi, padding='post')

vocab_size_source, vocab_size_target

(2275, 2888)

In [44]:
X_train[0], y_train[0]

(array([ 47, 116, 424, 112,   4, 117,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0], dtype=int32),
 array([  1,   1, 363,  22, 327,  69,  16, 953, 328,   5,   3,   2,   2,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0], dtype=int32))

In [47]:
with open('NMT_data.pkl','wb') as f:
  pkl.dump([X_train, y_train, X_test, y_test],f)

with open('NMT_Etokenizer.pkl','wb') as f:
  pkl.dump([vocab_size_source, Eword2index, englishTokenizer], f)

with open('NMT_Htokenizer.pkl', 'wb') as f:
  pkl.dump([vocab_size_target, Hword2index, hindiTokenizer], f)
  
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

In [48]:
from attention import AttentionLayer
from keras import backend as K 
K.clear_session() 
latent_dim = 500

# Encoder 
encoder_inputs = Input(shape=(max_length_english,)) 
enc_emb = Embedding(vocab_size_source, latent_dim,trainable=True)(encoder_inputs)

#LSTM 1 
encoder_lstm1 = LSTM(latent_dim,return_sequences=True,return_state=True) 
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)

#LSTM 2 
encoder_lstm2 = LSTM(latent_dim,return_sequences=True,return_state=True) 
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)

#LSTM 3 
encoder_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True) 
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2)

# Set up the decoder. 
decoder_inputs = Input(shape=(None,)) 
dec_emb_layer = Embedding(vocab_size_target, latent_dim,trainable=True) 
dec_emb = dec_emb_layer(decoder_inputs)

#LSTM using encoder_states as initial state
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True) 
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c])

#Attention Layer
attn_layer = AttentionLayer(name='attention_layer') 
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs])

# Concat attention output and decoder LSTM output 
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_out])

#Dense layer
decoder_dense = TimeDistributed(Dense(vocab_size_target, activation='softmax')) 
decoder_outputs = decoder_dense(decoder_concat_input)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 
plot_model(model, to_file='train_model.png', show_shapes=True)

ModuleNotFoundError: ignored