### We will now implement the LSTM Model for training the NMT with the cleaned data

In [26]:
import tensorflow as tf
import pandas as pd 
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split


In [2]:
VOCAB_SIZE = 20000 # max no. of words for tokenizer , Top 5000 Words in the Vocabulary
MAX_SEQUENCE_LENGTH = 200 # max length of each entry (sentence)
EMBEDDING_DIM = 300      # embedding dimensions for word vectors
OOV_TOKEN = '<OOV>'

In [3]:
# Read Data
df = pd.read_csv('./Data/cleaned_data.csv')
df.head()


Unnamed: 0,source,target,comments,cleaned_source,cleaned_target
0,Go.,Ve.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...,go,START_ Ve _END
1,Go.,Vete.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...,go,START_ Vete _END
2,Go.,Vaya.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...,go,START_ Vaya _END
3,Go.,Váyase.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...,go,START_ Váyase _END
4,Hi.,Hola.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...,hi,START_ Hola _END


### Let us now convert the sentences into number sequences and  pad the sequences with Zeros to make all the inputs of equal size

In [38]:
# find the maximum words in the sample
max_en_words_per_sample = max([len(sample.split()) for sample in df.cleaned_source])
max_es_words_per_sample = max([len(sample.split()) for sample in df.cleaned_target])

In [39]:
print(f'Maximum EN words in sample: {max_en_words_per_sample}')
print(f'Maximum ES words in sample: {max_es_words_per_sample}')

Maximum EN words in sample: 43
Maximum ES words in sample: 51


In [69]:
# Create word2index and index2word
def vocab_creator(texts,vocab_size = VOCAB_SIZE):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE)
    tokenizer.fit_on_texts(texts)
    # sequences = tokenizer.texts_to_sequences(texts)
    # sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences,
    # maxlen=MAX_SEQUENCE_LENGTH,padding='post')
    word_index_dictionary = tokenizer.word_index

    word2index = {}
    index2word = {}

    for key,value in word_index_dictionary.items():
        if value < VOCAB_SIZE:
            word2index[key] = value
            index2word[value] = key
        if value >= VOCAB_SIZE-1:
             continue
    return word2index,index2word
        

In [70]:
source_word2index ,source_index2word = vocab_creator(df.cleaned_source)

In [71]:
dict(list(source_index2word.items())[:15])

{1: 'be',
 2: 'the',
 3: 'to',
 4: 'tom',
 5: 'do',
 6: 'a',
 7: "n't",
 8: 'have',
 9: "'s",
 10: 'that',
 11: 'in',
 12: 'of',
 13: 'this',
 14: 'go',
 15: 'for'}

In [72]:
target_word2index ,target_index2word = vocab_creator(df.cleaned_target)

In [73]:
dict(list(target_word2index.items())[:15])

{'start': 1,
 'end': 2,
 'de': 3,
 'que': 4,
 'no': 5,
 'a': 6,
 'tom': 7,
 'la': 8,
 '¿': 9,
 'el': 10,
 'en': 11,
 'es': 12,
 'un': 13,
 'se': 14,
 'por': 15}

In [74]:
df_shuffled = shuffle(df)

In [75]:
X_train, X_test, y_train, y_test = train_test_split(df.cleaned_source, df.cleaned_target, test_size = 0.1)
X_train.shape, X_test.shape

((111393,), (12377,))

### Create Word Embeddings

In [29]:
# Load Glove vector
EMBEDDING_DIM = 300      # embedding dimensions for word vectors
GLOVE_DIR = f"../GloVe/glove.42B.{EMBEDDING_DIM}d.txt"

In [30]:
def create_embeddings_index(glove_dir): 
    
    embeddings_index = {}
    f = open(glove_dir,encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        embeddings_index[word] = np.asarray(values[1:],dtype='float32')
    f.close()

    return embeddings_index  

In [31]:
embeddings_index = create_embeddings_index(GLOVE_DIR)

In [32]:
# create an embedding matrix for the words we have in the dataset
embeddings_matrix = np.zeros((len(source_word2index)+1,EMBEDDING_DIM))
for word,i in source_word2index.items():
    embeddings_vector = embeddings_index.get(word)
    if embeddings_vector is not None:
        embeddings_matrix[i] = embeddings_vector

### Encoder

We have to create the encoder and decoder inputs as a generator which will zero pad all sequences and make them of the same size

In [77]:
def generate_batch(x,y,batch_size=128):
    while True:
        for current_batch_index in range(0,len(x),batch_size):
            encoder_data_input = np.zeros((batch_size,max_en_words_per_sample),dtype='float32')
            decoder_data_input = np.zeros((batch_size,max_es_words_per_sample),dtype='float32')
            decoder_target_input = np.zeros((batch_size,max_es_words_per_sample,len(target_word2index)+1 ),dtype='float32')

            for row_index ,(input_text, target_text) in enumerate(zip(x[current_batch_index:current_batch_index+batch_size], y[current_batch_index:current_batch_index+batch_size])):
                for word_index, word in enumerate(input_text.split()):
                    encoder_data_input[row_index,word_index] = source_word2index[word]
                for word_index, word in enumerate(target_text.split()):
                    decoder_data_input[row_index,word_index] = target_word2index[word] 
                    if word_index > 0:
                        decoder_target_input[row_index,word_index-1,target_word2index[word]] = 1
    yield ([encoder_data_input,decoder_data_input],decoder_target_input)


In [None]:
# We will create a generator 

In [78]:
latent_dim = 50 # Hidden layers dimension 

In [88]:
# encoder_inputs = tf.keras.layers.Input(shape=(max_en_words_per_sample, ),name="encoder_inputs", dtype='int32')

encoder_inputs = tf.keras.layers.Input(shape=(None, ),name="encoder_inputs", dtype='int32')

# Hidden layers of the encoder :
embedding_layer = tf.keras.layers.Embedding(len(source_word2index)+1,
                                            output_dim=EMBEDDING_DIM,        
                                            weights = [embeddings_matrix],
                                            # input_length=max_en_words_per_sample,
                                            trainable=False,
                                            name = 'embeddings' )

embedded_inputs = embedding_layer(encoder_inputs)
encoder_LSTM = tf.keras.layers.LSTM(latent_dim,return_state=True) 

# Output layer of the encoder :
# encoder_LSTM2_Layer = tf.keras.layers.LSTM(1024,return_sequences=True)

encoder_ouputs,state_h,state_c = encoder_LSTM(embedded_inputs)

# Next we discard the Encoder output and only keep the states
encoder_states =  [state_h,state_c]

In [91]:
# Building the decoder
# Input layer of the decoder :
decoder_inputs = tf.keras.layers.Input(shape=(None,))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
emb_layer_decoder = tf.keras.layers.Embedding(len(target_word2index)+1,latent_dim, mask_zero=True)(decoder_inputs)

decoder_lstm = tf.keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)

decoder_outputs, _, _ = decoder_lstm(emb_layer_decoder, initial_state=encoder_states)

# Use a softmax to generate a probability distribution over the target vocabulary for each time step

decoder_dense = tf.keras.layers.Dense(len(target_word2index)+1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

