### We will now implement the LSTM Model for training the NMT with the cleaned data

In [1]:
import tensorflow as tf
import pandas as pd 
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split


In [2]:
VOCAB_SIZE = 20000 # max no. of words for tokenizer , Top 5000 Words in the Vocabulary
MAX_SEQUENCE_LENGTH = 200 # max length of each entry (sentence)
EMBEDDING_DIM = 300      # embedding dimensions for word vectors
OOV_TOKEN = '<OOV>'

In [3]:
# Read Data
df = pd.read_csv('./Data/cleaned_data.csv')
df = df.head(10000)


### Let us now convert the sentences into number sequences and  pad the sequences with Zeros to make all the inputs of equal size

In [4]:
# find the maximum words in the sample
max_en_words_per_sample = max([len(sample.split()) for sample in df.cleaned_source])
max_es_words_per_sample = max([len(sample.split()) for sample in df.cleaned_target])

In [5]:
print(f'Maximum EN words in sample: {max_en_words_per_sample}')
print(f'Maximum ES words in sample: {max_es_words_per_sample}')

Maximum EN words in sample: 5
Maximum ES words in sample: 10


In [6]:
# Create word2index and index2word
# def vocab_creator(texts,vocab_size = VOCAB_SIZE):
#     tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE)
#     tokenizer.fit_on_texts(texts)
#     # sequences = tokenizer.texts_to_sequences(texts)
#     # sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences,
#     # maxlen=MAX_SEQUENCE_LENGTH,padding='post')
#     word_index_dictionary = tokenizer.word_index

#     word2index = {}
#     index2word = {}

#     for key,value in word_index_dictionary.items():
#         if value < VOCAB_SIZE:
#             word2index[key] = value
#             index2word[value] = key
#         if value >= VOCAB_SIZE-1:
#              continue
#     return word2index,index2word
        

In [7]:
# # Find all the source and target words and sort them
# Vocabulary of Source language
all_source_words=set()
for source in df.cleaned_source:
    for word in source.split():
        if word not in all_source_words:
            all_source_words.add(word)
# Vocabulary of Target 
all_target_words=set()
for target in df.cleaned_target:
    for word in target.split():
        if word not in all_target_words:
            all_target_words.add(word)
# sort all unique source and target words

source_words= sorted(list(all_source_words))
target_words=sorted(list(all_target_words))

num_source_tokens = len(source_words)
num_target_tokens = len(target_words)

In [8]:
# creating a word to index(word2idx) for source and target
source_word2index= dict([(word, i+1) for i,word in enumerate(source_words)])
target_word2index=dict([(word, i+1) for i, word in enumerate(target_words)])

#creating a dictionary for index to word for source and target vocabulary
source_index2word= dict([(i, word) for word, i in  source_word2index.items()])
target_index2word =dict([(i, word) for word, i in target_word2index.items()])

In [9]:
# source_word2index ,source_index2word = vocab_creator(df.cleaned_source)

In [10]:
dict(list(source_index2word.items())[:15])

{1: 'a',
 2: 'abandon',
 3: 'abducted',
 4: 'able',
 5: 'aboard',
 6: 'about',
 7: 'above',
 8: 'abroad',
 9: 'absent',
 10: 'absurd',
 11: 'accelerated',
 12: 'accept',
 13: 'ache',
 14: 'ached',
 15: 'aches'}

In [11]:
# target_word2index ,target_index2word = vocab_creator(df.cleaned_target)

In [12]:
dict(list(target_word2index.items())[:15])

{'START_': 1,
 '_END': 2,
 'a': 3,
 'aabe': 4,
 'abajo': 5,
 'abandona': 6,
 'abandonaron': 7,
 'abandonen': 8,
 'abandono': 9,
 'abatio': 10,
 'abejas': 11,
 'abierta': 12,
 'abofeteo': 13,
 'abogado': 14,
 'abogados': 15}

In [13]:
# target_word2index['¿']

In [14]:
df_shuffled = shuffle(df)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df.cleaned_source, df.cleaned_target, test_size = 0.1)
X_train.shape, X_test.shape

((9000,), (1000,))

### Create Word Embeddings

In [16]:
# Load Glove vector
EMBEDDING_DIM = 300      # embedding dimensions for word vectors
GLOVE_DIR = f"../GloVe/glove.42B.{EMBEDDING_DIM}d.txt"

In [17]:
def create_embeddings_index(glove_dir): 
    
    embeddings_index = {}
    f = open(glove_dir,encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        embeddings_index[word] = np.asarray(values[1:],dtype='float32')
    f.close()

    return embeddings_index  

In [18]:
embeddings_index = create_embeddings_index(GLOVE_DIR)

In [19]:
# create an embedding matrix for the words we have in the dataset
embeddings_matrix = np.zeros((num_source_tokens+1,EMBEDDING_DIM))
for word,i in source_word2index.items():
    embeddings_vector = embeddings_index.get(word)
    if embeddings_vector is not None:
        embeddings_matrix[i] = embeddings_vector

In [20]:
# Add START_ and END_ to tokenized dictionary
# target_word2index.pop('start')
# target_word2index.pop('end')

# target_word2index['START_']  = 1
# target_word2index['_END']  = 2

### Encoder

We have to create the encoder and decoder inputs as a generator which will zero pad all sequences and make them of the same size

In [52]:
num_target_tokens +=1

In [53]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_en_words_per_sample),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_es_words_per_sample),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_es_words_per_sample, num_target_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                   
                    encoder_input_data[i, t] = source_word2index[word] 
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_word2index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        #print(word)
                        decoder_target_data[i, t - 1, target_word2index[word]] = 1.
                    
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

def generate_batch(x,y,batch_size=128):
    while True:
        for current_batch_index in range(0,len(x),batch_size):
            encoder_data_input = np.zeros((batch_size,max_en_words_per_sample),dtype='float32')
            decoder_data_input = np.zeros((batch_size,max_es_words_per_sample),dtype='float32')
            decoder_target_input = np.zeros((batch_size,max_es_words_per_sample,len(target_word2index)+1 ),dtype='float32')

            for row_index ,(input_text, target_text) in enumerate(zip(x[current_batch_index:current_batch_index+batch_size], y[current_batch_index:current_batch_index+batch_size])):
                for word_index, word in enumerate(input_text.split()):
                    encoder_data_input[row_index,word_index] = source_word2index[word]
                for word_index, word in enumerate(target_text.split()):
                    if word_index < (len(target_text.split())) - 1:
                        # print(word_index)
                        # print(word)
                        decoder_data_input[row_index,word_index] = target_word2index[word] 
                    if word_index > 0:
                        decoder_target_input[row_index,word_index-1,target_word2index[word]] = 1
    yield ([encoder_data_input,decoder_data_input],decoder_target_input)


In [54]:
# We will create a generator 

In [55]:
latent_dim = 50 # Hidden layers dimension 

In [56]:
# encoder_inputs = tf.keras.layers.Input(shape=(max_en_words_per_sample, ),name="encoder_inputs", dtype='int32')

encoder_inputs = tf.keras.layers.Input(shape=(None, ),name="encoder_inputs", dtype='int32')

# Hidden layers of the encoder :
embedding_layer = tf.keras.layers.Embedding(num_source_tokens+1,
                                            output_dim=EMBEDDING_DIM,        
                                            weights = [embeddings_matrix],
                                            # input_length=max_en_words_per_sample,
                                            trainable=False,
                                            name = 'embeddings' )
# embedding_layer =  tf.keras.layers.Embedding(num_source_tokens+1, latent_dim, mask_zero = True)

embedded_inputs = embedding_layer(encoder_inputs)
encoder_LSTM = tf.keras.layers.LSTM(latent_dim,return_state=True) 

# Output layer of the encoder :
# encoder_LSTM2_Layer = tf.keras.layers.LSTM(1024,return_sequences=True)

encoder_ouputs,state_h,state_c = encoder_LSTM(embedded_inputs)

# Next we discard the Encoder output and only keep the states
encoder_states =  [state_h,state_c]

In [57]:
# Building the decoder
# Input layer of the decoder :
decoder_inputs = tf.keras.layers.Input(shape=(None,))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the   `
# return states in the training model, but we will use them in inference.
emb_layer_decoder = tf.keras.layers.Embedding(num_target_tokens,latent_dim, mask_zero=True)(decoder_inputs)

decoder_lstm = tf.keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)

decoder_outputs, _, _ = decoder_lstm(emb_layer_decoder, initial_state=encoder_states)

# Use a softmax to generate a probability distribution over the target vocabulary for each time step

decoder_dense = tf.keras.layers.Dense(num_target_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])



Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embeddings (Embedding)          (None, None, 300)    686400      encoder_inputs[0][0]             
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 50)     227100      input_4[0][0]                    
____________________________________________________________________________________________

In [66]:
# Define Checkpoint
import os
checkpoint_name = os.path.join('checkpoints','Weights-{epoch:03d}--{val_loss:.5f}.hdf5')
checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_weights_only = True, mode ='auto')

import datetime
log_dir = "logs\\" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir, histogram_freq=1)

# callbacks_list = [checkpoint,tensorboard_callback]
callbacks_list = [tensorboard_callback]

In [67]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [68]:
logs_base_dir = "logs"
os.makedirs(logs_base_dir, exist_ok=True)
%tensorboard --logdir {logs_base_dir}

Reusing TensorBoard on port 6006 (pid 26064), started 1 day, 1:38:02 ago. (Use '!kill 26064' to kill it.)

In [69]:
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
# for gpu in gpus:
#         tf.config.experimental.set_memory_growth(gpu, True)
# tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [70]:
# start training
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
model.fit(generate_batch(X_train,y_train,batch_size=batch_size),
        epochs = 10,
        steps_per_epoch = train_samples//batch_size,
        validation_data=generate_batch(X_test,y_test,batch_size = batch_size),
        validation_steps= val_samples // batch_size,
        verbose =1,
        callbacks = callbacks_list)

Epoch 1/10
Epoch 2/10


UnknownError:  [_Derived_]  CUDNN_STATUS_BAD_PARAM
in tensorflow/stream_executor/cuda/cuda_dnn.cc(1496): 'cudnnSetRNNDataDescriptor( data_desc.get(), data_type, layout, max_seq_length, batch_size, data_size, seq_lengths_array, (void*)&padding_fill)'
	 [[{{node cond_38/then/_0/CudnnRNNV3}}]]
	 [[model_3/lstm_7/StatefulPartitionedCall]]
	 [[gradient_tape/model_3/embedding_3/embedding_lookup/Reshape/_52]] [Op:__inference_train_function_41971]

Function call stack:
train_function -> train_function -> train_function
