# SEQUENCE TO SEQUENCE MODEL

In [7]:
# IMPORTING ALL NECCESSERRY LIBRARIES
import os
import numpy as np
from tensorflow.keras.models import Model,load_model
from tensorflow.keras.layers import *
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras.backend as K
import matplotlib.pyplot as plt

In [8]:
def softmax_over_time(x):
    assert(K.ndim(x) > 2)
    e = K.exp(x - K.max(x,axis=1,keepdims=True))
    s = K.sum(e,axis=1,keepdims=True)
    return e/s

In [9]:
# config
BATCH_SIZE=64
EPOCHS=40
LATENT_DIM=256
NUM_OF_SAMPLES=10000
MAX_NUM_WORDS=20000
EMBEDDING_DIM=100

In [10]:
# WHERE WE WILL STORE THE DATA
input_texts=[]
target_texts=[]
target_texts_inputs=[] # for force teaching in decoding layer

In [11]:
# CAPTURING THE DATA only 10000 sentences
t=0
for line in open("./spa-eng/spa.txt"):
    t+=1
    if t > NUM_OF_SAMPLES:
        break
    
    if '\t' not in line:
        continue
    a=line
    input_text,translation,_=line.rstrip().split("\t")
    target_text= translation + '<eos>'
    target_text_input = '<sos>' + translation
    
    input_texts.append(input_text)
    target_texts.append(target_text)
    target_texts_inputs.append(target_text_input)
    

UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 4638: character maps to <undefined>

In [15]:
# no of samples
print("No. of samples : ",len(input_texts))

No. of samples :  10000


In [16]:
# Tokenize our input sentences
tokenizer_input=Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_input.fit_on_texts(input_texts)
input_sequences=tokenizer_input.texts_to_sequences(input_texts)

In [17]:
# get word to idx mapping for each word
word2idx_inputs=tokenizer_input.word_index

In [18]:
# unique input words
print("Unique words : ",len(word2idx_inputs))

Unique words :  2146


In [19]:
# MAXIMUM LENGTH IN INPUT SENTENCES
max_len_input=max(len(s) for s in input_sequences)
print("Maximum length : ",max_len_input )

Maximum length :  5


In [20]:
# TOKENIZE OUTPUT SENTENCES
tokenizer_outputs=Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_outputs.fit_on_texts(target_texts+target_texts_inputs)

target_sequences=tokenizer_outputs.texts_to_sequences(target_texts)
target_sequences_input=tokenizer_outputs.texts_to_sequences(target_texts_inputs)

In [21]:
# WORD 2 IDX MAPPING FOR EACH WORD
word2idx_output=tokenizer_outputs.word_index
num_words_output = len(word2idx_output) + 1

In [22]:
print("Unique words : ",len(word2idx_output) )

Unique words :  4521


In [23]:
# MAXIMUM LENGTH OF TARGET SENTENCE
max_len_target=max(len(a) for a in target_sequences)
print(max_len_target)

11


In [24]:
# PADDING FOR ENCODER
encoder_inputs = pad_sequences(input_sequences,maxlen=max_len_input)
print("encoder_inputs.shape:", encoder_inputs.shape)
print("encoder_inputs[0]:", encoder_inputs[0])


encoder_inputs.shape: (10000, 5)
encoder_inputs[0]: [ 0  0  0  0 15]


In [25]:
# PADDING FOR DECODER 
decoder_inputs= pad_sequences(target_sequences_input, maxlen = max_len_target, padding = 'post')
print("decoder_inputs.shape:", decoder_inputs.shape)
print("decoder_inputs[0]:", decoder_inputs[0])


decoder_targets=pad_sequences(target_sequences,maxlen = max_len_target, padding = 'post')
print("decoder_targets.shape:", decoder_targets.shape)
print("decoder_targets[0]:", decoder_targets[0])

decoder_inputs.shape: (10000, 11)
decoder_inputs[0]: [ 2 51  0  0  0  0  0  0  0  0  0]
decoder_targets.shape: (10000, 11)
decoder_targets[0]: [51  1  0  0  0  0  0  0  0  0  0]


In [26]:
# STORING ALL WORDS FROM GLOVE TO DICT
word2vec = {}

with open("./glove6b100dtxt/glove.6B.100d.txt",encoding='utf8') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vector=np.asarray((values[1:]),dtype='float32')
        word2vec[word]=vector


In [27]:
# PREPARE EMBEDDING MATRIX
num_words = min(MAX_NUM_WORDS,len(word2idx_inputs)+1)
embedding_matrix = np.zeros((num_words,EMBEDDING_DIM))
for word, i in word2idx_inputs.items():
    if i < MAX_NUM_WORDS:
        embedding_vector=word2vec.get(word)
        
        if embedding_vector is not None:
            embedding_matrix[i]=embedding_vector


In [28]:
# ONE HOT ENCODING
decoder_targets_onehot=to_categorical(decoder_targets)
print(decoder_targets_onehot.shape)

(10000, 11, 4522)


# MODEL PREPERATION

In [29]:
# CREATE EMBEDDING LAYER
embedding_layer=Embedding(num_words,EMBEDDING_DIM,weights=[embedding_matrix],input_length=max_len_input)

In [99]:
# SETTING UP ENCODER
encoder_inputs_placeholder = Input(shape=(max_len_input,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = Bidirectional(LSTM(
  LATENT_DIM,
  return_sequences=True,
  dropout=0.5 # dropout not available on gpu
))
encoder_outputs = encoder(x)

# Set up the decoder - not so simple
decoder_inputs_placeholder = Input(shape=(max_len_target,))

# this word embedding will not use pre-trained vectors
# although you could
decoder_embedding = Embedding(num_words_output, EMBEDDING_DIM)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)




######### Attention #########
# Attention layers need to be global because
# they will be repeated Ty times at the decoder
attn_repeat_layer = RepeatVector(max_len_input)
attn_concat_layer = Concatenate(axis=-1)
attn_dense1 = Dense(10, activation='tanh')
attn_dense2 = Dense(1, activation=softmax_over_time)
attn_dot = Dot(axes=1) # to perform the weighted sum of alpha[t] * h[t]

def one_step_attention(h, st_1):
    # h = h(1), ..., h(Tx), shape = (Tx, LATENT_DIM * 2)
    # st_1 = s(t-1), shape = (LATENT_DIM_DECODER,)

    # copy s(t-1) Tx times
    # now shape = (Tx, LATENT_DIM_DECODER)
    st_1 = attn_repeat_layer(st_1)

    # Concatenate all h(t)'s with s(t-1)
    # Now of shape (Tx, LATENT_DIM_DECODER + LATENT_DIM * 2)
    x = attn_concat_layer([h, st_1])

    # Neural net first layer
    x = attn_dense1(x)
    # Neural net second layer with special softmax over time
    alphas = attn_dense2(x)

    # "Dot" the alphas and the h's
    # Remember a.dot(b) = sum over a[t] * b[t]
    context = attn_dot([alphas, h])
    

    return context


# define the rest of the decoder (after attention)
decoder_lstm = LSTM(LATENT_DIM, return_state=True)
decoder_dense = Dense(num_words_output, activation='softmax')

initial_s = Input(shape=(LATENT_DIM,), name='s0')
initial_c = Input(shape=(LATENT_DIM,), name='c0')
context_last_word_concat_layer = Concatenate(axis=2)


# Unlike previous seq2seq, we cannot get the output
# all in one step
# Instead we need to do Ty steps
# And in each of those steps, we need to consider
# all Tx h's

# s, c will be re-assigned in each iteration of the loop
s = initial_s
c = initial_c

# collect outputs in a list at first
outputs = []
for t in range(max_len_target): # Ty times
    # get the context using attention
    context = one_step_attention(encoder_outputs, s)

    # we need a different layer for each time step
    selector = Lambda(lambda x: x[:, t:t+1]) 
    
    xt = selector(decoder_inputs_x)
    # combine 
    decoder_lstm_input = context_last_word_concat_layer([context, xt])

    # pass the combined [context, last word] into the LSTM
    # along with [s, c]
    # get the new [s, c] and output
    o, s, c = decoder_lstm(decoder_lstm_input, initial_state=[s, c])
    # final dense layer to get next word prediction
    decoder_outputs = decoder_dense(o)

    outputs.append(decoder_outputs)


# 'outputs' is now a list of length Ty
# each element is of shape (batch size, output vocab size)
# therefore if we simply stack all the outputs into 1 tensor
# it would be of shape T x N x D
# we would like it to be of shape N x T x D

def stack_and_transpose(x):
    # x is a list of length T, each element is a batch_size x output_vocab_size tensor
    x = K.stack(x) # is now T x batch_size x output_vocab_size tensor
    x = K.permute_dimensions(x, pattern=(1, 0, 2)) # is now batch_size x T x output_vocab_size
    return x

# make it a layer
stacker = Lambda(stack_and_transpose)
outputs = stacker(outputs)
print(outputs.shape)

# create the model
model = Model(
  inputs=[
    encoder_inputs_placeholder,
    decoder_inputs_placeholder,
    initial_s, 
    initial_c,
  ],
  outputs=outputs
)


(None, 11, 4522)


In [40]:
model.summary()
model.compile(optimizer='adam',loss='categorical_crossentropy')

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_15 (InputLayer)           [(None, 5)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 5, 100)       214700      input_15[0][0]                   
__________________________________________________________________________________________________
s0 (InputLayer)                 [(None, 256)]        0                                            
__________________________________________________________________________________________________
bidirectional_7 (Bidirectional) (None, 5, 512)       731136      embedding[7][0]                  
______________________________________________________________________________________________

In [None]:
z = np.zeros((len(encoder_inputs), LATENT_DIM_DECODER)) # initial [s, c]
r = model.fit(
  [encoder_inputs, decoder_inputs, z, z], decoder_targets_one_hot,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
)

In [None]:
model.load_weights("seq2seqweights.h5")

# MAKE MODEL

In [100]:
##### Make predictions #####
# As with the poetry example, we need to create another model
# that can take in the RNN state and previous word as input
# and accept a T=1 sequence.

# The encoder will be stand-alone
# From this we will get our initial decoder hidden state
# i.e. h(1), ..., h(Tx)
encoder_model = Model(encoder_inputs_placeholder, encoder_outputs)

# next we define a T=1 decoder model
encoder_outputs_as_input = Input(shape=(max_len_input, LATENT_DIM * 2,))
decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = decoder_embedding(decoder_inputs_single)

# no need to loop over attention steps this time because there is only one step
context = one_step_attention(encoder_outputs_as_input, initial_s)

# combine context with last word
decoder_lstm_input = context_last_word_concat_layer([context, decoder_inputs_single_x])




# lstm and final dense
o, s, c = decoder_lstm(decoder_lstm_input, initial_state=[initial_s, initial_c])
decoder_outputs = decoder_dense(o)


# note: we don't really need the final stack and tranpose
# because there's only 1 output
# it is already of size N x D
# no need to make it 1 x N x D --> N x 1 x D



# create the model object
decoder_model = Model(
  inputs=[
    decoder_inputs_single,
    encoder_outputs_as_input,
    initial_s, 
    initial_c
  ],
  outputs=[decoder_outputs, s, c]
)

In [102]:
idx2word_eng = {v:k for k, v in word2idx_inputs.items()}
idx2word_trans = {v:k for k, v in word2idx_output.items()}

In [115]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    enc_out = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))

    # Populate the first character of target sequence with the start character.
    # NOTE: tokenizer lower-cases all words
    target_seq[0, 0] = word2idx_output['sos']

    # if we get this we break
    eos = word2idx_output['eos']


    # [s, c] will be updated in each loop iteration
    s = np.zeros((1, LATENT_DIM))
    c = np.zeros((1, LATENT_DIM))


    # Create the translation
    output_sentence = []
    for _ in range(max_len_target):
        o, s, c = decoder_model.predict([target_seq, enc_out, s, c])


        # Get next word
        idx = np.argmax(o.flatten())

        # End sentence of EOS
        if eos == idx:
            break

        word = ''
        if idx > 0:
            word = idx2word_trans[idx]
            output_sentence.append(word)

        # Update the decoder input
        # which is just the word just generated
        target_seq[0, 0] = idx

    return ' '.join(output_sentence)


In [116]:
while True:
    # Do some test translations
    i = np.random.choice(len(input_texts))
    input_seq = encoder_inputs[i:i+1]
    translation_a = decode_sequence(input_seq)
    print('-')
    print('Input:', input_texts[i])
    print('Translation:', translation_a)
    print('Original:',target_texts[i])

    ans = input("Continue? [Y/n]")
    if ans and ans.lower().startswith('n'):
        break

-
Input: I'm so unlucky!
Translation: vã©rifie vã©rifie soit arnaque arnaque arnaque arnaque arnaque arnaque arnaque arnaque
Original: Quelle poisse j'aiâ€¯!<eos>
Continue? [Y/n]n
