# Encoder-Decoder MT Attention Network

These are all the papers I reference:


1.   [Google Paper](https://arxiv.org/pdf/1609.08144.pdf): This was a study from google on how to configure encoder/decoder networks with attention for machine translation. Most of the decisions I made on what layers to use, how many to use, what attention to use, and other hyper parameters I use are determined from this. I mention most of them as they happen.
2.   [Luong et al](https://arxiv.org/pdf/1508.04025.pdf): This is the general scafolding I used for writing the network. This paper is a more digestable than the Bahdanau paper for the general strucutre of writing the seq2seq model.
3.   [Bahdanau et al](https://arxiv.org/pdf/1409.0473.pdf): Google referecnes 
Bahdanau attention as the better decision for attention. Bahdanau attention is referenced in the Luong paper too, but is slightly modified. The difference between the two is explored below. 



In [1]:
import numpy as np
import seaborn as sns
import tensorflow as tf

import re 
import os
from datetime import datetime

from nltk.translate.bleu_score import sentence_bleu

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Declare Static Variables

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

These parameters are mostly stolen from the Google Paper, except for embedding dim which is determined from GloVE.

In [3]:
EMBEDDING_DIM = 200
ATTENTION_UNITS = 512
ENCODER_UNITS = 1024
DECODER_UNITS = 1024
BATCH_SIZE = 64
MAX_INPUT_LENGTH = 32

### Load Data

In [4]:
# BASE_PATH = '/content/drive/MyDrive/Data/Data'  # on local is path to directory
BASE_PATH = '../../Data'

FORMAL_PATH_TRAIN = '{}/Supervised Data/Entertainment_Music/S_Formal_EM_Train.txt'.format(BASE_PATH)
INFORMAL_PATH_TRAIN = '{}/Supervised Data/Entertainment_Music/S_Informal_EM_Train.txt'.format(BASE_PATH)

FORMAL_PATH_HOLDOUT = '{}/Supervised Data/Entertainment_Music/S_Formal_EM_ValTest.txt'.format(BASE_PATH)
INFORMAL_PATH_HOLDOUT = '{}/Supervised Data/Entertainment_Music/S_Informal_EM_ValTest.txt'.format(BASE_PATH)

EMBEDDING_PATH = '{}/glove.6B.200d.txt'.format(BASE_PATH)

In [5]:
formal = open(FORMAL_PATH_TRAIN).read()
informal = open(INFORMAL_PATH_TRAIN).read()

formal_holdout = open(FORMAL_PATH_HOLDOUT).read()
informal_holdout = open(INFORMAL_PATH_HOLDOUT).read()

In [6]:
def process_sequence(seq):
    """This inserts a space in between the last word and a period"""
    s = re.sub('([.,!?()])', r' \1 ', seq)
    s = re.sub('\s{2,}', ' ', s)
    
    return '<start> ' + s + ' <end>'

In [7]:
f_corpus = [process_sequence(seq) for seq in formal.split('\n')]
if_corpus = [process_sequence(seq) for seq in informal.split('\n')]

f_holdout = [process_sequence(seq) for seq in formal_holdout.split('\n')]
if_holdout = [process_sequence(seq) for seq in informal_holdout.split('\n')]

### Preprocess data

In [8]:
def tokenize(corpus, tokenizer=None, maxlen=None):
    """ Tokenize data and pad sequences """
    if not tokenizer: 
        tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', 
                              oov_token='<OOV>')
        tokenizer.fit_on_texts(corpus)
    
    seqs = tokenizer.texts_to_sequences(corpus)
    padded_seqs = pad_sequences(seqs, padding='post', maxlen=maxlen)

    return padded_seqs, tokenizer

In [9]:
input_train, input_tokenizer = tokenize(if_corpus)
target_train, target_tokenizer = tokenize(f_corpus)

In [10]:
input_test, _ = tokenize(if_holdout, input_tokenizer, 32)
target_test, _ = tokenize(f_holdout, target_tokenizer)

In [11]:
buffer_size = len(input_train)
steps_per_epoch = len(input_train) // BATCH_SIZE
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

train = tf.data.Dataset.from_tensor_slices((input_train, target_train)).shuffle(buffer_size)
train = train.batch(BATCH_SIZE, drop_remainder=True)

test = tf.data.Dataset.from_tensor_slices((input_test, target_test)).batch(1)

In [12]:
example_input_batch, example_target_batch = next(iter(train))

In [13]:
test_in_batch, test_out_batch = next(iter(test))

# Setup Embedding Weights

In [14]:
def embedding_matrix(tokenizer, vocab_size, embedding_dim):
    embeddings_index = {}
    with open(EMBEDDING_PATH) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embeddings_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embeddings_matrix[i] = embedding_vector

    return embeddings_matrix

In [15]:
enc_E_mat = embedding_matrix(input_tokenizer, input_vocab_size, EMBEDDING_DIM)

### Encoder

Using one bidirectional LSTMs because that was reported to get almost as good performance as 2 LSTMs

In [16]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, encoder_units, weight_matrix):
        super(Encoder, self).__init__()
        self.encoder_units = encoder_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, 
                                                   embedding_dim,
                                                   weights=[weight_matrix])
        self.lstm_1 = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(
                self.encoder_units,
                return_sequences=True,
                return_state=True
            )
        )

    def call(self, x, hidden_state):
        """Shove into latent space"""
        # x shape: (batch_size x max_length x embedding_dim)
        x = self.embedding(x)

        # output shape: (batch_size x max_length x 2 * encoder_units)
        # h_f, h_b shapes: (batch_size x encoder_units)
        output, h_f, _, _, _ = self.lstm_1(x, initial_state=hidden_state)

        return output, h_f

In [17]:
encoder = Encoder(input_vocab_size, EMBEDDING_DIM, ENCODER_UNITS, enc_E_mat)

init_state = [tf.zeros((BATCH_SIZE, ENCODER_UNITS)) for _ in range(4)]

sample_output, hidden = encoder(example_input_batch, 
                                init_state)

### Attention Layer

In [18]:
class GlobalAttention(tf.keras.layers.Layer):
    """
    This is called GlobalAttention since that's what
    Bahdanau attention is called in the Luong paper
    and most of this implementation follows that 
    scaffolding. 

    The difference between this and global attention
    is concatenating the forward and backward hidden
    states from the 
    """
    def __init__(self, units):
        """These parameters follow notation from Bahdanau paper"""
        super(GlobalAttention, self).__init__()
        self.W = tf.keras.layers.Dense(units)
        self.U = tf.keras.layers.Dense(units)
        self.v = tf.keras.layers.Dense(1)

    def call(self, enc_opt, hidden):
        # expand dims to meet shape of latent tensor
        # hidden_broad shape: (batch_size, 1, encoder_units * 2)
        hidden_broad = tf.expand_dims(hidden, 1)

        # Alignment model score from A.1.2 of Bahdanau et al 
        # score shape: (batch_size, max_length, v_units)
        score = self.v(tf.nn.tanh(self.W(hidden_broad) + self.U(enc_opt)))

        # softmax generalization of eq(7) Luong
        # attention_weights shape: (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)  

        # This takes weighted average with attention weights
        # context shape: (batch_size, 2 * encoder_units)
        context = attention_weights * enc_opt
        context = tf.reduce_sum(context, axis=1)

        return context

In [19]:
attention_layer = GlobalAttention(ATTENTION_UNITS)
attention_result  = attention_layer(sample_output, 
                                    hidden)

### Decoder

In [20]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, attention_units, 
                 decoder_units):
        super(Decoder, self).__init__()
        self.attention = GlobalAttention(attention_units)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
                                                #    weights=[embedding_weights])
        self.lstm_1 = tf.keras.layers.LSTM(decoder_units,
                                           return_sequences=True,
                                           return_state=True)
        self.flatten = tf.keras.layers.Flatten()
        self.opt = tf.keras.layers.Dense(vocab_size)

    def call(self, x, hidden, encoder_output):
        context_vector = self.attention(encoder_output, hidden)

        # x shape: (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape: (batch_size, 1, embedding_dim + 2 * encoder_units)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # output shape: (batch_size, 1, decoder_units)
        # this shape is only important for expanding decoder depth
        output, h_f, c_f = self.lstm_1(x)

        # flatten to feed into opt
        # output shape: (batch_size, hidden_size)
        output = self.flatten(output)

        # get logits
        # x shape: (batch_size, vocab)
        x = self.opt(output)

        return x, h_f

In [21]:
decoder = Decoder(target_vocab_size, EMBEDDING_DIM, ATTENTION_UNITS, 
                  DECODER_UNITS)

sample_decoder_input = tf.expand_dims([target_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

sample_decoder_output, _  = decoder(sample_decoder_input, hidden, sample_output)

### Optimizer and Loss Function

In [26]:
encoder.summary()
decoder.summary()

Model: "encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  3511000   
_________________________________________________________________
bidirectional (Bidirectional multiple                  10035200  
Total params: 13,546,200
Trainable params: 13,546,200
Non-trainable params: 0
_________________________________________________________________
Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
global_attention_1 (GlobalAt multiple                  1574401   
_________________________________________________________________
embedding_1 (Embedding)      multiple                  3303400   
_________________________________________________________________
lstm_1 (LSTM)                multiple                  13406208  
______________________________

Here we define the optimizer and the loss function. In our loss function we mask the zeros since that's the padding.

Also of note is in the loss function. The reduction argument at default does some really wonky things which threw off all results. Had to change the reduciton to none, which at default is auto. Not exactly sure what it does in this context but it tries to sum over batches. I didn't work with it because I wanted to control all loss calculation manually. 

In [22]:
optimizer = tf.keras.optimizers.Adam()
static_loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [23]:
def loss_function(real, preds):
    """Calculate and return loss"""

    # caclulate loss
    loss = static_loss(real, preds)
    
    # create padding mask 
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    mask = tf.cast(mask, dtype=loss.dtype)
    
    # apply mask
    loss *= mask

    return tf.reduce_mean(loss)

### Training

This is the training loop. I'm using teacher forcing because I don't think I have enough computational power to use beam search. Google reccomends beam search with a beam width of 10, but that isn't an option here. Teacher forcing will provide better results than without using it. 

In [24]:
@tf.function
def train_step(inpt, trgt, init_state):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, dec_hidden_forward = encoder(inpt, init_state)

        # Get start token for every sequence in batch
        dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

        for i in range(1, trgt.shape[1]):
            # dec_hidden shape: (batch_size, decoder_units)
            # dec_input shape: (batch_size, 1)
            predictions, dec_hidden_forward = decoder(dec_input, 
                                                      dec_hidden_forward,
                                                      enc_output)

            loss += loss_function(trgt[:, i], predictions)
            dec_input = tf.expand_dims(trgt[:, 1], 1)

        # Apply gradients 
        trainable_variables = encoder.trainable_variables + decoder.trainable_variables
        gradients = tape.gradient(loss, trainable_variables)
        optimizer.apply_gradients(zip(gradients, trainable_variables))

        # return batch loss
        return loss

In [25]:
EPOCHS = 10
# encoder = Encoder(input_vocab_size, EMBEDDING_DIM, ENCODER_UNITS, 
#                   BATCH_SIZE, enc_E_mat)
# decoder = Decoder(target_vocab_size, EMBEDDING_DIM, ATTENTION_UNITS, 
#                   DECODER_UNITS, BATCH_SIZE)

for epoch in range(EPOCHS):
    start = datetime.now()

    total_loss = 0

    # This resets the hidden state of the LSTM for every epoch
    init_state = [tf.zeros((BATCH_SIZE, ENCODER_UNITS)) for _ in range(4)]

    for inpt, trgt in train.take(steps_per_epoch):
        total_loss += train_step(inpt, trgt, init_state)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / BATCH_SIZE))
    print('Time taken {}\n'.format(datetime.now() - start))

KeyboardInterrupt: 

In [None]:
def evaluate(data):
    loss = 0
    results = []
    bleu = 0

    for i, (x, y) in enumerate(data):

        result = '<start>'
        next_word = True

        init_state = [tf.zeros((1, ENCODER_UNITS)) for _ in range(4)]

        # This feeds in the start token for the first initial state
        dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)
        
        # Get inputs
        enc_output, dec_hidden_forward = encoder(x, None)

        j = 1  # iterative count
        while next_word: 
            # dec_hidden shape: (batch_size, decoder_units)
            # dec_input shape: (batch_size, 1)
            predictions, dec_hidden_forward  = decoder(dec_input, 
                                                         dec_hidden_forward,
                                                         enc_output)

            loss += loss_function(y[:, j], predictions)
            
            # max logit for tokenized word
            word_idx = tf.argmax(predictions[0]).numpy()
            word = target_tokenizer.index_word[word_idx]
            result += word + ' '

            dec_input = tf.expand_dims([word_idx], 0)

            if word == '<end>':
                next_word = False
            
            if j >= y.shape[1] - 1:
                result += '<end>'
                next_word = False
            
            j += 1

        results.append(result)
        bleu += sentence_bleu(f_holdout[i], result)

    return results, loss.numpy() / len(f_holdout), bleu / len(f_holdout)

In [None]:
results, test_loss, bleu = evaluate(test)

In [None]:
test_loss

In [None]:
bleu

In [None]:
def examine(index):
    print("Informal: ", if_holdout[index])
    print("Formal: ", f_holdout[index])
    print("Predicted: ", results[index])

In [None]:
examine(2)

In [None]:
encoder.save_weights(BASE_PATH + '/Results/Global-Attention-Encoder.txt')
decoder.save_weights(BASE_PATH + '/Results/Global-Attention-Decoder.txt')

In [None]:
with open(BASE_PATH + '/Results/global.txt', 'w') as f:
    for seq in results:
        f.write(seq + '\n')