# Encoder-Decoder MT Attention Network

In [37]:
import numpy as np
import seaborn as sns
import tensorflow as tf

import re 
import os
from datetime import datetime

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Declare Static Variables

These parameters are mostly stolen from the Google Paper

In [2]:
EMBEDDING_DIM = 256
ATTENTION_UNITS = 10
ENCODER_UNITS = 1024
DECODER_UNITS = 1024
BATCH_SIZE = 64

### Load Data

In [3]:
formal = open('../Data/Supervised Data/Entertainment_Music/S_Formal_EM_Train.txt').read()
informal = open('../Data/Supervised Data/Entertainment_Music/S_Informal_EM_Train.txt').read()

formal_holdout = open('../Data/Supervised Data/Entertainment_Music/S_Formal_EM_ValTest.txt').read()
informal_holdout = open('../Data/Supervised Data/Entertainment_Music/S_Informal_EM_ValTest.txt').read()

In [4]:
def process_sequence(seq):
    """This inserts a space in between the last word and a period"""
    s = re.sub('([.,!?()])', r' \1 ', seq)
    s = re.sub('\s{2,}', ' ', s)
    
    return '<start> ' + s + ' <end>'

In [5]:
f_corpus = [process_sequence(seq) for seq in formal.split('\n')]
if_corpus = [process_sequence(seq) for seq in informal.split('\n')]

f_holdout = [process_sequence(seq) for seq in formal_holdout.split('\n')]
if_holdout = [process_sequence(seq) for seq in informal_holdout.split('\n')]

### Preprocess data

In [6]:
def tokenize(corpus):
    """ Tokenize data and pad sequences """
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', oov_token='<OOV>')
    tokenizer.fit_on_texts(corpus)
    
    seqs = tokenizer.texts_to_sequences(corpus)
    padded_seqs = pad_sequences(seqs, padding='post')
    return padded_seqs, tokenizer

In [7]:
input_train, input_tokenizer = tokenize(if_corpus)
target_train, target_tokenizer = tokenize(f_corpus)

In [8]:
buffer_size = len(input_train)
steps_per_epoch = len(input_train) // BATCH_SIZE
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

train = tf.data.Dataset.from_tensor_slices((input_train, target_train)).shuffle(buffer_size)
train = train.batch(BATCH_SIZE, drop_remainder=True)

In [9]:
example_input_batch, example_target_batch = next(iter(train))

### Encoder

In [10]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, encoder_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.encoder_units = encoder_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm_1 = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(
                self.encoder_units,
                return_sequences=True
            )
        )
        self.lstm_2 = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(
                self.encoder_units,
                return_sequences=True,
                return_state=True
            )
        )

    def call(self, x, hidden=None):
        """
        h_f: hidden state of forward pass of BiRNN
        c_f: cell state of forward pass of BiRNN
        h_b: hidden state of backward pass of BiRNN
        """
        
        x = self.embedding(x)
        x = self.lstm_1(x, initial_state=hidden)
        output, h_f, c_f, _, _ = self.lstm_2(x)
        
        return output, h_f, c_f
    
    def get_initial_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [11]:
encoder = Encoder(input_vocab_size, EMBEDDING_DIM, ENCODER_UNITS, BATCH_SIZE)

sample_output, sample_forward_hidden, sample_cell = encoder(example_input_batch)

### Attention Layer

This is an implementation of 
$$ score = {v_a}^T \tanh(W_a h_s + U_a h_t)$$
and then from there to get attention weights we use 
$$ \text{softmax}(score) $$
This strays from the Bahdanau paper by only using the hidden states from the forward pass of the BiRNN from the last LSTM layer from the encoder. 

In [29]:
class GlobalAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        """These parameters follow notation from Bahdanau paper"""
        super(GlobalAttention, self).__init__()
        self.W = tf.keras.layers.Dense(units)
        self.U = tf.keras.layers.Dense(units)
        self.v = tf.keras.layers.Dense(1)

    def call(self, enc_opt, hidden):
        """ This uses the concat method from Luong paper"""
        hidden_broad = tf.expand_dims(hidden, 1)  # expand dims to meet shape of latent tensor
        score = self.v(tf.nn.tanh(self.W(hidden_broad) + self.U(enc_opt)))  # concat from eq (7.5) Luong
        attention_weights = tf.nn.softmax(score, axis=1)  # softmax generalization of eq(8) Luong

        # This takes weighted average with attention weights
        context_vector = attention_weights * enc_opt
        context_vector = tf.reduce_mean(context_vector, axis=1)

        return context_vector, attention_weights

In [30]:
attention_layer = GlobalAttention(ATTENTION_UNITS)
attention_result, attention_weights = attention_layer(sample_output, 
                                                      sample_forward_hidden)
attention_weights.shape

TensorShape([64, 32, 1])

### Decoder

In [31]:
attention_result.shape

TensorShape([64, 2048])

In [32]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, attention_units, decoder_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.attention = GlobalAttention(attention_units)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm_1 = tf.keras.layers.LSTM(decoder_units,
                                           return_sequences=True,
                                           return_state=True)
        self.opt = tf.keras.layers.Dense(vocab_size)

    def call(self, x, h_f, encoder_output):
        context_vector, attention_weights = self.attention(encoder_output, h_f)
        
        # (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        
        # (batch_size, 1, embedding_dim + hidden_state)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # (batch_size, hidden_size)
        output, h_f, c_f = self.lstm_1(x)
        output = tf.reshape(output, (-1, output.shape[2]))

        # (batch_size, vocab)
        x = self.opt(output)
        return x, h_f, c_f, attention_weights

In [33]:
decoder = Decoder(target_vocab_size, EMBEDDING_DIM, ATTENTION_UNITS, DECODER_UNITS, BATCH_SIZE)

sample_decoder_output, _, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)), sample_forward_hidden, sample_output)

In [36]:
# sample_decoder_output

### Optimizer and Loss Function

In [18]:
optimizer = tf.keras.optimizers.Adam()
static_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss = static_loss(real, pred)

    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    return tf.reduce_mean(loss)

### Training

In [41]:
target_tokenizer.word_index['<start>']

2

In [40]:
dec_input.shape

TensorShape([64, 1])

In [31]:
@tf.function
def train_step(inpt, trgt, enc_hidden):
    loss = 0
    
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inpt)
        
        # These are initialized for the first iteration
        dec_input = tf.random.uniform((BATCH_SIZE, 1))
        dec_hidden = enc_hidden
        
        for t in range(1, trgt.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_function(trgt[:, t], predictions)
            dec_input = tf.expand_dims(trgt[:, t], 1)
            
        variables = encoder.trainable_variables + decoder.trainable_variables
        batch_loss = loss / int(trgt.shape[1])
        gradients = tape.gradient(loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))
        
        return batch_loss

In [34]:
checkpoint_dir = './training-checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [35]:
EPOCHS = 10
for epoch in range(EPOCHS):
    start = datetime.now()
    
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch, (inpt, trgt)) in enumerate(train.take(steps_per_epoch)):
        batch_loss = train_step(inpt, trgt, enc_hidden)
        total_loss += batch_loss
        
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch+1,
                                                         batch,
                                                         batch_loss.numpy()))
        if (epoch + 1) % 2 == 0:
            checkpoint.save(file_prefix=checkpoint_prefix)
            
        print('Epoch {} Loss {:.4f}'.format(epoch+1,
                                            total_loss/steps_per_epoch))
        print('Time taken for 1 epoch {} seconds\n'.format(datetime.now() - start))

KeyboardInterrupt: 

In [None]:
def evaluate(sentence):
    attention_plot = np.zeros((input_se))