# Vanilla Encoder-Decoder
This is a basic encoder decoder model with no attention

In [116]:
import numpy as np
import seaborn as sns
import tensorflow as tf

import re 
import os
from datetime import datetime

from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [117]:
import warnings
warnings.filterwarnings('ignore')

### Declare Static Variables

In [118]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


These parameters are mostly stolen from the Google Paper, except for embedding dim which is determined from GloVE.

In [119]:
EMBEDDING_DIM = 200
ENCODER_UNITS = 1024
DECODER_UNITS = 1024
BATCH_SIZE = 128

### Load Data

In [120]:
BASE_PATH = '/content/drive/MyDrive/Data/Data'  # on local is path to directory
# BASE_PATH = '../../Data'

FORMAL_PATH_TRAIN = '{}/Supervised Data/Entertainment_Music/S_Formal_EM_Train.txt'.format(BASE_PATH)
INFORMAL_PATH_TRAIN = '{}/Supervised Data/Entertainment_Music/S_Informal_EM_Train.txt'.format(BASE_PATH)

FORMAL_PATH_HOLDOUT = '{}/Supervised Data/Entertainment_Music/S_Formal_EM_ValTest.txt'.format(BASE_PATH)
INFORMAL_PATH_HOLDOUT = '{}/Supervised Data/Entertainment_Music/S_Informal_EM_ValTest.txt'.format(BASE_PATH)

EMBEDDING_PATH = '{}/glove.6B.200d.txt'.format(BASE_PATH)

In [121]:
formal = open(FORMAL_PATH_TRAIN).read()
informal = open(INFORMAL_PATH_TRAIN).read()

formal_holdout = open(FORMAL_PATH_HOLDOUT).read()
informal_holdout = open(INFORMAL_PATH_HOLDOUT).read()

In [122]:
def process_sequence(seq):
    """This inserts a space in between the last word and a period"""
    s = re.sub('([.,!?()])', r' \1 ', seq)
    s = re.sub('\s{2,}', ' ', s)
    s = re.sub(r"[^a-zA-Z?.!,¿]+", " ", s)
    
    return '<start> ' + s + ' <end>'

In [123]:
f_corpus = [process_sequence(seq) for seq in formal.split('\n')]
if_corpus = [process_sequence(seq) for seq in informal.split('\n')]

f_holdout = [process_sequence(seq) for seq in formal_holdout.split('\n')]
if_holdout = [process_sequence(seq) for seq in informal_holdout.split('\n')]

### Preprocess data

In [124]:
def tokenize(corpus, tokenizer=None, maxlen=None):
    """ Tokenize data and pad sequences """
    if not tokenizer: 
        tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', 
                              oov_token='<OOV>')
        tokenizer.fit_on_texts(corpus)
    
    seqs = tokenizer.texts_to_sequences(corpus)
    padded_seqs = pad_sequences(seqs, padding='post', maxlen=maxlen)

    return padded_seqs, tokenizer

In [125]:
input_train, input_tokenizer = tokenize(if_corpus)
target_train, target_tokenizer = tokenize(f_corpus)

In [126]:
input_test, _ = tokenize(if_holdout, input_tokenizer, 32)
target_test, _ = tokenize(f_holdout, target_tokenizer)

In [127]:
buffer_size = len(input_train)
steps_per_epoch = len(input_train) // BATCH_SIZE
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

train = tf.data.Dataset.from_tensor_slices((input_train, target_train)).shuffle(buffer_size)
train = train.batch(BATCH_SIZE, drop_remainder=True)

test = tf.data.Dataset.from_tensor_slices((input_test, target_test)).batch(1)

In [128]:
example_input_batch, example_target_batch = next(iter(train))

In [129]:
test_in_batch, test_out_batch = next(iter(test))

# Setup Embedding Weights

In [130]:
def embedding_matrix(tokenizer, vocab_size, embedding_dim):
    embeddings_index = {}
    with open(EMBEDDING_PATH) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embeddings_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embeddings_matrix[i] = embedding_vector

    return embeddings_matrix

In [131]:
enc_E_mat = embedding_matrix(input_tokenizer, input_vocab_size, EMBEDDING_DIM)

### Encoder

Using one bidirectional LSTMs because that was reported to get almost as good performance as 2 LSTMs

In [132]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, encoder_units, batch_size,
                 weight_matrix):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.encoder_units = encoder_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, 
                                                   embedding_dim,
                                                   weights=[weight_matrix])
        self.lstm_1 = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(
                self.encoder_units,
                return_sequences=True,
                return_state=False
            )
        )
        self.lstm_2 = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(
                self.encoder_units,
                return_sequences=True,
                return_state=False
            )
        )

    def call(self, x, hidden_state):
        """Shove into latent space"""
        # x shape: (batch_size x max_length x embedding_dim)
        output = self.embedding(x)

        # output shape: (batch_size x max_length x 2 * encoder_units)
        # h_f, h_b shapes: (batch_size x encoder_units)
        output = self.lstm_1(output)

        output = self.lstm_2(output)

        return output

In [133]:
encoder = Encoder(input_vocab_size, EMBEDDING_DIM, ENCODER_UNITS, 
                  BATCH_SIZE, enc_E_mat)

init_state = [tf.zeros((BATCH_SIZE, ENCODER_UNITS)) for _ in range(4)]

sample_output = encoder(example_input_batch, init_state)

### Decoder

In [134]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, decoder_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm_1 = tf.keras.layers.LSTM(decoder_units,
                                           return_sequences=True,
                                           return_state=False)
        self.lstm_2 = tf.keras.layers.LSTM(decoder_units,
                                           return_sequences=True,
                                           return_state=False)
        self.flatten = tf.keras.layers.Flatten()
        self.opt = tf.keras.layers.Dense(vocab_size)

    def call(self, x, encoder_output):
        # x shape: (batch_size, 1, embedding_dim)
        output = self.embedding(x)

        # output shape: (batch_size, 1, decoder_units)
        # this shape is only important for expanding decoder depth
        output = self.lstm_1(output)
        # flatten to feed into opt
        # output shape: (batch_size, hidden_size)

        output = self.lstm_2(output)
        output = self.flatten(output)

        # get logits
        # x shape: (batch_size, vocab)
        x = self.opt(output)

        return x

In [135]:
decoder = Decoder(target_vocab_size, EMBEDDING_DIM, DECODER_UNITS, BATCH_SIZE)

sample_decoder_input = tf.expand_dims([target_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

sample_decoder_output = decoder(sample_decoder_input, sample_output)

### Optimizer and Loss Function

Here we define the optimizer and the loss function. In our loss function we mask the zeros since that's the padding.

Also of note is in the loss function. The reduction argument at default does some really wonky things which threw off all results. Had to change the reduciton to none, which at default is auto. Not exactly sure what it does in this context but it tries to sum over batches. I didn't work with it because I wanted to control all loss calculation manually. 

In [136]:
optimizer = tf.keras.optimizers.Adam()
static_loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [137]:
def loss_function(real, preds):
    """Calculate and return loss"""

    # caclulate loss
    loss = static_loss(real, preds)
    
    # create padding mask 
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    mask = tf.cast(mask, dtype=loss.dtype)
    
    # apply mask
    loss *= mask

    return tf.reduce_mean(loss)

### Training

In [138]:
@tf.function
def train_step(inpt, trgt, init_state):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output = encoder(inpt, init_state)

        # Get start token for every sequence in batch
        dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

        for i in range(1, trgt.shape[1]):
            # dec_hidden shape: (batch_size, decoder_units)
            # dec_input shape: (batch_size, 1)
            predictions = decoder(dec_input, 
                                  enc_output)

            loss += loss_function(trgt[:, i], predictions)
            dec_input = tf.expand_dims(trgt[:, i], 1)

    # Apply gradients 
    trainable_variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, trainable_variables)
    optimizer.apply_gradients(zip(gradients, trainable_variables))

    # return batch loss
    return loss

In [139]:
EPOCHS = 20
# encoder = Encoder(input_vocab_size, EMBEDDING_DIM, ENCODER_UNITS, 
#                   BATCH_SIZE, enc_E_mat)
# decoder = Decoder(target_vocab_size, EMBEDDING_DIM, ATTENTION_UNITS, 
#                   DECODER_UNITS, BATCH_SIZE)

for epoch in range(EPOCHS):
    start = datetime.now()

    total_loss = 0

#     This resets the hidden state of the LSTM for every epoch
#     init_state = [tf.zeros((BATCH_SIZE, ENCODER_UNITS)) for _ in range(4)]
    init_state = None

    for inpt, trgt in train.take(steps_per_epoch):
        total_loss += train_step(inpt, trgt, init_state)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / BATCH_SIZE))
    print('Time taken {}\n'.format(datetime.now() - start))

Epoch 1 Loss 118.2043
Time taken 0:01:58.798355

Epoch 2 Loss 102.9621
Time taken 0:00:38.876684

Epoch 3 Loss 96.2491
Time taken 0:00:39.004727

Epoch 4 Loss 92.0841
Time taken 0:00:39.003300

Epoch 5 Loss 89.1074
Time taken 0:00:38.983993

Epoch 6 Loss 86.6985
Time taken 0:00:39.033888

Epoch 7 Loss 84.6881
Time taken 0:00:39.067447

Epoch 8 Loss 83.0205
Time taken 0:00:39.114792

Epoch 9 Loss 81.5041
Time taken 0:00:39.130544

Epoch 10 Loss 80.2495
Time taken 0:00:39.056233

Epoch 11 Loss 79.0821
Time taken 0:00:39.019724

Epoch 12 Loss 78.0836
Time taken 0:00:38.963129

Epoch 13 Loss 77.2166
Time taken 0:00:38.910364

Epoch 14 Loss 76.4435
Time taken 0:00:38.951223

Epoch 15 Loss 75.7820
Time taken 0:00:39.035326

Epoch 16 Loss 75.1494
Time taken 0:00:39.012864

Epoch 17 Loss 74.6106
Time taken 0:00:38.970462

Epoch 18 Loss 74.1337
Time taken 0:00:38.936559

Epoch 19 Loss 73.7373
Time taken 0:00:39.068407

Epoch 20 Loss 73.3458
Time taken 0:00:38.891191



In [140]:
def evaluate(data):
    loss = 0
    results = []
    bleu = 0

    for i, (x, y) in enumerate(data):

        result = '<start> '
        next_word = True

        # This feeds in the start token for the first initial state
        dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)
        
        # Get inputs
        enc_output = encoder(x, None)

        j = 1  # iterative count
        while next_word: 
            # dec_hidden shape: (batch_size, decoder_units)
            # dec_input shape: (batch_size, 1)
            predictions = decoder(dec_input, enc_output)
            loss += loss_function(y[:, j], predictions)
            
            # max logit for tokenized word
            word_idx = tf.argmax(predictions[0]).numpy()
            word = target_tokenizer.index_word[word_idx]
            result += word + ' '

            dec_input = tf.expand_dims([word_idx], 0)

            if word == '<end>':
                next_word = False
            
            if j >= y.shape[1] - 1:
                result += '<end>'
                next_word = False
            
            j += 1

        results.append(result)
        bleu += sentence_bleu(f_holdout[i], result)

    return results, loss.numpy() / len(f_holdout), bleu / len(f_holdout)

In [141]:
results, test_loss, bleu = evaluate(test)

In [142]:
test_loss

39.462165350539294

In [143]:
bleu

0.8322362969156308

In [144]:
corpus_bleu(f_holdout, results)

0.8326549060715791

In [150]:
def examine(index):
    print("Informal: ", if_holdout[index])
    print("Formal: ", f_holdout[index])
    print("Predicted: ", results[index])
    print("Sequence BLEU: ", sentence_bleu(f_holdout[index], results[index]))

In [151]:
examine(1000)

Informal:  <start> Ask him to spell his name . . . m o r o . . .  <end>
Formal:  <start> If you ask him to spell his name he would say , M o r o . . .  <end>
Predicted:  <start> i am not know <end> 
Sequence BLEU:  0.855526185871245


In [153]:
with open(BASE_PATH + '/Results/vanilla_encoder_decoder_results_custom', 'w') as f:
    for seq in results:
        f.write(seq + '\n')