# Rule Assisted Concat with Global Attention
This is a fancy name for a seq2seq network with multiple encoders. These use some prebuilt rules that are as follows:
1. Capitalize the first word
2. Lowercase words that are all upper case
3. Expand contractions
4. Replace Slang words
5. Replace Swear words
6. Replace repeated characters

For the hierarchical approach we have two sequences: $x$ and $x^\prime$


In [71]:
import numpy as np
import seaborn as sns
import tensorflow as tf

import re 
import os
from datetime import datetime

from nltk.translate.bleu_score import sentence_bleu

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Declare Static Variables

In [72]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


These parameters are mostly stolen from the Google Paper, except for embedding dim which is determined from GloVE.

In [73]:
EMBEDDING_DIM = 200
ATTENTION_UNITS = 512
ENCODER_UNITS = 1024
DECODER_UNITS = 1024
BATCH_SIZE = 64
MAX_INPUT_LENGTH = 32

### Load Data

In [74]:
BASE_PATH = '/content/drive/MyDrive/Data/Data'  # on local is path to directory
# BASE_PATH = '../../Data'

FORMAL_PATH_TRAIN = '{}/Supervised Data/Entertainment_Music/S_Formal_EM_Train.txt'.format(BASE_PATH)
INFORMAL_PATH_TRAIN = '{}/Supervised Data/Entertainment_Music/S_Informal_EM_Train.txt'.format(BASE_PATH)

FORMAL_PATH_HOLDOUT = '{}/Supervised Data/Entertainment_Music/S_Formal_EM_ValTest.txt'.format(BASE_PATH)
INFORMAL_PATH_HOLDOUT = '{}/Supervised Data/Entertainment_Music/S_Informal_EM_ValTest.txt'.format(BASE_PATH)

CONTRACTIONS_PATH = '{}/Rule Data/Contractions.txt'.format(BASE_PATH)
SLANG_PATH = '{}/Rule Data/Slang.txt'.format(BASE_PATH)
SWEARS_PATH = '{}/Rule Data/Swears.csv'.format(BASE_PATH)

EMBEDDING_PATH = '{}/glove.6B.200d.txt'.format(BASE_PATH)

In [75]:
formal = open(FORMAL_PATH_TRAIN).read()
informal = open(INFORMAL_PATH_TRAIN).read()

formal_holdout = open(FORMAL_PATH_HOLDOUT).read()
informal_holdout = open(INFORMAL_PATH_HOLDOUT).read()

In [76]:
def process_sequence(seq):
    """This inserts a space in between the last word and a period"""
    s = re.sub('([.,!?()])', r' \1 ', seq)
    s = re.sub('\s{2,}', ' ', s)
    
    return '<start> ' + s + ' <end>'

In [77]:
f_corpus = [process_sequence(seq) for seq in formal.split('\n')]
if_corpus = [process_sequence(seq) for seq in informal.split('\n')]

f_holdout = [process_sequence(seq) for seq in formal_holdout.split('\n')]
if_holdout = [process_sequence(seq) for seq in informal_holdout.split('\n')]

### Load slang and Contractions

In [78]:
import csv
cont = []
cont_corr = []

slang = []
slang_corr = []

swears = []
with open(CONTRACTIONS_PATH, 'r') as f:
    for line in csv.reader(f, dialect='excel-tab'):
        cont.append(line[0])
        cont_corr.append(line[1])
        
with open(SLANG_PATH) as f:
    for line in csv.reader(f, dialect='excel-tab'):
        slang.append(line[0])
        slang_corr.append(line[1])
        
with open(SWEARS_PATH) as f:
    for line in csv.reader(f):
        swears.append(line)
swears = swears[0]

### Apply Rules
1. Capitalize the first word
2. Lowercase words that are all upper case
3. Expand contractions
4. Replace Slang words
5. Replace Swear words

In [79]:
def rule(seqs):
    out = []
    for seq in seqs:
        curr_seq = seq.split()
        temp = []
        
        for i, word in enumerate(curr_seq):
            curr_word = word
            
            if i == 1:
                curr_word = curr_word.capitalize()
            
            if curr_word == curr_word.upper():
                curr_word = curr_word.lower()
                
            if curr_word in slang:
                curr_word = slang_corr[slang.index(curr_word)]
                
            if curr_word in swears:
                curr_word = curr_word[0] + '*' * (len(curr_word) - 2)
            
            temp.append(curr_word)
                
        out.append(' '.join(temp))  
    return out

In [80]:
x_prime_raw = rule(if_corpus)
x_prime_holdout = rule(if_holdout)

### Preprocess data

In [81]:
def tokenize(corpus, tokenizer=None, maxlen=None):
    """ Tokenize data and pad sequences """
    if not tokenizer: 
        tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', 
                              oov_token='<OOV>', lower=False)
        tokenizer.fit_on_texts(corpus)
    
    seqs = tokenizer.texts_to_sequences(corpus)
    padded_seqs = pad_sequences(seqs, padding='post', maxlen=maxlen)

    return padded_seqs, tokenizer

In [82]:
input_train, input_tokenizer = tokenize(if_corpus)
input_prime, _ = tokenize(x_prime_raw, input_tokenizer)
target_train, target_tokenizer = tokenize(f_corpus)

In [83]:
input_test, _ = tokenize(if_holdout, input_tokenizer)
primt_test, _ = tokenize(x_prime_holdout, input_tokenizer)
target_test, _ = tokenize(f_holdout, target_tokenizer)

In [84]:
buffer_size = len(input_train)
steps_per_epoch = len(input_train) // BATCH_SIZE
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

train = tf.data.Dataset.from_tensor_slices((input_train, input_prime, target_train)).shuffle(buffer_size)
train = train.batch(BATCH_SIZE, drop_remainder=True)

test = tf.data.Dataset.from_tensor_slices((input_test, primt_test, target_test)).batch(1)

In [85]:
example_input_batch, example_prime_batch, example_target_batch = next(iter(train))

# Setup Embedding Weights

In [86]:
def embedding_matrix(tokenizer, vocab_size, embedding_dim):
    embeddings_index = {}
    with open(EMBEDDING_PATH) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embeddings_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embeddings_matrix[i] = embedding_vector

    return embeddings_matrix

In [87]:
enc_E_mat = embedding_matrix(input_tokenizer, input_vocab_size, EMBEDDING_DIM)
enc_E_mat_prime = embedding_matrix(input_tokenizer, input_vocab_size, EMBEDDING_DIM)

### Encoder

Using one bidirectional LSTMs because that was reported to get almost as good performance as 2 LSTMs

In [88]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, encoder_units, weight_matrix):
        super(Encoder, self).__init__()
        self.encoder_units = encoder_units
        if weight_matrix is not None:
            self.embedding = tf.keras.layers.Embedding(vocab_size, 
                                                       embedding_dim,
                                                       weights=[weight_matrix])
        else:
            self.embedding = tf.keras.layers.Embedding(vocab_size, 
                                                       embedding_dim)           
        self.lstm_1 = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(
                self.encoder_units,
                return_sequences=True,
                return_state=False
            )
        )

        self.lstm_2 = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(
                self.encoder_units,
                return_sequences=True,
                return_state=True
            )
        )

    def call(self, x, hidden_state):
        """Shove into latent space"""
        # x shape: (batch_size x max_length x embedding_dim)
        x = self.embedding(x)

        # output shape: (batch_size x max_length x 2 * encoder_units)
        # h_f, h_b shapes: (batch_size x encoder_units)
        output = self.lstm_1(x, initial_state=hidden_state)

        output, h_f, _, _, _ = self.lstm_2(output)

        return output, h_f 

In [89]:
encoder = Encoder(input_vocab_size, EMBEDDING_DIM, ENCODER_UNITS, enc_E_mat)

encoder_prime = Encoder(input_vocab_size, EMBEDDING_DIM, ENCODER_UNITS, None)

init_state = [tf.zeros((BATCH_SIZE, ENCODER_UNITS)) for _ in range(4)]

sample_output, hidden = encoder(example_input_batch, 
                                init_state)
prime_output, hidden_prime = encoder_prime(example_prime_batch, 
                                          init_state)

### Attention Layer

In [90]:
class GlobalAttention(tf.keras.layers.Layer):
    """
    This is actually global attention from Luong, 
    it ignors the hidden backward state
    """
    def __init__(self, units):
        """These parameters follow notation from Bahdanau paper"""
        super(GlobalAttention, self).__init__()
        self.W = tf.keras.layers.Dense(units)
        self.U = tf.keras.layers.Dense(units)
        self.v = tf.keras.layers.Dense(1)

    def call(self, enc_opt, hidden):
        # expand dims to meet shape of latent tensor
        # hidden_broad shape: (batch_size, 1, encoder_units)
        hidden_broad = tf.expand_dims(hidden, 1)

        # Alignment model score from A.1.2 of Bahdanau et al 
        # score shape: (batch_size, max_length, v_units)
        score = self.v(tf.nn.tanh(self.W(hidden_broad) + self.U(enc_opt)))

        # softmax generalization of eq(7) Luong
        # attention_weights shape: (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)  

        # This takes weighted average with attention weights
        # context shape: (batch_size, 2 * encoder_units)
        context = attention_weights * enc_opt
        context = tf.reduce_sum(context, axis=1)

        return context 

In [91]:
attention_layer = GlobalAttention(ATTENTION_UNITS)
attention_result = attention_layer(sample_output, 
                                   hidden)
attention_prime = attention_layer(prime_output,
                                  hidden_prime)

### Decoder

In [92]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, attention_units, 
                 decoder_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.attention = GlobalAttention(attention_units)
        self.attention_prime = GlobalAttention(attention_units)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm_1 = tf.keras.layers.LSTM(decoder_units,
                                           return_sequences=True,
                                           return_state=True)
        self.flatten = tf.keras.layers.Flatten()
        self.W = tf.keras.layers.Dense(2 * ENCODER_UNITS)
        self.opt = tf.keras.layers.Dense(vocab_size)

    def call(self, x, hidden, hidden_prime, encoder_output, prime_output, dec_hidden_forward):
        # output shape: (BATCH_SIZE x MAX_LENGTH x 4 * ENCODER_UNITS)
        output = tf.concat([encoder_output, prime_output], axis=-1)

        # hidden shape: (BATHC_SIZE x 2 * ENCODER_UNITS)
        hidden = tf.concat([hidden, hidden_prime], axis=-1)
        
        # context_vector shape: (batch_size, 2 * encoder_units)
        context_vector = self.attention(output, hidden)

        # x shape: (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape: (batch_size, 1, embedding_dim + 2 * encoder_units)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # output shape: (batch_size, 1, decoder_units)
        # this shape is only important for expanding decoder depth
        output, h_f, _ = self.lstm_1(x)

        # flatten to feed into opt
        # output shape: (batch_size, hidden_size)
        output = self.flatten(output)

        # get logits
        # x shape: (batch_size, vocab)
        x = self.opt(output)

        return x, h_f

In [93]:
decoder = Decoder(target_vocab_size, EMBEDDING_DIM, ATTENTION_UNITS, 
                  DECODER_UNITS, BATCH_SIZE)

sample_decoder_input = tf.expand_dims([target_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

dec_hidden_forward = tf.zeros((64, ENCODER_UNITS))

sample_decoder_output, aa = decoder(sample_decoder_input, hidden, hidden_prime, 
                                   sample_output, prime_output, dec_hidden_forward)

### Optimizer and Loss Function

Here we define the optimizer and the loss function. In our loss function we mask the zeros since that's the padding.

Also of note is in the loss function. The reduction argument at default does some really wonky things which threw off all results. Had to change the reduciton to none, which at default is auto. Not exactly sure what it does in this context but it tries to sum over batches. I didn't work with it because I wanted to control all loss calculation manually. 

In [94]:
optimizer = tf.keras.optimizers.Adam()
static_loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [95]:
def loss_function(real, preds):
    """Calculate and return loss"""

    # caclulate loss
    loss = static_loss(real, preds)
    
    # create padding mask 
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    mask = tf.cast(mask, dtype=loss.dtype)
    
    # apply mask
    loss *= mask

    return tf.reduce_mean(loss)

### Training

This is the training loop. I'm using teacher forcing because I don't think I have enough computational power to use beam search. Google reccomends beam search with a beam width of 10, but that isn't an option here. Teacher forcing will provide better results than without using it. 

In [96]:
@tf.function
def train_step(inpt, prime, trgt, init_state):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, hidden = encoder(inpt, 
                                     init_state)
        prime_output, hidden_prime = encoder_prime(prime, 
                                                   init_state)
        # Get start token for every sequence in batch
        dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)
        
        dec_hidden_forward = tf.zeros((BATCH_SIZE, ENCODER_UNITS))

        for i in range(1, trgt.shape[1]):
            # dec_hidden shape: (batch_size, decoder_units)
            # dec_input shape: (batch_size, 1)
            predictions, dec_hidden_forward = decoder(dec_input, hidden, hidden_prime, 
                                                      enc_output, prime_output, dec_hidden_forward)
                                                         
            loss += loss_function(trgt[:, i], predictions)
            dec_input = tf.expand_dims(trgt[:, 1], 1)

        # Apply gradients 
        trainable_variables = encoder.trainable_variables + decoder.trainable_variables
        gradients = tape.gradient(loss, trainable_variables)
        optimizer.apply_gradients(zip(gradients, trainable_variables))

        # return batch loss
        return loss

In [97]:
encoder.load_weights(BASE_PATH + '/Model Weights/Rule-Assisted/encoder')
encoder_prime.load_weights(BASE_PATH + '/Model Weights/Rule-Assisted/encoder_prime')
decoder.load_weights(BASE_PATH + '/Model Weights/Rule-Assisted/decoder')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f489025beb8>

In [108]:
EPOCHS = 30

for epoch in range(EPOCHS):
    start = datetime.now()

    total_loss = 0

    # This resets the hidden state of the LSTM for every epoch
    init_state = [tf.zeros((BATCH_SIZE, ENCODER_UNITS)) for _ in range(4)]

    for inpt, prime, trgt in train.take(steps_per_epoch):
        total_loss += train_step(inpt, prime, trgt, init_state)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / BATCH_SIZE))
    print('Time taken {}\n'.format(datetime.now() - start))

Epoch 1 Loss 276.4538
Time taken 0:02:45.801963

Epoch 2 Loss 272.6833
Time taken 0:02:46.147221

Epoch 3 Loss 269.2672
Time taken 0:02:46.186608

Epoch 4 Loss 265.8460
Time taken 0:02:45.872715

Epoch 5 Loss 262.6496
Time taken 0:02:46.098787

Epoch 6 Loss 259.5744
Time taken 0:02:46.005524

Epoch 7 Loss 256.4969
Time taken 0:02:46.225745

Epoch 8 Loss 253.7922
Time taken 0:02:46.078150

Epoch 9 Loss 250.9256
Time taken 0:02:46.368682

Epoch 10 Loss 248.2212
Time taken 0:02:46.153584

Epoch 11 Loss 245.5357
Time taken 0:02:46.418215

Epoch 12 Loss 243.1682
Time taken 0:02:46.304968

Epoch 13 Loss 240.7848
Time taken 0:02:46.506083

Epoch 14 Loss 238.2824
Time taken 0:02:46.397549

Epoch 15 Loss 235.9997
Time taken 0:02:46.139736

Epoch 16 Loss 233.6890
Time taken 0:02:46.286823

Epoch 17 Loss 231.6549
Time taken 0:02:46.445390

Epoch 18 Loss 229.4175
Time taken 0:02:46.103484

Epoch 19 Loss 227.5420
Time taken 0:02:46.088770

Epoch 20 Loss 225.6653
Time taken 0:02:46.510555

Epoch 21 

In [109]:
def evaluate(data):
    loss = 0
    results = []
    bleu = 0

    for i, (inpt, prime, y) in enumerate(data):

        result = '<start>'
        next_word = True

        init_state = None
        
        enc_output, hidden = encoder(inpt, 
                                     init_state)
        prime_output, hidden_prime = encoder_prime(prime, 
                                                   init_state)

        # This feeds in the start token for the first initial state
        dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)


        dec_hidden_forward = tf.zeros((1, ENCODER_UNITS))

        j = 1  # iterative count
        while next_word: 
            # dec_hidden shape: (batch_size, decoder_units)
            # dec_input shape: (batch_size, 1)
            predictions, dec_hidden_forward = decoder(dec_input, hidden, 
                                                      hidden_prime, enc_output, 
                                                      prime_output, 
                                                      dec_hidden_forward)
                                                         
            loss += loss_function(y[:, j], predictions)
            
            # max logit for tokenized word
            word_idx = tf.argmax(predictions[0]).numpy()
            word = target_tokenizer.index_word[word_idx]
            result += word + ' '

            dec_input = tf.expand_dims([word_idx], 0)

            if word == '<end>':
                next_word = False
            
            if j >= y.shape[1] - 1:
                result += '<end>'
                next_word = False
            
            j += 1

        results.append(result)
        bleu += sentence_bleu(f_holdout[i], result)

    return results, loss.numpy() / len(f_holdout), bleu / len(f_holdout)

In [110]:
results, test_loss, bleu = evaluate(test)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [111]:
bleu

0.7734854206757209

In [112]:
test_loss

51.366802291987675

In [113]:
def examine(index):
    print("Informal: ", if_holdout[index])
    print("Formal: ", f_holdout[index])
    print("Predicted: ", results[index])

In [114]:
examine(1300)

Informal:  <start> I have a few I picked up in a New & Used bookstore near me .  <end>
Formal:  <start> I own a few that I found at a new and used bookstore near me .  <end>
Predicted:  <start>I a years years years years years years years years years years years years years years years years years years years years years years years years years years years years years years years <end>


In [104]:
from nltk.translate.bleu_score import corpus_bleu

In [115]:
with open(BASE_PATH + '/Results/HA_results.txt', 'w') as f:
    for seq in results:
        f.write(seq + '\n')

In [116]:
encoder.save_weights(BASE_PATH + '/Model Weights/Rule-Assisted/encoder')
encoder_prime.save_weights(BASE_PATH + '/Model Weights/Rule-Assisted/encoder_prime')
decoder.save_weights(BASE_PATH + '/Model Weights/Rule-Assisted/decoder')