# Hierarchal Encoding with Rules

In [1]:
import numpy as np
import seaborn as sns
import tensorflow as tf

import re 
import os
from datetime import datetime

from nltk.translate.bleu_score import sentence_bleu

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Declare Static Variables

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


These parameters are mostly stolen from the Google Paper, except for embedding dim which is determined from GloVE.

In [3]:
EMBEDDING_DIM = 200
ATTENTION_UNITS = 512
ENCODER_UNITS = 1024
DECODER_UNITS = 1024
BATCH_SIZE = 64
MAX_INPUT_LENGTH = 32

### Load Data

In [4]:
BASE_PATH = '/content/drive/MyDrive/Data/Data'  # on local is path to directory
# BASE_PATH = '../../Data'

FORMAL_PATH_TRAIN = '{}/Supervised Data/Entertainment_Music/S_Formal_EM_Train.txt'.format(BASE_PATH)
INFORMAL_PATH_TRAIN = '{}/Supervised Data/Entertainment_Music/S_Informal_EM_Train.txt'.format(BASE_PATH)

FORMAL_PATH_HOLDOUT = '{}/Supervised Data/Entertainment_Music/S_Formal_EM_ValTest.txt'.format(BASE_PATH)
INFORMAL_PATH_HOLDOUT = '{}/Supervised Data/Entertainment_Music/S_Informal_EM_ValTest.txt'.format(BASE_PATH)

CONTRACTIONS_PATH = '{}/Rule Data/Contractions.txt'.format(BASE_PATH)
SLANG_PATH = '{}/Rule Data/Slang.txt'.format(BASE_PATH)
SWEARS_PATH = '{}/Rule Data/Swears.csv'.format(BASE_PATH)

EMBEDDING_PATH = '{}/glove.6B.200d.txt'.format(BASE_PATH)

In [5]:
formal = open(FORMAL_PATH_TRAIN).read()
informal = open(INFORMAL_PATH_TRAIN).read()

formal_holdout = open(FORMAL_PATH_HOLDOUT).read()
informal_holdout = open(INFORMAL_PATH_HOLDOUT).read()

In [6]:
def process_sequence(seq):
    """This inserts a space in between the last word and a period"""
    s = re.sub('([.,!?()])', r' \1 ', seq)
    s = re.sub('\s{2,}', ' ', s)
    
    return '<start> ' + s + ' <end>'

In [7]:
f_corpus = [process_sequence(seq) for seq in formal.split('\n')]
if_corpus = [process_sequence(seq) for seq in informal.split('\n')]

f_holdout = [process_sequence(seq) for seq in formal_holdout.split('\n')]
if_holdout = [process_sequence(seq) for seq in informal_holdout.split('\n')]

### Load slang and Contractions

In [8]:
import csv
cont = []
cont_corr = []

slang = []
slang_corr = []

swears = []
with open(CONTRACTIONS_PATH, 'r') as f:
    for line in csv.reader(f, dialect='excel-tab'):
        cont.append(line[0])
        cont_corr.append(line[1])
        
with open(SLANG_PATH) as f:
    for line in csv.reader(f, dialect='excel-tab'):
        slang.append(line[0])
        slang_corr.append(line[1])
        
with open(SWEARS_PATH) as f:
    for line in csv.reader(f):
        swears.append(line)
swears = swears[0]

### Apply Rules
1. Capitalize the first word
2. Lowercase words that are all upper case
3. Expand contractions
4. Replace Slang words
5. Replace Swear words

In [9]:
def rule(seqs):
    out = []
    for seq in seqs:
        curr_seq = seq.split()
        temp = []
        
        for i, word in enumerate(curr_seq):
            curr_word = word
            
            if i == 1:
                curr_word = curr_word.capitalize()
            
            if curr_word == curr_word.upper():
                curr_word = curr_word.lower()
                
            if curr_word in slang:
                curr_word = slang_corr[slang.index(curr_word)]
                
            if curr_word in swears:
                curr_word = curr_word[0] + '*' * (len(curr_word) - 2)
            
            temp.append(curr_word)
                
        out.append(' '.join(temp))  
    return out

In [10]:
x_prime_raw = rule(if_corpus)
x_prime_holdout = rule(if_holdout)

### Preprocess data

In [11]:
def tokenize(corpus, tokenizer=None, maxlen=None):
    """ Tokenize data and pad sequences """
    if not tokenizer: 
        tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', 
                              oov_token='<OOV>', lower=False)
        tokenizer.fit_on_texts(corpus)
    
    seqs = tokenizer.texts_to_sequences(corpus)
    padded_seqs = pad_sequences(seqs, padding='post', maxlen=maxlen)

    return padded_seqs, tokenizer

In [12]:
input_train, input_tokenizer = tokenize(if_corpus)
input_prime, _ = tokenize(x_prime_raw, input_tokenizer)
target_train, target_tokenizer = tokenize(f_corpus)

In [13]:
input_test, _ = tokenize(if_holdout, input_tokenizer)
prime_test, _ = tokenize(x_prime_holdout, input_tokenizer)
target_test, _ = tokenize(f_holdout, target_tokenizer)

In [14]:
buffer_size = len(input_train)
steps_per_epoch = len(input_train) // BATCH_SIZE
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

train = tf.data.Dataset.from_tensor_slices((input_train, input_prime, target_train)).shuffle(buffer_size)
train = train.batch(BATCH_SIZE, drop_remainder=True)

test = tf.data.Dataset.from_tensor_slices((input_test, prime_test, target_test)).batch(1)

In [15]:
example_input_batch, example_prime_batch, example_target_batch = next(iter(train))

# Setup Embedding Weights

In [16]:
def embedding_matrix(tokenizer, vocab_size, embedding_dim):
    embeddings_index = {}
    with open(EMBEDDING_PATH) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embeddings_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embeddings_matrix[i] = embedding_vector

    return embeddings_matrix

In [17]:
enc_E_mat = embedding_matrix(input_tokenizer, input_vocab_size, EMBEDDING_DIM)

### Encoder

Using one bidirectional LSTMs because that was reported to get almost as good performance as 2 LSTMs

In [18]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, encoder_units, weight_matrix):
        super(Encoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, 
                                                   embedding_dim,
                                                   weights=[weight_matrix])
        self.lstm_1 = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(
                encoder_units,
                return_sequences=True,
                return_state=True,
                kernel_regularizer=tf.keras.regularizers.l2(0.01),
                recurrent_regularizer=tf.keras.regularizers.l2(0.01),
                bias_regularizer=tf.keras.regularizers.l2(0.01)
            )
        )

    def call(self, x, hidden_state):
        """Shove into latent space"""
        # x shape: (batch_size x max_length x embedding_dim)
        x = self.embedding(x)

        # output shape: (batch_size x max_length x 2 * encoder_units)
        # h_f, h_b shapes: (batch_size x encoder_units)
        output, h_f, _, h_b, _ = self.lstm_1(x, initial_state=hidden_state)

        return output, h_f, h_b

In [19]:
encoder = Encoder(input_vocab_size, EMBEDDING_DIM, ENCODER_UNITS, enc_E_mat)

encoder_prime = Encoder(input_vocab_size, EMBEDDING_DIM, ENCODER_UNITS, enc_E_mat)

init_state = [tf.zeros((BATCH_SIZE, ENCODER_UNITS)) for _ in range(4)]

sample_output, enc_hidden_forward, enc_hidden_backward = encoder(example_input_batch, init_state)
prime_output, prime_hidden_forward, prime_hidden_backward = encoder_prime(example_prime_batch, init_state)

## Combined Encoders

In [None]:
class CombinedEncoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, encoder_units, weight_matrix):
        super(CombinedEncoder, self).__init__()
        self.seq_encoder = Encoder(input_vocab_size, EMBEDDING_DIM, ENCODER_UNITS, enc_E_mat)

### Attention Layer

In [20]:
class BahdanauAttention(tf.keras.layers.Layer):
    """
    This is different form the Luong paper since it 
    concatenates the forward and backward hidden states
    to caculate attention
    """
    def __init__(self, units):
        """These parameters follow notation from Bahdanau paper"""
        super(BahdanauAttention, self).__init__()
        self.W = tf.keras.layers.Dense(units)
        self.U = tf.keras.layers.Dense(units)
        self.v = tf.keras.layers.Dense(1)

    def call(self, enc_opt, hidden_f, hidden_b):
        # concatenate hidden states as in eq 7 Bahdanau
        # hidden shape: (batch_size, 2 * lstm_units)
        hidden = tf.concat([hidden_f, hidden_b], axis=-1)

        # expand dims to meet shape of latent tensor
        # hidden_broad shape: (batch_size, 1, encoder_units * 2)
        hidden_broad = tf.expand_dims(hidden, 1)

        # Alignment model score from A.1.2 of Bahdanau et al 
        # score shape: (batch_size, max_length, v_units)
        score = self.v(tf.nn.tanh(self.W(hidden_broad) + self.U(enc_opt)))

        # softmax generalization of eq(7) Luong
        # attention_weights shape: (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)  

        # This takes weighted average with attention weights
        # context shape: (batch_size, 2 * encoder_units)
        context = attention_weights * enc_opt
        context = tf.reduce_sum(context, axis=1)

        return context

### Decoder

In [21]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, attention_units, 
                 decoder_units):
        super(Decoder, self).__init__()

        self.attention = BahdanauAttention(attention_units)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm_1 = tf.keras.layers.LSTM(decoder_units,
                                           return_sequences=True,
                                           return_state=True)
        self.flatten = tf.keras.layers.Flatten()
        self.opt = tf.keras.layers.Dense(vocab_size)

    def call(self, x, hidden_f, hidden_b, encoder_output):
        context_vector = self.attention(encoder_output, hidden_f, hidden_b)

        # x shape: (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape: (batch_size, 1, embedding_dim + 2 * encoder_units)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # output shape: (batch_size, 1, decoder_units)
        # this shape is only important for expanding decoder depth
        output, h_f, c_f = self.lstm_1(x)

        # flatten to feed into opt
        # output shape: (batch_size, hidden_size)
        output = self.flatten(output)

        # get logits
        # x shape: (batch_size, vocab)
        x = self.opt(output)

        return x, h_f

In [22]:
seq_decoder = Decoder(target_vocab_size, EMBEDDING_DIM, ATTENTION_UNITS, DECODER_UNITS)
prime_decoder = Decoder(target_vocab_size, EMBEDDING_DIM, ATTENTION_UNITS, DECODER_UNITS)

sample_decoder_input = tf.expand_dims([target_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

dec_hidden_forward = tf.zeros((64, ENCODER_UNITS))

seq_decoder_out, seq_decoder_hidden = seq_decoder(sample_decoder_input, enc_hidden_forward, 
                                                  enc_hidden_backward, sample_output)
prime_decoder_out, prime_decoder_hidden = prime_decoder(sample_decoder_input, prime_hidden_forward, 
                                                        prime_hidden_backward, prime_output)

## Hierarchal Model

In [38]:
class CombinedDecoders(tf.keras.Model): 
    def __init__(self, input_vocab_size, target_vocab_size, embedding_dim,  
                 attention_units, encoder_units, decoder_units, embedding_weights): 
        super(CombinedDecoders, self).__init__()
        
        # declare decoders
        self.seq_decoder = Decoder(target_vocab_size, embedding_dim, attention_units, decoder_units)
        self.prime_decoder = Decoder(target_vocab_size, embedding_dim, attention_units, decoder_units)
        
        # create weights
        self.W = tf.keras.layers.Dense(target_vocab_size)
        
    def call(self, dec_input, seq_enc_out, seq_enc_forward, seq_enc_backward,
             prime_enc_out, prime_enc_forward, prime_enc_backward):
        
        seq_decoder_out, seq_decoder_forward = self.seq_decoder(dec_input, seq_enc_forward,
                                                               seq_enc_backward, seq_enc_out)
        prime_decoder_out, prime_decoder_forward = self.prime_decoder(dec_input, prime_enc_forward, 
                                                                     prime_enc_backward, prime_enc_out)
        
        # Now do Hierarchal attention
        # adaptation of eq (3) form Wang et al. 
        alpha = tf.exp(self.W(seq_decoder_out)) / (tf.exp(self.W(seq_decoder_out)) + tf.exp(self.W(prime_decoder_out)))
        beta = 1 - alpha
        
        return alpha * seq_decoder_out + beta * prime_decoder_out, seq_decoder_forward, prime_decoder_forward

In [39]:
ha_decoder = CombinedDecoders(input_vocab_size, target_vocab_size, EMBEDDING_DIM,
                              ATTENTION_UNITS, ENCODER_UNITS, DECODER_UNITS, enc_E_mat)

sample_decoder_input = tf.expand_dims([target_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

dec_hidden_forward = tf.zeros((64, ENCODER_UNITS))

seq_decoder_out, seq_decoder_hidden = seq_decoder(sample_decoder_input, enc_hidden_forward, 
                                                  enc_hidden_backward, sample_output)
prime_decoder_out, prime_decoder_hidden = prime_decoder(sample_decoder_input, prime_hidden_forward, 
                                                        prime_hidden_backward, prime_output)

sample_prediciton, seq_dec_foward, prime_dec_forward = ha_decoder(sample_decoder_input, sample_output,
                                                                  enc_hidden_forward, enc_hidden_backward, 
                                                                  prime_output, prime_hidden_forward, 
                                                                  prime_hidden_backward)

### Optimizer and Loss Function

Here we define the optimizer and the loss function. In our loss function we mask the zeros since that's the padding.

Also of note is in the loss function. The reduction argument at default does some really wonky things which threw off all results. Had to change the reduciton to none, which at default is auto. Not exactly sure what it does in this context but it tries to sum over batches. I didn't work with it because I wanted to control all loss calculation manually. 

In [40]:
optimizer = tf.keras.optimizers.Adam()
static_loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [41]:
def loss_function(real, preds):
    """Calculate and return loss"""

    # caclulate loss
    loss = static_loss(real, preds)
    
    # create padding mask 
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    mask = tf.cast(mask, dtype=loss.dtype)
    
    # apply mask
    loss *= mask

    return tf.reduce_mean(loss)

### Training

This is the training loop. I'm using teacher forcing because I don't think I have enough computational power to use beam search. Google reccomends beam search with a beam width of 10, but that isn't an option here. Teacher forcing will provide better results than without using it. 

In [44]:
@tf.function
def train_step(inpt, prime, trgt, init_state):
    loss = 0

    with tf.GradientTape() as tape:
        seq_enc_out, seq_enc_forward, seq_enc_backward = seq_encoder(inpt, hidden_state=init_state)
        prime_enc_out, prime_enc_forward, prime_enc_backward = prime_encoder(prime, hidden_state=init_state)

        # Get start token for every sequence in batch
        dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)
        
        # initialize decoder hidden forward states
        seq_dec_forward = seq_enc_forward
        prime_dec_forward = prime_enc_forward
        
        for i in range(1, trgt.shape[1]):
            # dec_hidden shape: (batch_size, decoder_units)
            # dec_input shape: (batch_size, 1)
            prediction, seq_dec_foward, prime_dec_forward = ha_decoder(dec_input, 
                                                                       seq_enc_out,
                                                                       seq_dec_forward, 
                                                                       enc_hidden_backward, 
                                                                       prime_output, 
                                                                       prime_enc_forward, 
                                                                       prime_hidden_backward)

            loss += loss_function(trgt[:, i], prediction)
            dec_input = tf.expand_dims(trgt[:, i], 1)

        # Apply gradients 
        trainable_variables = prime_encoder.trainable_variables + seq_encoder.trainable_variables  + ha_decoder.trainable_variables
        gradients = tape.gradient(loss, trainable_variables)
        optimizer.apply_gradients(zip(gradients, trainable_variables))

        # return batch loss
        return loss

In [45]:
EPOCHS = 20

seq_encoder = Encoder(input_vocab_size, EMBEDDING_DIM, ENCODER_UNITS, enc_E_mat)
prime_encoder = Encoder(input_vocab_size, EMBEDDING_DIM, ENCODER_UNITS, enc_E_mat)
ha_decoder = CombinedDecoders(input_vocab_size, target_vocab_size, EMBEDDING_DIM,
                              ATTENTION_UNITS, ENCODER_UNITS, DECODER_UNITS, enc_E_mat)

for epoch in range(EPOCHS):
    start = datetime.now()

    total_loss = 0

    # This resets the hidden state of the LSTM for every epoch
    init_state = [tf.zeros((BATCH_SIZE, ENCODER_UNITS)) for _ in range(4)]

    for inpt, prime, trgt in train.take(steps_per_epoch):
        total_loss += train_step(inpt, prime, trgt, init_state)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / BATCH_SIZE))
    print('Time taken {}\n'.format(datetime.now() - start))



ResourceExhaustedError: ignored

In [None]:
def evaluate(data):
    loss = 0
    results = []
    bleu = 0

    for i, (x, y) in enumerate(data):

        result = '<start> '
        next_word = True

        init_state = [tf.zeros((1, ENCODER_UNITS)) for _ in range(4)]

        # This feeds in the start token for the first initial state
        dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)
        
        # Get inputs
        enc_output, dec_hidden_forward, dec_hidden_backward = encoder(x, None)

        i = 1  # iterative count
        while next_word: 
            # dec_hidden shape: (batch_size, decoder_units)
            # dec_input shape: (batch_size, 1)
            predictions, dec_hidden_forward, _ = decoder(dec_input, 
                                                         dec_hidden_forward, 
                                                         dec_hidden_backward, 
                                                         enc_output)

            loss += loss_function(y[:, i], predictions)
            
            # max logit for tokenized word
            word_idx = tf.argmax(predictions[0]).numpy()
            word = target_tokenizer.index_word[word_idx]
            result += word + ' '

            dec_input = tf.expand_dims([word_idx], 0)

            if word == '<end>':
                next_word = False
            
            if i >= y.shape[1] - 1:
                result += '<end>'
                next_word = False
            
            i += 1

        results.append(result)

    return results, loss.numpy() / len(f_holdout)

In [None]:
results, test_loss = evaluate(test)

In [None]:
def examine(index):
    print("Informal: ", if_holdout[index])
    print("Formal: ", f_holdout[index])
    print("Predicted: ", results[index])

In [None]:
examine(1999)

In [None]:
with open(BASE_PATH + '/Results/Bahdanau_Attention_Results_Custom.txt', 'w') as f:
    for seq in results:
        f.write(seq + '\n')