# Attention is All You Need

In [None]:
import numpy as np
import seaborn as sns
import tensorflow as tf

import re 
import os
from datetime import datetime

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Declare Static Variables

In [None]:
BATCH_SIZE = 64
D_MODEL = 512
MAX_LENGTH = 43
NX = 6
H = 8

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Load Data

In [None]:
# BASE_PATH = '../../Data'
BASE_PATH = '/content/drive/MyDrive/Data/Data'

FORMAL_PATH_TRAIN = '{}/Supervised Data/Entertainment_Music/S_Formal_EM_Train.txt'.format(BASE_PATH)
INFORMAL_PATH_TRAIN = '{}/Supervised Data/Entertainment_Music/S_Informal_EM_Train.txt'.format(BASE_PATH)

FORMAL_PATH_HOLDOUT = '{}/Supervised Data/Entertainment_Music/S_Formal_EM_ValTest.txt'.format(BASE_PATH)
INFORMAL_PATH_HOLDOUT = '{}/Supervised Data/Entertainment_Music/S_Informal_EM_ValTest.txt'.format(BASE_PATH)

EMBEDDING_PATH = '{}/glove.6B.200d.txt'.format(BASE_PATH)

In [None]:
formal = open(FORMAL_PATH_TRAIN).read()
informal = open(INFORMAL_PATH_TRAIN).read()

formal_holdout = open(FORMAL_PATH_HOLDOUT).read()
informal_holdout = open(INFORMAL_PATH_HOLDOUT).read()

In [None]:
def process_sequence(seq):
    """This inserts a space in between the last word and a period"""
    s = re.sub('([.,!?()])', r' \1 ', seq)
    s = re.sub('\s{2,}', ' ', s)
    
    return s

In [None]:
def process_seq_target_output(seq):
    s = re.sub('([.,!?()])', r' \1 ', seq)
    s = re.sub('\s{2,}', ' ', s)
    
    return s + ' <end>'

In [None]:
def process_seq_target_input(seq):
    s = re.sub('([.,!?()])', r' \1 ', seq)
    s = re.sub('\s{2,}', ' ', s)
    
    return '<start> ' + s

In [None]:
f_corpus = [process_seq_target_output(seq) for seq in formal.split('\n')]
f_corpus_input = [process_seq_target_input(seq) for seq in formal.split('\n')]
if_corpus = [process_sequence(seq) for seq in informal.split('\n')]

f_holdout = [process_seq_target_output(seq) for seq in formal_holdout.split('\n')]
if_holdout = [process_sequence(seq) for seq in informal_holdout.split('\n')]

### Preprocess data

This is a little hacky. I force the max length since I already know what it is. 

In [None]:
def tokenize(corpus, tokenizer=None, maxlen=43):
    """ Tokenize data and pad sequences """
    if not tokenizer: 
        tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', 
                              oov_token='<OOV>')
        tokenizer.fit_on_texts(corpus)
        tokenizer.fit_on_texts(['<start>', '<end>'])

    seqs = tokenizer.texts_to_sequences(corpus)
    padded_seqs = pad_sequences(seqs, padding='post', maxlen=maxlen)

    return padded_seqs, tokenizer

In [None]:
input_train, input_tokenizer = tokenize(if_corpus)
target_train, target_tokenizer = tokenize(f_corpus)
target_input_train, _ = tokenize(f_corpus_input, target_tokenizer)

In [None]:
input_test, _ = tokenize(if_holdout, input_tokenizer)
target_test, _ = tokenize(f_holdout, target_tokenizer)

In [None]:
buffer_size = len(input_train)
steps_per_epoch = len(input_train) // BATCH_SIZE
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

train = tf.data.Dataset.from_tensor_slices((input_train, target_input_train, target_train)).shuffle(buffer_size)
train = train.batch(BATCH_SIZE, drop_remainder=True)

test = tf.data.Dataset.from_tensor_slices((input_test, target_test)).batch(1)

In [None]:
example_input_batch, example_target_input_batch, example_target_batch = next(iter(train))

## Positional Embedding

Need to compute Positional Embeddign from 3.5

$$ PE_{(pos, 2i)} = \sin(pos, 10000^{2i/d_{model}}) \\
PE_{(pos, 2i+1)} = \cos(pos, 10000^{2i/d_{model}})
$$

In [None]:
def positional_embedding(p, d_model):
    p_emb = np.zeros((1, d_model))
    for i in range(d_model):
        if i % 2 == 0:
            p_emb[:, i] = np.sin(p / 10000 ** (i / d_model))
        else:
            p_emb[:, i] = np.cos(p / 10000 ** (i / d_model))
    return p_emb


pes = [positional_embedding(i, D_MODEL) for i in range(43)]

pes = np.concatenate(pes, axis=0)
pes = tf.constant(pes, dtype=tf.float32)

## Multi-Head Attention

Computing 
$$ \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,...,head_h)W^o$$ 
where $$head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)$$
and attention is 
$$ \text{softmax}(\frac{QK^T}{\sqrt{d_k}})V$$ 

In seciton 3.2.3 of AAYN encoder-decoder for seq2seq models keys and values are both form the encoder output, so treating key and value as the same input here.

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, h):
        super(MultiHeadAttention, self).__init__()
        self.mha_size = d_model // h
        self.h = h

        # learn different weights for all 
        self.wq = [tf.keras.layers.Dense(self.mha_size) for _ in range(h)]
        self.wk = [tf.keras.layers.Dense(self.mha_size) for _ in range(h)]
        self.wv = [tf.keras.layers.Dense(self.mha_size) for _ in range(h)]
        self.wo = tf.keras.layers.Dense(d_model)

    def scaled_dot_product_attention(self, q, k, i, mask=None):
        """run for each query, value, key in h"""
        # query shape: (batch_size, query_length, d_model)
        # value shape: (batch_size, key_length, d_model)
        score = tf.matmul(self.wq[i](q), self.wk[i](k), transpose_b=True)

        # eq(1) from AAYN
        d_k = tf.math.sqrt(tf.cast(self.mha_size, dtype=tf.float32))

        # score shape: (batch_size, query_length, value_length)
        score /= d_k

        # attention shape: (batch_size, query_length, value_length)
        attention = tf.nn.softmax(score, axis=2)

        # context shape: (batch_size, query_length, value_length)
        head = tf.matmul(attention, self.wv[i](k))

        return head 

    def call(self, q, k, mask=None):
        """This computes the multi head attention by calling for each h"""
        # compute one head attention for each head
        multi_head = [self.scaled_dot_product_attention(q, k, i, mask) for i in range(self.h)]
        
        # concat all heads 
        multi_head = tf.concat(multi_head, axis=2)

        # multi_head shape: (batch_size, query_length, model_size)
        mutli_head = self.wo(multi_head)

        return mutli_head

## Encoder

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model, h):
        super(EncoderLayer, self).__init__()
        self.d_model = d_model
        self.h = h 

        self.mha = MultiHeadAttention(d_model, h)
        self.mha_norm = tf.keras.layers.LayerNormalization()

        self.FFN_l1 = tf.keras.layers.Dense(4 * d_model, activation='relu')
        self.FFN_l2 = tf.keras.layers.Dense(d_model)
        self.FFN_norm = tf.keras.layers.LayerNormalization()
    
    def call(self, E_out, mask=None):
        # MultiHead Attention
        mha_out = self.mha(E_out, E_out, mask)
        mha_out = self.mha_norm(E_out + mha_out)

        # Feed Forward Network
        FFN_out = self.FFN_l2(self.FFN_l1(mha_out))

        #  skip and norm
        FFN_out = self.FFN_norm(FFN_out + mha_out)
        
        return FFN_out

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, d_model, num_layers, h):
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.h = h

        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
        self.encoder_layers = [EncoderLayer(vocab_size, d_model, h) 
                               for _ in range(num_layers)]
        
    def call(self, seq, mask=None):
        # Embedding Layer
        # E_out shape: (batch_size x max_length x d_model)
        E_out = self.embedding(seq)

        # positonal encoding
        x = E_out + pes[:seq.shape[1], :]

        # MultiHeadAttention
        for i in range(self.num_layers):
            x = self.encoder_layers[i](x, mask)
            
        return x 

## Decoder

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, h):
        super(DecoderLayer, self).__init__()
        self.mha1 = MultiHeadAttention(d_model, h)
        self.mha1_norm = tf.keras.layers.LayerNormalization()
        
        self.mha2 = MultiHeadAttention(d_model, h)
        self.mha2_norm = tf.keras.layers.LayerNormalization()

        self.FFN_l1 = tf.keras.layers.Dense(4 * d_model)
        self.FFN_l2 = tf.keras.layers.Dense(d_model)
        self.FFN_norm = tf.keras.layers.LayerNormalization()
    
    def call(self, x, enc_opt, mask=None):
        # First MHA layer
        # mha1_out shape: (BATCH_SIZE x target max length)
        mha1_out = self.mha1(x, x, look_ahead_mask)
        mha1_out = self.mha1_norm(mha1_out + x)

        # Second MHA layer
        mha2_out = self.mha2(x, enc_opt, mask)
        mha2_out = self.mha2_norm(mha2_out + mha1_out)

        # FFN
        FFN_out = self.FFN_l2(self.FFN_l1(mha2_out))
        FFN_out = self.FFN_norm(FFN_out + mha2_out)
        
        return FFN_out 

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, d_model, num_layers, h):
        super(Decoder, self).__init__()
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
        self.decoder_layers = [DecoderLayer(d_model, h)
                              for _ in range(num_layers)]
        self.opt = tf.keras.layers.Dense(vocab_size)
        
    def call(self, seq, enc_opt, mask=None):
        E_out = self.embedding(seq)

        x = E_out + pes[:seq.shape[1], :]
        
        for i in range(self.num_layers):
            x = self.decoder_layers[i](x, enc_opt)
        
        output = self.opt(x)
        
        return output

In [None]:
encoder = Encoder(input_vocab_size, D_MODEL, NX, H)
decoder = Decoder(target_vocab_size, D_MODEL, NX, H)

In [None]:
enc_output = encoder(example_input_batch)
dec_output = decoder(example_target_input_batch, enc_output)

## Define Training

In [None]:
optimizer = tf.keras.optimizers.Adam()
static_loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, preds):
    """Calculate and return loss"""
    # caclulate loss
    loss = static_loss(real, preds)
    
    # create padding mask 
    mask = tf.math.logical_not(tf.equal(real, 0))
    mask = tf.cast(mask, dtype=loss.dtype)
    
    # apply mask
    loss *= mask

    return tf.reduce_mean(loss)

In [None]:
@tf.function
def train_step(in_seq, targ_in_seq, targ_out_seq):
    with tf.GradientTape() as tape:
        # predict
        enc_opt = encoder(in_seq)
        dec_opt = decoder(targ_in_seq, enc_opt)
        loss = loss_function(targ_out_seq, dec_opt)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    
    return loss

In [None]:
NUM_EPOCHS = 5
for epoch in range(NUM_EPOCHS):
    total_loss = 0
    start = datetime.now()
    for inpt, targ_inpt, targ_out in train.take(steps_per_epoch):
        total_loss += train_step(inpt, targ_inpt, targ_out)
    
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / BATCH_SIZE))
    print('Time taken for 1 epoch {} seconds\n'.format(datetime.now() - start))

Epoch 1 Loss 10.6616
Time taken for 1 epoch 0:02:48.664173 seconds

Epoch 2 Loss 10.4342
Time taken for 1 epoch 0:02:13.017185 seconds

Epoch 3 Loss 10.4217
Time taken for 1 epoch 0:02:13.301650 seconds

Epoch 4 Loss 10.4159
Time taken for 1 epoch 0:02:13.320810 seconds

Epoch 5 Loss 10.4118
Time taken for 1 epoch 0:02:13.698403 seconds



## Evaluate

In [None]:
from nltk.translate.bleu_score import sentence_bleu

In [None]:
def evaluate(data):
    loss = 0
    results = []
    bleu = 0
    
    for i, (x, y) in enumerate(data):
        result = ''
        next_word = True 
        
        enc_output = encoder(x)
        
        dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0) 
        j = 1
        while next_word:
            prediction = decoder(dec_input, enc_output)
            loss += loss_function(y[:, j], prediction)
            
            # update result
            word_idx = tf.argmax(prediction, axis=-1)[:, -1].numpy()[0]
            word = target_tokenizer.index_word[word_idx]
            result += word + ' '
            
            # update decoder input
            dec_input = tf.concat((dec_input, tf.expand_dims([word_idx], 0)), axis=-1)
            
            if word == '<end>':
                next_word = False
                
            if j >= y.shape[1] - 1:
                next_word = False
            
            j += 1

        results.append(result) 
        bleu += sentence_bleu(f_holdout[i], result)
        
    return results, loss.numpy() / len(f_holdout), bleu / len(f_holdout)

In [None]:
with open(BASE_PATH + '/Results/Custom_Transformer_Results.txt', 'w') as f:
    for seq in results:
        f.write(seq + '\n')