# Encoder-Decoder MT Attention Network

In [1]:
import numpy as np
import seaborn as sns
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Load Data

In [2]:
formal = open('../Data/Supervised Data/Entertainment_Music/S_Formal_EM_Train.txt').read()
informal = open('../Data/Supervised Data/Entertainment_Music/S_Informal_EM_Train.txt').read()

formal_holdout = open('../Data/Supervised Data/Entertainment_Music/S_Formal_EM_ValTest.txt').read()
informal_holdout = open('../Data/Supervised Data/Entertainment_Music/S_Informal_EM_ValTest.txt').read()

In [3]:
f_corpus = ['<start> ' + seq + ' <end>' for seq in formal.split('\n')]
if_corpus = ['<start> ' + seq + ' <end>' for seq in informal.split('\n')]

f_holdout = ['<start> ' + seq + ' <end>' for seq in formal_holdout.split('\n')]
if_holdout = ['<start> ' + seq + ' <end>' for seq in informal_holdout.split('\n')]

### Preprocess data

In [4]:
def tokenize(corpus):
    """ Tokenize data and return tokenizer """
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', oov_token='<OOV>')
    tokenizer.fit_on_texts(corpus)
    
    seqs = tokenizer.texts_to_sequences(corpus)
    padded_seqs = pad_sequences(seqs, padding='post')
    return padded_seqs, tokenizer

In [5]:
input_train, input_tokenizer = tokenize(if_corpus)
target_train, target_tokenizer = tokenize(f_corpus)

In [6]:
input_tokenizer.word_index

{'<OOV>': 1,
 '<start>': 2,
 '<end>': 3,
 'i': 4,
 'the': 5,
 'and': 6,
 'a': 7,
 'it': 8,
 'to': 9,
 'you': 10,
 'is': 11,
 'that': 12,
 'of': 13,
 'but': 14,
 'in': 15,
 'like': 16,
 'on': 17,
 'have': 18,
 'they': 19,
 'my': 20,
 'for': 21,
 'not': 22,
 'he': 23,
 'if': 24,
 'was': 25,
 'just': 26,
 'think': 27,
 'know': 28,
 'me': 29,
 "don't": 30,
 'so': 31,
 'are': 32,
 'with': 33,
 'be': 34,
 'no': 35,
 'good': 36,
 'or': 37,
 'one': 38,
 'all': 39,
 'u': 40,
 'she': 41,
 'do': 42,
 'what': 43,
 'get': 44,
 'go': 45,
 'can': 46,
 'its': 47,
 'out': 48,
 'love': 49,
 'your': 50,
 'this': 51,
 'there': 52,
 'song': 53,
 'her': 54,
 'dont': 55,
 'would': 56,
 'movie': 57,
 'up': 58,
 'about': 59,
 'at': 60,
 "it's": 61,
 'really': 62,
 'them': 63,
 'com': 64,
 'his': 65,
 'because': 66,
 'from': 67,
 'music': 68,
 'too': 69,
 'got': 70,
 'then': 71,
 'by': 72,
 'who': 73,
 'him': 74,
 'will': 75,
 'some': 76,
 'has': 77,
 "i'm": 78,
 'as': 79,
 'say': 80,
 'now': 81,
 '2': 82,
 'wa

In [7]:
BUFFER_SIZE = len(input_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_train)//BATCH_SIZE
embedding_dim = 256
units= 1024
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

train = tf.data.Dataset.from_tensor_slices((input_train, target_train)).shuffle(BUFFER_SIZE)
train = train.batch(BATCH_SIZE, drop_remainder=True)

In [8]:
example_input_batch, example_target_batch = next(iter(train))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 32]), TensorShape([64, 43]))

### Encoder

In [9]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, encoder_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.encoder_units = encoder_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.encoder_units,
                                       return_sequences=True, 
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.encoder_units))

In [10]:
encoder = Encoder(input_vocab_size, embedding_dim, units, BATCH_SIZE)

In [11]:
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)

In [12]:
target_tokenizer.word_index['with']

31

### Attention Layer

In [13]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)
        
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        return context_vector, attention_weights

In [14]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

### Decoder

In [15]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, decoder_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.decoder_units = decoder_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.decoder_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(self.decoder_units)
        
    def call(self, x, hidden, encoder_output):
        context_vector, attention_weights = self.attention(hidden, encoder_output)
        
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        
        x = self.fc(output)
        return x, state, attention_weights

In [16]:
decoder = Decoder(target_vocab_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

### Optimizer and Loss Function

In [17]:
optimizer = tf.keras.optimizers.Adam()
static_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [18]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss = static_loss(real, pred)
     
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    return tf.reduce_mean(loss)

### Training

In [31]:
@tf.function
def train_step(inpt, trgt, enc_hidden):
    loss = 0
    
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inpt, enc_hidden)
        
        dec_hidden = enc_hidden
        
        dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)
        
        for t in range(1, trgt.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_function(trgt[:, t], predictions)
            dec_input = tf.expand_dims(trgt[:, t], 1)
            
        batch_loss = loss / int(trgt.shape[1])
        
        variables = encoder.trainable_variables + decoder.trainable_variables
        
        gradients = tape.gradient(loss, variables)
        
        optimizer.apply_gradients(zip(gradients, variables))
        
        return batch_loss

In [32]:
from datetime import datetime

In [33]:
import os

In [34]:
checkpoint_dir = './training-checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [35]:
EPOCHS = 10
for epoch in range(EPOCHS):
    start = datetime.now()
    
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch, (inpt, trgt)) in enumerate(train.take(steps_per_epoch)):
        batch_loss = train_step(inpt, trgt, enc_hidden)
        total_loss += batch_loss
        
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch+1,
                                                         batch,
                                                         batch_loss.numpy()))
        if (epoch + 1) % 2 == 0:
            checkpoint.save(file_prefix=checkpoint_prefix)
            
        print('Epoch {} Loss {:.4f}'.format(epoch+1,
                                            total_loss/steps_per_epoch))
        print('Time taken for 1 epoch {} seconds\n'.format(datetime.now() - start))

KeyboardInterrupt: 