# Autoencoder
The autoencoder model failed to generate great speech 

In [1]:
import numpy as np
import tensorflow as tf

import re 
import os
from datetime import datetime

from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import TweetTokenizer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import workflow_manager as wm

## Load Data

In [2]:
EMBEDDING_DIM = 50
ENCODER_UNITS = 512
DECODER_UNITS = 512
ATTENTION_UNITS = 256
BATCH_SIZE = 32

### Autoencoder Data

In [3]:
BASE_PATH = '../Data'
train, val, test, context = wm.load_and_tokenize(BASE_PATH)

E_weights = wm.embedding_matrix(context['input_tokenizer'], 
                                context['input_vocab_size'], 
                                BASE_PATH)

## Get Models 

In [4]:
encoder = wm.Encoder(context['input_vocab_size'], EMBEDDING_DIM,
                         ENCODER_UNITS, E_weights)
decoder = wm.Decoder(context['target_vocab_size'], EMBEDDING_DIM,
                         ATTENTION_UNITS, DECODER_UNITS)

## Train AutoEncoder

### Seq2Seq loss and Optimizer

In [5]:
static_loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
optimizer = tf.keras.optimizers.Adam()

In [6]:
def loss_function(real, preds):
    """this is normal seq2seq loss"""

    # caclulate loss
    loss = static_loss(real, preds)
    
    # create padding mask 
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    mask = tf.cast(mask, dtype=loss.dtype)
    
    # apply mask
    loss *= mask

    return tf.reduce_mean(loss)

### Predict Function

In [7]:
def predict(inpt, trgt, train=True):
    loss = 0
    target_tokenizer = context['target_tokenizer']
    
    # initialize seqs tensor
    gen_seqs = tf.constant([target_tokenizer.word_index['<start>']] * BATCH_SIZE, dtype=tf.int64)
    gen_seqs = tf.expand_dims(gen_seqs, axis=1)
    
    # This resets the hidden state of the LSTM for every epoch
    init_state = [tf.zeros((BATCH_SIZE, ENCODER_UNITS)) for _ in range(4)]

    ## Generate Sequences
    enc_output, h_f, h_b = encoder(inpt, init_state)

    # Get start token for every sequence in batch
    dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

    for i in range(1, trgt.shape[1]):
        # dec_hidden shape: (batch_size, decoder_units)
        # dec_input shape: (batch_size, 1)
        predictions, h_f = decoder(dec_input, h_b, h_f, enc_output)

        loss += loss_function(trgt[:, i], predictions)
        dec_input = tf.expand_dims(trgt[:, i], 1)
        
        # Need to hold onto seqs for discriminator
        new_preds = tf.argmax(predictions, axis=1)
        new_preds = tf.expand_dims(new_preds, axis=1)
        gen_seqs = tf.concat([gen_seqs, new_preds], axis=1)

    if not train:
        return gen_seqs
        
    return gen_seqs, loss

In [17]:
ein, eout = next(iter(train))

In [18]:
x = predict(ein, eout)

In [19]:
context['target_tokenizer'].sequences_to_texts(x[0].numpy())

['<start> please do a yahoo search to find a specific product <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>',
 '<start> it is illegal to do <end> it <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>',
 '<start> yes i have watched it and it is quite good <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>',
 '<start> think really hard about this one what does one put between their legs and turn <OOV> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <

## Training Loop
This learns a sequence and then goes through usual GAN paradigm

### Training Step

In [11]:
@tf.function
def train_step(inpt, trgt):
    
    with tf.GradientTape() as tape:
        gen_seqs, loss = predict(inpt, trgt)
    
    # Apply gradients 
    trainable_variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, trainable_variables)
    optimizer.apply_gradients(zip(gradients, trainable_variables))
    
    return loss

### Training Loop

In [23]:
EPOCHS = 10

for epoch in range(EPOCHS):
    start = datetime.now()
    total_loss = 0

    for inpt, trgt in train.take(context['steps_per_epoch']):
        total_loss += train_step(inpt, trgt)
    
    epoch_print = 'Epoch {} | Generator Loss {:.4f} | Discriminator Loss {:.4f}'
    
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / BATCH_SIZE))
    print('Time taken {}\n'.format(datetime.now() - start))

In [25]:
encoder.save_weights('GAN Seq model weights/encoder')
decoder.save_weights('GAN Seq model weights/decoder')

In [26]:
import pickle

In [29]:
with open('GAN Seq model weights/input_tokenizer.pickle', 'wb') as handle:
    pickle.dump(context['input_tokenizer'], handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('GAN Seq model weights/target_tokenizer.pickle', 'wb') as handle:
    pickle.dump(context['target_tokenizer'], handle, protocol=pickle.HIGHEST_PROTOCOL)