In [None]:
# Note : TensorFlow >= 1.10 and enable eager execution

# Based on https://github.com/tensorflow/tensorflow/blob/r1.11/tensorflow/contrib/eager/python/
from __future__ import absolute_import, division, print_function
import tensorflow as tf
tf.enable_eager_execution()

import re
import os
import time
import numpy as np
import unicodedata
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
class WordIndex():
    """
    This class implements "word to id" and "id to word" mapping functions. 
    It's used in the embedding layer of the Seq2Seq model. 
    """
    
    def __init__(self):
        # stores word to id mappings        
        self.word2idx = defaultdict(lambda: len(self.word2idx))
        self.pad = self.word2idx["<pad>"]
        self.unk = self.word2idx["<unk>"]

        
    def create_forward_index(self, sentences):
        word_indices = []
        for sent in sentences:
            words = [self.word2idx[word] for word in sent] 
            word_indices.append(words)
        return word_indices
    
    def create_reverse_index(self):
        # stores id to word mappings
        # this will freeze the dictionary. If the word doesn't exist in the dictionary, <unk> will be returned.
        self.word2idx =  defaultdict(lambda: self.unk, self.word2idx)
        self.idx2word = {v: k for k, v in self.word2idx.items()}

In [None]:
class Dataset:
    """
    This class implements the functionality for reading and pre-processing the dataset.   
    """
    
    def __init__(self, path):
        self.path = path
        self.word_index = WordIndex()
        self.max_length_source = -1
        self.max_length_target = -1
        self.source_tensor = []
        self.target_tensor = []
        self.buffer_size = -1
        self.batch_size = -1
        self.n_batch = -1
        
        # load the dataset from path location
        self.load_dataset()
    
    
    def unicode_to_ascii(self, s):
        # Converts the unicode file to ascii
        return ''.join(c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn')
    
    def max_length(self, tensor):
        return max(len(t) for t in tensor)


    def preprocess_sentence(self, w):
        w = self.unicode_to_ascii(w.lower().strip())
    
        # creating a space between a word and the punctuation following it
        # eg: "he is a boy." => "he is a boy ." 
        # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
        w = re.sub(r"([?.!,¿])", r" \1 ", w)
        w = re.sub(r'[" "]+', " ", w)
    
        # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
        w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
        w = w.rstrip().strip()
    
        # adding a start and an end token to the sentence
        # so that the model know when to start and stop predicting.
        w = '<start> ' + w + ' <end>'
        return w
    
    
    def load_dataset(self):
        # load the data set and split it into <question, answer> pairs
        lines = open(self.path, encoding='UTF-8').read().strip().lower().split('\n')
        word_pairs = [[self.preprocess_sentence(w).split(' ') for w in l.split('\t')]  for l in lines]
        
        source =  [ source for source, target in word_pairs]
        target =  [ target for source, target in word_pairs]     
        
        # maps the words to ids
        source_tensor = self.word_index.create_forward_index(source)
        target_tensor = self.word_index.create_forward_index(target)
        
        self.word_index.create_reverse_index()
        
        # Calculate max_length of input and output tensor
        # Here, we'll set those to the longest sentence in the dataset
        self.max_length_source, self.max_length_target = self.max_length(source_tensor), self.max_length(target_tensor)
    
        # Padding the input and output tensor to the maximum length
        self.source_tensor = tf.keras.preprocessing.sequence.pad_sequences(source_tensor, 
                                                                 maxlen=self.max_length_source,
                                                                 padding='post')
    
        self.target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, 
                                                                  maxlen=self.max_length_target, 
                                                                  padding='post')
        
        self.buffer_size = len(self.source_tensor)
        self.batch_size = 64
        self.n_batch = self.buffer_size//self.batch_size
        
        self.tf_dataset = tf.data.Dataset.from_tensor_slices((self.source_tensor, self.target_tensor)).shuffle(self.buffer_size)
        self.tf_dataset = self.tf_dataset.batch(self.batch_size, drop_remainder=True)

In [None]:
def gru(units):
    # If you have a GPU, we recommend using CuDNNGRU(provides a 3x speedup than GRU)
    # the code automatically does that.

    if tf.test.is_gpu_available():
        return tf.keras.layers.CuDNNGRU(units, 
                                    return_sequences=True, 
                                    return_state=True, 
                                    recurrent_initializer='glorot_uniform')
    else:
        return tf.keras.layers.GRU(units, 
                               return_sequences=True, 
                               return_state=True, 
                               recurrent_activation='sigmoid', 
                               recurrent_initializer='glorot_uniform')

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.enc_units)
        
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)        
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [None]:
class  Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.dec_units)
        self.fc = tf.keras.layers.Dense(vocab_size)
                
        
    def call(self, x, hidden):
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        
        # passing the concatenated vector to the GRU
        output, state = self.gru(x, initial_state = hidden)
        
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))
        
        # output shape == (batch_size * 1, vocab)
        x = self.fc(output)
        
        return x, state
    
        
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.dec_units))
             

In [None]:
class Config(object):
    """Holds model hyperparams and data information.
    The config class is used to store various hyperparameters and dataset
    information parameters. Model objects are passed a Config() object at
    instantiation.
    """
    embedding_dim = 128
    units = 512
    optimizer = tf.train.AdamOptimizer()
    max_epochs = 10

In [None]:
class Seq2SeqModel():
    
    def __init__(self, config, path):
        self.config = config
        self.dataset = Dataset(path)
        self.encoder = Encoder(len(self.dataset.word_index.word2idx), self.config.embedding_dim, 
                               self.config.units, self.dataset.batch_size)
        self.decoder = Decoder(len(self.dataset.word_index.word2idx), self.config.embedding_dim, 
                               self.config.units, self.dataset.batch_size)        
    
    def train(self):
        for epoch in range(self.config.max_epochs):
            start = time.time()
    
            hidden = self.encoder.initialize_hidden_state()
            total_loss = 0
    
            for (batch, (inp, targ)) in enumerate(self.dataset.tf_dataset):
                loss = 0
        
                with tf.GradientTape() as tape:
                    enc_output, enc_hidden = self.encoder(inp, hidden)
                    dec_hidden = enc_hidden
                    dec_input = tf.expand_dims([self.dataset.word_index.word2idx['<start>']] * self.dataset.batch_size, 1)       
            
                    # Teacher forcing - feeding the target as the next input
                    for t in range(1, targ.shape[1]):
                        # passing enc_output to the decoder
                        predictions, dec_hidden = self.decoder(dec_input, dec_hidden)
                
                        loss += self.loss_function(targ[:, t], predictions)
                
                        # using teacher forcing
                        dec_input = tf.expand_dims(targ[:, t], 1)
        
                batch_loss = (loss / int(targ.shape[1]))
                total_loss += batch_loss
                variables = self.encoder.variables + self.decoder.variables
                gradients = tape.gradient(loss, variables)
                self.config.optimizer.apply_gradients(zip(gradients, variables))
        
                if batch % 100 == 0:
                    print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
    
        print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / self.dataset.n_batch))
        print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
    
    
    def evaluate(self,sentence):
        sentence = self.dataset.preprocess_sentence(sentence)
        inputs = [self.dataset.word_index.word2idx[i] for i in sentence.split(' ')]
        inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=self.dataset.max_length_source, padding='post')
        inputs = tf.convert_to_tensor(inputs)
    
        result = ''
        hidden = [tf.zeros((1, self.config.units))]
        enc_out, enc_hidden = self.encoder(inputs, hidden)

        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([self.dataset.word_index.word2idx['<start>']], 0)

        for t in range(self.dataset.max_length_target):
            predictions, dec_hidden = self.decoder(dec_input, dec_hidden)
                
            # Un-comment the following for non-greedy decoding 
            #predicted_id = tf.multinomial(predictions, num_samples=1)[0][0].numpy()
            predicted_id = tf.argmax(predictions[0]).numpy()
            
            result += self.dataset.word_index.idx2word[predicted_id] + ' '

            if self.dataset.word_index.idx2word[predicted_id] == '<end>':
                return result, sentence
        
            # the predicted ID is fed back into the model
            dec_input = tf.expand_dims([predicted_id], 0)

        return result, sentence
    
    
    def loss_function(self,real, pred):
        mask = 1 - np.equal(real, 0)
        loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
        return tf.reduce_mean(loss_)
    
    def converse(self, sentence):
        result, sentence = self.evaluate(sentence)
        
        print('Input : {}'.format(sentence))
        print('Response : {}'.format(result))

In [None]:
def test_dialogue_agent():
    dialogue_corpus = "./data/test.txt"
    
    config = Config()
    model = Seq2SeqModel(config, dialogue_corpus)
    model.train()
    model.converse("how was your day?")
    model.converse("what's your favourite band?")

In [None]:
if __name__ == "__main__":
    test_dialogue_agent()