<a href="https://colab.research.google.com/github/prashantmishra311/Attentive-Seq2Seq-with-Copying-Mechanism-for-Table-to-text/blob/master/table2text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import regex as re
from collections import Counter

In [None]:
class Vocab(object):

    def __init__(self, corpus, num_words=None, min_freq=None, unk_token='<unk>', pad_token='<pad>', sos_token=None, eos_token=None):

        self.unk_token = unk_token
        self.pad_token = pad_token
        self.sos_token = sos_token
        self.eos_token = eos_token
        self.word_to_id = {}
        self.id_to_word = {}

        words = [unk_token, pad_token]
        words = words.append(sos_token) if sos_token is not None else words
        words = words.append(eos_token) if eos_token is not None else words
        count = 0

        for word in words:
            self.word_to_id[word] = count
            self.id_to_word[count] = word
            count += 1
        
        size = num_words-len(words) if num_words is not None else num_words
        self.word_freq = {word: freq for (word, freq) in self._construct_vocab(corpus, size, min_freq).items()}

        for (word, freq) in self.word_freq.items():
            self.word_to_id[word] = count
            self.id_to_word[count] = word
            count += 1
        
        self.vocab_size = count

    def source_to_ids(self, text):

        ids, oovs = [], []
        for word in text:
            try:
                id = self.word_to_id[word]
                ids.append(id)
            except:
                id = self.word_to_id[self.unk_token]
                if word not in oovs:
                    oovs.append(word)
                ids.append(self.vocab_size + oovs.index(word))
        
        return ids, oovs

    def caption_to_ids(self, text, source_oovs):

        ids = []
        for word in text:
            try:
                id = self.word_to_id[word]
                ids.append(id)
            except:
                if word in source_oovs:
                    ids.append(self.vocab_size + source_oovs.index(word))
                else:
                    ids.append(self.word_to_id[self.unk_token])
        
        return ids
    
    def ids_to_caption(self, ids, source_oovs):

        words = []
        for id in ids:
            try:
                word = self.id_to_word[id]
                words.append(word)
            except:
                oov_id = id - self.vocab_size
                try:
                    word = source_oovs[oov_id]
                    words.append(word)
                except:
                    raise IndexError(f'oov id {oov_id} out of range')
        return words

    def _construct_vocab(self, corpus, size, min_freq):
        
        vocab = Counter(word for sent in corpus for word in sent.split())
        if size is not None:
            vocab = {word: freq for (word, freq) in vocab.most_common(size)}
        if min_freq is not None:
            vocab = {word: freq for (word, freq) in vocab.items() if freq >= min_freq}
        
        return vocab

In [None]:
class DataMaker(object):

    def __init__(self, sequences, vocab):

        self.sequences = sequences
        self.vocab = vocab
        self.max_len = max([len(seq.split()) for seq in sequences])
        
        idx, oovs = [], []
        for seqs in sequences:
            ids, oov = vocab.source_to_ids(seq.split())
            idx.append(ids)
            oovs.append(oov)
        
    def seq_to_ids(self):
        idx = []


In [None]:
test = ['i am using keras', 'i prefer tensorflow over pytorch']
voc = Vocab(test)

In [None]:
a = 'i started using r yesterday'
ids, oovs = voc.source_to_ids(a.split())
ids, oovs

([2, 10, 4, 11, 12], ['started', 'r', 'yesterday'])

In [None]:
pad = voc.pad_token
voc.word_to_id[pad]

1

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Prashant/StudentGradeComment.csv')
df.captions = df.captions.apply(lambda text: '<start> ' + text + ' <end>')
df.head()

Unnamed: 0,attributes,cells,captions
0,name gender math reading writing,liam female 72 72 74,<start> liam performance was decent and she wa...
1,name gender math reading writing,noah female 69 90 88,<start> noah scored good in reading and writin...
2,name gender math reading writing,william female 90 95 93,<start> william was one of the top performers ...
3,name gender math reading writing,james male 47 57 44,<start> james performed poorly across all thre...
4,name gender math reading writing,oliver male 76 78 75,<start> oliver was consistent and with more ef...


In [None]:
attr_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<unk_a>')
attr_tokenizer.fit_on_texts(df['attributes'])

In [None]:
cell_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<unk_c>')
cell_tokenizer.fit_on_texts(df['cells'])

In [None]:
targ_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<unk>')
targ_tokenizer.fit_on_texts(df['captions'])

In [None]:
attr_input = attr_tokenizer.texts_to_sequences(df['attributes'])
attr_input = tf.keras.preprocessing.sequence.pad_sequences(attr_input, padding='post')

cell_input = cell_tokenizer.texts_to_sequences(df['cells'])
cell_input = tf.keras.preprocessing.sequence.pad_sequences(cell_input, padding='post')

targ_input = targ_tokenizer.texts_to_sequences(df['captions'])
targ_input = tf.keras.preprocessing.sequence.pad_sequences(targ_input, padding='post')

In [None]:
attr_input.shape, cell_input.shape, targ_input.shape

((100, 5), (100, 5), (100, 23))

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((attr_input,cell_input,targ_input))
dataset = dataset.shuffle(32).batch(32, drop_remainder=True)

In [None]:
class InteractiveEncoder(tf.keras.Model):
    '''
    Arguments:-
        attr_vocab_size : (int) Size of field or attribute vocabulary
        attr_embedding_size : (int) attribute embedding size
        vocab_size: (int) Size of word vocabulary
        embedding_size: (int) word embedding size
        encoder_size: (int) Dimensions of encoder hidden state
        batch_size: (int) Batch size
    '''

    def __init__(self, attr_vocab_size, attr_embedding_size, vocab_size, embedding_size, encoder_size, batch_size):
        super(InteractiveEncoder, self).__init__()

        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.encoder_size = encoder_size
        self.batch_size = batch_size

        self.attr_vocab_size = attr_vocab_size
        self.attr_embedding_size = attr_embedding_size

        # attr_embedding_input_shape = (batch_size, enc_length)
        self.Attr_Embedding = tf.keras.layers.Embedding(attr_vocab_size, attr_embedding_size)
        # attr_embedding_output_shape = (batch_size, enc_length, attr_embedding_size)

        # embedding_input_shape = (batch_size, enc_length)
        self.Embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        # embedding_output_shape = (batch_size, enc_length, embedding_size)
        
        # input_shape = (batch_size, enc_length, None)
        self.W_e = tf.keras.layers.Dense(encoder_size, use_bias=True)
        # output_shape = (batch_size, enc_length, encoder_size)
    
    def call(self, attr_input, cell_input):

        # attr_input --> (batch_size, enc_length)
        attr_embed_output = self.Attr_Embedding(attr_input)
        # attr_embed_output --> (batch_size, enc_length, attr_embedding_size)
        
        # cell_input --> (batch_size, enc_length)
        cell_embed_output = self.Embedding(cell_input)
        # embed_output --> (batch_size, enc_length, embedding_size)

        concat_embeds = tf.concat([cell_embed_output, attr_embed_output], axis=2)
        # concat_embeds --> (batch_size, enc_length, embedding_size+attr_embedding_size)

        output = tf.nn.tanh(self.W_e(concat_embeds))
        # output --> (batch_size, seq_length, encoder_size)
        
        hidden = tf.reduce_mean(output, axis=1)
        # hidden --> (batch_size, encoder_size)

        return output, hidden

In [None]:
class InteractiveAttn(tf.keras.layers.Layer):
    
    def __init__(self, units):
        super(InteractiveAttn, self).__init__()
        
        self.units = units 
        # keep attention units same as decoder_size
        
        # for intermediate decoder hidden state 
        self.interGRU = tf.keras.layers.GRU(units, return_sequences=True, return_state=True)
        
        # for attention scoring
        self.W1 = tf.keras.layers.Dense(units) # --> (, decoder_size)
        self.W2 = tf.keras.layers.Dense(units) # --> (, decoder_size)
        self.v = tf.keras.layers.Dense(1) # --> (, 1)
        
    def call(self, dec_prev_input_embed, dec_prev_hidden, enc_prev_output):
        
        # enc_prev_output --> (batch_size, enc_length, encoder_size)
        # its a time dependent pseudo encoder output derived from actual/prev encoder output
        # dec_prev_hidden --> (batch_size, decoder_size)
        # dec_prev_input_embed --> (batch_size, 1, embedding_size)
        _, dec_inter_hidden = self.interGRU(dec_prev_input_embed, initial_state=dec_prev_hidden)
        # dec_inter_hidden --> (batch_size, decoder_size)
        dec_inter_hidden_ = tf.expand_dims(dec_inter_hidden, axis=1)
        # dec_inter_hidden_ --> (batch_size, 1, decoder_size)
        
        score = self.v(tf.nn.tanh(self.W1(dec_inter_hidden_) + self.W2(enc_prev_output)))
        # score --> (1)'((batch_size, 1, decoder_size) + (batch_size, enc_length, decoder_size))
        # score --> (1)'(batch_size, enc_length, decoder_size) [broadcasting over axis 1]
        # score --> (batch_size, enc_length, 1)
        
        attn_weights = tf.nn.softmax(score, axis=1)
        # attn_weights --> (batch_size, enc_length, 1) [floats between 0 and 1]
        

        context_vector = tf.reduce_sum(attn_weights*enc_prev_output, axis=1)
        # attention_weights*encoder_output --> (batch_size, enc_length, encoder_size)
        # context_vector --> (batch_size, encoder_size)
        
        return context_vector, attn_weights, dec_inter_hidden

In [None]:
class InteractiveDecoder(tf.keras.Model):
    '''
    Arguments:-
        vocab_size: (int) Size of target vocabulary
        embedding_size: (int) Embedding size
        decoder_size: (int) Dimensions of decoder hidden state
        batch_size: (int) Batch size
        attention_style: attention mechanism, 'bahdanau' or 'luong'
    '''
    def __init__(self, vocab_size, embedding_size, encoder_size, decoder_size, batch_size):
        super(InteractiveDecoder, self).__init__()

        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.encoder_size = encoder_size
        self.decoder_size = decoder_size
        self.batch_size = batch_size

        # embedding_input_shape = (batch_size, 1) [seq_length = 1 for decoder]
        self.Embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        # embedding_output_shape = (batch_size, 1, embedding_size)

        # gru_input_shape = (batch_size, 1, embedding_size+...)
        self.GRU = tf.keras.layers.GRU(decoder_size, 
                                       return_sequences=True, 
                                       return_state=True)
        # output_shape = (batch_size, 1, decoder_size)
        # hidden_state_shape = (batch_size, decoder_size)

        self.Linear = tf.keras.layers.Dense(vocab_size)

        self.attention = InteractiveAttn(decoder_size)
        
        # to derive new pseudo encoder output
        self.Wf = tf.keras.layers.Dense(encoder_size) # --> (, encoder_size)
        self.Wu = tf.keras.layers.Dense(encoder_size) # --> (, encoder_size)

    def call(self, dec_prev_input, dec_prev_hidden, enc_prev_output):

        # dec_prev_input --> (batch_size, 1) [dec_length = 1 for decoder]
        dec_prev_input_embed = self.Embedding(dec_prev_input)
        # dec_prev_input_embed --> (batch_size, 1, embedding_size)

        # prev_enc_output --> (batch_size, enc_length, encoder_size)
        # dec_prev_hidden --> (batch_size, decoder_size)
        context_vector, attn_weights, dec_inter_hidden = self.attention(dec_prev_input_embed, 
                                                                        dec_prev_hidden, 
                                                                        enc_prev_output)
        # context_vector --> (batch_size, encoder_size)
        # attn_weights --> (batch_size, enc_length, 1)
        # dec_inter_hidden --> (batch_size, decoder_size)
        
        # --------constructing new pseudo encoder output----------

        context_vector_ = tf.expand_dims(context_vector, axis=1)
        # context_vector_ --> (batch_size, 1, encoder_size)

        output_, dec_curr_hidden = self.GRU(context_vector_, initial_state=dec_inter_hidden)
        # output_ --> (batch_size, 1, decoder_size)
        # dec_curr_hidden --> (batch_size, decoder_size)
        
        enc_length = enc_prev_output.shape[1]
        # (a) Forget Part
        F = tf.nn.sigmoid(self.Wf(dec_curr_hidden))
        F = tf.expand_dims(F, axis=1)
        F = tf.tile(F, [1,enc_length,1])
        # F --> (batch_size, enc_length, encoder_size)
        
        # (a) Update Part
        U = tf.nn.sigmoid(self.Wu(dec_curr_hidden))
        U = tf.expand_dims(U, axis=1)
        U = tf.tile(U, [1,enc_length,1])
        # U --> (batch_size, enc_length, encoder_size)
        
        # (c) New (pseudo) encoder output
        enc_curr_output = enc_prev_output*(1-attn_weights*F) + attn_weights*U
        
        output_ = tf.squeeze(output_, axis=1)
        # output --> (batch_size, decoder_size)
        output = tf.nn.softmax(self.Linear(output_))
        # output --> (batch_size, vocab_size)
        return output, dec_curr_hidden, enc_curr_output, context_vector, dec_prev_input_embed, attn_weights

In [None]:
class Table2Text(object):
    
    def __init__(self, attr_vocab_size, attr_embedding_size, 
                 cell_vocab_size, cell_embedding_size, 
                 targ_vocab_size, targ_embedding_size, 
                 encoder_size, decoder_size, batch_size):
        
        self.batch_size = batch_size
        self.targ_vocab_size = targ_vocab_size
        self.history = dict()

        self.encoder = InteractiveEncoder(attr_vocab_size, attr_embedding_size, 
                                          cell_vocab_size, cell_embedding_size, 
                                          encoder_size, batch_size)
        self.decoder = InteractiveDecoder(targ_vocab_size, targ_embedding_size, 
                                          encoder_size, decoder_size, batch_size)
        
        self.Wg = tf.keras.layers.Dense(1, use_bias=True)
        
        self.optimizer = tf.keras.optimizers.Adam()
        self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, 
                                                                         reduction='none')
        
    
    def loss_function(self, real, pred):
        
        mask = tf.math.logical_not(tf.math.equal(real, 0))
        loss_ = self.loss_object(real, pred)
        mask = tf.cast(mask, dtype=loss_.dtype)
        loss_ *= mask

        return tf.reduce_mean(loss_)
    
    def _retrieve_attr(self, attr_input, cell_input, dec_prev_input):
        
        attr = attr_input.numpy()
        cell = cell_input.numpy()
        targ = dec_prev_input.numpy()
        enc_length = cell_input.shape[1]
        cell[cell != np.tile(targ, [1,enc_length])] = 0
        cell[cell != 0] = 1
        attr_retrieved = tf.reduce_max(attr*cell, axis=1)
        
        return attr_retrieved

    #@tf.function
    def train_step(self, attr_input, cell_input, targ, live_pred):
        
        loss = 0
        with tf.GradientTape() as tape:
            enc_output, enc_hidden = self.encoder(attr_input, cell_input)
            
            enc_prev_output, dec_prev_hidden = enc_output, enc_hidden
            dec_prev_input = tf.expand_dims([targ_tokenizer.word_index['<start>']]*self.batch_size, axis=1)
            
            pred_batch = dec_prev_input
            pred_batch = tf.cast(pred_batch, dtype=tf.int32)
            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                beta, dec_curr_hidden, enc_curr_output, context_vector, dec_prev_input_embed, attn_weights = self.decoder(dec_prev_input, 
                                                                                                            dec_prev_hidden, 
                                                                                                            enc_prev_output)
                                                                                                 
                
                attr_retrieved = self._retrieve_attr(attr_input, cell_input, dec_prev_input)
                attr_retrieved_embed = self.encoder.Attr_Embedding(attr_retrieved)
                dec_prev_input_embed = tf.squeeze(dec_prev_input_embed, axis=1)
        
                g_inp = tf.concat([dec_prev_input_embed, 
                                   dec_curr_hidden, 
                                   context_vector, 
                                   enc_hidden, 
                                   attr_retrieved_embed], axis=1)
                g_out = tf.nn.sigmoid(self.Wg(g_inp))

                alpha = tf.squeeze(attn_weights, axis=2)
                pred = tf.concat([(1-g_out)*beta, g_out*alpha], axis=1)
                # pred --> (batch_size, targ_vocab_size+enc_lenght)
                loss += self.loss_function(targ[:, t], pred)
                new_pred = tf.expand_dims(tf.argmax(pred, axis=1, output_type=tf.int32), axis=1)
                pred_batch = tf.concat([pred_batch,new_pred], axis=1)
                # using teacher forcing
                dec_prev_input = tf.expand_dims(targ[:, t], 1)
                enc_prev_output, dec_prev_hidden = enc_curr_output, dec_curr_hidden

        batch_loss = (loss/int(targ.shape[1]))
        variables = self.encoder.trainable_variables + self.decoder.trainable_variables
        gradients = tape.gradient(loss, variables)
        self.optimizer.apply_gradients(zip(gradients, variables))

        if live_pred:
            self._rand_prediction(pred_batch,targ)

        return batch_loss
    
    def val_step(self, attr_input, cell_input, targ, live_pred):

        loss = 0
        enc_output, enc_hidden = self.encoder(attr_input, cell_input)

        enc_prev_output, dec_prev_hidden = enc_output, enc_hidden
        dec_prev_input = tf.expand_dims([targ_tokenizer.word_index['<start>']]*self.batch_size, axis=1)
        pred_batch = dec_prev_input
        pred_batch = tf.cast(pred_batch, dtype=tf.int32)
        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            beta, dec_curr_hidden, enc_curr_output, context_vector, dec_prev_input_embed, attn_weights = self.decoder(dec_prev_input, 
                                                                                                        dec_prev_hidden, 
                                                                                                        enc_prev_output)


            attr_retrieved = self._retrieve_attr(attr_input, cell_input, dec_prev_input)
            attr_retrieved_embed = self.encoder.Attr_Embedding(attr_retrieved)
            dec_prev_input_embed = tf.squeeze(dec_prev_input_embed, axis=1)

            g_inp = tf.concat([dec_prev_input_embed, 
                               dec_curr_hidden, 
                               context_vector, 
                               enc_hidden, 
                               attr_retrieved_embed], axis=1)
            g_out = tf.nn.sigmoid(self.Wg(g_inp))

            alpha = tf.squeeze(attn_weights, axis=2)
            pred = tf.concat([(1-g_out)*beta, g_out*alpha], axis=1)
            # pred --> (batch_size, targ_vocab_size+enc_lenght)

            loss += self.loss_function(targ[:, t], pred)
            
            # not using teacher forcing
            pred_id = tf.argmax(pred, axis=1)
            pred_id = tf.expand_dims(pred_id, 1) # problem if pred_id is beyond vocab size
            pred_id_ = pred_id.numpy()
            pred_id_[pred_id_ > self.targ_vocab_size] = targ_tokenizer.word_index['<unk>']
            dec_prev_input = tf.convert_to_tensor(pred_id_, dtype=tf.int32)
            pred_batch = tf.concat([pred_batch,dec_prev_input], axis=1)
            enc_prev_output, dec_prev_hidden = enc_curr_output, dec_curr_hidden

        batch_loss = (loss/int(targ.shape[1]))
        if live_pred:
            self._rand_prediction(pred_batch,targ)

        return batch_loss
    
    def _rand_prediction(self, pred, targ=None, idx=None):
        
        idx = np.random.randint(0,pred.shape[0]) if idx is None else idx
        
        if targ is not None:
            targ_s = targ_tokenizer.sequences_to_texts([targ.numpy()[idx][1:]])[0]
            #targ_s = ' '.join(targ_s)
            targ_s = re.sub('[_$$_]', ' ', targ_s)
            targ_s = re.sub('\s+', ' ', targ_s)
            print('(Reference) ', targ_s)
        pred_s = targ_tokenizer.sequences_to_texts([pred.numpy()[idx][1:]])[0]
        #pred_s = ' '.join(pred_s)
        pred_s = re.sub('[_$$_]', ' ', pred_s)
        pred_s = re.sub('\s+', ' ', pred_s)
        print('(Generated) ', pred_s)
    
    def fit(self, train_set, epochs, val_set=None, live_pred=True):
        
        for i in range(epochs):
            train_loss = 0
            for (batch, (attr, cell, targ)) in enumerate(train_set.take(1)):
                batch_loss = self.train_step(attr, cell, targ, live_pred)
                print('train batch loss: ',batch_loss)
                train_loss += batch_loss
            try:
                self.history['train'].append(train_loss)
            except:
                self.history['train'] = [train_loss]

            if val_set is not None:
                val_loss = 0
                for (batch, (attr, cell, targ)) in enumerate(val_set.take(1)):
                    batch_loss = self.val_step(attr, cell, targ, live_pred)
                    print('val batch loss: ',batch_loss)
                    val_loss += batch_loss
                try:
                    self.history['val'].append(val_loss)
                except:
                    self.history['val'] = [val_loss]

In [None]:
model = Table2Text(attr_vocab_size=len(attr_tokenizer.word_index)+1, 
                attr_embedding_size=4, 
                cell_vocab_size=len(cell_tokenizer.word_index)+1, 
                cell_embedding_size=16, 
                targ_vocab_size=len(targ_tokenizer.word_index)+1, 
                targ_embedding_size=16, 
                encoder_size=64, 
                decoder_size=64, 
                batch_size=32)

In [None]:
model.fit(dataset, 1000)