## Test script for LSTM models

Test script for LSTM models. The model parameters must be the same as those used when training the model. "restore_path" is the path to the model to be tested. Test results are saved as a pickle file for detailed analysis.

Testing does not take long on the legislative corpus: a few minutes on an intel i5 processor / 4GB memory.

In [None]:
import tensorflow as tf
from tensorflow.python.ops import lookup_ops
from tensorflow.python.layers import core as layers_core
from tensorflow.contrib.layers import xavier_initializer

In [None]:
import codecs
import numpy as np
import time

In [None]:
# Inputs

data_path = "" # Data path

text_data = [data_path + "/leg_test_text.txt"]  # Sentence training data (spacy parsed)
text_whole_data = [data_path + "/leg_test_original.txt"] # Whole sentence (not tokenised)
labels_data = [data_path + "/leg_test_label.txt"] # Labels for sentences
embed_vocab_data = data_path + "/leg_embeddings_vocab.txt" # Embedding vocab: words from training sentences 
# for which embeddings exist and have been extracted in embed_file. (If full, this is "embed_vocab.txt")
full_vocab_data = data_path + "/total_vocab.txt" # Full sentence vocab. ("total_vocab.txt")


txt_eos = "</S>" # Special characters
lbl_sos = "<l>"
lbl_eos = "</l>"
embed_file = data_path + "/leg_embeddings.txt" # Embeddings file (full is "embeddings.txt")

restore_path = "" # Path to model to be tested

In [None]:
# Model parameters. These should match the parameters of the training model

bi_directional = False
num_dec_layers = 3
num_bi_layers = 3

batch_size = 25
num_enc_units = 128
num_dec_units = 128 # If uni-directional, then same as enc_units. If bi, make twice as big.

beam_search = False
beam_width = 4 # There are only 3 outputs...

In [None]:
# Build a tf dataset: an iterator that returns batched data for processing.

def build_dataset(text_data, text_whole_data, labels_data, embed_vocab_data, full_vocab_data, 
                  txt_eos, lbl_sos, lbl_eos, batch_size):

    # Build the word to id lookup table from the text data. OOV words point at 0 = <unk> = random (but all same)
    vocab_table = lookup_ops.index_table_from_file(embed_vocab_data, default_value=0)
    
    # Build a residual lookup table for all vocab, so can convert words back at end of process (evaluation only)
    full_vocab_table = lookup_ops.index_table_from_file(full_vocab_data, default_value=0)

    txt_eos_id = tf.cast(vocab_table.lookup(tf.constant(txt_eos)), tf.int32)
    txt_full_eos_id = tf.cast(full_vocab_table.lookup(tf.constant(txt_eos)), tf.int32) # Probably not strictly necessary, since
    # eos ends up in the same place in both vocab files.
    lbl_sos_id = tf.cast(vocab_table.lookup(tf.constant(lbl_sos)), tf.int32)
    lbl_eos_id = tf.cast(vocab_table.lookup(tf.constant(lbl_eos)), tf.int32)
    fill = tf.constant("f")

    # Read each line of the text file. Each line is a sentence (where text has been tokenised using spacy)
    # NB can pass multiple files to TextLineDataset (so can prep data in batches)
    sent_data = tf.data.TextLineDataset(text_data)
    sent_whole_data = tf.data.TextLineDataset(text_whole_data)
    labels_data = tf.data.TextLineDataset(labels_data)

    # For each line, split on white space
    sent_data = sent_data.map(lambda string: tf.string_split([string]).values)
    labels_data = labels_data.map(lambda label: tf.string_split([label]).values)
    labels_data = labels_data.map(lambda label: tf.string_to_number(label, tf.int32))

    # Lookup word ids (in the embedding vocab and in the full vocab)
    embed_sent_data = sent_data.map(lambda token: tf.cast(vocab_table.lookup(token), tf.int32))
    full_sent_data = sent_data.map(lambda token: tf.cast(full_vocab_table.lookup(token), tf.int32))

    # Zip datasets together
    sent_label_data = tf.data.Dataset.zip((full_sent_data, sent_whole_data, embed_sent_data, labels_data))
    
    # Create input dataset (labels prefixed by sos) and target dataset (labels suffixed with eos)
    
    sent_label_data = sent_label_data.map(lambda full_words, full_sent, embed_words, 
                                          labels: (full_words, full_sent, embed_words,
                                                   tf.concat(([lbl_sos_id], labels), 0),
                                                   tf.concat((labels, [lbl_eos_id]), 0),))

    # Add seqeunce length
    sent_label_data = sent_label_data.map(lambda full_words, full_sent, embed_words, 
                                          labels_in, labels_out: (full_words, full_sent, embed_words,
                                                                  tf.size(embed_words),
                                                                  tf.size(labels_in),
                                                                  labels_in, 
                                                                  labels_out))
    
    # Random shuffle
    # sent_label_data = sent_label_data.shuffle(buffer_size=5000)

    # Batching the input, padding to the length of the longest sequence in the input. Can also bucket these. Form of dataset
    # is: txt_ids_for_full_vocab, txt_ids_for_embed_vocab, text_size, label_size, labels_in, labels_out.
    
    batch_size = tf.constant(batch_size, tf.int64)
    
    batched_input = sent_label_data.padded_batch(batch_size, padded_shapes=(tf.TensorShape([None]),
                                                                            tf.TensorShape([]),
                                                                            tf.TensorShape([None]),
                                                                            tf.TensorShape([]), 
                                                                            tf.TensorShape([]),
                                                                            tf.TensorShape([None]), 
                                                                            tf.TensorShape([None])), 
                                                 padding_values=(txt_full_eos_id,
                                                                 fill,
                                                                 txt_eos_id,
                                                                 0,
                                                                 0,
                                                                 lbl_eos_id, 
                                                                 lbl_eos_id))
    iterator = batched_input.make_initializable_iterator()
    return iterator, batch_size, lbl_sos_id, lbl_eos_id

In [None]:
# Preparatory step to create_emb_matrix. Each line of the embedding file is a word followed by a space delimited numbers forming
# the vector. load_embed_txt splits on white space and builds a dictionary where keys are the words in the embedding file

def load_embed_txt(embed_file):
    emb_dict = dict()
    with codecs.getreader("utf-8")(tf.gfile.GFile(embed_file, 'rb')) as f:
        for line in f:
            tokens = line.strip().split(" ")
            word = tokens[0]
            vec = list(map(float, tokens[1:]))
            emb_dict[word] = vec
            emb_size = len(vec)
    return emb_dict, emb_size

In [None]:
# Create an embedding matrix (numpy array of embeddings). Includes an <unk> value for oov words. These are the values that are
# looked-up when the model is run.

def create_emb_matrix(embed_file):
    emb_dict, emb_size = load_embed_txt(embed_file)
    mat = np.array([emb_dict[token] for token in emb_dict.keys()])
    emb_mat = tf.convert_to_tensor(mat, dtype=tf.float32)
    return emb_mat

In [None]:
# A hack to help with the input to the decoder. Creates a matrix where keys and values are just integers in single item lists.

def create_dec_matrix(num):
    dec_dict = {}
    for i in range(num):
        dec_dict[i] = [i]
    mat = np.array([dec_dict[token] for token in dec_dict.keys()])
    dec_mat = tf.convert_to_tensor(mat, dtype=tf.float32)
    return dec_mat

In [None]:
# Build the id to vocab dictionary (reverse of the vocab lookup). This is for the "embed vocab" (i.e. where lots of words are
# still mapped to <unk>)). This assumes there is both: an "embed vocab file", a file of the vocab for which embeddings exist and
# an embed file. Recall unk and special characters are included in the vocab file, so no need to manaully add to the dictionary.
# The words are just set out on each line of the file, so "strip" / "split" is a bit overkill but works well enough.

def ids_to_embed_vocab(embed_vocab_data):
    embed_vocab_dict = {}
    with codecs.getreader("utf-8")(tf.gfile.GFile(embed_vocab_data, 'rb')) as f:
        count = 0
        for line in f:
            tokens = line.strip().split(" ")
            word = tokens[0]
            embed_vocab_dict[count] = word
            count += 1
    return embed_vocab_dict

In [None]:
# Build the id to vocab dictionary (reverse of the vocab lookup). This is for the full vocab. This is a hack, not really
# necessary for the model but allows you to read the outputs easier (otherwise you would be left with lots of "unks" in the
# final output.) We don't compute with these ids, they are just preserved through the batch input so we know what words went in.

def ids_to_full_vocab(full_vocab_data):
    full_vocab_dict = {}
    with codecs.getreader("utf-8")(tf.gfile.GFile(full_vocab_data, 'rb')) as f:
        count = 0
        for line in f:
            tokens = line.strip().split(" ")
            word = tokens[0]
            full_vocab_dict[count] = word
            count += 1
    return full_vocab_dict

In [None]:
# Single LSTM cell instance

def single_cell(num_units, name):
    single_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units, name=name)
    return single_cell

In [None]:
# Multi-layer RNN definition. The "direction" argument is just to help with naming when using bi-directional model.

def RNN_cell(num_layers, num_units, direction):
    if num_layers == 1:
        cell_name = direction + "_LSTM_layer"
        rnn_cell = single_cell(num_units, cell_name)
    else:
        cell_list = []
        for i in range(num_layers):
            cell_name = direction + "_LSTM_layer_" + str(i)
            cell = single_cell(num_units, cell_name)
            cell_list.append(cell)
        rnn_cell = tf.nn.rnn_cell.MultiRNNCell(cell_list)
    return rnn_cell

In [None]:
# Compose cells and get output

def build_encoder(encoder_emb_inp, num_layers, num_units, txt_size, bi_directional):
    if bi_directional == True:
        fw_cell = RNN_cell(num_layers, num_units, "enc_fw")
        bw_cell = RNN_cell(num_layers, num_units, "enc_bw")
        outputs, (encoder_fw_state, encoder_bw_state) = tf.nn.bidirectional_dynamic_rnn(fw_cell,
                                                                                              bw_cell,
                                                                                              encoder_emb_inp,
                                                                                              sequence_length=txt_size,
                                                                                              time_major=False,
                                                                                              dtype = tf.float32)
        
        encoder_outputs = tf.concat(outputs, 2)
        
        if isinstance(encoder_fw_state, tuple) and isinstance(encoder_fw_state[0], tf.contrib.rnn.LSTMStateTuple):  # MultiLstmCell
            encoder_state = tuple(map(
                lambda fw_state, bw_state: tf.contrib.rnn.LSTMStateTuple(
                    c=tf.concat((fw_state.c, bw_state.c), 1,
                                name="bidirectional_concat_c"),
                    h=tf.concat((fw_state.h, bw_state.h), 1,
                                name="bidirectional_concat_h")),
                encoder_fw_state, encoder_bw_state))
        else:
            encoder_state = tf.concat(
                (encoder_fw_state, encoder_bw_state), 1,
                name="bidirectional_state_concat")
    if bi_directional == False:
        encoder_cells = RNN_cell(num_layers, num_units, "enc_fw")
        encoder_outputs, encoder_state = tf.nn.dynamic_rnn(encoder_cells, 
                                                           encoder_emb_inp, 
                                                           sequence_length=txt_size, 
                                                           time_major=False, 
                                                           dtype = tf.float32)
    
    return encoder_outputs, encoder_state

In [None]:
# Build a basic decoder

def build_basic_decoder(decoder_cells, encoder_state, dec_mat, lbl_sos_id, lbl_eos_id, output_layer):
    #helper = tf.contrib.seq2seq.TrainingHelper(decoder_emb_inp, labels_size, time_major=False)
    helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(dec_mat, 
                                                      tf.fill([batch_size], lbl_sos_id), 
                                                      lbl_eos_id)
    
    decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cells, helper, encoder_state, output_layer)
    return decoder    

In [None]:
# Build a beam search decoder

def build_BM_decoder(decoder_cells, encoder_state, dec_mat, beam_width, lbl_sos_id, lbl_eos_id, output_layer):
    start_tokens = tf.fill([batch_size], lbl_sos_id)
    end_token = lbl_eos_id
    decoder_initial_state = tf.contrib.seq2seq.tile_batch(encoder_state, multiplier=beam_width)
    decoder = tf.contrib.seq2seq.BeamSearchDecoder(cell=decoder_cells, embedding=dec_mat, 
                                                           start_tokens=start_tokens,
                                                           end_token=end_token, 
                                                           initial_state=decoder_initial_state, 
                                                           beam_width=beam_width,
                                                           output_layer=output_layer,
                                                           length_penalty_weight=0.0)
    return decoder    

In [None]:
class Model():
    def __init__(self, num_enc_units, num_dec_units, num_dec_layers, 
                 num_bi_layers, bi_directional, embed_words, full_words, text_whole_data, txt_size, labels_size,
                 labels_in, labels_out, beam_search, beam_width, batch_size, lbl_sos_id, lbl_eos_id):
        
        self.num_enc_units = num_enc_units
        
        self.words_in = embed_words
        self.full_words_in = full_words
        self.text_whole_in = text_whole_data
        
        with tf.variable_scope("main", initializer=xavier_initializer()):
            
            # Inputs
            
            mask_labels = tf.sequence_mask(labels_size, dtype=tf.int32) # To mask the padded input
            labels_in = labels_in * mask_labels
            
            self.labels_out = labels_out
            self.mask_labels = mask_labels
            
            encoder_emb_inp = tf.nn.embedding_lookup(emb_mat, embed_words) # Encoder embedding lookup
            decoder_emb_inp = tf.nn.embedding_lookup(dec_mat, labels_in) # Decoder embedding lookup (easiest way to get it in
            # right shape)
            
            # Encoder definition (by default, encoder_state is just the final state). Encoder can be multi-layers and
            # bi-directional
            
            encoder_outputs, encoder_state = build_encoder(encoder_emb_inp, num_bi_layers, num_enc_units, 
                                                           txt_size, bi_directional)
            
            # Decoder definition. Number of decoder layers is the same as the number of encoder layers, but needed be. The 
            # helper is defined seperately and can be adjusted for greedy / beam decoding at inference.
            
            decoder_cells = RNN_cell(num_dec_layers, num_dec_units, "dec")
            
            # Output layer which takes decoder output and maps to 3 categories (0,1,2) - these are the same as the target labels.
            # Recall 2 just maps to </l>, which is the prediction for </s>
            
            output_layer = layers_core.Dense(3, use_bias=False, name="output_projection")
            
            if beam_search == False:
                decoder = build_basic_decoder(decoder_cells, encoder_state, dec_mat, lbl_sos_id, lbl_eos_id, output_layer)
            else:
                decoder = build_BM_decoder(decoder_cells, encoder_state, dec_mat, beam_width, 
                                           lbl_sos_id, lbl_eos_id, output_layer)          
            
            outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode(decoder,
                                                                                impute_finished = False,
                                                                                output_time_major=False)
            
            # Decoder just runs until it gets to the end, but could impose a max length (e.g. length of labels)
                       
            self.preds = tf.argmax(outputs[0], axis=2) # Take the argmax to get the prediction. Distinct from beam / non-beam
            # search.

In [None]:
# Collect outputs

test_inputs = []
test_inputs_whole = []
predictions = []
predicted_labels = []
targets = []
target_labels = []
errors = []
error_count = 0

In [None]:
# Run the graph

with tf.Graph().as_default(): 

    iterator, batch_size_, lbl_sos_id_, lbl_eos_id_ = build_dataset(text_data, text_whole_data, labels_data, 
                                                                    embed_vocab_data, full_vocab_data, txt_eos, 
                                                                    lbl_sos, lbl_eos, batch_size)
    
    emb_mat = create_emb_matrix(embed_file)
    dec_mat = create_dec_matrix(4)
    
    random_embeddings = np.random.uniform(low=-1, high=1, size=(4,300)) # A random choice for unk and other special characters
    embeddings = tf.Variable(tf.convert_to_tensor(random_embeddings, dtype=tf.float32), name="saved_embeddings")
    emb_mat = tf.concat((embeddings, emb_mat), 0)
    
    ids_to_embed_vocab_ = ids_to_embed_vocab(embed_vocab_data)
    ids_to_full_vocab_ = ids_to_full_vocab(full_vocab_data)
    
    # A call to the iterator for inputs
    
    full_words_, text_whole_data_, embed_words_, txt_size_, label_size_, labels_in_, labels_out_  = iterator.get_next()
    
    # Model instantiation
    
    model = Model(num_enc_units, num_dec_units, num_dec_layers, num_bi_layers, bi_directional,
                  embed_words_, full_words_, text_whole_data_, txt_size_, label_size_, labels_in_, labels_out_, 
                  beam_search, beam_width, batch_size_, lbl_sos_id_, lbl_eos_id_)
    
    # Initialise variables
    
    init = tf.global_variables_initializer()
    
    saver = tf.train.Saver() # Save for variables. Not full graph.
        
    with tf.Session() as sess:
        
        saver.restore(sess, restore_path)
        print("Model restored.")
        
        # Initialise the vocab tables
        sess.run(tf.tables_initializer())
        sess.run(iterator.initializer)
        counter = 0
        
        # Batch processing loop.
        while True:
            try:
                preds, full_words_in, text_whole_in, labels_out, mask_labels = sess.run([model.preds, 
                                                                                         model.full_words_in, 
                                                                                         model.text_whole_in, 
                                                                                         model.labels_out, 
                                                                                         model.mask_labels])
                for j in range(len(full_words_in)):

                    full_sent = []
                    target_sent = []
                    predicted_sent = []
                    de_bug = False
                    
                    target_labels.append(labels_out[j])
                    predicted_labels.append(preds[j])

                    for i in range(len(full_words_in[j])):
                        if mask_labels[j][i] == 1:
                            full_sent.append(ids_to_full_vocab_[full_words_in[j][i]])
                            try:
                                if preds[j][i] != 0:
                                    predicted_sent.append(ids_to_full_vocab_[full_words_in[j][i]])
                            except IndexError:
                                de_bug = True
                            try:
                                if labels_out[j][i] != 0:
                                    target_sent.append(ids_to_full_vocab_[full_words_in[j][i]])
                            except IndexError:
                                de_bug = True

                    if de_bug == False:
                        test_inputs.append(full_sent)
                        test_inputs_whole.append(text_whole_in[j])
                        predictions.append(predicted_sent)
                        targets.append(target_sent)
                    else:
                        test_inputs.append(full_sent)
                        predictions.append(["de_bug_error"])
                        targets.append(["de_bug_error"])
                        errors.append([full_sent, preds[j], labels_out[j]])
                        error_count += 1

                    counter += 1
                    if counter % 1000 == 0:
                        print("Number of examples processed:")
                        print(counter)

            except tf.errors.OutOfRangeError:
                break
        print("End of input file.")

In [None]:
# Save the test data

import pickle

test_outputs = {}
test_outputs['stringInputs'] = test_inputs
test_outputs['wholeInput'] = test_inputs_whole
test_outputs['predictions'] = predictions
test_outputs['predicted_labels'] = predicted_labels
test_outputs['targets'] = targets
test_outputs['target_labels'] = target_labels

results_file = "" # Set output file

with open(results_file, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(test_outputs, f, pickle.HIGHEST_PROTOCOL)