## Transfer learning with ULMFiT script

This script implements transfer learning for the LSTM models using the ULMFiT procedures. The core parameters are specified in "slanted_learning_rate". The rigidity of the tensorflow computational graph means that the procedures are hard coded (with separate learning rates and optimisers specified for each model layer). This is hugely inefficient and it might be that a cleaner implementation is possible in newer versions of tensorflow.

Given the size of the legislative training dataset, these efficiencies don't much matter: one training epoch takes around 80 seconds on an intel i5 processor / 4GB memory.

Adapted from <https://github.com/tensorflow/nmt>

In [None]:
import tensorflow as tf
from tensorflow.python.ops import lookup_ops
from tensorflow.python.layers import core as layers_core
from tensorflow.contrib.layers import xavier_initializer

In [None]:
import codecs
import numpy as np
import time

In [None]:
# Inputs

data_path = "" # Data path

text_data = [data_path + "/leg_train_text.txt"]  # Sentence training data (spacy parsed)
text_whole_data = [data_path + "/leg_train_original.txt"] # Whole sentence (not tokenised)
labels_data = [data_path + "/leg_train_label.txt"] # Labels for sentences
embed_vocab_data = data_path + "/leg_embeddings_vocab.txt" # Embedding vocab: words from training sentences 
# for which embeddings exist and have been extracted in embed_file. (If full, this is "embed_vocab.txt")
full_vocab_data = data_path + "/total_vocab.txt" # Full sentence vocab. ("total_vocab.txt")


txt_eos = "</S>" # Special characters
lbl_sos = "<l>"
lbl_eos = "</l>"
embed_file = data_path + "/leg_embeddings.txt" # Embeddings file (full is "embeddings.txt")

restore_path = "./LSTM_base/model.ckpt"
save_model = True # Set to True if you want to save model variables
log_path = "" # Log directory
save_path = "" # Save model path, only used if save_path is True
log_freq = 100 # Show some outputs every log_freq training steps

In [None]:
# Model parameters

num_layers = 3
num_total_layers = 7

num_units = 128 # If uni-directional, then same as enc_units. If bi, make twice as big.

beam_search = False
beam_width = 4 # There are only 3 outputs...

batch_size = 25
forget_bias = 0
dropout = 0.2
max_gradient_norm = 1
learning_rate = 0.002 # This doesn't do anything - see slanted_learning_rate
epochs = 10

In [None]:
# Build a tf dataset: an iterator that returns batched data for training.

def build_dataset(text_data, labels_data, embed_vocab_data, full_vocab_data, txt_eos, lbl_sos, lbl_eos, batch_size):

    # Build the word to id lookup table from the text data. OOV words point at 0 = <unk> = random (but all same)
    vocab_table = lookup_ops.index_table_from_file(embed_vocab_data, default_value=0)
    
    # Build a residual lookup table for all vocab, so can convert words back at end of process (evaluation only)
    full_vocab_table = lookup_ops.index_table_from_file(full_vocab_data, default_value=0)

    txt_eos_id = tf.cast(vocab_table.lookup(tf.constant(txt_eos)), tf.int32)
    txt_full_eos_id = tf.cast(full_vocab_table.lookup(tf.constant(txt_eos)), tf.int32) # Probably not strictly necessary, since
    # eos ends up in the same place in both vocab files.
    lbl_sos_id = tf.cast(vocab_table.lookup(tf.constant(lbl_sos)), tf.int32)
    lbl_eos_id = tf.cast(vocab_table.lookup(tf.constant(lbl_eos)), tf.int32)

    # Read each line of the text file. Each line is a sentence (where text has been tokenised using spacy)
    # NB can pass multiple files to TextLineDataset (so can prep data in batches)
    sent_data = tf.data.TextLineDataset(text_data)
    labels_data = tf.data.TextLineDataset(labels_data)

    # For each line, split on white space
    sent_data = sent_data.map(lambda string: tf.string_split([string]).values)
    labels_data = labels_data.map(lambda label: tf.string_split([label]).values)
    labels_data = labels_data.map(lambda label: tf.string_to_number(label, tf.int32))

    # Lookup word ids (in the embedding vocab and in the full vocab)
    embed_sent_data = sent_data.map(lambda token: tf.cast(vocab_table.lookup(token), tf.int32))
    full_sent_data = sent_data.map(lambda token: tf.cast(full_vocab_table.lookup(token), tf.int32))

    # Zip datasets together
    sent_label_data = tf.data.Dataset.zip((full_sent_data, embed_sent_data, labels_data))
    
    # Create input dataset (labels prefixed by sos) and target dataset (labels suffixed with eos)
    
    sent_label_data = sent_label_data.map(lambda full_words, embed_words, labels: (full_words, embed_words,
                                                                                  tf.concat(([lbl_sos_id], labels), 0),
                                                                                  tf.concat((labels, [lbl_eos_id]), 0),))

    # Add seqeunce length
    sent_label_data = sent_label_data.map(lambda full_words, embed_words, labels_in, labels_out: (full_words, embed_words,
                                                                                                  tf.size(embed_words),
                                                                                                  tf.size(labels_in),
                                                                                                  labels_in, 
                                                                                                  labels_out))
    
    # Random shuffle
    sent_label_data = sent_label_data.shuffle(buffer_size=5000)

    # Batching the input, padding to the length of the longest sequence in the input. Can also bucket these. Form of dataset
    # is: txt_ids_for_full_vocab, txt_ids_for_embed_vocab, text_size, label_size, labels_in, labels_out.
    
    batch_size = tf.constant(batch_size, tf.int64)
    
    batched_input = sent_label_data.padded_batch(batch_size, padded_shapes=(tf.TensorShape([None]),
                                                                            tf.TensorShape([None]),
                                                                            tf.TensorShape([]), 
                                                                            tf.TensorShape([]),
                                                                            tf.TensorShape([None]), 
                                                                            tf.TensorShape([None])), 
                                                 padding_values=(txt_full_eos_id,
                                                                 txt_eos_id,
                                                                 0,
                                                                 0,
                                                                 lbl_eos_id, 
                                                                 lbl_eos_id))
    iterator = batched_input.make_initializable_iterator()
    return iterator

In [None]:
# Preparatory step to create_emb_matrix. Each line of the embedding file is a word followed by a space delimited numbers forming
# the vector. load_embed_txt splits on white space and builds a dictionary where keys are the words in the embedding file

def load_embed_txt(embed_file):
    emb_dict = dict()
    with codecs.getreader("utf-8")(tf.gfile.GFile(embed_file, 'rb')) as f:
        for line in f:
            tokens = line.strip().split(" ")
            word = tokens[0]
            vec = list(map(float, tokens[1:]))
            emb_dict[word] = vec
            emb_size = len(vec)
    return emb_dict, emb_size

In [None]:
# Create an embedding matrix (numpy array of embeddings). Includes an <unk> value for oov words. These are the values that are
# looked-up when the model is run.

def create_emb_matrix(embed_file):
    emb_dict, emb_size = load_embed_txt(embed_file)
    mat = np.array([emb_dict[token] for token in emb_dict.keys()])
    emb_mat = tf.convert_to_tensor(mat, dtype=tf.float32)
    return emb_mat

In [None]:
# A hack to help with the input to the decoder. Creates a matrix where keys and values are just integers in single item lists.

def create_dec_matrix(num):
    dec_dict = {}
    for i in range(num):
        dec_dict[i] = [i]
    mat = np.array([dec_dict[token] for token in dec_dict.keys()])
    dec_mat = tf.convert_to_tensor(mat, dtype=tf.float32)
    return dec_mat

In [None]:
# Build the id to vocab dictionary (reverse of the vocab lookup). This is for the "embed vocab" (i.e. where lots of words are
# still mapped to <unk>)). This assumes there is both: an "embed vocab file", a file of the vocab for which embeddings exist and
# an embed file. Recall unk and special characters are included in the vocab file, so no need to manaully add to the dictionary.
# The words are just set out on each line of the file, so "strip" / "split" is a bit overkill but works well enough.

def ids_to_embed_vocab(embed_vocab_data):
    embed_vocab_dict = {}
    with codecs.getreader("utf-8")(tf.gfile.GFile(embed_vocab_data, 'rb')) as f:
        count = 0
        for line in f:
            tokens = line.strip().split(" ")
            word = tokens[0]
            embed_vocab_dict[count] = word
            count += 1
    return embed_vocab_dict

In [None]:
# Build the id to vocab dictionary (reverse of the vocab lookup). This is for the full vocab. This is a hack, not really
# necessary for the model but allows you to read the outputs easier (otherwise you would be left with lots of "unks" in the
# final output.) We don't compute with these ids, they are just preserved through the batch input so we know what words went in.

def ids_to_full_vocab(full_vocab_data):
    full_vocab_dict = {}
    with codecs.getreader("utf-8")(tf.gfile.GFile(full_vocab_data, 'rb')) as f:
        count = 0
        for line in f:
            tokens = line.strip().split(" ")
            word = tokens[0]
            full_vocab_dict[count] = word
            count += 1
    return full_vocab_dict

In [None]:
# Single LSTM cell instance with dropout option.

def single_cell(num_units, forget_bias, dropout, name):
    single_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units, forget_bias=forget_bias, name=name)
    if dropout > 0.0:
        single_cell = tf.nn.rnn_cell.DropoutWrapper(cell=single_cell, input_keep_prob=(1.0 - dropout))
    return single_cell

In [None]:
# Multi-layer RNN definition. The "direction" argument is just to help with naming when using bi-directional model.

def RNN_cell(num_layers, num_units, forget_bias, dropout, direction):
    if num_layers == 1:
        cell_name = direction + "_LSTM_layer"
        rnn_cell = single_cell(num_units, forget_bias, dropout, cell_name)
    else:
        cell_list = []
        for i in range(num_layers):
            cell_name = direction + "_LSTM_layer_" + str(i)
            cell = single_cell(num_units, forget_bias, dropout, cell_name)
            cell_list.append(cell)
        rnn_cell = tf.nn.rnn_cell.MultiRNNCell(cell_list)
    return rnn_cell

In [None]:
# Build bi-directional LSTM (just add direction name prefixes)

def build_bi_directional_LSTM(num_units, forget_bias, dropout, num_layers):
    
    fw_cell = RNN_cell(num_layers, num_units, forget_bias, dropout, "fw")
    bw_cell = RNN_cell(num_layers, num_units, forget_bias, dropout, "bw")
    
    return fw_cell, bw_cell

In [None]:
# Calculate the slanted learning rates to use at each training iteration. Returns a list of learning rates, one per training
# iteration. Note params are hard coded (see in particular the number of training examples).

def slanted_learning_rate(epochs, num_total_layers, batch_size):
    LRmax = 0.01
    ratio = 32
    T = (750*epochs)/batch_size # Number of training examples.
    cut_frac = 0.1
    cut = T * cut_frac
    reduce_factor = 1/2.6

    iterations = np.arange(int(T))
    
    s_l_r = np.zeros((num_total_layers, int(T)))
    
    for l in range(num_total_layers):                  
        for i in iterations:
            if i < cut:
                p = i/cut
                LR_i = LRmax*((1+p*(ratio-1))/ratio)
            else:
                p = 1 - ((i-cut)/(cut*(1/cut_frac-1)))
                LR_i = LRmax*((1+p*(ratio-1))/ratio)
            if i > l*(T/epochs):
                s_l_r[l][i]= LR_i*(reduce_factor**l)
    
    t_list = []
    for l in range(num_total_layers):
        s_l_r_t = tf.convert_to_tensor(s_l_r[l], dtype=tf.float32)
        t_list.append([s_l_r_t])
    s_l_r_t = tf.concat(t_list,0)
    return s_l_r_t

In [None]:
class Model():
    def __init__(self, dropout, num_units, num_layers, forget_bias, 
                 embed_words, full_words, txt_size, labels_size, labels_in, labels_out,
                 global_step):
        
        self.global_step = global_step
        self.learning_rates_ = slanted_learning_rate(epochs, num_total_layers, batch_size)
        
        self.learning_rate1 = self.learning_rates_[0][global_step]
        self.learning_rate2 = self.learning_rates_[1][global_step]
        self.learning_rate3 = self.learning_rates_[2][global_step]
        self.learning_rate4 = self.learning_rates_[3][global_step]
        self.learning_rate5 = self.learning_rates_[4][global_step]
        self.learning_rate6 = self.learning_rates_[5][global_step]
        self.learning_rate7 = self.learning_rates_[6][global_step]
        
        self.dropout = dropout
        self.num_units = num_units
        self.forget_bias = forget_bias
        self.num_layers = num_layers
        
        self.words_in = embed_words
        self.full_words_in = full_words
        
        with tf.variable_scope("main", initializer=xavier_initializer()):
            
            # Inputs
            
            mask_labels = tf.sequence_mask(labels_size, dtype=tf.int32) # To mask the padded input
            labels_in = labels_in * mask_labels
            
            self.labels_out = labels_out
            self.mask_labels = mask_labels
            
            encoder_emb_inp = tf.nn.embedding_lookup(emb_mat, embed_words) # Encoder embedding lookup
            decoder_emb_inp = tf.nn.embedding_lookup(dec_mat, labels_in) # Decoder embedding lookup (easiest way to get it in
            # right shape)
            
            # Encoder definition (by default, encoder_state is just the final state). Encoder can be multi-layers and
            # bi-directional
            
            encoder_cells = RNN_cell(num_layers, num_units, forget_bias, dropout, "enc_fw")
            
            encoder_outputs, encoder_state = tf.nn.dynamic_rnn(encoder_cells, 
                                                               encoder_emb_inp, 
                                                               sequence_length=txt_size, 
                                                               time_major=False, 
                                                               dtype = tf.float32)
            
            # Decoder definition. Number of decoder layers is the same as the number of encoder layers, but needed be. The 
            # helper is defined seperately and can be adjusted for greedy / beam decoding at inference.
            
            decoder_cells = RNN_cell(num_layers, num_units, forget_bias, dropout, "dec")
            
            helper = tf.contrib.seq2seq.TrainingHelper(decoder_emb_inp, 
                                                       labels_size, 
                                                       time_major=False)
            
            # Output layer which takes decoder output and maps to 3 categories (0,1,2) - these are the same as the target labels.
            # Recall 2 just maps to </l>, which is the prediction for </s>
            
            output_layer = layers_core.Dense(3, use_bias=False, name="output_projection")
            
            decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cells, helper, encoder_state, output_layer)
            
            outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode(decoder,
                                                                                output_time_major=False)
            
            # Decoder just runs until it gets to the end, but could impose a max length (e.g. length of labels)
            
            # Calculate loss: By logits we just mean the outputs of the decoder (after output_layer). crossent takes normalised
            # output probability prediction for each class (i.e. the softmax of the logits) and takes cross-entropy with the 
            # actual labels.
            
            self.logits = outputs[0]
            crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels_out, logits=self.logits)
            self.loss = tf.reduce_sum(crossent * tf.cast(mask_labels, tf.float32)) / tf.cast(batch_size, tf.float32)
            
            ########################################################################################################
            # Transfer learning regime...
            ########################################################################################################
            
            epoch_count = tf.floor_div(global_step*batch_size, 750)+1
            
            opt1 = tf.train.GradientDescentOptimizer(self.learning_rates_[0][global_step])
            opt2 = tf.train.GradientDescentOptimizer(self.learning_rates_[1][global_step])
            opt3 = tf.train.GradientDescentOptimizer(self.learning_rates_[2][global_step])
            opt4 = tf.train.GradientDescentOptimizer(self.learning_rates_[3][global_step])
            opt5 = tf.train.GradientDescentOptimizer(self.learning_rates_[4][global_step])
            opt6 = tf.train.GradientDescentOptimizer(self.learning_rates_[5][global_step])
            opt7 = tf.train.GradientDescentOptimizer(self.learning_rates_[6][global_step])
            
            t_variables1 = tf.trainable_variables(scope="main/decoder/output_projection/")
            t_variables2 = tf.trainable_variables(scope="main/decoder/multi_rnn_cell/cell_2/dec_LSTM_layer_2/")
            t_variables3 = tf.trainable_variables(scope="main/decoder/multi_rnn_cell/cell_1/dec_LSTM_layer_1/")
            t_variables4 = tf.trainable_variables(scope="main/decoder/multi_rnn_cell/cell_0/dec_LSTM_layer_0/")
            t_variables5 = tf.trainable_variables(scope="main/rnn/multi_rnn_cell/cell_2/enc_fw_LSTM_layer_2/")
            t_variables6 = tf.trainable_variables(scope="main/rnn/multi_rnn_cell/cell_1/enc_fw_LSTM_layer_1/")
            t_variables7 = tf.trainable_variables(scope="main/rnn/multi_rnn_cell/cell_0/enc_fw_LSTM_layer_0/")
            
            gradients1, variables1 = zip(*opt1.compute_gradients(self.loss, var_list=t_variables1))
            gradients2, variables2 = zip(*opt2.compute_gradients(self.loss, var_list=t_variables2))
            gradients3, variables3 = zip(*opt3.compute_gradients(self.loss, var_list=t_variables3))
            gradients4, variables4 = zip(*opt4.compute_gradients(self.loss, var_list=t_variables4))
            gradients5, variables5 = zip(*opt5.compute_gradients(self.loss, var_list=t_variables5))
            gradients6, variables6 = zip(*opt6.compute_gradients(self.loss, var_list=t_variables6))
            gradients7, variables7 = zip(*opt7.compute_gradients(self.loss, var_list=t_variables7))
            
            train_opt1 = opt1.apply_gradients(zip(gradients1, variables1), global_step=global_step)
            train_opt2 = opt2.apply_gradients(zip(gradients2, variables2))
            train_opt3 = opt3.apply_gradients(zip(gradients3, variables3))
            train_opt4 = opt4.apply_gradients(zip(gradients4, variables4))
            train_opt5 = opt5.apply_gradients(zip(gradients5, variables5))
            train_opt6 = opt6.apply_gradients(zip(gradients6, variables6))
            train_opt7 = opt7.apply_gradients(zip(gradients7, variables7))
            
            grad_opts = [train_opt1, train_opt2, train_opt3, train_opt4, train_opt5, train_opt6, train_opt7]
            
            self.train_ops = tf.group(grad_opts)
            
            self.preds = tf.argmax(self.logits, axis=2)
            
            # Summaries: Tensorflow summaries
            
            self.make_summaries(self.learning_rate1, self.learning_rate2, self.learning_rate3, 
                                self.learning_rate4, self.learning_rate5, self.learning_rate6,
                                self.learning_rate7, self.loss)
                        
    def make_summaries(self, learning_rate1, learning_rate2, learning_rate3, learning_rate4, 
                       learning_rate5, learning_rate6, learning_rate7, loss):
        
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("learning_rate_dense_layer", learning_rate1)
        tf.summary.scalar("learning_dec_layer2", learning_rate2)
        tf.summary.scalar("learning_dec_layer1", learning_rate3)
        tf.summary.scalar("learning_dec_layer0", learning_rate4)
        tf.summary.scalar("learning_enc_layer2", learning_rate5)
        tf.summary.scalar("learning_enc_layer1", learning_rate6)
        tf.summary.scalar("learning_enc_layer0", learning_rate7)
        
        self.merged = tf.summary.merge_all()

In [None]:
# Run the graph

with tf.Graph().as_default(): 

    iterator = build_dataset(text_data, labels_data, embed_vocab_data, full_vocab_data, txt_eos, lbl_sos, lbl_eos, batch_size)
    
    emb_mat = create_emb_matrix(embed_file)
    dec_mat = create_dec_matrix(4)
    
    random_embeddings = np.random.uniform(low=-1, high=1, size=(4,300)) # A random choice for unk and other special characters
    embeddings = tf.Variable(tf.convert_to_tensor(random_embeddings, dtype=tf.float32), name="saved_embeddings")
    emb_mat = tf.concat((embeddings, emb_mat), 0)
    
    ids_to_embed_vocab = ids_to_embed_vocab(embed_vocab_data)
    ids_to_full_vocab = ids_to_full_vocab(full_vocab_data)
    
    # A call to the iterator for inputs
    
    full_words_, embed_words_, txt_size_, label_size_, labels_in_, labels_out_ = iterator.get_next()
    
    # Model instantiation
    
    global_step = tf.Variable(0, name='global_step',trainable=False)
    model = Model(dropout, num_units, num_layers, 
                  forget_bias, embed_words_, full_words_, 
                  txt_size_, label_size_, labels_in_, labels_out_, global_step)
    
    # Initialise variables
    
    init = tf.global_variables_initializer()
    t_variables = tf.trainable_variables()
    saver = tf.train.Saver(var_list=t_variables) # Saver for variables. Not full graph.
        
    with tf.Session() as sess:
        
        train_writer = tf.summary.FileWriter(log_path, sess.graph)
                
        # Restore variables if present.
        if restore_path == None:
            sess.run(init)
        else:
            global_step.initializer.run()
            saver.restore(sess, restore_path)
            print("Model restored.")
        
        # Initialise the vocab tables
        sess.run(tf.tables_initializer())
        counter = 0
        # Training loop.
        for epoch in range(epochs):
            losses = []
            epoch_start = time.time()
            sess.run(iterator.initializer)
            while True:
                try:
                    _, summary, loss = sess.run([model.train_ops, model.merged, model.loss])
                    
                    train_writer.add_summary(summary, counter)
                    train_writer.flush()
                    losses.append(loss) # Counter for epoch loss
                    counter += 1
                    
                    #print(sess.run(global_step))
                    
                    if counter % log_freq == 0:
                        
                        # Get the values from model
                        preds, full_words_in, labels_out, mask_labels = sess.run([model.preds,
                                                                                  model.full_words_in, 
                                                                                  model.labels_out, 
                                                                                  model.mask_labels])

                        # pick one of the entries in the current batch
                        j = np.random.randint(0, batch_size)
                            
                        full_sent = []
                        target_sent = []
                        predicted_sent = []

                        for i in range(len(full_words_in[j])):
                            if mask_labels[j][i] == 1:
                                full_sent.append(ids_to_full_vocab[full_words_in[j][i]])
                                if preds[j][i] != 0:
                                    predicted_sent.append(ids_to_full_vocab[full_words_in[j][i]])
                                if labels_out[j][i] != 0:
                                    target_sent.append(ids_to_full_vocab[full_words_in[j][i]])

                        print("Input sentence is:")
                        print(" ".join(full_sent))
                        print("Target sentence is:")
                        print(" ".join(target_sent))
                        print("Predicted sentence is:")
                        print(" ".join(predicted_sent))
                        
                except tf.errors.OutOfRangeError:
                    
                    average_loss = sum(losses) / len(losses)
                    elapsed_time = (time.time() - epoch_start)
                    print("Epoch run time: %s" % elapsed_time)
                    print("Average epoch loss: %s" % average_loss)
                    
                    break
                    
        if save_model == True:
            saver.save(sess, save_path)