In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector
from tensorflow.python.ops.rnn_cell import LSTMCell, LSTMStateTuple
import re
import random
import utils
import os
sess = tf.InteractiveSession()

In [2]:
def get_vocab_size(vocabfile):
    with open(vocabfile, encoding='utf8') as v:
        vocab = v.read()
        vocab = vocab.split('\n')
        size = len(vocab)
    return size

In [3]:
def build_vocab(words):
    word2index = dict()
    index2word = dict()
    with open('processed/vocab.tsv', 'w+') as f:
        index = 0
        for word in words:
            if word not in word2index:
                word2index[word] = index
                index2word[index] = word
                f.write(word + '\n')
                index += 1
    return word2index, index2word

In [4]:
def get_indexed_words(textfile):
    with open(textfile, encoding='utf8') as f:
        text = f.read()
        text = re.sub(r'\n', ' ', text)
        words = text.split(' ')
        
    word2index, index2word = build_vocab(words)
            
    index_words = list()
    for word in words:
        index_words.append(word2index[word])
        
    del words
    return index_words

In [5]:
def generate_sample(index_words, context_window_size):
    """ Form training pairs according to the skip-gram model. """
    for index, center in enumerate(index_words):
        context = random.randint(1, context_window_size)
        # get a random target before the center word
        for target in index_words[max(0, index - context): index]:
            yield center, target
        # get a random target after the center wrod
        for target in index_words[index + 1: index + context + 1]:
            yield center, target

In [6]:
def get_batch(iterator, batch_size):
    """ Group a numerical stream into batches and yield them as Numpy arrays. """
    while True:
        center_batch = np.zeros(batch_size, dtype=np.int32)
        target_batch = np.zeros([batch_size, 1])
        for index in range(batch_size):
            center_batch[index], target_batch[index] = next(iterator)
        yield center_batch, target_batch

In [7]:
def process_data(textfile, skip_window, batch_size):
    #Input: textfile, skip_window=size of context, batch_size
    #Output: iterator object over center and context word grouped together
    #in tuples converted into indexes
    #ex. [(92, 2012), (...), ...]
    #these are then embedded in a two batch sized arrays
    #ex. center words [0, 23, 2435, 123] (batch size=4)
    #and corresponding context words in the same format
    index_words = get_indexed_words(textfile)
    single_gen = generate_sample(index_words, skip_window) 
    #return single_gen
    return get_batch(single_gen, batch_size)

In [8]:
batch_size = 128
vocab_size = get_vocab_size('train.bpe.eng')
embed_size = 128
num_sampled = 64 #number of negative samples
learning_rate = 1.0
num_train_steps = 299199
skip_window=1 #context window
skip_step = 2000
weights_fld = 'processed/'

In [9]:
PAD = 0 #Padding
EOS = 1 #Transfer from encoding to decoding

encoder_hidden_units = 128 #num neurons
decoder_hidden_units = encoder_hidden_units

In [10]:
#input placehodlers
#encoder has size [encoder_max_timestep, batch_size] (these are still the int words, not embeddings!)
encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs')
#contains the lengths for each of the sequence in the batch, we will pad so all the same
encoder_inputs_length = tf.placeholder(shape=(None,), dtype=tf.int32, name='encoder_inputs_length')
#same idea as for encoder inputs
decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets')

In [11]:
#actual input into network needs to be embedded, so of size [max_time, batch_size, input_embedding_size]
#first, we need to create the uniform embedding matrix of size [vocab_size, embedding_size]
embeddings = tf.Variable(tf.random_uniform([vocab_size, embed_size], -1.0, 1.0), dtype=tf.float32)

#now, we get the embeddings of the encoder_input, which yields a matrix of the above mentioned size
encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, encoder_inputs)

In [12]:
#start building the encoder
#this returns as many LSTMCells as defined by the variable encoder_hidden_units
with tf.variable_scope("forward"):
    encoder_fw_cell = LSTMCell(encoder_hidden_units)

with tf.variable_scope("backwards", reuse=True):
    encoder_bw_cell = LSTMCell(encoder_hidden_units)

#use bidirectional_RNN to build the encoder architecture
#returns tuples within a tuple of shape ((ouput_fw, output_bw), (output_state_fw, output_state_bw))
#output if time_major == True: [max_time, batch_size, input_embedding_size]
((encoder_fw_outputs,
encoder_bw_outputs),
 (encoder_fw_final_state,
  encoder_bw_final_state)) = (
    tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_fw_cell,
                                    cell_bw=encoder_bw_cell,
                                    inputs=encoder_inputs_embedded,
                                    sequence_length=encoder_inputs_length,
                                    dtype=tf.float32, time_major=True))

In [13]:
#Concatenates tensors along one dimension.
#output: [max_time, batch_size, embed_size], now concatenated on the embed_size dimension (that is dimension number 2)
#(so embeddings are concatenated)
encoder_outputs = tf.concat((encoder_fw_outputs, encoder_bw_outputs), 2)

#letters h and c are commonly used to denote "output value" and "cell state". 
#http://colah.github.io/posts/2015-08-Understanding-LSTMs/ 
#Those tensors represent combined internal state of the cell, and should be passed together. 
#Comes from LSTMStateTuple which holds h (output value) and c (cell state)

encoder_final_state_c = tf.concat(
    (encoder_fw_final_state.c, encoder_bw_final_state.c), 1)

encoder_final_state_h = tf.concat(
    (encoder_fw_final_state.h, encoder_bw_final_state.h), 1)

#Use concatenated output value and cell state to define the final LSTM encoder state
encoder_final_state = LSTMStateTuple(
    c=encoder_final_state_c,
    h=encoder_final_state_h
)

In [15]:
#start building the decoder
decoder_cell = LSTMCell(decoder_hidden_units)

#retrieve encoder_max_time and batch_size from encoder_inputs
encoder_max_time, batch_size = tf.unstack(tf.shape(encoder_inputs))

#allow decoder sequence to be a bit longer than encoder sequence
decoder_lengths = encoder_inputs_length + 10


In [16]:
#create weights and bias for decoder
#weights of size decoder_hidden_units * vocab_size (because of embeddings having size vocab * embed_size, fully connected)
W = tf.Variable(tf.random_uniform([decoder_hidden_units, vocab_size], -1, 1), dtype=tf.float32)
#bias (of size vocab_size, one bias per embedded word)
b = tf.Variable(tf.zeros([vocab_size]), dtype=tf.float32)

In [17]:
#prepare EOS and padding for embedding (of size batch_size, since one is added to each entry in the batch)
eos_time_slice = tf.ones([batch_size], dtype=tf.int32, name='EOS')
pad_time_slice = tf.zeros([batch_size], dtype=tf.int32, name='PAD')

#get their embedding
eos_step_embedded = tf.nn.embedding_lookup(embeddings, eos_time_slice)
pad_step_embedded = tf.nn.embedding_lookup(embeddings, pad_time_slice)

In [18]:
#start building the loop over all tokens in sentence
#this is the initial case (nothing generated yet)
def loop_fn_initial():
    initial_elements_finished = (0 >= decoder_lengths)  # false at initial step, because 0 is not >= decoder_lengths
    #end of sentence
    initial_input = eos_step_embedded
    #last time steps cell state becomes initial_cell_state of our encoder
    initial_cell_state = encoder_final_state
    #none, since we do not want EOS to be used as output
    initial_cell_output = None
    #none, since nothing was altered
    initial_loop_state = None  # we don't need to pass any additional information
    return (initial_elements_finished,
            initial_input,
            initial_cell_state,
            initial_cell_output,
            initial_loop_state)

In [20]:
#function to get next input after generating the previous
def loop_fn_transition(time, previous_output, previous_state, previous_loop_state):
    def get_next_input():
        #dot product between previous ouput and weights, then + biases
        output_logits = tf.add(tf.matmul(previous_output, W), b)
        #Logits simply means that the function operates on the unscaled output of 
        #earlier layers and that the relative scale to understand the units is linear. 
        #It means, in particular, the sum of the inputs may not equal 1, that the values are not probabilities 
        #(you might have an input of 5).
        #prediction value at current time step

        #Returns the index with the largest value across axes of a tensor.
        prediction = tf.argmax(output_logits, axis=1)
        #embed prediction for the next input
        next_input = tf.nn.embedding_lookup(embeddings, prediction)
        return next_input
    
    elements_finished = (time >= decoder_lengths) # True if sequence is finished, of size batch_size
    
    finished = tf.reduce_all(elements_finished) # -> boolean scalar
    
    #Return padding if finished, return next_input if not finished.
    input = tf.cond(finished, lambda: pad_step_embedded, get_next_input)
    
    #set previous to current
    state = previous_state
    output = previous_output
    loop_state = None
    
    return (elements_finished, 
            input,
            state,
            output,
            loop_state)


In [22]:
def loop_fn(time, previous_output, previous_state, previous_loop_state):
    if previous_state is None:    # if no previous state, then initial case
        assert previous_output is None and previous_state is None #check if these are also none
        return loop_fn_initial()
    else:
        return loop_fn_transition(time, previous_output, previous_state, previous_loop_state)

In [23]:
decoder_outputs_ta, decoder_final_state, _ = tf.nn.raw_rnn(decoder_cell, loop_fn)
decoder_outputs = decoder_outputs_ta.stack()

ValueError: Dimensions must be equal, but are 128 and 256 for 'rnn/while/rnn/lstm_cell/lstm_cell/mul' (op: 'Mul') with input shapes: [?,128], [?,256].

In [17]:
class SkipGramModel:
    def __init__(self, vocab_size, embed_size, batch_size, num_sampled, learning_rate):
        #Initialize all parameters
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.batch_size = batch_size
        self.num_sampled = num_sampled
        self.lr = learning_rate
        self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')

    def _create_placeholders(self):
        #Define Placeholders for Input and Output
        with tf.name_scope('data'):
            self.center_words = tf.placeholder(tf.int32, shape=[self.batch_size], name='center_words')
            self.target_words = tf.placeholder(tf.int32, shape=[self.batch_size, 1], name='target_words')
    
    def _create_embedding(self):
        #Define Embedding Matrix
        with tf.name_scope('embed'):
            self.embed_matrix = tf.Variable(tf.random_uniform([self.vocab_size, self.embed_size], -1.0, 1.0), name='embed_matrix')

    def _create_loss(self):
        #Get the embedding of the center words
        with tf.name_scope('loss'):
            embed = tf.nn.embedding_lookup(self.embed_matrix, self.center_words, 'embed')
            #Define loss function (Noise Contrastive Estimation)
            nce_weights = tf.Variable(tf.truncated_normal([self.vocab_size, self.embed_size], stddev=1.0 / embed_size ** 0.5), name='nce_weights')
            nce_bias = tf.Variable(tf.zeros([self.vocab_size]), name='nce_bias')
            self.loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                                biases=nce_bias,
                                                labels=self.target_words,
                                                inputs=embed,
                                                num_sampled=self.num_sampled,
                                                num_classes=self.vocab_size), name='loss')
    
    def _create_optimizer(self):
        #Define optimizer
        self.optimizer = tf.train.GradientDescentOptimizer(self.lr).minimize(self.loss, global_step=self.global_step)

    def _create_summaries(self):
        with tf.name_scope('summaries'):
            tf.summary.scalar('loss', self.loss)
            tf.summary.histogram('histogram loss', self.loss)
            self.summary_op = tf.summary.merge_all()

    def build_graph(self):
        self._create_placeholders()
        self._create_embedding()
        self._create_loss()
        self._create_optimizer()
        self._create_summaries()


In [18]:
def train_model(model, batch_gen, num_train_steps, weights_fld):
    saver = tf.train.Saver()

    initial_step = 0
    utils.make_dir('checkpoints')

    #Execute
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))
        #if that checkpoint exists, restore from checkpoint
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
        
        total_loss = 0.0
        writer = tf.summary.FileWriter('./improved_graph/lr' + str(learning_rate), sess.graph)
        initial_step = model.global_step.eval()
        for train_step in range(num_train_steps):
            centers, targets = next(batch_gen)
            loss_batch, _, summary = sess.run([model.loss, model.optimizer, model.summary_op], feed_dict={model.center_words: centers, model.target_words: targets})
            writer.add_summary(summary, global_step=train_step)
            total_loss += loss_batch
            if (train_step + 1) % skip_step == 0:
                print('Average loss at step {}: {:5.1f}'.format(train_step, total_loss / skip_step))
                total_loss = 0.0
        saver.save(sess, 'checkpoints/skip-gram', train_step)

        #Get visualization
        #Obtain trained embedding_matrix
        final_embed_matrix = sess.run(model.embed_matrix)

        #Create variable to hold embeddings and get 1000 most common words
        embedding_var = tf.Variable(final_embed_matrix[:1000], name='embedding')
        sess.run(embedding_var.initializer)
        config = projector.ProjectorConfig()
        summary_writer = tf.summary.FileWriter('processed')
        
        #Add embedding to config file
        embedding = config.embeddings.add()
        embedding.tensor_name = embedding_var.name

        #Link to metadata file
        embedding.metadata_path = 'vocab.tsv'

        #Save configuration file
        projector.visualize_embeddings(summary_writer, config)
        saver_embed = tf.train.Saver([embedding_var])
        saver_embed.save(sess, 'processed/model3.ckpt', 1)

        writer.close()
        summary_writer.close()

In [19]:
def main():
    model = SkipGramModel(vocab_size, embed_size, batch_size, num_sampled, learning_rate)
    model.build_graph()
    batch_gen = process_data('train.bpe.eng', skip_window, batch_size)
    train_model(model, batch_gen, num_train_steps, weights_fld)

In [20]:
if __name__ == '__main__':
    main()

INFO:tensorflow:Summary name histogram loss is illegal; using histogram_loss instead.


KeyboardInterrupt: 