In [None]:
"""
    issues/TODO:
    - learning rate add momentum
    - variable scopes could be managed better
    - remember to add GO as the prefix
    - document PADDING and EOS
    - GO, PAD, EOS should be an embedding vector that learned
    - when decoder stop output during the testing? -> depends on which bucket it is in, 
        or could be set to fixed length
    - Decide whether or not include word embeddings as a trainable variables
    - Decoder inputs should start with GO
    - Stop gradient on decoder input???
"""

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.python.ops import variable_scope

import datetime

In [None]:
def rnn_seq2seq_step(
        enc_cell,
        dec_cell,
        encoder_inputs,
        decoder_inputs,
        output_targets=None,
        is_sampled=True,
        keep_prob=1.0,
        dtype=tf.float32):
    encoder_outputs, enc_state = rnn_encoder(enc_cell, encoder_inputs, dtype=dtype)
    decoder_outputs, dec_state = rnn_decoder(dec_cell, enc_state, encoder_outputs, 
                                             decoder_inputs, output_targets=output_targets, 
                                             is_sampled=is_sampled, keep_prob=keep_prob)
    
    return decoder_outputs

In [None]:
def rnn_encoder(cell, encoder_inputs, dtype=tf.float32):
    outputs, state = tf.nn.rnn(cell, encoder_inputs, dtype=dtype)
    return outputs, state

In [None]:
def rnn_decoder(cell, 
                initial_state,   
                encoder_states, 
                decoder_inputs,
                output_targets=None, 
                is_sampled=True,
                keep_prob=1.0
               ):
    
    if not is_sampled and output_targets is None:
        raise InputError('Input labels should be provided when decoder is not sampled')
    
    state = initial_state
    outputs = []
    prev = None
    for i, inp in enumerate(decoder_inputs):
        if is_sampled and prev is not None:
            inp = inp * prev # feed previous, implemented for curriculum learning or during testing
        if not is_sampled and i > 0:
            inp = inp * output_targets[i - 1] # feed the true label of the sentence
        
        if i > 0:
            variable_scope.get_variable_scope().reuse_variables()
            
        output, state = cell(inp, state)
        
        output = MLP(output, encoder_state=encoder_states[i], keep_prob=keep_prob)
        
        outputs.append(output)
        
        # applies curriculum training here
        # when set True, the output of previous state is used
        # to weight the input for the next state computation
        if is_sampled:
            prev = output
            
    return outputs, state

In [None]:
def sequence_maximum_likelihood_loss(logits, targets):
    """
        - returns the mean of all sequence loss in the batch
        - each sequence loss is compute using maximum likelihood estimator loss function
        - the goal is to maximize the logits when the target 1, and minimize the logits otherwise
    """
    losses = []
    for logit, target in zip(logits, targets):
        loss = - (target * tf.log(logit + 1e-9) + (1 - target) * tf.log(1 - logit + 1e-9))
        losses.append(loss)
#     sequences_loss = tf.reduce_sum(losses, 0)
    return tf.reduce_mean(losses)

In [None]:
# TODO: two layer could be simplified to one layer
def MLP(decoder_state, encoder_state, keep_prob=1.0):
    h1_weights = variable_dict["h1_weights"]
    h1_bias = variable_dict["h1_bias"]
    logit_weights = variable_dict["logit_weights"]
    logit_bias = variable_dict["logit_bias"]
    
    x = tf.concat(1, [decoder_state, encoder_state])
    x = tf.nn.dropout(x, keep_prob)

    h1 = tf.matmul(x, h1_weights)
    h1 = tf.add(h1, h1_bias)
    h1 = tf.nn.relu(h1)
    
    logits = tf.add(tf.matmul(h1, logit_weights), logit_bias)
    logits = tf.nn.sigmoid(logits)
    
    return logits

In [None]:
def convolutional_sentence_encoder(input_x, keep_prob):
    word_embeddings = weights['word_embeddings']
    sentence_tensor = tf.nn.embedding_lookup(word_embeddings, input_x)
    sentence_tensor = tf.expand_dims(sentence_tensor, -1, name='expanded_sentence_tensor')

    conv1 = tf.nn.conv2d(
        sentence_tensor,
        weights['wc1'],
        strides=[1,1,1,1],
        padding="VALID",
        name="conv1"
    )
    conv1 = tf.add(conv1, biases['bc1'])
    conv1 = tf.nn.relu(conv1)
    pooled1 = tf.nn.max_pool(
        conv1,
        ksize=[1, sentence_length - filter_sizes[0] + 1, 1, 1],
        strides=[1, 1, 1, 1],
        padding='VALID',
        name="pool1")
    
    conv2 = tf.nn.conv2d(
        sentence_tensor,
        weights['wc2'],
        strides=[1,1,1,1],
        padding="VALID",
        name="conv2"
    )
    conv2 = tf.add(conv2, biases['bc2'])
    conv2 = tf.nn.relu(conv2)
    pooled2 = tf.nn.max_pool(
        conv2,
        ksize=[1, sentence_length - filter_sizes[1] + 1, 1, 1],
        strides=[1, 1, 1, 1],
        padding='VALID',
        name="pool2")
    
    conv3 = tf.nn.conv2d(
        sentence_tensor,
        weights['wc3'],
        strides=[1,1,1,1],
        padding="VALID",
        name="conv3"
    )
    conv3 = tf.add(conv3, biases['bc3'])
    conv3 = tf.nn.relu(conv3)
    pooled3 = tf.nn.max_pool(
        conv3,
        ksize=[1, sentence_length - filter_sizes[2] + 1, 1, 1],
        strides=[1, 1, 1, 1],
        padding='VALID',
        name="pool3")

    num_total_filters = len(filter_sizes) * num_filters
    pool_h = tf.concat(3, [pooled1, pooled2, pooled3])
    pool_h = tf.reshape(pool_h, [-1, num_total_filters])
    pool_h = tf.nn.dropout(pool_h, keep_prob=keep_prob, name='final_sentence_embedding')
    
    return pool_h

In [None]:
""" 
    Setup Variables and Placeholders for Convolutional Sentence Encoder
"""
# Store layers weight & bias
num_filters = 100
filter_sizes = [3, 4, 5]

word_embedding_size = 150
sentence_length = 50 # fixed length
vocab_size = 42579

filter_shape_1 = [filter_sizes[0], word_embedding_size, 1, num_filters]
filter_shape_2 = [filter_sizes[1], word_embedding_size, 1, num_filters]
filter_shape_3 = [filter_sizes[2], word_embedding_size, 1, num_filters]
weights = {
    'wc1': tf.Variable(tf.random_uniform(filter_shape_1, minval=-0.05, maxval=0.05)),
    'wc2': tf.Variable(tf.random_uniform(filter_shape_2, minval=-0.05, maxval=0.05)),
    'wc3': tf.Variable(tf.random_uniform(filter_shape_3, minval=-0.05, maxval=0.05)),
    'word_embeddings': tf.Variable(tf.random_uniform([vocab_size, word_embedding_size], 1., -1.), name='word_embeddings_150_6_20')
}

biases = {
    'bc1': tf.Variable(tf.random_uniform([num_filters], minval=-0.05, maxval=0.05)),
    'bc2': tf.Variable(tf.random_uniform([num_filters], minval=-0.05, maxval=0.05)),
    'bc3': tf.Variable(tf.random_uniform([num_filters], minval=-0.05, maxval=0.05))
}

In [None]:
batch_size = 5 # size of training batch
sentence_embedding_size = 300 # encoder input size
doc_embedding_size = 750 # hidden layer size
output_size = 1
learning_rate = 1e-3
momentum_beta_1 = 0.99
momentum_beta_2 = 0.999

variable_dict = {
    "h1_weights": tf.Variable(tf.random_uniform([doc_embedding_size * 2, doc_embedding_size], minval=-0.05, maxval=0.05)),
    "h1_bias": tf.Variable(tf.random_uniform([doc_embedding_size], minval=-0.05, maxval=0.05)),
    "logit_weights": tf.Variable(tf.random_uniform([doc_embedding_size, output_size], minval=-0.05, maxval=0.05)),
    "logit_bias": tf.Variable(tf.random_uniform([output_size], minval=-0.05, maxval=0.05)),
    "encoder_cell": tf.nn.rnn_cell.BasicLSTMCell(doc_embedding_size, state_is_tuple=True),
    "decoder_cell": tf.nn.rnn_cell.BasicLSTMCell(doc_embedding_size, state_is_tuple=True),
}

placeholders = {
    "bucket_10": tf.placeholder(tf.int32, shape=[None, 10+1, sentence_length], name='input_bucket_10'),
    "bucket_20": tf.placeholder(tf.int32, shape=[None, 20+1, sentence_length], name='input_bucket_20'),
    "bucket_30": tf.placeholder(tf.int32, shape=[None, 30+1, sentence_length], name='input_bucket_30'),
    "bucket_40": tf.placeholder(tf.int32, shape=[None, 40+1, sentence_length], name='input_bucket_40'),
    "bucket_50": tf.placeholder(tf.int32, shape=[None, 50+1, sentence_length], name='input_bucket_50'),
    "sentence_labels_10": tf.placeholder(tf.float32, shape=[None, 10], name='label_bucket_10'),
    "sentence_labels_20": tf.placeholder(tf.float32, shape=[None, 20], name='label_bucket_20'),
    "sentence_labels_30": tf.placeholder(tf.float32, shape=[None, 30], name='label_bucket_30'),
    "sentence_labels_40": tf.placeholder(tf.float32, shape=[None, 40], name='label_bucket_40'),
    "sentence_labels_50": tf.placeholder(tf.float32, shape=[None, 50], name='label_bucket_50'),
    "feedfw_sampling": tf.placeholder(tf.bool, name='feedforward_sampling_flag'),
    "keep_prob": tf.placeholder(tf.float32, name='dropout_keep_probability')
}

In [None]:
# TODO decoder input is encoder shifted to the right once
def sentence_extractor(bucket_id):  
    # specify if the graph would execute curriculum learning sampling during feed-forward operation
    feedfw_sampling = placeholders["feedfw_sampling"]

    keep_prob = placeholders["keep_prob"]

    # setup tensors of word ids list
    sentence_inputs = placeholders["bucket_{0}".format(bucket_id)] # tensor of (batch size, document size, sentc size)
    sentence_inputs = tf.transpose(sentence_inputs, perm=[1, 0, 2])
    sentence_inputs = tf.reshape(sentence_inputs, [-1, sentence_length])

    sentence_embeddings = []
    for sentence_input in tf.split(0, bucket_id + 1, sentence_inputs):
        sentence_embedding = convolutional_sentence_encoder(sentence_input, keep_prob)
        sentence_embeddings.append(sentence_embedding)

    # setup input and labels placeholders
    sentence_labels = placeholders["sentence_labels_{0}".format(bucket_id)]

    decoder_targets = tf.split(1, bucket_id, sentence_labels)

    encoder_cell = variable_dict["encoder_cell"]
    decoder_cell = variable_dict["decoder_cell"]

    def sampled_decode(): 
        return rnn_seq2seq_step(
            encoder_cell,
            decoder_cell,
            sentence_embeddings[1:],
            sentence_embeddings[:len(sentence_embeddings)-1],
            keep_prob=keep_prob
        )

    def non_sampled_decode():
        return rnn_seq2seq_step(
            encoder_cell,
            decoder_cell,
            sentence_embeddings[1:],
            sentence_embeddings[:len(sentence_embeddings)-1],
            is_sampled=False,
            output_targets=decoder_targets,
            keep_prob=keep_prob
        )

    decoder_outputs = tf.cond(feedfw_sampling, sampled_decode, non_sampled_decode)
    return decoder_outputs, decoder_targets

# Specifying the computation flow for each bucket of inputss
bucket10outputs, bucket10targets = sentence_extractor(10)
bucket20outputs, bucket20targets = sentence_extractor(20)

bucket_outputs = {
    'bucket_10': bucket10outputs,
    'bucket_20': bucket20outputs
}

# Specifying the loss function for each bucket of inputs
bucket10loss = sequence_maximum_likelihood_loss(bucket10outputs, bucket10targets)
bucket20loss = sequence_maximum_likelihood_loss(bucket20outputs, bucket20targets)

bucket_losses = {
    'bucket_10': bucket10loss,
    'bucket_20': bucket20loss
}

# Minimizing loss
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, 
                                   beta1=momentum_beta_1,
                                   beta2=momentum_beta_2)
train_ops = {
    'bucket_10': optimizer.apply_gradients(optimizer.compute_gradients(bucket10loss), global_step=global_step),
    'bucket_20': optimizer.apply_gradients(optimizer.compute_gradients(bucket20loss), global_step=global_step)
}

init = tf.initialize_all_variables()

In [None]:
accuracies = {
    'bucket_10': tf.reduce_mean(tf.cast(tf.equal(tf.round(bucket10outputs), bucket10targets), tf.float32)),
    'bucket_20': tf.reduce_mean(tf.cast(tf.equal(tf.round(bucket20outputs), bucket20targets), tf.float32))
}

In [None]:
def train_step(sess, x_batch, y_batch, feedforward_sampling=False, keep_prob=0.5):
    """
    parameters:
    - x_batch: 3 dimensional list of size (batch size, document number of sentence, and sentence size)
    - y_batch: 2 dimensional list of size (batch size, document number of sentence). This list represents the label
                of whether a sentence is included in the summary
    - feedforward_sampling: Set True to allow the decoder to feed its previous states. This will be required during
                a curriculum learning
    """
    bucket_id = len(x_batch[0]) - 1

    input_dict = {placeholders['bucket_{0}'.format(bucket_id)]: x_batch, 
                 placeholders['sentence_labels_{0}'.format(bucket_id)]: y_batch,
                 placeholders["feedfw_sampling"]: feedforward_sampling,
                 placeholders["keep_prob"]: keep_prob}
    
    _, step, loss, summaries, acc = sess.run([train_ops['bucket_{0}'.format(bucket_id)], global_step,
                                bucket_losses['bucket_{0}'.format(bucket_id)], 
                                train_summary_ops['{0}'.format(bucket_id)],
                                accuracies['bucket_{0}'.format(bucket_id)]], input_dict)
    
    train_summary_writer.add_summary(summaries, step)
    train_summary_writer.flush()
    
    return step, loss, acc

In [None]:
def eval_test_step(sess, x_batch, y_batch, feedforward_sampling=True, keep_prob=1.0):
    bucket_id = len(x_batch[0]) - 1
    
    input_dict = {placeholders['bucket_{0}'.format(bucket_id)]: x_batch, 
                 placeholders['sentence_labels_{0}'.format(bucket_id)]: y_batch,
                 placeholders["feedfw_sampling"]: feedforward_sampling,
                 placeholders["keep_prob"]: keep_prob}
    
    loss, step, acc = sess.run([bucket_losses['bucket_{0}'.format(bucket_id)], global_step,
                          accuracies['bucket_{0}'.format(bucket_id)]], input_dict)
    
#     dev_summary_writer.add_summary(summaries, step)
#     dev_summary_writer.flush()
    
    return loss, acc

In [None]:
import batch_generator_alt as bg
def generate_batch(bucket, batch_size, batch_type, include_filenames=False):
    global vocab_size
    global sentence_length

    batch = bg.get_batch_with_filenames(bucket, batch_size, batch_type)
    random_batch = map(lambda x: x[0], batch)
    random_batch_target = map(lambda x: x[1], batch)
    
    if (include_filenames):
        batch_filenames = map(lambda x: x[2], batch)
        return random_batch, random_batch_target, batch_filenames
    
    return random_batch, random_batch_target

In [None]:
# initialize tensorflow session and graph variables
w_embedding_path = 'tf_variables/word_embeddings_150_6_20.var'
sess = tf.Session()
sess.run(init)
# var_saver = tf.train.Saver({"word_embeddings_150_6_20": weights['word_embeddings']})
# var_saver.restore(sess, w_embedding_path)

# checkpoint_path = 'checkpoints_sentence_extractor_2/stored_variables.ckpt.epoch2.40000'
# var_saver_2 = tf.train.Saver(tf.trainable_variables())
# var_saver_2.restore(sess, checkpoint_path)

In [None]:
"""
    Initialize the summaries writers
"""
import os
import time

# Output directory for models and summaries
timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
print("Writing to {}\n".format(out_dir))
 
# Summaries for loss and accuracy
loss_summary10 = tf.scalar_summary("maximum-likelihood loss", bucket10loss)
loss_summary20 = tf.scalar_summary("maximum-likelihood loss", bucket20loss)
accuracy10 = tf.scalar_summary("label accuracy", accuracies['bucket_10'])
accuracy20 = tf.scalar_summary("label accuracy", accuracies['bucket_20'])

# Train Summaries
train_summary_ops = {
    '10': tf.merge_summary([loss_summary10, accuracy10]),
    '20': tf.merge_summary([loss_summary20, accuracy20])
}
train_summary_dir = os.path.join(out_dir, "summaries", "train")
train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph)
 
current_val_loss = tf.placeholder(tf.float32, name='validation_loss')
current_val_acc = tf.placeholder(tf.float32, name='validation_acc')
loss_summary = tf.scalar_summary("maximum-likelihood loss", current_val_loss)
accuracy_summary = accuracy10 = tf.scalar_summary("label accuracy", current_val_acc)
    
# Dev summaries
dev_summary_op = tf.merge_summary([loss_summary, accuracy_summary])
dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph)

In [None]:
import sys
temp_stdout = sys.stdout
sys.stdout = open('/dev/stdout', 'w')

# define the iterative training steps here
report_every = 50
evaluate_every = 1000
checkpoint_every = 10000

num_epoch = 3

checkpoint_path = 'checkpoints_sentence_extractor_1/stored_variables.ckpt'
var_saver_2 = tf.train.Saver(tf.trainable_variables())

total_loss = 0.
total_acc = 0. 
step_counter = 0.

buckets = [10, 20]
eval_buckets = [10, 20]

is_sampled_weights = [0., 0., 0.]

for epoch, r_weight in zip(range(num_epoch), is_sampled_weights):
    r_elements = [True, False]
    r_weights = [r_weight, 1.0 - r_weight]
    sampled_number = 0
    docs_trained = 0
    val_global_counter = 0
    for bucket in buckets:
        while(bg.has_more(bucket, 'training')):
            x_batch, y_batch = generate_batch(bucket, 5, 'training')

            if len(x_batch) == 0:
                continue # signaling if the batch is empty

            # weighted random choice if sampled or not
            sampled = np.random.choice(r_elements, p=r_weights)
                
            current_step, current_loss, current_acc = train_step(sess, x_batch, y_batch, feedforward_sampling=sampled)
            total_acc += current_acc
            total_loss += current_loss
            step_counter += 1
            docs_trained += len(x_batch)
            if sampled:
                sampled_number += len(x_batch)

            if current_step % report_every == 0:
                print("ep {}: bucket: {}, training step {}, loss avg {:g}, accuracy: {:g}, trained: {}, sampled: {}".format(epoch, bucket, current_step, 
                                                                               total_loss / step_counter,
                                                                               total_acc / step_counter, docs_trained, sampled_number))
                total_loss = 0
                total_acc = 0
                step_counter = 0

            if current_step % evaluate_every == 0:
                eval_loss = 0.
                eval_acc = 0. 
                eval_counter = 0.
                for eval_bucket in eval_buckets:
                    print 'Evaluation on validation data bucket {0}:'.format(eval_bucket)
                    while(bg.has_more(eval_bucket, 'validation')):
                        x_val_batch, y_val_batch = generate_batch(eval_bucket, 5, 'validation')
                        
                        if len(x_val_batch) == 0:
                            continue # signaling if the batch is empty
                        
                        val_loss, val_acc = eval_test_step(sess, x_val_batch, y_val_batch)
                        eval_loss += val_loss
                        eval_acc += val_acc
                        eval_counter += 1
                print("validation loss avg {:g}, accuracy: {:g}".format(eval_loss / eval_counter,
                                                                        eval_acc / eval_counter))
                
                val_summaries = sess.run(dev_summary_op, feed_dict={
                        current_val_loss: eval_loss / eval_counter,
                        current_val_acc: eval_acc / eval_counter
                    })
                dev_summary_writer.add_summary(val_summaries, current_step)
                dev_summary_writer.flush()
                
                bg.reset_indices('validation')

            if current_step % checkpoint_every == 0:
                ckpt_path = var_saver_2.save(sess, checkpoint_path+'.epoch'+str(epoch)+'.'+str(current_step))
                print("Saved model checkpoint to {}\n".format(ckpt_path))
    
    time_str = datetime.datetime.now().isoformat()
    print("{}: epoch {} completed. docs trained: {}. sampled: {}".format(time_str, epoch,
                                                                            docs_trained, sampled_number))
    
    bg.reset_indices('training')
        
sys.stdout = temp_stdout
print '\nOptimization completed!\n'

In [None]:
# is_sampled_weights = [0.01, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
# for epoch, weight in zip(range(num_epoch), is_sampled_weights):
#     elements = [True, False]
#     weights = [weight, 1.0 - weight]
#     counter1 = 0
#     counter2 = 0
#     for i in range(10000):
#         rand = np.random.choice(elements, p=weights)
#         if rand:
#             counter1 += 1
#         else:
#             counter2 += 1
#     print 'counter1: {}'.format(counter1)
#     print 'counter2: {}'.format(counter2)
# y_batch = np.zeros((len(x_batch), 20))
# input_dict = {placeholders['bucket_{0}'.format(20)]: x_batch,
#               placeholders['sentence_labels_{0}'.format(20)]: y_batch,
#               placeholders["feedfw_sampling"]: True,
#               placeholders["keep_prob"]: 1.0}

# res = sess.run(bucket20outputs, feed_dict=input_dict)
# output_labels = np.transpose(np.squeeze(np.array(res)))
# if len(x_batch) > 0:
#     input_dict = {placeholders['bucket_20']: x_batch, 
#                      placeholders['sentence_labels_20']: y_batch,
#                      placeholders["feedfw_sampling"]: True,
#                      placeholders["keep_prob"]: 1.0}

#     outs, rounds, tars, acc = sess.run([bucket20outputs, outputs20rounded, bucket20targets, accuracy20], feed_dict=input_dict)
#     for out, ro, tar in zip(outs, rounds, tars):
#         print str(out[0]) + '->' + str(ro[0]) + '->' + str(tar[0])
#     print acc[0]

In [None]:
import summary as sm
for bucket in buckets:
    while bg.has_more(bucket, 'test'):
        test_batch, _, batch_filenames = generate_batch(bucket, 10, 'test', include_filenames=True)
        
        if len(test_batch) <= 1:
                continue # signaling if the batch is empty
        
        y_filler = np.zeros((len(test_batch), bucket))
        input_dict = {placeholders['bucket_{0}'.format(bucket)]: test_batch,
              placeholders['sentence_labels_{0}'.format(bucket)]: y_filler,
              placeholders["feedfw_sampling"]: True,
              placeholders["keep_prob"]: 1.0}

        out_probs = sess.run(bucket_outputs['bucket_{}'.format(bucket)], feed_dict=input_dict)
        
        out_probs = np.transpose(np.squeeze(np.array(out_probs)))

        for i in range(len(out_probs)):
            write_path = 'se_test_results_1/' + batch_filenames[i].split('/')[-1] + '.pred'
            res = sm.get_summary(batch_filenames[i], out_probs[i], True)
            with open(write_path, 'w') as f:
                f.write(res)
        
bg.reset_indices('test')

In [None]:
# sess.close()