---
Problem 3
---------

(difficult!)

Write a sequence-to-sequence LSTM which mirrors all the words in a sentence. For example, if your input is:

    the quick brown fox
    
the model should attempt to output:

    eht kciuq nworb xof
    
Refer to the lecture on how to put together a sequence-to-sequence model, as well as [this article](http://arxiv.org/abs/1409.3215) for best practices.

---

In [1]:
import numpy as np
import tensorflow as tf
import string
import collections

In [2]:
dtype = tf.float32

GO = '>'
EOS = '$'

special_chars = [GO, EOS]
#safe_chars = list(' abc')
safe_chars = list(' ' + string.ascii_lowercase)
alphabet = special_chars + safe_chars
alphabet_size = len(alphabet)

char_to_id_map = {c: i for i, c in enumerate(alphabet)}


def char_to_id(char):
    try:
        return char_to_id_map[char]
    except IndexError:
        print('Unknown character: {}'.format(char))
        return 0


def id_to_char(char_id):
    return alphabet[char_id]


def next_batch(batch_size=1000):
    sentence_size = np.random.randint(low=4, high=11)
    batch = np.zeros([batch_size, sentence_size, alphabet_size])
    chars = np.random.choice(safe_chars, size=batch_size * sentence_size)
    for i, char in enumerate(chars):
        batch_i, sentence_i = divmod(i, sentence_size)
        batch[batch_i, sentence_i, char_to_id(char)] = 1.0
    return batch, batch[:, ::-1, :]


def probabilities_to_sentence(probs):
    mask = np.sum(probs, 1) != 0.0
    sentence_length = np.sum(mask)
    masked_probs = probs[:sentence_length]
    sentence = ''.join([id_to_char(c) for c in np.argmax(masked_probs, 1)])
    zeros = '0' * (len(probs) - sentence_length)
    return sentence + zeros


def batch_to_sentences(batch):
    return [probabilities_to_sentence(probs) for probs in batch]


batch, labels = next_batch(5)
print('batch: ', batch_to_sentences(batch))
print('labels:', batch_to_sentences(labels))

batch:  ['nywdqq', 'tpejrh', 'bqxsva', 'skwc o', 'dzdwxp']
labels: ['qqdwyn', 'hrjept', 'avsxqb', 'o cwks', 'pxwdzd']


In [3]:
lstm_size = 128
initializer_scale = 0.1
learning_rate = 0.001
optimizer = 'Adam'
clip_gradients = 5.0
dropout_keep_prob = 0.6


class Model():
    def __init__(self, mode):
        assert mode in ['train', 'eval', 'inference']
        self.mode = mode

    def build_graph(self):
        self._build_inputs()
        self._build_encoder()
        self._build_decoder()
        if self.mode != 'inference':
            self._build_loss()
        if self.mode == 'train':
            self._build_train_op()

    def _build_inputs(self):
        self.sentences = tf.placeholder(
            dtype, [None, None, alphabet_size],
            name='sentences')
        self.labels = tf.placeholder(
            dtype, [None, None, alphabet_size],
            name='labels')
        with tf.variable_scope('batch_size'):
            self.batch_size = tf.shape(self.sentences)[0]

    def _build_encoder(self):
        with tf.variable_scope('encoder'):
            self.encoder_inputs = tf.identity(self.sentences, name='inputs')
            self.encoder_num_rolls = tf.reduce_sum(
                self.encoder_inputs, axis=[1, 2], name='num_rolls')
            _, self.encoder_output_state = tf.nn.dynamic_rnn(
                cell=self._make_cell(),
                inputs=self.encoder_inputs,
                sequence_length=self.encoder_num_rolls,
                dtype=dtype)

    def _build_decoder(self):
        with tf.variable_scope('decoder'):
            with tf.variable_scope('inputs'):
                self.decoder_inputs = self._prepend_go(
                    self.labels, name='inputs')
            self.decoder_num_rolls = tf.reduce_sum(
                self.decoder_inputs, axis=[1, 2], name='num_rolls')
            self.decoder_input_state = self.encoder_output_state
            self.decoder_outputs, self.decoder_output_state = tf.nn.dynamic_rnn(
                cell=self._make_cell(),
                inputs=self.decoder_inputs,
                sequence_length=self.decoder_num_rolls,
                initial_state=self.decoder_input_state)

        with tf.variable_scope('logits'):
            outputs_flat = tf.reshape(
                self.decoder_outputs, [-1, lstm_size], name='outputs_flat')
            self.logits_flat = tf.contrib.layers.fully_connected(
                inputs=outputs_flat,
                num_outputs=alphabet_size,
                activation_fn=None,
                weights_initializer=tf.random_uniform_initializer(
                    minval=-initializer_scale,
                    maxval=initializer_scale))
            with tf.variable_scope('shape'):
                shape = tf.shape(self.decoder_inputs)
                batch_size = shape[0]
                num_rolls = shape[1]
                logits_shape = [batch_size, num_rolls, alphabet_size]
            self.logits = tf.reshape(
                self.logits_flat, logits_shape, name='logits')

        with tf.variable_scope('probs'):
            self.probs = tf.nn.softmax(self.logits, name='probs')

    def _build_loss(self):
        with tf.variable_scope('loss'):
            labels_eos = self._append_eos(self.labels, 'labels_eos')
            labels_eos_flat = tf.reshape(
                labels_eos, [-1, alphabet_size], name='labels_eos_flat')
            mask = tf.reduce_sum(labels_eos_flat, axis=1, name='mask')
            unmasked_losses = tf.nn.softmax_cross_entropy_with_logits(
                labels=labels_eos_flat, logits=self.logits_flat, name='unmasked_losses')
            batch_loss = tf.reduce_mean(
                unmasked_losses * mask, name='batch_loss')
            tf.losses.add_loss(batch_loss)
            self.loss = tf.losses.get_total_loss()

    def _build_train_op(self):
        global_step = tf.contrib.framework.get_or_create_global_step()
        self.train_op = tf.contrib.layers.optimize_loss(
            loss=self.loss,
            global_step=global_step,
            learning_rate=learning_rate,
            optimizer=optimizer,
            clip_gradients=clip_gradients)

    def _make_cell(self):
        cell = tf.contrib.rnn.BasicLSTMCell(lstm_size)
        keep_prob = dropout_keep_prob if self.mode == 'train' else 1.0
        cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
        return cell

    def _prepend_go(self, labels, name=None):
        return tf.concat([
            self._broadcast_char(GO, name='special_go'),
            labels,
        ], axis=1, name=name)

    def _append_eos(self, labels, name=None):
        return tf.concat([
            labels,
            self._broadcast_char(EOS, name='special_eos'),
        ], axis=1, name=name)

    def _broadcast_char(self, char, name=None):
        with tf.variable_scope(name or 'broadcast_char'):
            result = tf.one_hot(
                char_to_id(char), alphabet_size, dtype=dtype)
            result = tf.tile(result, [self.batch_size])
            result = tf.reshape(
                result, [self.batch_size, 1, alphabet_size], name=name)
            return result

In [4]:
train_dir = '/tmp/lstm_seq2seq'


def train(num_steps=None, max_variation=None):
    graph = tf.Graph()
    with graph.as_default():
        model = Model(mode='train')
        model.build_graph()
        hooks = [
            tf.train.StopAtStepHook(num_steps=num_steps)
            if num_steps is not None else None,
            StopWhenPlateauHook(model.loss, max_variation=max_variation)
            if max_variation is not None else None,
            tf.train.NanTensorHook(model.loss),
            LoggerHook(model),
        ]
        hooks = [hook for hook in hooks if hook is not None]
        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=train_dir,
                save_checkpoint_secs=60,
                hooks=hooks) as sess:
            while not sess.should_stop():
                batch_sentences, batch_labels = next_batch()
                _, batch_loss = sess.run(
                    [model.train_op, model.loss],
                    feed_dict={
                        model.sentences: batch_sentences,
                        model.labels: batch_labels,
                    })


class LoggerHook(tf.train.SessionRunHook):
    def __init__(self, model):
        self.model = model

    def begin(self):
        self.valid_sentences, self.valid_labels = next_batch(100)
        self.valid_model = Model(mode='eval')
        with tf.name_scope('eval'), tf.variable_scope(tf.get_variable_scope(), reuse=True):
            self.valid_model.build_graph()

    def before_run(self, run_context):
        return tf.train.SessionRunArgs({
            'step': tf.contrib.framework.get_global_step(),
            'batch_loss': self.model.loss,
            'valid_loss': self.valid_model.loss,
            'valid_probs': self.valid_model.probs,
        }, feed_dict={
            self.valid_model.sentences: self.valid_sentences,
            self.valid_model.labels: self.valid_labels,
        })

    def after_run(self, run_context, run_values):
        step = run_values.results['step']
        if not (step % 100 == 0 or step == 1):
            return
        batch_loss = run_values.results['batch_loss']
        valid_loss = run_values.results['valid_loss']
        valid_probs = run_values.results['valid_probs']
        valid_predictions = np.argmax(valid_probs, axis=2)[:, :-1]
        num_ok = np.sum(np.all(valid_predictions == np.argmax(
            self.valid_labels, axis=2), axis=1))
        print('step: {:<5} batch loss: {:<8.4f} valid loss: {:<8.4f} valid "accuracy": {:<8.4f}'.format(
            step, batch_loss, valid_loss, num_ok / len(valid_predictions)))
        n = 5
        print('inputs: ', batch_to_sentences(
            self.valid_sentences[:n, :, :]))
        print('labels: ', [
              x + '$' for x in batch_to_sentences(self.valid_labels[:n, :, :])])
        print('outputs:', batch_to_sentences(valid_probs[:n, :, :]))


class StopWhenPlateauHook(tf.train.SessionRunHook):
    """Hook that requests stop when the metric reaches a plateau."""

    def __init__(self, metric, *, max_variation, num_steps=50):
        self._metric = metric
        self._max_variation = max_variation
        self._history = collections.deque(maxlen=num_steps)

    def before_run(self, run_context):
        return tf.train.SessionRunArgs(self._metric)

    def after_run(self, run_context, run_values):
        metric = run_values.results
        self._history.append(metric)
        if len(self._history) < self._history.maxlen:
            return
        variation = max(self._history) - min(self._history)
        if variation < self._max_variation:
            run_context.request_stop()


train(num_steps=20)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/lstm_seq2seq/model.ckpt.
step: 1     batch loss: 3.3697   valid loss: 6.7395   valid "accuracy": 0.0000  
inputs:  ['egysc', ' qcnf', 'dwzza', 'ljist', 'v wir']
labels:  ['csyge$', 'fncq $', 'azzwd$', 'tsijl$', 'riw v$']
outputs: ['abytty', 'msajba', 'owwhub', 'hzttyy', 'oyybaa']
INFO:tensorflow:Saving checkpoints for 20 into /tmp/lstm_seq2seq/model.ckpt.


In [5]:
def inspect():
    np.set_printoptions(precision=3, suppress=True)
    graph = tf.Graph()
    with graph.as_default():
        model = Model(mode='inference')
        model.build_graph()
        with tf.train.MonitoredSession(
                session_creator=tf.train.ChiefSessionCreator(
                    checkpoint_dir=train_dir)) as sess:
            batch_sentences, batch_labels = next_batch(batch_size=1)
            results = sess.run(
                {
                    '1 encoder_inputs': model.encoder_inputs,
                    '2 encoder_num_rolls': model.encoder_num_rolls,
                    '3 decoder_inputs': model.decoder_inputs,
                    '4 decoder_num_rolls': model.decoder_num_rolls,
                    '5 decoder_outputs': model.decoder_outputs,
                    'batch_probs': model.probs,
                },
                feed_dict={
                    model.sentences: batch_sentences,
                    model.labels: batch_labels,
                })
            print('inputs: ', batch_to_sentences(batch_sentences))
            print('labels: ', batch_to_sentences(batch_labels))
            print('outputs:', batch_to_sentences(results['batch_probs']))
            print()
            for k, v in sorted(results.items()):
                print(k)
                print(v)
                print()


inspect()

inputs:  ['osymrydjqz']
labels:  ['zqjdyrmyso']
outputs: ['$$$$$$$$$$$']

1 encoder_inputs
[[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.
    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.]
  [ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
    0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  

In [6]:
train(num_steps=5000, max_variation=0.01)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 21 into /tmp/lstm_seq2seq/model.ckpt.
step: 100   batch loss: 3.0244   valid loss: 6.1390   valid "accuracy": 0.0000  
inputs:  ['nlhomzjyu', 'pwmycamse', 'syhpapzyq', ' neoohcdm', 'tfopzzebe']
labels:  ['uyjzmohln$', 'esmacymwp$', 'qyzpaphys$', 'mdchooen $', 'ebezzpoft$']
outputs: ['zzzzzzzz$$', 'tttttt$$$$', 'kkkkk$$$$$', 'nnnnnnnb$$', 'ftttttt$$$']
INFO:tensorflow:global_step/sec: 1.92986
INFO:tensorflow:Saving checkpoints for 139 into /tmp/lstm_seq2seq/model.ckpt.
step: 200   batch loss: 2.3228   valid loss: 4.9937   valid "accuracy": 0.0000  
inputs:  ['nlhomzjyu', 'pwmycamse', 'syhpapzyq', ' neoohcdm', 'tfopzzebe']
labels:  ['uyjzmohln$', 'esmacymwp$', 'qyzpaphys$', 'mdchooen $', 'ebezzpoft$']
outputs: ['nnnnnnnnn$', 'aaaaaaaa$$', 'ppppppppp$', 'nnnnnnnnn$', 'pppppttt$$']
INFO:tensorflow:global_step/sec: 2.10587
INFO:tensorflow:Saving checkpoints for 263 into /tmp/lstm_seq2seq/model.ckpt.
step: 300

In [7]:
def encode_char(c):
    result = np.zeros([alphabet_size])
    result[char_to_id(c)] = 1.0
    return result


def encode_sentence(sentence):
    return np.stack([encode_char(c) for c in sentence])


def decode_char(probs):
    if np.all(probs == 0.0):
        return '0'
    return id_to_char(np.argmax(probs))


def run_greedy(sentence):
    graph = tf.Graph()
    with graph.as_default():
        model = Model(mode='inference')
        model.build_graph()
        with tf.train.MonitoredSession(
                session_creator=tf.train.ChiefSessionCreator(checkpoint_dir=train_dir)) as sess:
            state = sess.run(
                model.encoder_output_state,
                feed_dict={
                    model.sentences: [encode_sentence(sentence)],
                })
            c = GO
            result = []
            while True:
                state, probs = sess.run(
                    [model.decoder_output_state, model.probs],
                    feed_dict={
                        model.decoder_input_state: state,
                        model.decoder_inputs: [encode_sentence(c)],
                    })
                probs = np.squeeze(probs)
                c = decode_char(probs)
                if c == EOS:
                    break
                result.append(c)
                print('{c} p={p:.2f}'.format(c=c, p=np.max(probs)))
            return ''.join(result)


for sentence in ['abc', 'tiny little', 'pony', 'the quick brown fox', string.ascii_lowercase]:
    print('sentence: {}'.format(sentence))
    print('target:   {}'.format(''.join(reversed(sentence))))
    print('output:   {}'.format(run_greedy(sentence)))
    print()

sentence: abc
target:   cba
c p=1.00
b p=1.00
a p=1.00
output:   cba

sentence: tiny little
target:   elttil ynit
e p=1.00
l p=0.97
t p=1.00
t p=0.96
i p=0.98
l p=1.00
  p=1.00
y p=1.00
n p=1.00
i p=0.99
t p=1.00
output:   elttil ynit

sentence: pony
target:   ynop
y p=1.00
n p=1.00
o p=1.00
p p=1.00
output:   ynop

sentence: the quick brown fox
target:   xof nworb kciuq eht
x p=1.00
o p=0.51
f p=1.00
n p=0.61
  p=1.00
w p=1.00
o p=0.96
r p=0.98
c p=1.00
output:   xofn worc

sentence: abcdefghijklmnopqrstuvwxyz
target:   zyxwvutsrqponmlkjihgfedcba
z p=0.99
y p=0.99
x p=1.00
w p=1.00
v p=0.99
u p=0.60
t p=0.93
s p=1.00
r p=0.76
p p=0.98
q p=0.78
o p=1.00
output:   zyxwvutsrpqo



In [8]:
def run_beam(sentence, num_options=10):
    graph = tf.Graph()
    with graph.as_default():
        model = Model(mode='inference')
        model.build_graph()
        with tf.train.MonitoredSession(
                session_creator=tf.train.ChiefSessionCreator(checkpoint_dir=train_dir)) as sess:
            state = sess.run(
                model.encoder_output_state,
                feed_dict={
                    model.sentences: [encode_sentence(sentence)],
                })
            Option = collections.namedtuple(
                'Option', ['p', 'sentence', 'state'])
            options = [Option(p=1.0, sentence=GO, state=state)]
            while True:
                new_options = []
                for option in options:
                    c = option.sentence[-1]
                    if c == EOS:
                        new_options.append(option)
                        continue
                    state, probs = sess.run(
                        [model.decoder_output_state, model.probs],
                        feed_dict={
                            model.decoder_input_state: option.state,
                            model.decoder_inputs: [encode_sentence(c)],
                        })
                    probs = np.squeeze(probs)
                    for i, p in enumerate(probs):
                        c = id_to_char(i)
                        new_option = Option(
                            p=option.p * p,
                            sentence=option.sentence + c,
                            state=state)
                        new_options.append(new_option)
                options = list(
                    sorted(new_options, key=lambda x: -x.p))[:num_options]
                if all(option.sentence[-1] == EOS for option in options):
                    break
            return [(option.sentence[1:-1], option.p) for option in options]


for sentence in ['abc', 'tiny little', 'pony', 'the quick brown fox', string.ascii_lowercase]:
    print('sentence:        {}'.format(sentence))
    print('target:          {}'.format(''.join(reversed(sentence))))
    for option, prob in run_beam(sentence):
        print('output: p={:.4f} {}'.format(prob, option))
    print()

sentence:        abc
target:          cba
output: p=0.9936 cba
output: p=0.0016 bca
output: p=0.0006 cbap
output: p=0.0005 cbas
output: p=0.0004 cbay
output: p=0.0003 cba 
output: p=0.0003 cbam
output: p=0.0002 cbad
output: p=0.0002 cbao
output: p=0.0002 cbav

sentence:        tiny little
target:          elttil ynit
output: p=0.8990 elttil ynit
output: p=0.0131 eltitl yinyv
output: p=0.0063 elttli ynltv
output: p=0.0045 elttil ynti
output: p=0.0035 elttli ynlt
output: p=0.0025 elttil ynitg
output: p=0.0024 ehtltgztsnm
output: p=0.0022 eltitl qyiwj
output: p=0.0017 eltitl xiyp
output: p=0.0016 eltitl yinym

sentence:        pony
target:          ynop
output: p=0.9966 ynop
output: p=0.0023 yonp
output: p=0.0003 ynpo
output: p=0.0003 oynp
output: p=0.0001 yno
output: p=0.0000 ynopz
output: p=0.0000 nyop
output: p=0.0000 ynopc
output: p=0.0000 wyopc
output: p=0.0000 nyopn

sentence:        the quick brown fox
target:          xof nworb kciuq eht
output: p=0.2889 xofn worc
output: p=0.1846