Switch to the Keras LSTM/GRU implementation

Recent versions of Tensorflow Keras will automatically switch between cuDNN and Tensorflow implementations. The trained parameters work regardless of the selected implementation. The conditions for using the cuDNN implementation are documented at: https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM They boil down to: 1. a NVIDIA GPU is available, 2. certain hyper parameters (e.g. activations) are set to specific values. If the cuDNN implementation is selected, this results in a nice speedup. The Tensorflow requirements are bumped to 1.15.0. This setup fails with 1.14.0 with a constant folding error in Grappler: tensorflow/tensorflow#29525
stickeritis · Nov 7, 2019 · 11b2d27 · 11b2d27
1 parent 8e5b956
commit 11b2d27
Show file tree

Hide file tree

Showing 6 changed files with 57 additions and 139 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -13,7 +13,7 @@ matrix:
     python: 3.6
     before_script:
       - cd sticker-graph
-      - pip install tensorflow==1.14.0 toml
+      - pip install tensorflow==1.15.0 toml
     script:
       - python setup.py test
       - ./sticker-write-rnn-graph --rnn_layers 2 --hidden_size 100 testdata/sticker.shapes rnn.graph

diff --git a/sticker-graph/setup.py b/sticker-graph/setup.py
@@ -9,11 +9,11 @@
       license='BlueOak-1.0.0',
       tests_require=[
           'numpy',
-          'tensorflow == 1.13.1',
+          'tensorflow == 1.15.0',
           'toml',
       ],
       install_requires=[
-          'tensorflow == 1.13.1',
+          'tensorflow == 1.15.0',
           'toml',
       ],
       packages=['sticker_graph'],

diff --git a/sticker-graph/sticker_graph/model.py b/sticker-graph/sticker_graph/model.py
@@ -199,11 +199,16 @@ def subword_reprs(self):
         byte_lens = tf.reshape(subword_lens, [-1])
 
         with tf.compat.v1.variable_scope("byte_rnn"):
-            _, fw, bw = bidi_rnn_layers(self.is_training, byte_reprs, num_layers=self.args.subword_layers, output_size=self.args.subword_hidden_size,
-                                        output_keep_prob=self.args.subword_keep_prob, seq_lens=byte_lens, gru=self.args.subword_gru, residual_connections=self.args.subword_residual)
-
-        # Concat forward/backward states.
-        subword_reprs = tf.concat([fw[-1].h, bw[-1].h], axis=-1)
+            subword_reprs = bidi_rnn_layers(
+                self.is_training,
+                byte_reprs,
+                num_layers=self.args.subword_layers,
+                output_size=self.args.subword_hidden_size,
+                output_keep_prob=self.args.subword_keep_prob,
+                seq_lens=byte_lens,
+                gru=self.args.subword_gru,
+                residual_connections=self.args.subword_residual,
+                return_sequences=False)
 
         return tf.reshape(subword_reprs, [bytes_shape[0], bytes_shape[1],
                                           subword_reprs.shape[-1]])

diff --git a/sticker-graph/sticker_graph/rnn.py b/sticker-graph/sticker_graph/rnn.py
@@ -1,25 +1,6 @@
 import tensorflow as tf
 
-import sticker_graph.vendored
-
-
-def dropout_wrapper(
-        cell,
-        is_training,
-        output_keep_prob=1.0,
-        state_keep_prob=1.0):
-    output_keep_prob = tf.cond(
-        pred=is_training,
-        true_fn=lambda: tf.constant(output_keep_prob),
-        false_fn=lambda: tf.constant(1.0))
-    state_keep_prob = tf.cond(
-        pred=is_training,
-        true_fn=lambda: tf.constant(state_keep_prob),
-        false_fn=lambda: tf.constant(1.0))
-    return tf.compat.v1.nn.rnn_cell.DropoutWrapper(
-        cell,
-        output_keep_prob=output_keep_prob,
-        state_keep_prob=state_keep_prob)
+from sticker_graph.keras_vendored import GRU, LSTM
 
 
 def bidi_rnn_layers(
@@ -28,32 +9,48 @@ def bidi_rnn_layers(
         num_layers=1,
         output_size=50,
         output_keep_prob=1.0,
-        state_keep_prob=1.0,
-        seq_lens=None,
         gru=False,
-        residual_connections=False):
+        residual_connections=False,
+        return_sequences=True,
+        seq_lens=None):
     if gru:
-        cell = tf.compat.v1.nn.rnn_cell.GRUCell
+        rnn_layer = GRU
     else:
-        cell = tf.compat.v1.nn.rnn_cell.LSTMCell
-
-    fw_cells = [
-        dropout_wrapper(
-            cell=cell(output_size),
-            is_training=is_training,
-            state_keep_prob=state_keep_prob,
-            output_keep_prob=output_keep_prob) for i in range(num_layers)]
-
-    bw_cells = [
-        dropout_wrapper(
-            cell=cell(output_size),
-            is_training=is_training,
-            state_keep_prob=state_keep_prob,
-            output_keep_prob=output_keep_prob) for i in range(num_layers)]
-    return sticker_graph.vendored.stack_bidirectional_dynamic_rnn(
-        fw_cells,
-        bw_cells,
-        inputs,
-        dtype=tf.float32,
-        sequence_length=seq_lens,
-        residual_connections=residual_connections)
+        rnn_layer = LSTM
+
+    # Compute mask
+    mask = None
+    if seq_lens is not None:
+        mask = tf.sequence_mask(
+            seq_lens, maxlen=tf.shape(
+                inputs)[1])
+
+    layer = inputs
+    for i in range(num_layers):
+        # Keep a reference to the previous layer for residual connections.
+        prev_layer = layer
+
+        layer_return_sequences = True
+        if i == num_layers - 1:
+            layer_return_sequences = return_sequences
+
+        # Bidirectional RNN + state output dropout.
+        layer = tf.compat.v2.keras.layers.Bidirectional(
+            rnn_layer(
+                output_size,
+                return_sequences=layer_return_sequences))(
+            layer,
+            mask=mask)
+        layer = tf.compat.v2.keras.layers.Dropout(
+            1.0 -
+            output_keep_prob)(
+            layer,
+            training=is_training)
+
+        # Add a residual connection if requested. A residual connection
+        # is not added for the first layer, since input/output sizes
+        # may mismatch.
+        if i != 0 and residual_connections:
+            layer = layer + prev_layer
+
+    return layer
diff --git a/sticker-graph/sticker_graph/rnn_model.py b/sticker-graph/sticker_graph/rnn_model.py
@@ -14,7 +14,7 @@ def __init__(
 
         self.setup_placeholders()
 
-        hidden_states, _, _ = bidi_rnn_layers(
+        hidden_states = bidi_rnn_layers(
             self.is_training,
             self.inputs,
             num_layers=args.rnn_layers,

diff --git a/sticker-graph/sticker_graph/vendored.py b/sticker-graph/sticker_graph/vendored.py
@@ -17,90 +17,6 @@
 import tensorflow as tf
 
 
-def stack_bidirectional_dynamic_rnn(cells_fw,
-                                    cells_bw,
-                                    inputs,
-                                    initial_states_fw=None,
-                                    initial_states_bw=None,
-                                    dtype=None,
-                                    sequence_length=None,
-                                    parallel_iterations=None,
-                                    time_major=False,
-                                    scope=None,
-                                    residual_connections=False):
-    """
-    NOTE:
-    This is a modified copy of tf.contrib.rnn.stack_bidirectional_dynamic_rnn
-    that adds the option to have residual skip connections. It has been taken
-    from https://github.com/tensorflow/tensorflow/blob/r1.13/tensorflow/contrib/rnn/python/ops/rnn.py.
-
-    If residual connections is True, the input of a layer is summed with the
-    output of the layer. In order to allow for inputs with other dimensionality
-    than that of the concatenated RNN states, the input to the first layer is
-    not summed to its output.
-    """
-    if not cells_fw:
-        raise ValueError(
-            "Must specify at least one fw cell for BidirectionalRNN.")
-    if not cells_bw:
-        raise ValueError(
-            "Must specify at least one bw cell for BidirectionalRNN.")
-    if not isinstance(cells_fw, list):
-        raise ValueError(
-            "cells_fw must be a list of RNNCells (one per layer).")
-    if not isinstance(cells_bw, list):
-        raise ValueError(
-            "cells_bw must be a list of RNNCells (one per layer).")
-    if len(cells_fw) != len(cells_bw):
-        raise ValueError(
-            "Forward and Backward cells must have the same depth.")
-    if (initial_states_fw is not None and
-            (not isinstance(initial_states_fw, list) or
-             len(initial_states_fw) != len(cells_fw))):
-        raise ValueError(
-            "initial_states_fw must be a list of state tensors (one per layer).")
-    if (initial_states_bw is not None and
-            (not isinstance(initial_states_bw, list) or
-             len(initial_states_bw) != len(cells_bw))):
-        raise ValueError(
-            "initial_states_bw must be a list of state tensors (one per layer).")
-
-    states_fw = []
-    states_bw = []
-    prev_layer = inputs
-
-    with tf.compat.v1.variable_scope(scope or "stack_bidirectional_rnn"):
-        for i, (cell_fw, cell_bw) in enumerate(zip(cells_fw, cells_bw)):
-            initial_state_fw = None
-            initial_state_bw = None
-            if initial_states_fw:
-                initial_state_fw = initial_states_fw[i]
-            if initial_states_bw:
-                initial_state_bw = initial_states_bw[i]
-
-            with tf.compat.v1.variable_scope("cell_%d" % i):
-                shortcut = prev_layer
-                outputs, (state_fw, state_bw) = tf.compat.v1.nn.bidirectional_dynamic_rnn(
-                    cell_fw,
-                    cell_bw,
-                    prev_layer,
-                    initial_state_fw=initial_state_fw,
-                    initial_state_bw=initial_state_bw,
-                    sequence_length=sequence_length,
-                    parallel_iterations=parallel_iterations,
-                    dtype=dtype,
-                    time_major=time_major)
-                # Concat the outputs to create the new input.
-                prev_layer = tf.concat(outputs, 2)
-                if i != 0 and residual_connections:
-                    prev_layer += shortcut
-
-            states_fw.append(state_fw)
-            states_bw.append(state_bw)
-
-    return prev_layer, tuple(states_fw), tuple(states_bw)
-
-
 def _create_file_writer_generic_type(logdir,
                                      name="logdir",
                                      max_queue=None,