<a href="https://colab.research.google.com/github/rajdeepd/tensorflow_2.0_book_code/blob/master/ch09/Test_REINFORCE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!sudo apt-get install -y xvfb ffmpeg
!pip install 'imageio==2.4.0'
#!pip install pyvirtualdisplay
!pip install tf-agents

Reading package lists... Done
Building dependency tree       
Reading state information... Done
ffmpeg is already the newest version (7:3.4.8-0ubuntu0.2).
The following NEW packages will be installed:
  xvfb
0 upgraded, 1 newly installed, 0 to remove and 31 not upgraded.
Need to get 784 kB of archives.
After this operation, 2,270 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 xvfb amd64 2:1.19.6-1ubuntu4.8 [784 kB]
Fetched 784 kB in 3s (227 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected pack

In [2]:

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from absl.testing import parameterized
from absl.testing.absltest import mock

import tensorflow as tf
import tensorflow_probability as tfp

from tf_agents.agents.reinforce import reinforce_agent
from tf_agents.networks import actor_distribution_rnn_network
from tf_agents.networks import network
from tf_agents.networks import utils as network_utils
from tf_agents.specs import tensor_spec
from tf_agents.trajectories import policy_step
from tf_agents.trajectories import time_step as ts
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
from tf_agents.utils import nest_utils

In [3]:
_obs_spec = tensor_spec.TensorSpec([2], tf.float32)
_time_step_spec = ts.time_step_spec(_obs_spec)
_action_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1)

In [4]:
class DummyActorNet(network.Network):

  def __init__(self,
               input_tensor_spec,
               output_tensor_spec,
               unbounded_actions=False,
               stateful=False):
    # When unbounded_actions=True, we skip the final tanh activation and the
    # action shift and scale. This allows us to compute the actor and critic
    # losses by hand more easily.
    # If stateful=True, the network state has the same shape as
    # `input_tensor_spec`. Otherwise it is empty.
    state_spec = (tf.TensorSpec(input_tensor_spec.shape, tf.float32)
                  if stateful else ())
    super(DummyActorNet, self).__init__(
        input_tensor_spec=input_tensor_spec,
        state_spec=state_spec,
        name='DummyActorNet')
    single_action_spec = tf.nest.flatten(output_tensor_spec)[0]
    activation_fn = None if unbounded_actions else tf.nn.tanh
    self._output_tensor_spec = output_tensor_spec
    self._dummy_layers = [
        tf.keras.layers.Dense(
            single_action_spec.shape.num_elements() * 2,
            activation=activation_fn,
            kernel_initializer=tf.constant_initializer([[2, 1], [1, 1]]),
            bias_initializer=tf.constant_initializer(5),
        ),
    ]

  def call(self, observations, step_type, network_state):
    del step_type

    states = tf.cast(tf.nest.flatten(observations)[0], tf.float32)
    for layer in self._dummy_layers:
      states = layer(states)

    single_action_spec = tf.nest.flatten(self._output_tensor_spec)[0]
    # action_spec is TensorSpec([1], ...) so make sure there's an outer dim.
    actions = states[..., 0]
    stdevs = states[..., 1]
    actions = tf.reshape(actions, [-1] + single_action_spec.shape.as_list())
    stdevs = tf.reshape(stdevs, [-1] + single_action_spec.shape.as_list())
    actions = tf.nest.pack_sequence_as(self._output_tensor_spec, [actions])
    stdevs = tf.nest.pack_sequence_as(self._output_tensor_spec, [stdevs])

    distribution = nest_utils.map_structure_up_to(
        self._output_tensor_spec,
        tfp.distributions.MultivariateNormalDiag,
        actions,
        stdevs)
    return distribution, network_state

In [5]:
agent = reinforce_agent.ReinforceAgent(
        _time_step_spec,
        _action_spec,
        actor_network=DummyActorNet(
            _obs_spec, _action_spec, unbounded_actions=False),
        optimizer=None,
    )


In [6]:
observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
time_steps = ts.restart(observations, batch_size=2)
actions = tf.constant([[0], [1]], dtype=tf.float32)
actions_distribution = agent.collect_policy.distribution(
        time_steps).action
returns = tf.constant([1.9, 1.0], dtype=tf.float32)

loss = agent.policy_gradient_loss(
  actions_distribution, actions, time_steps.is_last(), returns, 1) 

In [7]:
import sys

tf.print("loss:", loss, output_stream=sys.stdout)

loss: 3.61492157


 Test that `policy_gradient_loss` reacts correctly to rewards when there are:
 * A single MDP episode
 * Returns on the `tf.StepType.FIRST` transitions

F, L, M = `ts.StepType.{FIRST, MID, LAST}` in the chart below.

```
Experience looks like this:
Trajectories: (F, L) -> (L, F)
observation : [1, 2]    [1, 2]
action      :   [0]       [1]
reward      :    3         0
~is_boundary:    1         0
is_last     :    1         0
valid reward:   3*1       4*0
```

The second action & reward should be masked out due to being on a boundary (step_type=(L, F)) transition.

The expected_loss is > 0.0 in this case, only LAST should be excluded.

In [8]:
step_type = tf.constant([ts.StepType.FIRST, ts.StepType.LAST])
reward = tf.constant([3, 4], dtype=tf.float32)
discount = tf.constant([1, 0], dtype=tf.float32)
observations = tf.constant([[1, 2], [1, 2]], dtype=tf.float32)
time_steps = ts.TimeStep(step_type, reward, discount, observations)

actions = tf.constant([[0], [1]], dtype=tf.float32)
actions_distribution = agent.collect_policy.distribution(
        time_steps).action
returns = tf.constant([3.0, 0.0], dtype=tf.float32)

# Returns on the StepType.FIRST should be counted.
expected_loss = 10.8935775757
loss = agent.policy_gradient_loss(
        actions_distribution, actions, time_steps.is_last(), returns, 1)
tf.print("loss:", loss, output_stream=sys.stdout)

loss: 4.25681543


Bandit Episodes

Sample which shows how train reacts correctly to experience when there is only a single Bandit episode.  Bandit episodes are encoded differently than MDP episodes.  They have only a single transition with `step_type=StepType.FIRST` and `next_step_type=StepType.LAST`.

```
F, L, M = ts.StepType.{FIRST, MID, LAST} in the chart below.

Experience looks like this:
Trajectories: (F, L)
observation : [1, 2]
action      :   [0]
reward      :    3
~is_boundary:    0
is_last     :    1
valid reward:   3*1
```
The single bandit transition is valid and not masked.

The expected_loss is `> 0.0` in this case.

In [13]:
step_type = tf.constant([ts.StepType.FIRST])
next_step_type = tf.constant([ts.StepType.LAST])
reward = tf.constant([3], dtype=tf.float32)
discount = tf.constant([0], dtype=tf.float32)
observations = tf.constant([[1, 2]], dtype=tf.float32)
actions = tf.constant([[0]], dtype=tf.float32)

experience = nest_utils.batch_nested_tensors(trajectory.Trajectory(
        step_type, observations, actions, (), next_step_type, reward, discount))

# Rewards should be counted.
expected_loss = 10.8935775757

#if tf.executing_eagerly():
#      loss = lambda: agent.train(experience)
#else:
loss = lambda: agent.train(experience)
tf.print("loss:", loss, output_stream=sys.stdout)

loss: <function <lambda> at 0x7f96207265f0>
