In [2]:
from __future__ import absolute_import, division, print_function

import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import PIL.Image
import pyvirtualdisplay
import reverb

import tensorflow as tf

from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import py_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import sequential
from tf_agents.policies import py_tf_eager_policy
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import reverb_replay_buffer
from tf_agents.replay_buffers import reverb_utils
from tf_agents.trajectories import trajectory
from tf_agents.specs import tensor_spec
from tf_agents.utils import common
from tf_agents.policies import random_py_policy
import argparse

import gym

num_iterations = 10000 # @param {type:"integer"}

initial_collect_steps = 50  # @param {type:"integer"}
collect_steps_per_iteration =   1# @param {type:"integer"}
replay_buffer_max_length = 1000  # @param {type:"integer"}

batch_size = 32  # @param {type:"integer"}
learning_rate = 1e-3  # @param {type:"number"}
log_interval = 200  # @param {type:"integer"}

num_eval_episodes = 10  # @param {type:"integer"}
eval_interval = 100  # @param {type:"integer"}

In [3]:
t_env = tf_py_environment.TFPyEnvironment(suite_gym.load('gym_go:go-v1', gym_kwargs={'size':3,'komi':0}))
e_env = tf_py_environment.TFPyEnvironment(suite_gym.load('gym_go:go-v1', gym_kwargs={'size':3,'komi':0}))

In [4]:
tp_env = suite_gym.load('gym_go:go-v1', gym_kwargs={'size':3,'komi':0})
ep_env = suite_gym.load('gym_go:go-v1', gym_kwargs={'size':3,'komi':0})
env = suite_gym.load('gym_go:go-v1', gym_kwargs={'size':3,'komi':0})

In [5]:
print('Observation Spec:')
print(t_env.time_step_spec().observation)


Observation Spec:
BoundedTensorSpec(shape=(36,), dtype=tf.float32, name='observation', minimum=array(0., dtype=float32), maximum=array(3., dtype=float32))


In [6]:
tp_env.reset()

TimeStep(
{'discount': array(1., dtype=float32),
 'observation': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1.], dtype=float32),
 'reward': array(0., dtype=float32),
 'step_type': array(0, dtype=int32)})

In [7]:
print(suite_gym.load('gym_go:go-v1', gym_kwargs={'size':3,'komi':0}).time_step_spec().observation)
print(suite_gym.load('gym_go:go-v1', gym_kwargs={'size':3,'komi':0}).action_spec())

BoundedArraySpec(shape=(36,), dtype=dtype('float32'), name='observation', minimum=0.0, maximum=3.0)
BoundedArraySpec(shape=(), dtype=dtype('int64'), name='action', minimum=0, maximum=8)


In [8]:
t_env.reset()

TimeStep(
{'discount': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>,
 'observation': <tf.Tensor: shape=(1, 36), dtype=float32, numpy=
array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1.,
        1., 1., 1., 1.]], dtype=float32)>,
 'reward': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>,
 'step_type': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>})

In [9]:
t_env.observation_spec()

BoundedTensorSpec(shape=(36,), dtype=tf.float32, name='observation', minimum=array(0., dtype=float32), maximum=array(3., dtype=float32))

In [10]:
time_step = tp_env.step((0,0))
print(time_step)

TimeStep(
{'discount': array(1., dtype=float32),
 'observation': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1.,
       1., 1.], dtype=float32),
 'reward': array(0., dtype=float32),
 'step_type': array(1, dtype=int32)})


In [11]:
print('Reward Spec:')
print(t_env.time_step_spec().reward)


Reward Spec:
TensorSpec(shape=(), dtype=tf.float32, name='reward')


In [12]:
fc_layer_params = (100, 50)
action_tensor_spec = tensor_spec.from_spec(t_env.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1

# Define a helper function to create Dense layers configured with the right
# activation and kernel initializer.
def dense_layer(num_units):
  return tf.keras.layers.Dense(
      num_units,
      activation=tf.keras.activations.relu,
      kernel_initializer=tf.keras.initializers.VarianceScaling(
          scale=2.0, mode='fan_in', distribution='truncated_normal'))

# QNetwork consists of a sequence of Dense layers followed by a dense layer
# with `num_actions` units to generate one q_value per available action as
# its output.
dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
q_values_layer = tf.keras.layers.Dense(
    num_actions,
    activation=None,
    kernel_initializer=tf.keras.initializers.RandomUniform(
        minval=-0.03, maxval=0.03),
    bias_initializer=tf.keras.initializers.Constant(-0.2))
q_net = sequential.Sequential(dense_layers + [q_values_layer])


In [14]:
def observation_and_action_constraint_splitter(obs):
    return obs[:-9],obs[-9:]

In [15]:
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    t_env.time_step_spec(),
    t_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    observation_and_action_constraint_splitter=observation_and_action_constraint_splitter,
    train_step_counter=train_step_counter)

agent.initialize()


TypeError: 'BoundedTensorSpec' object is not subscriptable
  In call to configurable 'DqnAgent' (<class 'tf_agents.agents.dqn.dqn_agent.DqnAgent'>)

In [20]:
def compute_avg_return(environment, policy, num_episodes=10):

  total_return = 0.0
  for _ in range(num_episodes):

    time_step = environment.reset()
    episode_return = 0.0

    while not time_step.is_last():
      print("action step")
      action_step = policy.action(time_step)
      print("env step")
      time_step = environment.step(action_step.action)
      episode_return += time_step.reward
    total_return += episode_return

  avg_return = total_return / num_episodes
  return avg_return.numpy()[0]


In [21]:
random_policy = random_tf_policy.RandomTFPolicy(t_env.time_step_spec(),
                                                t_env.action_spec(),observation_and_action_constraint_splitter=observation_and_action_constraint_splitter)


In [22]:
compute_avg_return(e_env, random_policy, num_eval_episodes)


action step


ValueError: Received a mix of batched and unbatched Tensors, or Tensors are not compatible with Specs.  num_outer_dims: 1.
Saw tensor_shapes:
   TimeStep(
{'discount': TensorShape([1]),
 'observation': {'legal_moves': TensorShape([1, 9]),
                 'observation': TensorShape([1, 27])},
 'reward': TensorShape([1]),
 'step_type': TensorShape([1])})
And spec_shapes:
   TimeStep(
{'discount': TensorShape([]),
 'observation': {'legal_moves': TensorShape([]),
                 'observation': TensorShape([27])},
 'reward': TensorShape([]),
 'step_type': TensorShape([])})

In [27]:
table_name = 'uniform_table'
replay_buffer_signature = tensor_spec.from_spec(
      agent.collect_data_spec)
replay_buffer_signature = tensor_spec.add_outer_dim(
    replay_buffer_signature)

table = reverb.Table(
    table_name,
    max_size=replay_buffer_max_length,
    sampler=reverb.selectors.Uniform(),
    remover=reverb.selectors.Fifo(),
    rate_limiter=reverb.rate_limiters.MinSize(1),
    signature=replay_buffer_signature)

reverb_server = reverb.Server([table])

replay_buffer = reverb_replay_buffer.ReverbReplayBuffer(
    agent.collect_data_spec,
    table_name=table_name,
    sequence_length=2,
    local_server=reverb_server)

rb_observer = reverb_utils.ReverbAddTrajectoryObserver(
  replay_buffer.py_client,
  table_name,
  sequence_length=2)


In [33]:
tp_env.action_spec()

AttributeError: 'BoundedArraySpec' object has no attribute 'item'

In [34]:
action_spec = array_spec.BoundedArraySpec((2,), np.int32, -10, 10)
py_driver.PyDriver(
    env,
    random_py_policy.RandomPyPolicy(time_step_spec=tp_env.time_step_spec(),
    action_spec=action_spec,
    [rb_observer],
    max_steps=initial_collect_steps).run(tp_env.reset())


SyntaxError: positional argument follows keyword argument (<ipython-input-34-1d4aefee4ebc>, line 7)