# Acme test

## Train D4PG for Cartpole

In [6]:
from acme import environment_loop
from acme import specs
from acme import wrappers
from acme.agents.tf import d4pg
from acme.tf import networks
from acme.tf import utils as tf2_utils
from acme.utils import loggers
import numpy as np
import sonnet as snt
import warnings
warnings.simplefilter(action='ignore', category=DeprecationWarning)

In [7]:
import gym

In [8]:
# Imports required for visualization
import pyvirtualdisplay
import imageio
import base64

In [29]:
# Set up a virtual display for rendering.
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()

In [41]:
environment = gym.make('MountainCarContinuous-v0')
# environment = gym.make('CartPole-v1')

In [42]:
state = environment.reset()

In [43]:
state.shape

(4,)

In [44]:
state

array([ 0.01637666,  0.03503753,  0.00148239, -0.0016279 ], dtype=float32)

In [45]:
environment.action_space

Discrete(2)

In [49]:
environment.step(0)

(array([ 0.01707742, -0.16010565,  0.00144983,  0.29152238], dtype=float32),
 1.0,
 False,
 {})

In [50]:
environment = wrappers.GymWrapper(environment)

In [51]:
environment.action_space

Discrete(2)

In [52]:
# Make sure the environment outputs single-precision floats.
environment = wrappers.SinglePrecisionWrapper(environment)

In [53]:
# Grab the spec of the environment.
environment_spec = specs.make_environment_spec(environment)

In [54]:
environment_spec

EnvironmentSpec(observations=BoundedArray(shape=(4,), dtype=dtype('float32'), name='observation', minimum=[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], maximum=[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]), actions=DiscreteArray(shape=(), dtype=int32, name=action, minimum=0, maximum=1, num_values=2), rewards=Array(shape=(), dtype=dtype('float32'), name='reward'), discounts=BoundedArray(shape=(), dtype=dtype('float32'), name='discount', minimum=0.0, maximum=1.0))

In [55]:
#@title Build agent networks

# Get total number of action dimensions from action spec.
num_dimensions = np.prod(environment_spec.actions.shape, dtype=int)

# Create the shared observation network; here simply a state-less operation.
observation_network = tf2_utils.batch_concat

# Create the deterministic policy network.
policy_network = snt.Sequential([
    networks.LayerNormMLP((256, 256, 256), activate_final=True),
    networks.NearZeroInitializedLinear(num_dimensions),
    networks.TanhToSpec(environment_spec.actions),
])

# Create the distributional critic network.
critic_network = snt.Sequential([
    # The multiplexer concatenates the observations/actions.
    networks.CriticMultiplexer(),
    networks.LayerNormMLP((512, 512, 256), activate_final=True),
    networks.DiscreteValuedHead(vmin=-150., vmax=150., num_atoms=51),
])

In [56]:
num_dimensions

1

In [57]:
# Create a logger for the agent and environment loop.
agent_logger = loggers.TerminalLogger(label='agent', time_delta=10.)
env_loop_logger = loggers.TerminalLogger(label='env_loop', time_delta=10.)

# Create the D4PG agent.
agent = d4pg.D4PG(
    environment_spec=environment_spec,
    policy_network=policy_network,
    critic_network=critic_network,
    observation_network=observation_network,
    sigma=1.0,
    logger=agent_logger,
    checkpoint=False
)

# Create an loop connecting this agent to the environment created above.
env_loop = environment_loop.EnvironmentLoop(
    environment, agent, logger=env_loop_logger)

In [58]:
# Run a `num_episodes` training episodes.
# Rerun this cell until the agent has learned the given task.
env_loop.run(num_episodes=50)

AssertionError: array([0.16814864], dtype=float32) (<class 'numpy.ndarray'>) invalid

In [38]:
# Create a simple helper function to render a frame from the current state of
# the environment.

def render(env):
    return env.environment.render(mode='rgb_array')


def display_video(frames, filename='temp.mp4'):
  """Save and display video."""

  # Write video
  with imageio.get_writer(filename, fps=60) as video:
    for frame in frames:
      video.append_data(frame)

  # Read video and display the video
  video = open(filename, 'rb').read()
  b64_video = base64.b64encode(video)
  video_tag = ('<video  width="320" height="240" controls alt="test" '
               'src="data:video/mp4;base64,{0}">').format(b64_video.decode())

  return IPython.display.HTML(video_tag)

In [39]:
import IPython

In [40]:
timestep = environment.reset()
frames = [render(environment)]

while not timestep.last():
  # Simple environment loop.
  action = agent.select_action(timestep.observation)
  timestep = environment.step(action)

  # Render the scene and add it to the frame stack.
  frames.append(render(environment))

# Save and display a video of the behaviour.
display_video(np.array(frames))

