In [1]:
from blob_env.blob_env import *
from util import *

Using TensorFlow backend.


In [2]:
from __future__ import absolute_import, division, print_function

import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import PIL.Image

import tensorflow as tf

from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

In [3]:
from gym.envs.registration import register
from gym.envs.registration import register

register(
    id='Blob2d-v1',
    entry_point='blob_env.blob_env:BlobEnv')

In [4]:
env = BlobEnv()
example_env = BlobEnv()
tf_env = tf_py_environment.TFPyEnvironment(suite_gym.load('Blob2d-v1'))
eval_env = tf_py_environment.TFPyEnvironment(suite_gym.load('Blob2d-v1'))

In [53]:
num_iterations = 200000 # @param {type:"integer"}

initial_collect_steps = 100  # @param {type:"integer"} 
collect_steps_per_iteration = 1  # @param {type:"integer"}
replay_buffer_max_length = 100000  # @param {type:"integer"}

batch_size = 64  # @param {type:"integer"}
learning_rate = .1  # @param {type:"number"}
log_interval = 1000  # @param {type:"integer"}

num_eval_episodes = 10  # @param {type:"integer"}
eval_interval = 1000  # @param {type:"integer"}

In [54]:
fc_layer_params = (100,)

q_net = q_network.QNetwork(
    tf_env.observation_spec(),
    tf_env.action_spec(),
    fc_layer_params=fc_layer_params)

In [55]:
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

In [56]:
eval_policy = agent.policy
collect_policy = agent.collect_policy

In [57]:
random_policy = random_tf_policy.RandomTFPolicy(tf_env.time_step_spec(),
                                                tf_env.action_spec())

In [58]:
all_returns = []
def compute_avg_return(environment, policy, num_episodes=100):

  total_return = 0.0
  for _ in range(num_episodes):
    episode_returns = []

    time_step = environment.reset()
    episode_return = 0.0

    while not time_step.is_last():
      action_step = policy.action(time_step)
      time_step = environment.step(action_step.action)
      episode_return += time_step.reward
    total_return += episode_return

  avg_return = total_return / num_episodes
  return avg_return.numpy()[0]


# See also the metrics module for standard implementations of different metrics.
# https://github.com/tensorflow/agents/tree/master/tf_agents/metrics

In [59]:
eval_env.reset()
compute_avg_return(eval_env, random_policy, num_eval_episodes)

-1.0

In [60]:
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=tf_env.batch_size,
    max_length=replay_buffer_max_length)

In [61]:
def collect_step(environment, policy, buffer):
  time_step = environment.current_time_step()
  action_step = policy.action(time_step)
  next_time_step = environment.step(action_step.action)
  traj = trajectory.from_transition(time_step, action_step, next_time_step)

  # Add trajectory to the replay buffer
  buffer.add_batch(traj)

def collect_data(env, policy, buffer, steps):
  for _ in range(steps):
    collect_step(env, policy, buffer)

collect_data(tf_env, random_policy, replay_buffer, initial_collect_steps)

# This loop is so common in RL, that we provide standard implementations. 
# For more details see the drivers module.
# https://www.tensorflow.org/agents/api_docs/python/tf_agents/drivers

In [62]:
# Dataset generates trajectories with shape [Bx2x...]
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, 
    sample_batch_size=batch_size, 
    num_steps=2).prefetch(3)
iterator = iter(dataset)

In [None]:
num_iterations = 100000 # @param {type:"integer"}
try:
  %%time
except:
  pass

# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

# Reset the train step
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]
losses = []

for _ in range(num_iterations):

  # Collect a few steps using collect_policy and save to the replay buffer.
  collect_data(tf_env, agent.collect_policy, replay_buffer, collect_steps_per_iteration)

  # Sample a batch of data from the buffer and update the agent's network.
  experience, unused_info = next(iterator)
  train_loss = agent.train(experience).loss

  step = agent.train_step_counter.numpy()

  if step % log_interval == 0:
    print('step = {0}: loss = {1}'.format(step, train_loss))
    losses.append(train_loss)

  if step % eval_interval == 0:
    avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
    print('step = {0}: Average Return = {1}'.format(step, avg_return))
    returns.append(avg_return)

step = 1000: loss = 22.408761978149414
step = 1000: Average Return = -1.0
step = 2000: loss = 12.162713050842285
step = 2000: Average Return = 1.600000023841858
step = 3000: loss = 9.425508499145508
step = 3000: Average Return = -1.0
step = 4000: loss = 12.46407699584961
step = 4000: Average Return = -1.0
step = 5000: loss = 5.286794185638428
step = 5000: Average Return = -1.0
step = 6000: loss = 18.408531188964844
step = 6000: Average Return = -30.899999618530273
step = 7000: loss = 1.582027792930603
step = 7000: Average Return = -1.0
step = 8000: loss = 3.266300678253174
step = 8000: Average Return = -1.0
step = 9000: loss = 1.809601068496704
step = 9000: Average Return = -1.0
step = 10000: loss = 5.318526268005371
step = 10000: Average Return = -1.0
step = 11000: loss = 14.190179824829102
step = 11000: Average Return = 1.600000023841858
step = 12000: loss = 3.689124822616577
step = 12000: Average Return = -30.899999618530273
step = 13000: loss = 23.070192337036133
step = 13000: Aver

In [None]:
iterations = range(0, num_iterations + 1, eval_interval)
plt.plot(iterations, returns)
plt.plot(iterations, losses)
plt.ylabel('Average Return')
plt.xlabel('Iterations')
plt.ylim(top=1,bottom=-5)

In [None]:
def embed_mp4(filename):
  """Embeds an mp4 file in the notebook."""
  video = open(filename,'rb').read()
  b64 = base64.b64encode(video)
  tag = '''
  <video width="640" height="480" controls>
    <source src="data:video/mp4;base64,{0}" type="video/mp4">
  Your browser does not support the video tag.
  </video>'''.format(b64.decode())

  return IPython.display.HTML(tag)

In [None]:
video_py_env = suite_gym.load('Blob2d-v1')
video_env = tf_py_environment.TFPyEnvironment(video_py_env)

def create_policy_eval_video(policy, filename, num_episodes=50, fps=5):
  filename = filename + ".mp4"
  with imageio.get_writer(filename, fps=fps) as video:
    for _ in range(num_episodes):
      time_step = video_env.reset()
      video.append_data(video_py_env.render())
      while not time_step.is_last():
        action_step = policy.action(time_step)
        time_step = video_env.step(action_step.action)
        video.append_data(video_py_env.render())
  return embed_mp4(filename)




create_policy_eval_video(agent.policy, "trained-agent")