# Setup

In [243]:
import tensorflow as tf

from tf_agents.networks import sequential as sq

from tf_agents.environments import suite_gym
from tf_agents.agents import DqnAgent

from tf_agents.environments import tf_py_environment
from tf_agents.policies import py_tf_eager_policy
from tf_agents.policies import random_tf_policy
from tf_agents.metrics import py_metrics
from tf_agents.metrics import tf_metrics
from tf_agents.drivers import py_driver
from tf_agents.drivers import dynamic_episode_driver

In [257]:
train_py_env = suite_gym.load('CartPole-v1')
train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_py_env = suite_gym.load('CartPole-v1')
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

In [245]:
nr_actions = train_py_env.action_space.n
observation_space = train_py_env.observation_space.shape

# Hyperparameters

In [256]:
num_iterations = 20000 # @param {type:"integer"}

initial_collect_steps = 100  # @param {type:"integer"}
collect_steps_per_iteration =   1# @param {type:"integer"}
replay_buffer_max_length = 100000  # @param {type:"integer"}

batch_size = 64  # @param {type:"integer"}
learning_rate = 1e-3  # @param {type:"number"}
log_interval = 200  # @param {type:"integer"}

num_eval_episodes = 10  # @param {type:"integer"}
eval_interval = 1000  # @param {type:"integer"}

# Model

In [246]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(8, input_shape=observation_space))
model.add(tf.keras.layers.Dense(2))

In [247]:
model = sq.Sequential(layers=[tf.keras.layers.Dense(8, input_shape=observation_space),tf.keras.layers.Dense(2)])

### Optimizer

In [248]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

# Agent

In [250]:
agent = DqnAgent(
    time_step_spec=train_env.time_step_spec(),
    action_spec=train_env.action_spec(),
    q_network=model,
    optimizer=optimizer
)
agent.initialize()

# Policy

In [254]:
eval_policy = agent.policy
collect_policy = agent.collect_policy
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                train_env.action_spec())

# Data collection

In [255]:
def compute_avg_return(environment, policy, num_episodes=10):
    
  total_return = 0.0
  for _ in range(num_episodes):

    time_step = environment.reset()
    episode_return = 0.0

    while not time_step.is_last():
      action_step = policy.action(time_step)
      time_step = environment.step(action_step.action)
      episode_return += time_step.reward
    total_return += episode_return

  avg_return = total_return / num_episodes
  return avg_return.numpy()[0]

# Train

In [None]:
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]

In [258]:
agent.train_step_counter.assign(0)

time_step = train_py_env.reset()


collect_driver = py_driver.PyDriver(
    train_py_env,
    py_tf_eager_policy.PyTFEagerPolicy(
      agent.collect_policy, use_tf_function=True),
    max_steps=collect_steps_per_iteration)

for _ in range(num_iterations):

  # Collect a few steps and save to the replay buffer.
  time_step, _ = collect_driver.run(time_step)

  # Sample a batch of data from the buffer and update the agent's network.
  experience, unused_info = next(iterator)
  train_loss = agent.train(experience).loss

  step = agent.train_step_counter.numpy()

  if step % log_interval == 0:
    print('step = {0}: loss = {1}'.format(step, train_loss))

  if step % eval_interval == 0:
    avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
    print('step = {0}: Average Return = {1}'.format(step, avg_return))
    returns.append(avg_return)

TypeError: __init__() missing 1 required positional argument: 'observers'