In [1]:
import numpy as np
import tensorflow as tf
import gym

from agent import DQN
from agent import train
from replay import UniformReplayBuffer, PrioritizedReplayBuffer
from replay import Transition
from observer import AverageObserver, MaximumObserver

### settings and parameters

In [2]:
# select the env
env_name = "CartPole-v1"

# determine training episodes
num_episodes = 1000

# log interval
log_interval = 50

# max step per episode
max_steps_per_episode = 10000

# batch size for sampling experience
batch_size = 64

# using PER or not
use_prioritized_experience_buffer = True

# buffer size
replay_buffer_capacity = 10000

# frequency for updating online network
online_update_period = 1

# if use soft update target network
use_soft_update = False

# update rate if using soft update
target_update_tau = 1

# frequency for synchronizing the target network
target_sync_period = 1

# learning rate
learning_rate = 5e-4

# discounting factor
gamma = 1

# epsilon-greedy
epsilon = 0.1

# optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# loss function
loss_function = tf.keras.losses.MeanSquaredError()

### ENV

In [3]:
env = gym.make(env_name)

# display
print("shape of State Space -> {}".format(
    env.observation_space.shape
))
print("Size of Action Space ->  {}".format(
    env.action_space.n
))

shape of State Space -> (4,)
Size of Action Space ->  2


### DQN

In [4]:
# agent config
config = {
    "type": "dqn", # "dqn" or "ddqn", default is "ddqn"
    "network": {
        "type": "dense", # "dense", "conv", "lstm"
        "hidden_layers": (32, 32),
    },
    "gamma": gamma,
    "epsilon": epsilon,
}

# agent
dqn = DQN(
    config,
    env.observation_space.shape,
    env.action_space.n,
    optimizer=optimizer,
    loss_function=loss_function
)

# replay buffer
if use_prioritized_experience_buffer:
    b_dqn = PrioritizedReplayBuffer(size=replay_buffer_capacity, alpha=0.6)
else:
    b_dqn = UniformReplayBuffer(size=replay_buffer_capacity)
    
# observer
obv_dqn = [AverageObserver(10), MaximumObserver(10)]

In [None]:
dqn_rewards = train(
    env, dqn, b_dqn, 
    num_episodes=num_episodes, 
    max_steps_per_episode=max_steps_per_episode,
    batch_size=batch_size,
    online_update_period=online_update_period,
    target_sync_period=target_sync_period,
    log_interval=log_interval,
    use_soft_update=use_soft_update,
    target_update_tau=target_update_tau,
    observer=obv_dqn
)

period: 50, average reward: 10.000, maximum reward: 11.000
period: 100, average reward: 11.700, maximum reward: 16.000
period: 150, average reward: 17.800, maximum reward: 28.000
period: 200, average reward: 36.600, maximum reward: 61.000
period: 250, average reward: 249.000, maximum reward: 429.000
