In [None]:
import numpy as np
from datetime import datetime

from env import Env
from agent import SimpleAgent
from replay_memory import SimpleReplayMemory


env = Env(action_size=4, history_length=4, pixel_obs=False)
agent = SimpleAgent(env, atoms=51, V_min=-10.0, V_max=10.0, batch_size=64, multi_step=3, discount=0.99, 
                    norm_clip=10.0, lr=1e-4, adam_eps=1.5e-4, input_size=8, hidden_size=256, noisy_std=0.1, eps_decay=5e-4, min_epsilon=0.01)
mem = SimpleReplayMemory(int(100e3), env.window, agent.discount, agent.n, priority_weight=0.4, priority_exponent=0.5)

episodes = int(10e3)
replay_frequency = 2
reward_clip = 1.0
max_steps = int(50e6)
learning_start_step = int(5e3)
target_update = int(2e3)
priority_weight_increase = (1 - mem.priority_weight) / (max_steps - learning_start_step)

rewards = []
ep_rewards = []

print(f"{datetime.now()}, start training")
steps = 0
for episode_ix in range(1, episodes+1):
    observation, ep_reward, ep_steps, done = env.reset(), 0, 0, False
    while not done:
        if steps % replay_frequency == 0:
            agent.reset_noise()
        action = agent.act(observation)
        next_observation, reward, done, info = env.step(action)
        rewards.append(reward)
        ep_reward += reward
        ep_steps += 1
        steps += 1
        if reward_clip > 0:
            reward = max(min(reward, reward_clip), -reward_clip) / reward_clip
        mem.append(observation, action, reward, done)
        if steps >= learning_start_step:
            mem.priority_weight = min(mem.priority_weight + priority_weight_increase, 1)
            if steps % replay_frequency == 0:
                agent.learn(mem)
            if steps % target_update == 0:
                agent.update_target_net()
        observation = next_observation
    ep_rewards.append(ep_reward)
    if episode_ix == 1 or episode_ix % 1 == 0:
        print(f"{datetime.now()}, episode:{episode_ix:2d}, step:{steps:5d}, reward:{ep_reward:10.4f}")
print(f"{datetime.now()}, end training")
env.close()

In [None]:
agent.epsilon = 0.01
for _ in range(10):
    observation, ep_reward, ep_steps, done = env.reset(), 0, 0, False
    while not done:
        env.wrapped_env.render()
        action = agent.act_e_greedy(observation)
        next_observation, reward, done, info = env.step(action)
        ep_reward += reward
        ep_steps += 1
        observation = next_observation
    print(f"{datetime.now()}, ep_steps:{ep_steps:5d}, reward:{ep_reward:10.4f}")
env.close()