In [None]:
import numpy as np
from datetime import datetime

from env import Env
from agent import Agent
from replay_memory import ReplayMemory


env = Env(action_size=4, history_length=4)
agent = Agent(env, atoms=51, V_min=-10.0, V_max=10.0, batch_size=32, multi_step=3, discount=0.99, 
              norm_clip=10.0, lr=6.25e-5, adam_eps=1.5e-4, hidden_size=256, noisy_std=0.1, eps_decay=1e-6, min_epsilon=0.1)
mem = ReplayMemory(int(1e6), env.window, agent.discount, agent.n, priority_weight=0.4, priority_exponent=0.5)

episodes = int(10e3)
replay_frequency = 4
reward_clip = 2.0
max_steps = int(50e6)
learning_start_step = int(20e3)
target_update = int(8e3)
priority_weight_increase = (1 - mem.priority_weight) / (max_steps - learning_start_step)

rewards = []
ep_rewards = []

print(f"{datetime.now()}, start training")
steps = 0
for episode_ix in range(1, episodes+1):
    observation, ep_reward, ep_steps, done = env.reset(), 0, 0, False
    while not done:
#     for _ in range(3):
        if steps % replay_frequency == 0:
            agent.reset_noise()
        action = agent.act_e_greedy(observation)
        next_observation, reward, done, info = env.step(action)
        rewards.append(reward)
        ep_reward += reward
        ep_steps += 1
        steps += 1
#         print(f"{datetime.now()}, episode:{episode_ix:2d}, step:{steps:5d}, action:{action}, reward:{reward:9.4f}, done:{done}")
        if reward_clip > 0:
            reward = max(min(reward, reward_clip), -reward_clip) / reward_clip
        mem.append(observation, action, reward, done)
        if steps >= learning_start_step:
            mem.priority_weight = min(mem.priority_weight + priority_weight_increase, 1)
            if steps % replay_frequency == 0:
                # pass
                agent.learn(mem)
#                 print(f"{datetime.now()}, step:{steps:5d}")
            if steps % target_update == 0:
                agent.update_target_net()
        observation = next_observation
    ep_rewards.append(ep_reward)
    if episode_ix == 1 or episode_ix % 200 == 0:
        print(f"{datetime.now()}, episode:{episode_ix:2d}, step:{steps:5d}, reward:{ep_reward:10.4f}")
print(f"{datetime.now()}, end training")
env.close()

In [None]:
env.close()

In [None]:
env = Env(action_size=4, history_length=4)
observation, ep_reward, ep_steps, done = env.reset(), 0, 0, False
while not done:
    action = agent.act(observation)
    next_observation, reward, done, info = env.step(action)
    rewards.append(reward)
    ep_reward += reward
    ep_steps += 1
    observation = next_observation
print(f"{datetime.now()}, episode:{episode_ix:2d}, step:{steps:5d}, reward:{ep_reward:10.4f}")
env.close()

In [None]:
eval_rewards = []
eval_episode_rewards = []

for T in range(1, T_max + 1):
    if T >= learn_start:
#         if episode_count % 10 == 0 and done:
#             agent.eval()
            
#             eval_episode_reward = 0.0
#             eval_episode_steps = 0
#             eval_done = True
#             while True:
#                 if eval_done:
#                     eval_state, eval_done = env.reset(), False
#                 eval_action = agent.act_e_greedy(eval_state)
#                 eval_state, eval_reward, eval_done = env.step(eval_action)
#                 eval_rewards.append(eval_reward)
#                 eval_episode_reward += eval_reward
#                 eval_episode_steps += 1
#                 if eval_done:
#                     eval_episode_rewards.append(eval_episode_reward)
#                     print(f"{datetime.now()}, T:{T}, Eval_Episode:{episode_count}, Steps:{eval_episode_steps}, "
#                           f"Avg. Reward:{eval_episode_reward/eval_episode_steps:.4f}, Total Reward:{eval_episode_reward}")
#                     break
#             agent.train()