In [None]:
import numpy as np
from datetime import datetime

from env import SimpleEnv
from agent import SimpleAgent
from replay_memory import SimpleReplayMemory


env = SimpleEnv(action_size=4)
agent = SimpleAgent(env, batch_size=64, discount=0.99, lr=5e-4, input_size=8, hidden_size=256, eps_decay=5e-4, min_epsilon=0.01)
mem = SimpleReplayMemory(capacity=int(1e5), input_size=8)

episodes = int(10e3)
target_update = int(1e3)

rewards = []
ep_rewards = []

print(f"{datetime.now()}, start training")
steps = 0
for episode_ix in range(1, episodes+1):
    observation, ep_reward, ep_steps, done = env.reset(), 0, 0, False
    while not done:
        action = agent.act_e_greedy(observation)
        next_observation, reward, done, info = env.step(action)
        rewards.append(reward)
        ep_reward += reward
        ep_steps += 1
        steps += 1
        mem.append(observation, action, reward, done, next_observation)
        agent.learn(mem)
        if steps % target_update == 0:
            agent.update_target_net()
        observation = next_observation
    ep_rewards.append(ep_reward)
    if episode_ix == 1 or episode_ix % 1 == 0:
        print(f"{datetime.now()}, episode:{episode_ix:2d}, step:{steps:5d}, reward:{ep_reward:10.4f}")
print(f"{datetime.now()}, end training")
env.close()    

In [None]:
for _ in range(10):
    observation, ep_reward, ep_steps, done = env.reset(), 0, 0, False
    while not done:
        env.env.render()
        action = agent.act_e_greedy(observation)
        next_observation, reward, done, info = env.step(action)
        ep_reward += reward
        ep_steps += 1
        observation = next_observation
    print(f"{datetime.now()}, ep_steps:{ep_steps:5d}, reward:{ep_reward:10.4f}")
env.close()