In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os
import time
import shutil
from datetime import datetime

from env import Env, SimpleEnv, ObstacleTowerEnv
from agent import Agent, SimpleAgent
from replay_memory import ReplayMemory, SimpleReplayMemory


def print_and_log(text):
    print(text)
    print(text, file=open(f'{model_dir}/log.txt', 'a'))


training_timestamp = str(int(time.time()))
model_dir = f'trained_models/model_{training_timestamp}/'

if not os.path.exists(model_dir):
    os.makedirs(model_dir)

shutil.copy2('./qlearning.ipynb', model_dir)

env = ObstacleTowerEnv(action_size=54, history_length=16)
agent = Agent(env, atoms=51, V_min=-20.0, V_max=20.0, batch_size=64, multi_step=3, discount=0.99, norm_clip=10.0, lr=5e-4, adam_eps=1.5e-4, hidden_size=256, noisy_std=0.1)
mem = ReplayMemory(int(50e3), env.window, agent.discount, agent.n, priority_weight=0.4, priority_exponent=0.5)

episodes = int(20e3)
replay_frequency = 6
reward_clip = 2.0
max_steps = int(1e6)
learning_start_step = int(5e3)
target_update = int(2e3)
priority_weight_increase = (1 - mem.priority_weight) / (max_steps - learning_start_step)

rewards = []
ep_rewards = []
ep_steps = []

print_and_log(f"{datetime.now()}, start training")
steps = 0
for episode_ix in range(1, episodes+1):
    observation, ep_reward, ep_step, done = env.reset(), 0, 0, False
    while not done:
        if steps % replay_frequency == 0:
            agent.reset_noise()
        action = agent.act(observation)
        next_observation, reward, done, info = env.step(action)
        rewards.append(reward)
        ep_reward += reward
        ep_step += 1
        steps += 1
        if reward_clip > 0:
            reward = max(min(reward, reward_clip), -reward_clip) / reward_clip
        mem.append(observation, action, reward, done)
        if steps >= learning_start_step:
            mem.priority_weight = min(mem.priority_weight + priority_weight_increase, 1)
            if steps % replay_frequency == 0:
                agent.learn(mem)
            if steps % target_update == 0:
                agent.update_target_net()
        observation = next_observation
    ep_rewards.append(ep_reward)
    ep_steps.append(steps)
    if episode_ix == 1 or episode_ix % 1 == 0:
        print_and_log(f"{datetime.now()}, episode:{episode_ix:4d}, step:{steps:5d}, reward:{ep_reward:10.4f}")
print_and_log(f"{datetime.now()}, end training")

In [None]:
env.close()
agent.save(model_dir)

plt.style.use('default')
avg_ep_rewards = [np.array(ep_rewards[max(0, i-150):max(1, i)]).mean() for i in range(len(ep_rewards))]
fig = plt.figure(figsize=(10, 6))
axes = plt.gca()
axes.set_ylim([0,5])
plt.plot(ep_steps, ep_rewards, alpha=0.5)
plt.plot(ep_steps, avg_ep_rewards, linewidth=3)
plt.xlabel('steps')
plt.ylabel('episode reward')
plt.savefig(f"{model_dir}/training.png")
plt.show()

In [None]:
env = ObstacleTowerEnv(action_size=54, history_length=16)
agent.eval()
for _ in range(10):
    observation, ep_reward, ep_step, done = env.reset(), 0, 0, False
    env.render()
    while not done:
        action = agent.act_e_greedy(observation)
        next_observation, reward, done, info = env.step(action, render=True)
        ep_reward += reward
        ep_step += 1
        observation = next_observation
    print_and_log(f"{datetime.now()}, eval ep_step:{ep_step:5d}, reward:{ep_reward:10.4f}")
env.close()