## Hyperparameters

In [None]:
from datetime import datetime, timezone

env_id = "BreakoutNoFrameskip-v4"
algorithm = "ddqn"
max_episode = 7000

seed = 42
minibatch_size = 32
replay_memory_size = 100000
replay_start_size = 50000
action_repeat = 4
agent_history_length = 4
update_frequency = 4
target_network_update_frequency = 10000
learning_rate = 0.00001
discount_factor = 0.99
initial_exploration = 1.0
final_exploration = 0.1
final_exploration_frame = 100000.0

now = datetime.now().strftime("%Y%m%d-%H%M%S")
weights_dir_tmpl = "{}/{}-{}/{}".format("checkpoints", algorithm, now, {})
log_dir = "{}/{}-{}".format("runs", algorithm, now)

In [None]:
import sys, os
sys.path.append(os.path.pardir)

import numpy as np
import tensorflow as tf

from collections import deque
from utils.agent import Agent
from utils.models import create_dqn_model
from utils.policies import epsilon_greedy_policy
from utils.algorithms import train_ddqn
from utils.experience_replay import ExperienceReplay
from atari.wrapper import wrapper

np.random.seed(seed)
tf.random.set_seed(seed)

env = wrapper(env_id, skip=action_repeat, stack=agent_history_length, seed=seed)
buffer = ExperienceReplay(memory_size=replay_memory_size, batch_size=minibatch_size)
agent = Agent(
    model=create_dqn_model(env.action_space.n),
    num_actions=env.action_space.n,
    behavior_policy=epsilon_greedy_policy,
    train=train_ddqn,
    optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
    epsilon_init=initial_exploration,
    epsilon_fin=final_exploration,
    epsilon_fin_frame=final_exploration_frame)

writer = tf.summary.create_file_writer(log_dir)
writer.set_as_default()

is_gpu = tf.config.list_physical_devices('GPU')
metric_q = tf.keras.metrics.Mean("Avg. Q-value", dtype=tf.float32)
metric_loss = tf.keras.metrics.Mean("Avg. Loss", dtype=tf.float32)
metric_rewards = tf.keras.metrics.Sum("Sum. Rewards", dtype=tf.float32)

## Training

In [None]:
max_reward = -float('inf')

running = True
step_count = 0
episode_count = 0

while running:    
    done = False
    state = env.reset()

    while done is False:
        step_count += 1

        if buffer.get_history_length() < replay_start_size:
            action = agent.get_action(state, 1.0)
        else:
            action = agent.get_action(state)

        state_next, reward, done, info = env.step(action)
        buffer.put(state, [action], state_next, [reward], [done])

        if step_count >= replay_start_size:
            for _ in range(update_frequency):
                state_sample, action_sample, state_next_sample, rewards_sample, done_sample = buffer.get()
                loss, q_max = agent.train(state_sample, action_sample, state_next_sample, rewards_sample, done_sample)
                metric_loss(loss)
                metric_q(q_max)
        
            if step_count % target_network_update_frequency == 0:
                agent.update_model_target()

        metric_rewards(reward)
        state = state_next

    if env.was_real_done:
        episode_count += 1
        
        if max_reward <= metric_rewards.result():
            max_reward = metric_rewards.result()
            agent.save_model(weights_dir_tmpl.format(max_reward))
        
        if max_episode <= episode_count:
            agent.save_model(weights_dir_tmpl.format("latest"))
            running = False

        tf.summary.scalar("Performance/Q-value", metric_q.result(), step=episode_count)
        tf.summary.scalar("Performance/Loss", metric_loss.result(), step=episode_count)
        tf.summary.scalar("Performance/Rewards", metric_rewards.result(), step=episode_count)
        tf.summary.scalar("Etc./Epsilon", agent.epsilon, step=episode_count)
        if is_gpu:
            tf.summary.scalar(
                "Etc./GPU usages",
                tf.config.experimental.get_memory_usage("GPU:0"),
                step=episode_count)

        print("Episode: {} | Loss: {:.4f} | Q-value: {:.4f} | Rewards: {:4f} | epsilon: {:.4f} | memory size: {:}".format(
            episode_count, metric_loss.result(), metric_q.result(), metric_rewards.result(), agent.epsilon, buffer.get_history_length()))

        metric_q.reset_states()
        metric_loss.reset_states()
        metric_rewards.reset_states()