OpenAI Gym에서 "Breakout-v4" 게임을 플레이하기 위해서는 `atari-py`를 설치해야 한다.

```Python
pip install gym atari-py
```

## 이미지 전처리

In [None]:
import gym
import matplotlib.pyplot as plt

env = gym.make("Breakout-v4")
obs = env.reset()

plt.matshow(obs)
plt.show()

print(obs.shape)

In [None]:
import tensorflow as tf

def preprocessing(image):
    # converting their RGB representation to gray-scale
    image = tf.image.rgb_to_grayscale(image)
    # down-sampling it to a 110×84 image
    image = tf.image.resize(image, size=[110, 84])
    # cropping an 84 × 84 region of the image that roughly captures the playing area
    image = tf.image.crop_to_bounding_box(
        image, 
        offset_height=17, offset_width=0,
        target_height=84, target_width=84
    )
    image = tf.image.per_image_standardization(image)
    return tf.squeeze(image)

preprocessed = preprocessing(obs)
print(preprocessed.shape)

plt.matshow(preprocessed)
plt.show()

## 모델 생성

In [None]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.activations import relu


def create_model(output_size, trainable=True):
    # The input to the neural network consists is an 84 × 84 × 4 image produced by φ.
    inputs = Input(shape=(84, 84, 4))
    # inputs = tf.transpose(inputs, [0, 2, 3, 1])
    # The first hidden layer convolves 16 8 × 8 filters with stride 4 with the input image,
    # and applies a rectifier nonlinearity [10, 18]. 
    hidden1 = Conv2D(
        filters=16,
        kernel_size=(8, 8),
        strides=4,
        activation=relu
    )(inputs)
    # The second hidden layer convolves 32 4 × 4 filters with stride 2,
    # again followed by a rectifier nonlinearity.
    hidden2 = Conv2D(
        filters=32,
        kernel_size=(4, 4),
        strides=2,
        activation=relu
    )(hidden1)
    # The final hidden layer is fully-connected and consists of 256 rectifier units.
    flatten = Flatten()(hidden2)
    hidden3 = Dense(units=256, activation=relu)(flatten)
    # The output layer is a fully-connected linear layer with a single output for each valid action.
    outputs = Dense(units=output_size)(hidden3)

    return Model(inputs=inputs, outputs=outputs, trainable=trainable)


model = create_model(env.action_space.n)
model.summary()

In [None]:
sample = tf.stack([preprocessed]*4, axis=2)
sample = tf.expand_dims(sample, axis=0)
print(sample.shape)

q_value = model(sample)
print(q_value)

## Optimizer

In [None]:
from tensorflow.keras.optimizers import RMSprop

optimizer = RMSprop()

## Behavior policy

In [None]:
epsilon = 1.0
epsilon_min = 0.1
epsilon_max = 1.0
epsilon_interval = epsilon_max - epsilon_min
epsilon_greedy_frames = 1000000.0
epsilon_random_frames = 50000

def epsilon_greedy_policy(model, state, epsilon):
    _, num_actions = model.output_shape
    if tf.random.uniform(shape=[]) < epsilon:
        return tf.random.uniform(shape=[], maxval=num_actions, dtype=tf.int64)
    return tf.math.argmax(model(state), axis=1, output_type=tf.int64)[0]

print(epsilon_greedy_policy(model, sample, 1))
print(epsilon_greedy_policy(model, sample, 0))

## Hyperparameter

In [None]:
num_actions = env.action_space.n
batch_size = 32
gamma = 0.99

skip_frame = 4
replay_memory_length = 1000000
update_target_network = 10000
max_frame_count = 10000000

from collections import deque

obs_history         = deque(maxlen=skip_frame)
state_history       = deque(maxlen=replay_memory_length)
action_history      = deque(maxlen=replay_memory_length)
rewards_history     = deque(maxlen=replay_memory_length)
state_next_history  = deque(maxlen=replay_memory_length)
done_history        = deque(maxlen=replay_memory_length)

## 학습

In [None]:
from tensorflow.keras.losses import MSE

# Define our metrics
rl_loss = tf.keras.metrics.Mean('loss', dtype=tf.float32)
rl_rewards = tf.keras.metrics.Sum('avg. rewards', dtype=tf.float32)
rl_q_values = tf.keras.metrics.Mean('avg. Q', dtype=tf.float32)


@tf.function
def train(state_sample, action_sample, rewards_sample, state_next_sample, done_sample):
    # Build the updated Q-values for the sampled future states
    # Use the target model for stability
    future_rewards = model_target(state_next_sample) * (1 - done_sample)
    # Q value = reward + discount factor * expected future reward
    q_values_target = rewards_sample + gamma * tf.reduce_max(future_rewards)

    # Create a mask so we only calculate loss on the updated Q-values
    masks = tf.one_hot(action_sample, num_actions)

    with tf.GradientTape() as tape:
        # Train the model on the states and updated Q-values
        q_values = model(state_sample)

        # Apply the masks to the Q-values to get the Q-value for action taken
        q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
        # Calculate loss between new Q-value and old Q-value
        loss = MSE(q_values_target, q_action)

    rl_loss(loss)
    
    q_max = tf.reduce_max(q_values)
    rl_q_values(q_max)

    # Backpropagation
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs

In [None]:
import numpy as np
from datetime import datetime

current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = 'logs/DQN/' + current_time
summary_writer = tf.summary.create_file_writer(log_dir)

model = create_model(num_actions)
model_target = create_model(num_actions, trainable=False)
model_target.set_weights(model.get_weights())

frame_count = 0
episode_count = 0

seed = 42
env.seed(seed)

while frame_count <= max_frame_count:
    episode_count += 1

    obs = env.reset()
    obs = preprocessing(obs)

    for _ in range(skip_frame):
        obs_history.append(obs)

    state = tf.stack(obs_history, axis=2)

    is_done = False

    while is_done is False:
        frame_count += 1

        inputs = tf.expand_dims(state, axis=0)

        if frame_count <= epsilon_random_frames:
            action = epsilon_greedy_policy(model, inputs, 1)
        else:
            action = epsilon_greedy_policy(model, inputs, epsilon)
            # Decay probability of taking random action
            epsilon -= epsilon_interval / epsilon_greedy_frames
            epsilon = max(epsilon, epsilon_min)

        reward = 0
        for _ in range(skip_frame):
            if is_done is False:
                # env.render()
                obs_next, _reward, is_done, _ = env.step(action)
                obs_next = preprocessing(obs_next)
            reward += _reward
            obs_history.append(obs_next)

        state_next = tf.stack(obs_history, axis=2)
        done = 1.0 if is_done else 0.0

        state_history.append(state)
        action_history.append(action)
        rewards_history.append(reward)
        state_next_history.append(state_next)
        done_history.append([done])

        state = state_next

        if len(done_history) > epsilon_random_frames:
            indices = np.random.choice(range(len(done_history)), size=batch_size)

            state_sample        = tf.stack([state_history[i] for i in indices])
            action_sample       = tf.stack([action_history[i] for i in indices])
            rewards_sample      = tf.stack([rewards_history[i] for i in indices])
            state_next_sample   = tf.stack([state_next_history[i] for i in indices])
            done_sample         = tf.stack([done_history[i] for i in indices])

            train(state_sample, action_sample, rewards_sample, state_next_sample, done_sample)

            with summary_writer.as_default():
                tf.summary.scalar('DQN/Loss', rl_loss.result(), step=frame_count)
                tf.summary.scalar('DQN/Average Q', rl_q_values.result(), step=frame_count)


        if frame_count % update_target_network == 0:
            # update the the target network with new weights
            model_target.set_weights(model.get_weights())

        rl_rewards(reward)

    with summary_writer.as_default():
        tf.summary.scalar('DQN/Average Rewards', rl_rewards.result(), step=episode_count)
        tf.summary.scalar('DQN/Epsilon', data=epsilon, step=episode_count)

    rl_loss.reset_states()
    rl_q_values.reset_states()
    rl_rewards.reset_states()

In [None]:
import gym

env = gym.make("Breakout-v4")

while True:

    obs = env.reset()
    obs = preprocessing(obs)

    for _ in range(skip_frame):
        obs_history.append(obs)

    state = tf.stack(obs_history, axis=2)

    is_done = False

    while is_done is False:
        inputs = tf.expand_dims(state, axis=0)
        action = epsilon_greedy_policy(model, inputs, 0)

        obs_next, _, is_done, _ = env.step(action)
        obs_next = preprocessing(obs_next)
        obs_history.append(obs_next)

        state_next = tf.stack(obs_history, axis=2)
        state = state_next