# DQN

## 참고 논문
- Playing Atari with Deep Reinforcement Learning
- Human-level control through deep reinforcement learning
- [Frame Skipping and Pre-Processing for Deep Q-Networks on Atari 2600 Games](https://danieltakeshi.github.io/2016/11/25/frame-skipping-and-preprocessing-for-deep-q-networks-on-atari-2600-games/)

## Action
- 0: None
- 1: Fire
- 2: Right
- 3: Left

## Preinstallation

OpenAI Gym에서 "Breakout-v4" 게임을 플레이하기 위해서는 `atari-py`를 설치해야 한다.

```Python
pip install gym atari-py
```

## 게임 환경에 대한 이미지 전처리

In [None]:
import gym
import matplotlib.pyplot as plt

env = gym.make("Breakout-v4")

env.reset()
action = env.action_space.sample()
action = 1

fig, axs = plt.subplots(ncols=4, figsize=(16, 16*3))

step = 0
for col in range(4):
    for _ in range(4):
        step += 1
        obs, _, _, info = env.step(action)

    ax = axs[col]
    ax.set_axis_off()
    ax.set_title("step: {}, lives: {}".format(step, info['ale.lives']))
    ax.imshow(obs)

print(obs.shape)
plt.show()

In [None]:
import numpy as np
from PIL import Image


def preprocessing(image):
    image = Image.fromarray(image)

    # cropping an 84 × 84 region of the image that roughly captures the playing area
    (left, upper, right, lower) = (0, 17, 160, 17+177)
    image = image.crop(box=(left, upper, right, lower))

    # converting their RGB representation to gray-scale
    image = image.convert("L")
    
    # down-sampling it to a 110×84 image
    (width, height) = (84, 84)
    image = image.resize(size=(width, height))

    # normalization
    image = np.asarray(image, dtype=np.float32)
    image /= 255.0

    return image

In [None]:
env = gym.make("Breakout-v4")

env.reset()
action = env.action_space.sample()

fig, axs = plt.subplots(ncols=4, figsize=(16, 16))

step = 0
for col in range(4):
    for _ in range(4):
        step += 1
        obs, _, _, _ = env.step(action)

    # 전처리를 추가
    preprocessed = preprocessing(obs)

    ax = axs[col]
    ax.set_axis_off()
    ax.set_title("step: {}".format(step))
    ax.imshow(preprocessed)

print(obs.shape)
plt.show()

## 모델 생성

In [None]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.activations import relu


def create_model(num_actions, trainable=True):
    # The input to the neural network consists of an 84 x 84 x 4 image produced by the preprocessing map φ.
    inputs = Input(shape=(84, 84, 4))
    # The first hidden layer convolves 32 filters of 8 x 8 with stride 4,
    # with the input image and applies a rectifier nonlinearity.
    hidden1 = Conv2D(
        filters=32,
        kernel_size=(8, 8),
        strides=4,
        activation=relu
    )(inputs)
    # The second hidden layer convolves 64 filters of 4 x 4 with stride 2,
    # again followed by a rectifier nonlinearity.
    hidden2 = Conv2D(
        filters=64,
        kernel_size=(4, 4),
        strides=2,
        activation=relu
    )(hidden1)
    # This is followed by a third convolutional layer that convolves 64 filters of 3 x 3,
    # with stride 1 followed by a rectifier. 
    hidden3 = Conv2D(
        filters=64,
        kernel_size=(3, 3),
        strides=1,
        activation=relu
    )(hidden2)
    flatten = Flatten()(hidden3)
    # The final hidden layer is fully-connected and consists of 512 rectifier units.
    hidden4 = Dense(units=512, activation=relu)(flatten)
    # The output layer is a fully-connected linear layer with a single output for each valid action.
    outputs = Dense(units=num_actions)(hidden4)

    return Model(inputs=inputs, outputs=outputs, trainable=trainable)


model = create_model(env.action_space.n)
model.summary()

In [None]:
state = np.stack([preprocessed]*4, axis=2)
sample = np.expand_dims(state, axis=0)
print(sample.shape)

q_value = model(sample)
print(q_value)

## Optimizer

In [None]:
from tensorflow.keras.optimizers import Adam

learning_rate = 0.0001

optimizer = Adam(learning_rate=learning_rate)

## Behavior policy

In [None]:
epsilon = 1.0
epsilon_min = 0.1
epsilon_max = 1.0
epsilon_interval = epsilon_max - epsilon_min
epsilon_greedy_frames = 1000000.0
epsilon_random_frames = 50000

def epsilon_greedy_policy(model, state, epsilon):
    _, num_actions = model.output_shape
    state = np.expand_dims(state, axis=0)
    if np.random.sample() < epsilon:
        return np.random.choice(num_actions)
    with tf.device('/CPU:0'):
        return np.argmax(model(state, training=False), axis=1)[0]

print(epsilon_greedy_policy(model, state, 1))
print(epsilon_greedy_policy(model, state, 0))

## Hyperparameter

In [None]:
num_actions                     = env.action_space.n
agent_history_length            = 4
action_repeat                   = 4
minibatch_size                  = 32
replay_memory_size              = 1000000
replay_start_size               = 50000

update_frequency                = 4
target_network_update_frequency = 10000
discount_factor                 = 0.99


from collections import deque

obs_history         = deque(maxlen=agent_history_length)
state_history       = deque(maxlen=replay_memory_size)
action_history      = deque(maxlen=replay_memory_size)
rewards_history     = deque(maxlen=replay_memory_size)
state_next_history  = deque(maxlen=replay_memory_size)
done_history        = deque(maxlen=replay_memory_size)

## 학습

In [None]:
with tf.device('/CPU:0'):
    # Define our metrics
    rl_rewards  = tf.keras.metrics.Sum('Avg. Rewards', dtype=tf.float32)
    rl_loss     = tf.keras.metrics.Mean('Avg. Loss', dtype=tf.float32)
    rl_q_values = tf.keras.metrics.Mean('Avg. Q-value', dtype=tf.float32)

In [None]:
from tensorflow.keras.losses import MSE

@tf.function
def train(state_sample, action_sample, rewards_sample, state_next_sample, done_sample):
    state_sample = tf.convert_to_tensor(state_sample)
    action_sample = tf.convert_to_tensor(action_sample)
    rewards_sample = tf.convert_to_tensor(rewards_sample)
    state_next_sample = tf.convert_to_tensor(state_next_sample)
    done_sample = tf.convert_to_tensor(done_sample)

    future_rewards = model_target(state_next_sample, training=False) * (1 - done_sample)
    q_values_target = rewards_sample + discount_factor * tf.reduce_max(future_rewards, axis=1)

    masks = tf.one_hot(action_sample, num_actions)
    with tf.GradientTape() as tape:
        q_values = model(state_sample)
        q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
        loss = MSE(q_values_target, q_action)

    # Backpropagation
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    return loss, tf.reduce_max(q_values)

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs

In [None]:
from datetime import datetime

current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = 'logs/DQN/' + current_time
summary_writer = tf.summary.create_file_writer(log_dir)

In [None]:
import numpy as np

model = create_model(num_actions)
model_target = create_model(num_actions, trainable=False)
model_target.set_weights(model.get_weights())

max_frame_count = 10000000

frame_count = 0
episode_count = 0

while frame_count <= max_frame_count:
    lives = 5
    info = {'ale.lives': 5}
    episode_count += 1

    turn_done = True
    episode_done = False

    obs = env.reset()
    obs = preprocessing(obs)

    f_count = 0

    while episode_done is False:
        frame_count += 1

        if turn_done:
            for _ in range(agent_history_length):
                obs_history.append(obs)
            state = np.stack(obs_history, axis=2)
            turn_done = False

        if frame_count <= replay_start_size:
            action = epsilon_greedy_policy(model, state, 1)
        else:
            action = epsilon_greedy_policy(model, state, epsilon)
            epsilon -= epsilon_interval / epsilon_greedy_frames
            epsilon = max(epsilon, epsilon_min)

        reward_signal = 0
        for i in range(agent_history_length):
            for _ in range(action_repeat):
                f_count += 1
                if lives > info['ale.lives']:
                    turn_done = True
                    break
                # env.render()
                obs, reward, episode_done, info = env.step(action)
                obs = preprocessing(obs)
                reward_signal += reward

            obs_history.append(obs)

        reward_signal = np.sign(reward_signal)

        if turn_done or episode_done:
            reward_signal = -1

        state_next = np.stack(obs_history, axis=2)
        done = 1.0 if turn_done or episode_done else 0.0

        reward_signal = np.float32(reward_signal)
        done = np.float32(done)

        state_history.append(state)
        action_history.append(action)
        rewards_history.append(reward_signal)
        state_next_history.append(state_next)
        done_history.append([done])


        lives = info['ale.lives']
        state = state_next


        with tf.device('/CPU:0'):
            rl_rewards(reward_signal)

        if len(done_history) > replay_start_size:
            for _ in range(update_frequency):
                indices = np.random.choice(range(len(done_history)), size=minibatch_size)

                state_sample        = np.stack([state_history[i] for i in indices])
                action_sample       = np.stack([action_history[i] for i in indices])
                rewards_sample      = np.stack([rewards_history[i] for i in indices])
                state_next_sample   = np.stack([state_next_history[i] for i in indices])
                done_sample         = np.stack([done_history[i] for i in indices])

                loss, q_max = train(state_sample, action_sample, rewards_sample, state_next_sample, done_sample)

                with tf.device('/CPU:0'):
                    rl_loss(loss)
                    rl_q_values(q_max)


        if frame_count % target_network_update_frequency == 0:
            # update the the target network with new weights
            model_target.set_weights(model.get_weights())


        with summary_writer.as_default():
            tf.summary.scalar('DQN/Avg. Loss', rl_loss.result(), step=frame_count)
            tf.summary.scalar('DQN/Avg. Q-value', rl_q_values.result(), step=frame_count)
            tf.summary.scalar('DQN/Epsilon', data=epsilon, step=episode_count)
            # tf.summary.scalar(
            #     'DQN/GPU usages',
            #     data=tf.config.experimental.get_memory_usage("GPU:0"),
            #     step=frame_count
            # )

    with summary_writer.as_default():
        tf.summary.scalar('DQN/Avg. Rewards', rl_rewards.result(), step=episode_count)

    print("Episode: {} | Loss: {} | Q-value: {} | Rewards: {} | epsilon: {}".format(
        episode_count, rl_loss.result(), rl_q_values.result(), rl_rewards.result(), epsilon
    ))
    
    rl_loss.reset_states()
    rl_q_values.reset_states()
    rl_rewards.reset_states()


In [None]:
for idx in range(10, 20):
    print(action_history[idx], rewards_history[idx], done_history[idx])

    fig, axs = plt.subplots(ncols=4, figsize=(16, 16))
    for i in range(4):
        axs[i].set_axis_off()
        axs[i].imshow(state_history[idx][:, :, i])

    fig, axs = plt.subplots(ncols=4, figsize=(16, 16))
    for i in range(4):
        axs[i].set_axis_off()
        axs[i].imshow(state_next_history[idx][:, :, i])

    plt.show()

import gym

env = gym.make("Breakout-v4")

while True:

    obs = env.reset()
    obs = preprocessing(obs)

    for _ in range(skip_frame):
        obs_history.append(obs)

    state = np.stack(obs_history, axis=2)

    is_done = False

    while is_done is False:
        inputs = np.expand_dims(state, axis=0)
        action = epsilon_greedy_policy(model, inputs, 0)

        obs_next, reward, is_done, _ = env.step(action)
        obs_next = preprocessing(obs_next)
        obs_history.append(obs_next)

        state_next = np.stack(obs_history, axis=2)
        state = state_next

In [None]:
import gym

env = gym.make("Breakout-v4")

while True:

    obs = env.reset()
    obs = preprocessing(obs)

    for _ in range(4):
        obs_history.append(obs)

    state = np.stack(obs_history, axis=2)

    is_done = False

    while is_done is False:
        env.render()
        action = epsilon_greedy_policy(model, state, 1)

        obs_next, reward, is_done, _ = env.step(action)
        # print(reward)
        obs_next = preprocessing(obs_next)
        obs_history.append(obs_next)
        state_next = np.stack(obs_history, axis=2)
        state = state_next
        print(reward)