In [None]:
import numpy as np
from PIL import Image


def preprocessing(image):
    image = Image.fromarray(image)

    # cropping an 84 × 84 region of the image that roughly captures the playing area
    (left, upper, right, lower) = (0, 17, 160, 17+177)
    image = image.crop(box=(left, upper, right, lower))

    # converting their RGB representation to gray-scale
    image = image.convert("L")
    
    # down-sampling it to a 110×84 image
    (width, height) = (84, 84)
    image = image.resize(size=(width, height))

    # normalization
    image = np.asarray(image, dtype=np.float32)
    image /= 255.0

    return image

In [None]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import LSTM
from tensorflow.keras.activations import relu


def create_model(num_actions, trainable=True):
    # The input to the neural network consists of an 84 x 84 x 4 image produced by the preprocessing map φ.
    inputs = Input(shape=(10, 84, 84, 1))
    # The first hidden layer convolves 32 filters of 8 x 8 with stride 4,
    # with the input image and applies a rectifier nonlinearity.
    hidden1 = Conv2D(
        filters=32,
        kernel_size=(8, 8),
        strides=4,
        activation=relu
    )(inputs)
    # The second hidden layer convolves 64 filters of 4 x 4 with stride 2,
    # again followed by a rectifier nonlinearity.
    hidden2 = Conv2D(
        filters=64,
        kernel_size=(4, 4),
        strides=2,
        activation=relu
    )(hidden1)
    # This is followed by a third convolutional layer that convolves 64 filters of 3 x 3,
    # with stride 1 followed by a rectifier. 
    hidden3 = Conv2D(
        filters=64,
        kernel_size=(3, 3),
        strides=1,
        activation=relu
    )(hidden2)

    elems_size = tf.math.reduce_prod(hidden3.shape[1:])
    flatten = tf.reshape(hidden3, [-1, 1, elems_size])
    # replacing only its first fully connected layer with a recurrent LSTM layer of the same size.
    hidden4 = LSTM(units=512)(flatten)
    # The output layer is a fully-connected linear layer with a single output for each valid action.
    outputs = Dense(units=num_actions)(hidden4)

    return Model(inputs=inputs, outputs=outputs, trainable=trainable)

In [None]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import LSTM
from tensorflow.keras.activations import relu


def create_model(num_actions, trainable=True):
    # The input to the neural network consists of an 84 x 84 x 4 image produced by the preprocessing map φ.
    inputs = Input(shape=(10, 84, 84, 1))
    # The first hidden layer convolves 32 filters of 8 x 8 with stride 4,
    # with the input image and applies a rectifier nonlinearity.
    hidden1 = TimeDistributed(Conv2D(
        filters=32,
        kernel_size=(8, 8),
        strides=4,
        activation=relu
    ))(inputs)
    # The second hidden layer convolves 64 filters of 4 x 4 with stride 2,
    # again followed by a rectifier nonlinearity.
    hidden2 = TimeDistributed(Conv2D(
        filters=64,
        kernel_size=(4, 4),
        strides=2,
        activation=relu
    ))(hidden1)
    # This is followed by a third convolutional layer that convolves 64 filters of 3 x 3,
    # with stride 1 followed by a rectifier. 
    hidden3 = TimeDistributed(Conv2D(
        filters=64,
        kernel_size=(3, 3),
        strides=1,
        activation=relu
    ))(hidden2)

    flatten = TimeDistributed(Flatten())(hidden3)
    # replacing only its first fully connected layer with a recurrent LSTM layer of the same size.
    hidden4 = LSTM(units=512)(flatten)
    # The output layer is a fully-connected linear layer with a single output for each valid action.
    outputs = Dense(units=num_actions)(hidden4)

    return Model(inputs=inputs, outputs=outputs, trainable=trainable)

In [None]:
import gym
import matplotlib.pyplot as plt

env = gym.make("Breakout-v4")
obs = env.reset()

model = create_model(env.action_space.n)
model.summary()

preprocessed = preprocessing(obs)

state = np.expand_dims([preprocessed]*10, axis=-1)
sample = np.expand_dims(state, axis=0)

model(sample)

## Optimizer

In [None]:
from tensorflow.keras.optimizers import Adadelta

learning_rate = 0.1
momentum = 0.95

optimizer = Adadelta(learning_rate=learning_rate, rho=momentum)

## Behavior policy

In [None]:
epsilon = 1.0
epsilon_min = 0.1
epsilon_max = 1.0
epsilon_interval = epsilon_max - epsilon_min
epsilon_greedy_frames = 1000000.0

def epsilon_greedy_policy(model, state, epsilon):
    _, num_actions = model.output_shape
    state = np.expand_dims(state, axis=0)
    if np.random.sample() < epsilon:
        return np.random.choice(num_actions)
    with tf.device('/CPU:0'):
        return np.argmax(model(state, training=False), axis=1)[0]

print(epsilon_greedy_policy(model, state, 1))
print(epsilon_greedy_policy(model, state, 0))

In [None]:
num_actions                     = env.action_space.n
agent_history_length            = 10
action_repeat                   = 0
minibatch_size                  = 32
replay_memory_size              = 1000000
replay_start_size               = 50000
replay_start_size               = 1000

update_frequency                = 4
target_network_update_frequency = 10000
discount_factor                 = 0.99


from collections import deque

obs_history         = deque(maxlen=agent_history_length)
state_history       = deque(maxlen=replay_memory_size)
action_history      = deque(maxlen=replay_memory_size)
rewards_history     = deque(maxlen=replay_memory_size)
state_next_history  = deque(maxlen=replay_memory_size)
done_history        = deque(maxlen=replay_memory_size)

In [None]:
with tf.device('/CPU:0'):
    # Define our metrics
    rl_rewards  = tf.keras.metrics.Sum('Avg. Rewards', dtype=tf.float32)
    rl_loss     = tf.keras.metrics.Mean('Avg. Loss', dtype=tf.float32)
    rl_q_values = tf.keras.metrics.Mean('Avg. Q-value', dtype=tf.float32)

In [None]:
from tensorflow.keras.losses import MSE

@tf.function
def train(state_sample, action_sample, rewards_sample, state_next_sample, done_sample):
    future_rewards = model_target(state_next_sample, training=False) * (1 - done_sample)
    q_values_target = rewards_sample + discount_factor * tf.reduce_max(future_rewards, axis=1)

    masks = tf.one_hot(action_sample, num_actions)
    with tf.GradientTape() as tape:
        q_values = model(state_sample)
        q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
        loss = MSE(q_values_target, q_action)

    # Backpropagation
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    return loss, tf.reduce_max(q_values)

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs

In [None]:
from datetime import datetime

current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = 'logs/DRQN/' + current_time
summary_writer = tf.summary.create_file_writer(log_dir)

In [None]:
import numpy as np

model = create_model(num_actions)
model_target = create_model(num_actions, trainable=False)
model_target.set_weights(model.get_weights())

max_frame_count = 10000000

frame_count = 0
episode_count = 0

while frame_count <= max_frame_count:
    lives = 5
    info = {'ale.lives': 5}
    episode_count += 1

    turn_done = True
    episode_done = False

    obs = env.reset()
    obs = preprocessing(obs)


    while episode_done is False:
        frame_count += 1

        if turn_done:
            state_step, action_step, state_next_step, rewards_step, done_step = [], [], [], [], []
            for _ in range(agent_history_length):
                obs_history.append(obs)
            state = np.expand_dims(obs_history, axis=-1)
            turn_done = False

        if frame_count <= replay_start_size:
            action = epsilon_greedy_policy(model, state, 1)
        else:
            action = epsilon_greedy_policy(model, state, epsilon)
            epsilon -= epsilon_interval / epsilon_greedy_frames
            epsilon = max(epsilon, epsilon_min)

        reward_signal = 0
        
        # env.render()
        obs, reward, episode_done, info = env.step(action)
        obs = preprocessing(obs)
        reward_signal += reward

        if lives > info['ale.lives']:
            turn_done = True

        if turn_done or episode_done:
            reward_signal -= -1

        reward_signal = np.sign(reward_signal)

        state_next = np.expand_dims(obs_history, axis=-1)
        done = 1.0 if turn_done or episode_done else 0.0

        reward_signal = np.float32(reward_signal)
        done = np.float32(done)

        state_step.append(state)
        action_step.append(action)
        rewards_step.append(reward_signal)
        state_next_step.append(state_next)
        done_step.append([done])


        lives = info['ale.lives']
        state = state_next


        with tf.device('/CPU:0'):
            rl_rewards(reward_signal)



        if frame_count > replay_start_size:
            for _ in range(update_frequency):
                indices = np.random.choice(range(len(done_history)), size=minibatch_size)

                state_sample, action_sample, rewards_sample, state_next_sample, done_sample = [], [], [], [], []
                for idx in indices:
                    step_idx = np.random.randint(len(done_history[idx]))
                    state_sample.append(state_history[idx][step_idx])
                    action_sample.append(action_history[idx][step_idx])
                    rewards_sample.append(rewards_history[idx][step_idx])
                    state_next_sample.append(state_next_history[idx][step_idx])
                    done_sample.append(done_history[idx][step_idx])

                state_sample = np.array(state_sample)
                action_sample = np.array(action_sample)
                rewards_sample = np.array(rewards_sample)
                state_next_sample = np.array(state_next_sample)
                done_sample = np.array(done_sample)

                loss, q_max = train(state_sample, action_sample, rewards_sample, state_next_sample, done_sample)

                with tf.device('/CPU:0'):
                    rl_loss(loss)
                    rl_q_values(q_max)


        if frame_count % target_network_update_frequency == 0:
            # update the the target network with new weights
            model_target.set_weights(model.get_weights())


        with summary_writer.as_default():
            tf.summary.scalar('DRQN/Avg. Loss', rl_loss.result(), step=frame_count)
            tf.summary.scalar('DRQN/Avg. Q-value', rl_q_values.result(), step=frame_count)
            tf.summary.scalar('DRQN/Epsilon', data=epsilon, step=frame_count)
            # tf.summary.scalar(
            #     'DRQN/GPU usages',
            #     data=tf.config.experimental.get_memory_usage("GPU:0"),
            #     step=frame_count
            # )


    state_history.append(state_step)
    action_history.append(action_step)
    rewards_history.append(rewards_step)
    state_next_history.append(state_next_step)
    done_history.append(done_step)

    with summary_writer.as_default():
        tf.summary.scalar('DRQN/Avg. Rewards', rl_rewards.result(), step=episode_count)

    print("Episode: {} | Loss: {} | Q-value: {} | Rewards: {} | epsilon: {}".format(
        episode_count, rl_loss.result(), rl_q_values.result(), rl_rewards.result(), epsilon
    ))
    
    rl_loss.reset_states()
    rl_q_values.reset_states()
    rl_rewards.reset_states()