In [None]:
# pip install gym, atari-py

In [None]:
import gym
import numpy as np
import tensorflow as tf

from PIL import Image
from collections import deque

In [None]:
class Model(tf.keras.Model):

    def __init__(self, output_size):
        super(Model, self).__init__()

        self.output_size = output_size
        
        self.cnn1 = tf.keras.layers.Conv2D(
            filters=16,
            kernel_size=(8, 8), strides=(4, 4),
            activation=tf.keras.activations.relu)
        self.cnn2 = tf.keras.layers.Conv2D(
            filters=32,
            kernel_size=(4, 4), strides=(2, 2),
            activation=tf.keras.activations.relu)
        self.dense1 = tf.keras.layers.Dense(256, activation=tf.nn.relu)
        self.dense2 = tf.keras.layers.Dense(self.output_size)


    def call(self, inputs):
        inputs = tf.cast(inputs, dtype=tf.float32)
        inputs = tf.transpose(inputs, [0, 2, 3, 1])
        x = self.cnn1(inputs)
        x = self.cnn2(x)
        x = tf.keras.layers.Flatten()(x)
        x = self.dense1(x)
        return self.dense2(x)

In [None]:
def preprocessing(observation):
    image = Image.fromarray(observation)

    width, height = 84, 110
    grayscale = image.convert("L")
    resized = grayscale.resize((width, height))

    left, upper, right, lower = 0, 18, 0+width, 18+width
    cropped = resized.crop((left, upper, right, lower))
    return np.asarray(cropped)

In [None]:
def epsilon_greedy_policy(model, state, epsilon=0.1):
    action_space = model.output_size
    if np.random.sample() < epsilon:
        return np.random.randint(0, action_space)

    return tf.math.argmax(model(state), axis=1)[0]

In [None]:
def train(model, target_model, mini_batch, optimizer, discount_factor=0.9):
    state, action, reward, next_state, done = [], [], [], [], []

    for _state, _action, _reward, _next_state, _done in mini_batch:
        state.append(_state[0])
        action.append([ _action ])
        reward.append([ _reward ])
        next_state.append(_next_state[0])
        done.append([ 0 if _done else 1 ])

    state = tf.Variable(state)
    action = tf.Variable(action)
    reward = tf.Variable(reward, dtype=tf.float32)
    next_state = tf.Variable(next_state)
    done = tf.Variable(done, dtype=tf.float32)

    target_q = reward + discount_factor * tf.math.reduce_max(target_model(next_state), axis=1, keepdims=True) * done
    target_q = tf.squeeze(target_q)

    with tf.GradientTape() as tape:
        q = tf.gather_nd(model(state), action, batch_dims=1)
        loss = tf.losses.mean_squared_error(target_q, q)
    optimizer.minimize(loss, var_list=model.trainable_weights, tape=tape)
    return loss

In [None]:
env = gym.make('Breakout-v0')

action_space = env.action_space.n

skipframe = 4
batch_size = 32
max_episode = 100000
replay_memory_maxlen = int(1e+6)
learning_rate=0.000001
epsilon = 1.0
opt = tf.keras.optimizers.RMSprop()

model = Model(action_space)
target_model = Model(action_space)

replay_buffer = deque(maxlen=replay_memory_maxlen)
observation_buffer = deque(maxlen=skipframe)

In [None]:
try:
    model(tf.ones((1, 4, 84, 84)))
    target_model(tf.ones((1, 4, 84, 84)))
    target_model.set_weights(model.get_weights())

    step = 1

    for episode in range(max_episode):
        episode_done = False
        observation_buffer.clear()
        lives = 5

        observation = env.reset()
        print("EPISODE:", episode + 1, observation.shape)
        observation = preprocessing(observation)

        while len(observation_buffer) < observation_buffer.maxlen:
            observation_buffer.append(observation)

        while not episode_done:
            state = np.expand_dims(observation_buffer, axis=0)
            
            action = epsilon_greedy_policy(model, state, max(0.1, epsilon - 0.9/1000000*step))
            reward_sum = 0
            done = False
            loss = []

            for _ in range(skipframe):
                if not done:
                    env.render()
                    next_observation, reward, episode_done, etc = env.step(action)
                    next_observation = preprocessing(next_observation)
                    done = episode_done
                observation_buffer.append(next_observation)
                reward_sum += -1 if etc['ale.lives'] != lives else reward
                lives = etc['ale.lives']

            next_state = np.expand_dims(observation_buffer, axis=0)
            replay_buffer.append((state, action, reward_sum, next_state, done))

            if len(replay_buffer) > 10000:
                mini_batch = [
                    replay_buffer[index]
                    for index in np.random.choice(len(replay_buffer), batch_size, replace=False) ]
                loss.append(train(model, target_model, mini_batch, opt))
            
            if step % 1000 == 0 and model.get_weights():
                target_model.set_weights(model.get_weights())

            step += 1

        if loss:
            print(np.mean(loss))
        model.save_weights('./checkpoints/my_checkpoint')
finally:
    env.close()

In [None]:
try:
    model(tf.ones((1, 4, 84, 84)))
    model.load_weights('./checkpoints/my_checkpoint')

    for episode in range(max_episode):
        episode_done = False
        observation_buffer.clear()

        observation = env.reset()
        observation = preprocessing(observation)

        while len(observation_buffer) < observation_buffer.maxlen:
            observation_buffer.append(observation)

        while not episode_done:
            env.render()
            state = np.expand_dims(observation_buffer, axis=0)
            action = epsilon_greedy_policy(model, state, 0)
            next_observation, reward, episode_done, etc = env.step(action)
            next_observation = preprocessing(next_observation)
            observation_buffer.append(next_observation)
finally:
    env.close()