# Use DQN to Run Pong

In [None]:
%matplotlib inline

import sys
import itertools
import imp
import logging

import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import matplotlib.pyplot as plt
import tensorflow.compat.v2 as tf
tf.random.set_seed(0)
from tensorflow import nn
from tensorflow import losses
from tensorflow import optimizers
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import models

imp.reload(logging)
logging.basicConfig(level=logging.DEBUG,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')

Environment

In [None]:
env = gym.make('PongDeterministic-v4')
env.seed(0)
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])    

Replayer

In [None]:
class DQNReplayer:
    def __init__(self, capacity):
        self.memory = pd.DataFrame(index=range(capacity),
                columns=['state', 'action', 'reward', 'next_state', 'done'])
        self.i = 0
        self.count = 0
        self.capacity = capacity
    
    def store(self, *args):
        self.memory.loc[self.i] = args
        self.i = (self.i + 1) % self.capacity
        self.count = min(self.count + 1, self.capacity)
        
    def sample(self, size):
        indices = np.random.choice(self.count, size=size)
        return tuple(np.stack(self.memory.loc[indices, field]) for \
                field in self.memory.columns)

Agent

In [None]:
class DQNAgent:
    def __init__(self, env):
        self.action_n = env.action_space.n
        self.replayer = DQNReplayer(10000)
        self.evaluate_net = self.build_net(verbose=True)
        self.target_net = models.clone_model(self.evaluate_net)

    def build_net(self, verbose=False):
        model = models.Sequential()
        model.add(layers.Conv2D(32, 8, strides=4, activation=nn.relu, 
                                input_shape=(210, 160, 4)))
        model.add(layers.Conv2D(64, 4, strides=2, activation=nn.relu))
        model.add(layers.Conv2D(64, 3, strides=1, activation=nn.relu))
        model.add(layers.Flatten())
        model.add(keras.layers.Dense(16, activation=nn.relu))
        model.add(keras.layers.Dense(8, activation=nn.relu))
        model.add(keras.layers.Dense(2))
        model.compile(loss=losses.mse, optimizer=optimizers.Adam())
        if verbose:
            model.summary()
        return model

    def reset(self, mode=None):
        self.mode = mode
        self.trajectory = [] # need for both train and test
        if self.mode == 'train':
            self.target_net = models.clone_model(self.evaluate_net)

    def step(self, observation, reward, done):
        # stack images to get state
        obs = observation.mean(axis=-1, keepdims=True) / 256. - 0.5
        if len(self.trajectory) < 4:
            state = obs.repeat(4, axis=-1)
        else:
            prev_state = self.trajectory[-4]
            state = np.concatenate([prev_state[:, :, 1:], obs], axis=-1)
        
        if self.mode == 'train' and np.random.rand() < 0.001:
            # epsilon-greedy policy in train mode
            action = np.random.randint(self.action_n)
        else:
            qs = self.evaluate_net.predict(state[np.newaxis])
            action = np.argmax(qs)
        
        self.trajectory += [state, reward, done, action,]
        if self.mode == 'train':
            if len(self.trajectory) >= 8:
                state, _, _, act, next_state, reward, done, _ = \
                        self.trajectory[-8:]
                self.replayer.store(state, act, reward, next_state, done)
            if self.replayer.count >= self.replayer.capacity * 0.95:
                    # skip first few episodes for speed
                self.learn()
        return action

    def close(self):
        pass

    def learn(self):
        # replay
        states, actions, rewards, next_states, dones = self.replayer.sample(64)

        # train
        next_qs = self.target_net.predict(next_states)
        next_max_qs = next_qs.max(axis=-1)
        us = rewards + 0.99 * (1. - dones) * next_max_qs
        targets = self.evaluate_net.predict(states)
        targets[np.arange(us.shape[0]), actions] = us
        self.evaluate_net.fit(states, targets, verbose=0)


agent = DQNAgent(env)

In [None]:
def play_episode(env, agent, max_episode_steps=None, mode=None, render=False):
    observation, reward, done = env.reset(), 0., False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, done)
        if render:
            env.render()
        if done:
            break
        observation, reward, done, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
        if max_episode_steps and elapsed_steps >= max_episode_steps:
            break
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
    episode_reward, elapsed_steps = play_episode(env.unwrapped, agent,
            max_episode_steps=env._max_episode_steps, mode='train')
    episode_rewards.append(episode_reward)
    logging.debug('train episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
    if np.mean(episode_rewards[-50:]) > 20.5:
        break
plt.plot(episode_rewards)


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.debug('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))