# Deep Q-Learning

References:
- https://www.nervanasys.com/demystifying-deep-reinforcement-learning/

## Create Pong environment

In [1]:
import gym
import numpy as np

In [2]:
class PongEnv(object):
    def __init__(self):
        self._env = gym.make("Pong-v0")
        # self._env.get_action_meanings()
        # ['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']
        self._action_map = [2, 3] # only two actions 'RIGHT' and 'LEFT' matter
        self.reset()
        
    def _preprocess_image(self, img):
        """Convert 210x160x3 uint8 frame into 40x40x1 float
        Based on https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5#file-pg-pong-py-L30
        """
        img = img[35:195] # crop
        img = img[::4,::4,0] # downsample by factor of 4
        img[img == 144] = 0 # erase background (background type 1)
        img[img == 109] = 0 # erase background (background type 2)
        img[img != 0] = 1 # everything else (paddles, ball) just set to 1
        img = np.expand_dims(img, axis=2) # add single channel to make it TF friendly
        return img.astype(np.float32)
    
    def reset(self):
        s = self._env.reset()
        s = self._preprocess_image(s)
        self._prev_state = s
        return np.zeros_like(s)
        
    def step(self, action):
        a = self._action_map[action]
        s, r, done, info = self._env.step(a)
        s = self._preprocess_image(s)
        x = s - self._prev_state
        self._prev_state = s
        return x, r, done, info
    
    @property
    def action_space(self):
        return gym.spaces.discrete.Discrete(len(self._action_map))
    
    @property
    def observation_space(self):
        return gym.spaces.box.Box(self._prev_state.min(),
                                  self._prev_state.max(),
                                  self._prev_state.shape)

In [3]:
# env = PongEnv()
env = gym.make('CartPole-v0')

[2016-12-07 22:37:58,599] Making new env: CartPole-v0


## Create agent

In [4]:
from os import path
import numpy as np
import random
import tensorflow as tf
from tensorflow.contrib import slim

In [5]:
def build_small_fc_q_net(states, n_actions):
    """Build small fully connected Q-network"""
    states_flat = slim.flatten(states)
    layer1 = slim.fully_connected(states_flat,
                                  64,
                                  activation_fn=tf.nn.relu, scope='layer1')
    q = slim.fully_connected(layer1,
                             n_actions,
                             activation_fn=None, scope='layer2')
    return q

def build_large_fc_q_net(states, n_actions):
    """Build 'large' fully connected Q-network"""
    states_flat = slim.flatten(states)
    layer1 = slim.fully_connected(states_flat,
                                  200,
                                  activation_fn=tf.nn.relu, scope='layer1')
    layer2 = slim.fully_connected(layer1,
                                  100,
                                  activation_fn=tf.nn.relu, scope='layer2')
    q = slim.fully_connected(layer2,
                             n_actions,
                             activation_fn=None, scope='layer3')
    return q

def build_cnn_q_net(states, n_actions):
    """Build CNN Q-network"""
    conv1 = slim.conv2d(states, 32, [8, 8], stride=4, padding='SAME',
                        activation_fn=tf.nn.relu, scope='conv1')
    conv2 = slim.conv2d(conv1, 16, [8, 8], stride=2, padding='SAME',
                        activation_fn=tf.nn.relu, scope='conv2')
    conv2_flat = slim.flatten(conv2)
    q = slim.fully_connected(conv2_flat, n_actions,
                             activation_fn=None, scope='fc')
    return q

In [25]:
class Agent(object):
    def __init__(self, n_actions, state_shape, discount=0.99, max_reward=1.0,
                 lr_decay_factor=0.9, init_lr=1e-3, lr_decay_steps=1e3, lr_min=1e-7,
                 min_epsilon=0.05, epsilon_decay_duration=1e4,
                 q_net_builder_fn=build_small_fc_q_net, update_q_target_frequency=1,
                 summary_update_frequency=1000,
                 mem_capacity=100000, batch_size=32):
        self.n_actions = n_actions
        self.discount = discount
        self.max_reward = max_reward
        self.update_q_target_frequency = update_q_target_frequency
        self.summary_update_frequency = summary_update_frequency
        self.mem_capacity = mem_capacity
        self.mem = []
        self.batch_size = batch_size
        self.graph = tf.Graph()
        self.i = 0
        with self.graph.as_default():
            # add batch dimension
            state_shape = [None] + list(state_shape)
            
            with tf.name_scope('inputs'):
                self.state = tf.placeholder(tf.float32, shape=state_shape)
                self.action = tf.placeholder(tf.int32, shape=[None])
                self.reward = tf.placeholder(tf.float32, shape=[None])
                self.state_ = tf.placeholder(tf.float32, shape=state_shape)
                self.terminal = tf.placeholder(tf.float32, shape=[None])
            
            with tf.variable_scope('q_net'):
                self.q = q_net_builder_fn(self.state, n_actions)
            
            with tf.variable_scope('q_target_net'):
                q_t = q_net_builder_fn(self.state_, n_actions)
                self.q_t = tf.stop_gradient(q_t) # we do not train q_target_net
            
            with tf.name_scope('loss'):
                q_t_max = tf.reduce_max(self.q_t, reduction_indices=-1)
                q_target = (1. - self.terminal) * self.discount * q_t_max + self.reward
                
                mask = tf.one_hot(self.action, n_actions, dtype=tf.float32)
                q_acted = tf.reduce_sum(self.q * mask, reduction_indices=-1)
                # idx = tf.range(0, limit=n_actions*batch_size, delta=n_actions) + self.action
                # q_flat = tf.reshape(self.q, shape=[-1])
                # q_acted = tf.gather(q_flat, idx, validate_indices=True)
                                
                assert q_target.get_shape().is_compatible_with(q_acted.get_shape())
                
                diff = q_target - q_acted
                tf.histogram_summary("diff", diff)
                clipped = tf.clip_by_value(diff, -max_reward, max_reward)
                # debug
                self.error = tf.reduce_mean(tf.square(diff))
                tf.scalar_summary("error", self.error)
            
            with tf.name_scope('update_q_target_net'):
                q_net_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_net')
                q_target_net_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_target_net')
                assign_ops = []
                assert len(q_net_vars) == len(q_target_net_vars)
                for v1, v2 in zip(q_net_vars, q_target_net_vars):
                    v1_name = v1.name.split('/',1)[1] # name without scope
                    v2_name = v2.name.split('/',1)[1]
                    assert v1_name == v2_name
                    assign_op = v2.assign(v1)
                    assign_ops.append(assign_op)
                self.update_q_target_op = tf.group(*assign_ops)
            
            with tf.name_scope('trainer'):
                self.global_step = tf.Variable(0, trainable=False, name='global_step')
                
                global_step_1 = self.global_step + 1
                self.global_step_inc = self.global_step.assign(global_step_1)
                
                exp_lr = tf.train.exponential_decay(init_lr,
                                                    self.global_step,
                                                    lr_decay_steps,
                                                    lr_decay_factor,
                                                    staircase=True)
                
                self.lr = tf.maximum(lr_min, exp_lr)
                tf.scalar_summary("learning_rate", self.lr)
                opt = tf.train.AdamOptimizer(self.lr)
                self.train_op = opt.minimize(self.error)
                
                # piggy back on lr to update epsilon (exploration probability)
                gs_float = tf.cast(self.global_step, tf.float32)
                self.epsilon = 1.0 - gs_float / epsilon_decay_duration # start with 1.0 and decay
                self.epsilon = tf.maximum(min_epsilon, self.epsilon)
                tf.scalar_summary("epsilon", self.epsilon)
            
            with tf.name_scope('summary'):
                tf.histogram_summary('q_val', self.q)
                tf.histogram_summary('q_target_val', self.q_t)
                # tf.image_summary('states', self.state, max_images=2)
                
                self.total_reward = tf.placeholder(tf.float32, shape=())
                self.game_steps = tf.placeholder(tf.float32, shape=())
                ema = tf.train.ExponentialMovingAverage(0.9)
                self.ema_op = ema.apply([self.total_reward, self.game_steps])
                tf.scalar_summary("total_reward_avg", ema.average(self.total_reward))
                tf.scalar_summary("game_steps_avg", ema.average(self.game_steps))
                
                with tf.name_scope('q_net_summary'):
                    q_net_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_net')
                    for v in q_net_vars:
                        tf.histogram_summary(v.op.name + '/activations', v)
                        # tf.scalar_summary(v.op.name + '/sparsity', tf.nn.zero_fraction(v))

                self.summary_op = tf.merge_all_summaries()
                
            init = tf.global_variables_initializer()
            self.saver = tf.train.Saver(max_to_keep=20)
            
        self.sess = tf.Session(graph=self.graph)
        self.sess.run(init)
        self.sess.run(self.update_q_target_op)
        
        self.logs_dir = './dqn_logs'
        self.summary_writer = tf.train.SummaryWriter(self.logs_dir, graph=self.graph)
        self.checkpoint_path = path.join(self.logs_dir, 'model.ckpt')
        
        ckpt = tf.train.get_checkpoint_state(self.logs_dir)
        if ckpt and ckpt.model_checkpoint_path:
            self.saver.restore(self.sess, ckpt.model_checkpoint_path)
            print('restoring at global_step = %d' % self.sess.run(self.global_step))
                
    def predict_action(self, state):
        ep_val, self.i = self.sess.run([self.epsilon, self.global_step_inc])
        if np.random.rand() < ep_val:
            return np.random.randint(0, self.n_actions)

        state = np.expand_dims(state, 0)
        q_val = self.sess.run(self.q, feed_dict={self.state: state})[0]
        return np.argmax(q_val)
    
    def observe(self, state, action, reward, state_, terminal):
        self.mem.append((state, action, reward, state_, terminal))
        if len(self.mem) > self.mem_capacity:
            self.mem.pop(0)
    
    def train(self):
        if self.batch_size > len(self.mem):
            return
        
        samples = random.sample(self.mem, self.batch_size)
        s, a, r, s_, t = zip(*samples)
        
        s = np.array(s, dtype=np.float32)
        s_ = np.array(s_, dtype=np.float32)
                
        feed_dict = {self.state: s,
                     self.action: a,
                     self.reward: r,
                     self.state_: s_,
                     self.terminal: t
                    }
        sess = self.sess
        
        _, err_val = sess.run([self.train_op, self.error], feed_dict)
        
        assert not np.isnan(err_val), 'Model diverged with loss = NaN'
        
        if self.i % self.summary_update_frequency == 0:
            summary_str = sess.run(self.summary_op, feed_dict)
            self.summary_writer.add_summary(summary_str, self.i)
            self.summary_writer.flush()
        
        if self.i % self.update_q_target_frequency == 0:
            sess.run(self.update_q_target_op)
        
        if self.i % (5 * self.summary_update_frequency) == 0:
            self.saver.save(sess,
                            self.checkpoint_path,
                            global_step=self.global_step)
    
    def summary_stats(self, total_reward, game_steps):
        feed_dict = {self.total_reward: total_reward, 
                     self.game_steps: game_steps}
        self.sess.run(self.ema_op, feed_dict)

In [31]:
!rm -rf ./dqn_logs

In [32]:
# %debug
agent = Agent(env.action_space.n, env.observation_space.shape)

In [33]:
# self, n_actions, state_shape, discount=0.99, max_reward=1.0,
#                  lr_decay_factor=0.9, init_lr=1e-3, lr_decay_steps=1e5, lr_min=1e-7,
#                  min_epsilon=0.05, epsilon_decay_duration=1e6,
#                  q_net_builder_fn=build_small_fc_q_net, update_q_target_frequency=1000,
#                  summary_update_frequency=1000,
#                  mem_capacity=1e6, batch_size=64

## Train agent

In [34]:
from tqdm import tqdm

In [35]:
for game in tqdm(range(300)):
    s = env.reset()
    steps = 0
    total_reward = 0
    while True:
        a = agent.predict_action(s)
        s_, r, done, info = env.step(a)
        agent.observe(s, a, r, s_, done)
        s = s_
        total_reward += r
        agent.train()

        steps += 1
        if done or total_reward > 200:
        # if done:
            agent.summary_stats(total_reward, steps)
            break

100%|██████████| 300/300 [00:51<00:00,  3.58it/s]


In [36]:
total_reward

201.0

In [11]:
# # import matplotlib so Gym's render() works nicely with Jupyter
# %matplotlib inline
# import matplotlib.pyplot as plt

In [12]:
# observation, reward, done, info = env.step(2)
# I = env.render(mode='rgb_array')
# plt.imshow(I)