## Deep Deterministic Policy Gradient
Implementation followed: Continuous control with deep reinforcement learning (arXiv:1509.02971v5)
- Memory Relay
- A3C
- Trained with a target net
- Initial exploration policy is quite important to warm up the net

In [1]:
import datetime
print(datetime.datetime.now().isoformat())

2018-04-22T17:07:25.995578


In [2]:
import numpy as np
import tensorflow as tf
from functools import partial

In [3]:
class Actor(object):
    def __init__(self, n_observation, n_action, name='actor_net'):
        self.n_observation = n_observation
        self.n_action = n_action
        self.name = name
        self.sess = None
        self.build_model()
        self.build_train()
        
    def build_model(self):
        activation = tf.nn.elu
        kernel_initializer = tf.contrib.layers.variance_scaling_initializer()
        kernel_regularizer = tf.contrib.layers.l2_regularizer(0.01)
        default_dense = partial(tf.layers.dense,\
                                activation=activation,\
                                kernel_initializer=kernel_initializer,\
                                kernel_regularizer=kernel_regularizer)
        with tf.variable_scope(self.name) as scope:
            observation = tf.placeholder(tf.float32,shape=[None,self.n_observation])
            hid1 = default_dense(observation,32)
            hid2 = default_dense(hid1,64)
            action = default_dense(hid2,self.n_action,activation=tf.nn.tanh,use_bias=False)
            trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=self.name)
        self.observation,self.action,self.trainable_vars = observation,action,trainable_vars
        
    def build_train(self,learning_rate = 0.0001):
        with tf.variable_scope(self.name) as scope:
            action_grads = tf.placeholder(tf.float32,[None,self.n_action])
            var_grads = tf.gradients(self.action,self.trainable_vars,-action_grads)
            train_op = tf.train.AdamOptimizer(learning_rate).apply_gradients(zip(var_grads,self.trainable_vars))
        self.action_grads,self.train_op = action_grads,train_op
        
    def predict_action(self,obs_batch):
        return self.action.eval(session=self.sess,feed_dict={self.observation:obs_batch})

    def train(self,obs_batch,action_grads):
        batch_size = len(action_grads)
        self.train_op.run(session=self.sess,feed_dict={self.observation:obs_batch,self.action_grads:action_grads/batch_size})
        
    def set_session(self,sess):
        self.sess = sess
        
    def get_trainable_dict(self):
        return {var.name[len(self.name):]: var for var in self.trainable_vars}

In [4]:
class Critic(object):
    def __init__(self, n_observation, n_action, name='critic_net'):
        self.n_observation = n_observation
        self.n_action = n_action
        self.name = name
        self.sess = None
        self.build_model()
        self.build_train()
        
    def build_model(self):
        activation = tf.nn.elu
        kernel_initializer = tf.contrib.layers.variance_scaling_initializer()
        kernel_regularizer = tf.contrib.layers.l2_regularizer(0.01)
        default_dense = partial(tf.layers.dense,\
                                activation=activation,\
                                kernel_initializer=kernel_initializer,\
                                kernel_regularizer=kernel_regularizer)
        with tf.variable_scope(self.name) as scope:
            observation = tf.placeholder(tf.float32,shape=[None,self.n_observation])
            action = tf.placeholder(tf.float32,shape=[None,self.n_action])
            hid1 = default_dense(observation,32)
            hid2 = default_dense(action,32)
            hid3 = tf.concat([hid1,hid2],axis=1)
            hid4 = default_dense(hid3,128)
            Q = default_dense(hid4,1, activation=None)
            trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=self.name)
        self.observation,self.action,self.Q,self.trainable_vars= observation,action,Q,trainable_vars
    
    def build_train(self,learning_rate=0.001):
        with tf.variable_scope(self.name) as scope:
            Qexpected = tf.placeholder(tf.float32,shape=[None,1])
            loss = tf.losses.mean_squared_error(Qexpected,self.Q)
            optimizer = tf.train.AdamOptimizer(learning_rate)
            train_op = optimizer.minimize(loss)
        self.Qexpected,self.train_op = Qexpected,train_op
        self.action_grads = tf.gradients(self.Q,self.action)[0]
    
    def predict_Q(self,obs_batch,action_batch):
        return self.Q.eval(session=self.sess,\
                           feed_dict={self.observation:obs_batch,self.action:action_batch})
    
    def compute_action_grads(self,obs_batch,action_batch):
        return self.action_grads.eval(session=self.sess,\
                               feed_dict={self.observation:obs_batch,self.action:action_batch})
    def train(self,obs_batch,action_batch,Qexpected_batch):
        self.train_op.run(session=self.sess,\
                          feed_dict={self.observation:obs_batch,self.action:action_batch,self.Qexpected:Qexpected_batch})
    
    def set_session(self,sess):
        self.sess = sess
        
    def get_trainable_dict(self):
        return {var.name[len(self.name):]: var for var in self.trainable_vars}

In [5]:
class AsyncNets(object):
    def __init__(self,class_name):
        class_ = eval(class_name)
        self.net = class_(3,1,name=class_name)
        self.target_net = class_(3,1,name='{}_target'.format(class_name))
        self.TAU = tf.placeholder(tf.float32,shape=None)
        self.sess = None
        self.__build_async_assign()
    
    def __build_async_assign(self):
        net_dict = self.net.get_trainable_dict()
        target_net_dict = self.target_net.get_trainable_dict()
        keys = net_dict.keys()
        async_update_op = [target_net_dict[key].assign((1-self.TAU)*target_net_dict[key]+self.TAU*net_dict[key]) \
                           for key in keys]
        self.async_update_op = async_update_op
    
    def async_update(self,tau=0.01):
        self.sess.run(self.async_update_op,feed_dict={self.TAU:tau})
    
    def set_session(self,sess):
        self.sess = sess
        self.net.set_session(sess)
        self.target_net.set_session(sess)
    
    def get_subnets(self):
        return self.net, self.target_net
    
        

In [6]:
from collections import deque
class Memory(object):
    def __init__(self,memory_size=10000):
        self.memory = deque(maxlen=memory_size)
        self.memory_size = memory_size
        
    def __len__(self):
        return len(self.memory)
    
    def append(self,item):
        self.memory.append(item)
        
    def sample_batch(self,batch_size=256):
        idx = np.random.permutation(len(self.memory))[:batch_size]
        return [self.memory[i] for i in idx]

In [7]:
def UONoise():
    theta = 0.15
    sigma = 0.2
    state = 0
    while True:
        yield state
        state += -theta*state+sigma*np.random.randn()

In [8]:
import gym
from gym import wrappers
max_episode = 500
gamma = 0.99
tau = 0.001
memory_size = 10000
batch_size = 256
memory_warmup = batch_size*3
max_explore_eps = 100
save_path = 'DDPG_net_Class.ckpt'

tf.reset_default_graph()
actorAsync = AsyncNets('Actor')
actor,actor_target = actorAsync.get_subnets()
criticAsync = AsyncNets('Critic')
critic,critic_target = criticAsync.get_subnets()

init = tf.global_variables_initializer()
saver = tf.train.Saver()
with tf.Session() as sess:
    init.run()
    actorAsync.set_session(sess)
    criticAsync.set_session(sess)
    env = gym.make('Pendulum-v0')
    env = wrappers.Monitor(env,'./tmp/',force=True)
    obs = env.reset()
    iteration = 0
    episode = 0
    episode_score = 0
    episode_steps = 0
    noise = UONoise()
    memory = Memory(memory_size)
    while episode < max_episode:
        print('\riter {}, ep {}'.format(iteration,episode),end='')
        action = actor.predict_action(np.reshape(obs,[1,-1]))[0]
        if episode<max_explore_eps: # exploration policy
            p = episode/max_explore_eps
            action = action*p + (1-p)*next(noise)
        action *= 2 # scale action
        next_obs, reward, done,info = env.step(action)
        memory.append([obs,action,reward,next_obs,done])
        if iteration >= memory_warmup:
            memory_batch = memory.sample_batch(batch_size)
            extract_mem = lambda k : np.array([item[k] for item in memory_batch])
            obs_batch = extract_mem(0)
            action_batch = extract_mem(1)
            reward_batch = extract_mem(2)
            next_obs_batch = extract_mem(3)
            done_batch = extract_mem(4)
            action_next = actor_target.predict_action(next_obs_batch)
            Q_next = critic_target.predict_Q(next_obs_batch,action_next)[:,0]
            Qexpected_batch = reward_batch + gamma*(1-done_batch)*Q_next # target Q value
            Qexpected_batch = np.reshape(Qexpected_batch,[-1,1])
            # train critic
            critic.train(obs_batch,action_batch,Qexpected_batch)
            # train actor
            action_grads = critic.compute_action_grads(obs_batch,action_batch)
            actor.train(obs_batch,action_grads)
            # async update
            actorAsync.async_update(tau)
            criticAsync.async_update(tau)
        episode_score += reward
        episode_steps += 1
        iteration += 1
        if done:
            print(', score {:8f}, steps {}'.format(episode_score,episode_steps))
#             if episode%5 == 0:
                
#                 Q_check = 
            obs = env.reset()
            episode += 1
            episode_score = 0
            episode_steps = 0
            noise = UONoise()
            if episode%100==0:
                saver.save(sess,save_path)
        else:
            obs = next_obs
env.close()

[2018-04-22 17:07:35,412] Making new env: Pendulum-v0
[2018-04-22 17:07:35,434] Starting new video recorder writing to /Users/dattlee/Developer/Final_Year_Project/tmp/openaigym.video.0.10474.video000000.mp4


iter 190, ep 0

[2018-04-22 17:07:39,761] Starting new video recorder writing to /Users/dattlee/Developer/Final_Year_Project/tmp/openaigym.video.0.10474.video000001.mp4


iter 199, ep 0, score -1214.675689, steps 200
iter 399, ep 1, score -997.673005, steps 200
iter 599, ep 2, score -1539.627618, steps 200
iter 799, ep 3, score -1524.000772, steps 200
iter 999, ep 4, score -1042.662316, steps 200
iter 1199, ep 5, score -954.466750, steps 200
iter 1399, ep 6, score -1651.210360, steps 200
iter 1599, ep 7

[2018-04-22 17:07:47,335] Starting new video recorder writing to /Users/dattlee/Developer/Final_Year_Project/tmp/openaigym.video.0.10474.video000008.mp4


, score -1127.422672, steps 200
iter 1799, ep 8, score -1334.673865, steps 200
iter 1999, ep 9, score -1100.311806, steps 200
iter 2199, ep 10, score -1608.512705, steps 200
iter 2399, ep 11, score -1169.280587, steps 200
iter 2599, ep 12, score -1441.181427, steps 200
iter 2799, ep 13, score -1553.228150, steps 200
iter 2999, ep 14, score -1214.210095, steps 200
iter 3199, ep 15, score -1488.731149, steps 200
iter 3399, ep 16, score -1371.057099, steps 200
iter 3599, ep 17, score -1731.397070, steps 200
iter 3799, ep 18, score -1692.031704, steps 200
iter 3999, ep 19, score -871.118443, steps 200
iter 4199, ep 20, score -1042.120064, steps 200
iter 4399, ep 21, score -1286.671239, steps 200
iter 4599, ep 22, score -1662.843445, steps 200
iter 4799, ep 23, score -1569.380840, steps 200
iter 4999, ep 24, score -1111.897616, steps 200
iter 5199, ep 25, score -1734.783088, steps 200
iter 5373, ep 26

[2018-04-22 17:08:07,946] Starting new video recorder writing to /Users/dattlee/Developer/Final_Year_Project/tmp/openaigym.video.0.10474.video000027.mp4


iter 5399, ep 26, score -1506.677845, steps 200
iter 5599, ep 27, score -1652.145745, steps 200
iter 5799, ep 28, score -1464.733009, steps 200
iter 5999, ep 29, score -1501.309866, steps 200
iter 6199, ep 30, score -1543.808192, steps 200
iter 6399, ep 31, score -1288.773206, steps 200
iter 6599, ep 32, score -1317.635515, steps 200
iter 6799, ep 33, score -1398.969117, steps 200
iter 6999, ep 34, score -1255.573886, steps 200
iter 7199, ep 35, score -1317.657525, steps 200
iter 7399, ep 36, score -1314.577563, steps 200
iter 7599, ep 37, score -1265.277316, steps 200
iter 7799, ep 38, score -1285.866488, steps 200
iter 7999, ep 39, score -1212.146464, steps 200
iter 8199, ep 40, score -1171.436901, steps 200
iter 8399, ep 41, score -1283.800316, steps 200
iter 8599, ep 42, score -1020.476557, steps 200
iter 8799, ep 43, score -994.013213, steps 200
iter 8999, ep 44, score -1098.952588, steps 200
iter 9199, ep 45, score -629.126610, steps 200
iter 9399, ep 46, score -1234.618854, step

[2018-04-22 17:08:50,102] Starting new video recorder writing to /Users/dattlee/Developer/Final_Year_Project/tmp/openaigym.video.0.10474.video000064.mp4


iter 12799, ep 63, score -513.585037, steps 200
iter 12999, ep 64, score -818.839757, steps 200
iter 13199, ep 65, score -410.040974, steps 200
iter 13399, ep 66, score -380.771165, steps 200
iter 13599, ep 67, score -257.671634, steps 200
iter 13799, ep 68, score -129.489024, steps 200
iter 13999, ep 69, score -869.282968, steps 200
iter 14199, ep 70, score -128.807583, steps 200
iter 14399, ep 71, score -1330.224335, steps 200
iter 14599, ep 72, score -381.995667, steps 200
iter 14799, ep 73, score -0.390461, steps 200
iter 14999, ep 74, score -285.611740, steps 200
iter 15199, ep 75, score -130.103164, steps 200
iter 15399, ep 76, score -128.114088, steps 200
iter 15599, ep 77, score -504.963379, steps 200
iter 15799, ep 78, score -0.135106, steps 200
iter 15999, ep 79, score -269.577870, steps 200
iter 16199, ep 80, score -380.923231, steps 200
iter 16399, ep 81, score -127.849147, steps 200
iter 16599, ep 82, score -124.345633, steps 200
iter 16799, ep 83, score -130.584580, steps

ValueError: Parent directory of DDPG_net_Class.ckpt doesn't exist, can't save.

In [9]:
gym.upload('./tmp/', api_key='sk_BlwjttPKR6ZsXVrObENYA')

[2017-08-26 21:34:01,863] [Pendulum-v0] Uploading 500 episodes of training data
[2017-08-26 21:34:03,713] [Pendulum-v0] Uploading videos of 8 training episodes (628501 bytes)
[2017-08-26 21:34:05,040] [Pendulum-v0] Creating evaluation object from ./tmp/ with learning curve and training video
[2017-08-26 21:34:05,260] 
****************************************************
You successfully uploaded your evaluation on Pendulum-v0 to
OpenAI Gym! You can find it at:

    https://gym.openai.com/evaluations/eval_ZVyGQYhVTb67h0Vu6UtOYQ

****************************************************
