# deep deterministic policy gradient

In [None]:
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np

## actor model

In [None]:
class Actor(tf.keras.Model):
    def __init__(self, action_dims=1):
        super(Actor, self).__init__()
        
        self.l0 = tf.keras.layers.Dense(64, activation=tf.nn.relu)
        self.l1 = tf.keras.layers.Dense(32, activation=tf.nn.relu)
        self.l2 = tf.keras.layers.Dense(16, activation=tf.nn.relu)
        self.l3 = tf.keras.layers.Dense(8, activation=tf.nn.relu)
        
        self.mu = tf.keras.layers.Dense(action_dims, activation=tf.nn.tanh)
        self.sigma = tf.keras.layers.Dense(action_dims, activation=tf.nn.softplus)
    
    def call(self, inputs):
        h = self.l0(inputs)
        h = self.l1(h)
        h = self.l2(h)
        h = self.l3(h)   
        
        mu = self.mu(h)
        sigma = self.sigma(h)
        return mu, sigma

## critic model

In [None]:
class Critic(tf.keras.Model):
    def __init__(self, ):
        super(Critic, self).__init__()
        
        self.l0 = tf.keras.layers.Dense(64, activation=tf.nn.relu)
        self.l1 = tf.keras.layers.Dense(32, activation=tf.nn.relu)
        self.l2 = tf.keras.layers.Dense(16, activation=tf.nn.relu)
        self.l3 = tf.keras.layers.Dense(8, activation=tf.nn.relu)
        self.l4 = tf.keras.layers.Dense(1)
    
    def call(self, inputs):
        inputs = tf.keras.layers.concatenate(inputs, axis=-1)
        h = self.l0(inputs)
        h = self.l1(h)
        h = self.l2(h)                
        h = self.l3(h)                
        q = self.l4(h) 
        return q

# replay buffer

In [None]:
class Memory():
    def __init__(self, mem_size=(1024, 9)):
        self.mem = np.zeros(shape=mem_size, dtype=np.float32)
        
        self.pointer = 0
        self.mem_size = mem_size
        self.full = False
    
    def add(self, obs, a, r, done, obs_):
        exp = np.concatenate((obs, a, [r, done], obs_), axis=-1).astype(np.float32)
        
        if self.pointer >= self.mem_size[0]:
            self.pointer = 0
            self.full = True
            
        self.mem[self.pointer] = exp
        
        self.pointer += 1
        
        return self.full
    
    def get_batch(self, batch_size):
        idx = np.random.randint(self.mem_size[0], size=batch_size)
        return self.mem[idx]

## DDPG

In [None]:
class DDPG():
    def __init__(self, mem_size=(1024,9), action_dims=1, obs_dims=3):
        self.mem = Memory(mem_size)
        
        self.actor = Actor(action_dims=1)
        self.actor_target = Actor(action_dims=1)
        
        self.critic = Critic()
        self.critic_target = Critic()
        
        '''
        self.actor.build(input_shape=(None, obs_dims))
        self.actor_target.build(input_shape=(None, obs_dims))
        self.actor_target.set_weights(self.actor.get_weights())
        
        self.critic.build(input_shape=(None, obs_dims+action_dims))
        self.critic_target.build(input_shape=(None, obs_dims+action_dims))
        self.critic_target.set_weights(self.critic.get_weights())
        '''
        
        self.gamma = 0.9
        self.actor_lr = 1e-3
        self.critic_lr = 1e-3
        self.critic_target_update_time = 10
        self.actor_target_update_time = 20
        self.tau = 0.8
        
        self.learning_times = 0
        self.obs_dims = obs_dims
        self.action_dims = action_dims
        self.status = {"buffer_full": False,
                       "learning_times": 0,
                       "update_actor": 0,
                       "update_critic": 0,
                       "reward_mean": 0,
                       "loss_critic": None,
                       "loss_actor": None,
                      }
    
    def get_action(self, obs, train=False):
        obs = tf.expand_dims(obs, axis=0)
        obs = tf.cast(obs, dtype=tf.float32)
        mu, sigma = self.actor(obs)
        disb = tfp.distributions.Normal(loc=mu*2., scale=sigma + 0.1 + 1. * float(train))
        a = disb.sample(1)[0,0]
        a = tf.clip_by_value(a, -2., 2.)
        return a
    
    def add_exp(self, obs, a, r, done, obs_):
        full = self.mem.add(obs, a, r, done, obs_)
        self.status["buffer_full"] = full
        return full
    
    def learning(self, batch_size):
        mini_batch = self.mem.get_batch(batch_size)
        
        obs = mini_batch[:,0:self.obs_dims]
        a = np.expand_dims(mini_batch[:, self.obs_dims], axis=-1)
        r = np.expand_dims(mini_batch[:, self.obs_dims+1], axis=-1)
        done = np.expand_dims(mini_batch[:, self.obs_dims+2], axis=-1)
        obs_ = mini_batch[:, -self.obs_dims:]
        
        mu_, sigma_ = self.actor_target(obs_)
        disb = tfp.distributions.Normal(loc=mu_*2., scale=sigma_)
        a_ = disb.sample(1)[0]
        q_ = self.critic_target([obs_, a_])
        
        with tf.GradientTape() as tape_c:
            q = self.critic([obs, a])
            q_target = r + self.gamma * q_ * (1. - done)
        
            loss_critic = tf.reduce_mean(tf.losses.mse(q_target, q))

        grads_critic = tape_c.gradient(target=loss_critic, sources=self.critic.trainable_variables)
        tf.optimizers.Adam(self.critic_lr).apply_gradients(zip(grads_critic, self.critic.trainable_variables))
        
        with tf.GradientTape() as tape_a:
            mu, sigma = self.actor(obs)
            distrib = tfp.distributions.Normal(loc=mu*2., scale=sigma)
            a_pred = distrib.sample(1)[0]
            a_pred = tf.clip_by_value(a_pred, -2., 2.)
            q = self.critic([obs, a_pred])
            
            loss_actor = -tf.reduce_mean(q)
        
        grads_actor = tape_a.gradient(target=loss_actor, sources=self.actor.trainable_variables)
        tf.optimizers.Adam(self.actor_lr).apply_gradients(zip(grads_actor, self.actor.trainable_variables))
        
        self.__update_critic_target()
        self.__update_actor_target()
        
        self.learning_times += 1        
        self.status["learning_times"] = self.learning_times
        self.status["loss_critic"] = float(loss_critic)
        self.status["loss_actor"] = float(loss_actor)
    
    def __update_critic_target(self, ):
        if self.learning_times % self.critic_target_update_time == 0:
            weights = []
            for w_target, w in zip(self.critic_target.get_weights(), self.critic.get_weights()):
                weights.append(self.tau * w_target + (1.-self.tau) * w)
            self.critic_target.set_weights(weights)
            self.status["update_critic"] += 1
    
    def __update_actor_target(self, ):
        if self.learning_times % self.actor_target_update_time == 0:
            weights = []
            for w_target, w in zip(self.actor_target.get_weights(), self.actor.get_weights()):
                weights.append(self.tau * w_target + (1.-self.tau) * w)
            self.actor_target.set_weights(weights)
            self.status["update_actor"] += 1

## test in env

In [None]:
import gym

env = gym.make("Pendulum-v0")
agent = DDPG(mem_size=(1024, 9), action_dims=1, obs_dims=3)

In [None]:
episode_len = 200
learn_per_step = 8
test_per_episode = 5
test_episodes = 5

episode = 0

while True:
    obs = env.reset()
    step = 0
    for _ in range(episode_len):
        a = agent.get_action(obs, train=True)
        obs_, r, _, info = env.step(a)
        
        full = agent.add_exp(obs, a, r, False, obs_)
        
        if full and step % learn_per_step == 0:
            agent.learning(batch_size=256)
            print(agent.status)
        
        obs = obs_
            
        step += 1
    
    if episode % test_per_episode == 0:
        print("---- start testing...")
        total_reward = 0
        for _ in range(test_episodes):
            s = env.reset()
            for _ in range(episode_len):
                act = agent.get_action(s)
                s_, rd, _, info = env.step(act)
                env.render()
                s = s_
                total_reward += rd
        agent.status["reward_mean"] = float(total_reward / test_episodes)
    
    episode += 1

In [None]:
env.close()