# Snychronous SAC with PyBullet Ant Env  <font color='grey'> (*Self-Contained*) </font>

In [1]:
import os,ray
import numpy as np
import tensorflow as tf
from util import gpu_sess,suppress_tf_warning
np.set_printoptions(precision=2)
print ("Packaged loaded. TF version is [%s]."%(tf.__version__))

Packaged loaded. TF version is [1.14.0].


In [2]:
suppress_tf_warning() # suppress warning 

### Ray Workers

In [3]:
n_cpus = 5
ray.init(num_cpus=n_cpus)
print ("RAY initialized with [%d] cpus."%(n_cpus))

2020-06-18 21:15:01,619	INFO resource_spec.py:212 -- Starting Ray with 16.16 GiB memory available for workers and up to 8.1 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-06-18 21:15:01,976	INFO services.py:1078 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


RAY initialized with [5] cpus.


### SAC Model

In [4]:
class ReplayBuffer:
    """
    A simple FIFO experience replay buffer for SAC agents.
    """
    def __init__(self, odim, adim, size):
        self.obs1_buf = np.zeros([size, odim], dtype=np.float32)
        self.obs2_buf = np.zeros([size, odim], dtype=np.float32)
        self.acts_buf = np.zeros([size, adim], dtype=np.float32)
        self.rews_buf = np.zeros(size, dtype=np.float32)
        self.done_buf = np.zeros(size, dtype=np.float32)
        self.ptr, self.size, self.max_size = 0, 0, size
    def store(self, obs, act, rew, next_obs, done):
        self.obs1_buf[self.ptr] = obs
        self.obs2_buf[self.ptr] = next_obs
        self.acts_buf[self.ptr] = act
        self.rews_buf[self.ptr] = rew
        self.done_buf[self.ptr] = done
        self.ptr = (self.ptr+1) % self.max_size
        self.size = min(self.size+1, self.max_size)
    def sample_batch(self, batch_size=32):
        idxs = np.random.randint(0, self.size, size=batch_size)
        return dict(obs1=self.obs1_buf[idxs],
                    obs2=self.obs2_buf[idxs],
                    acts=self.acts_buf[idxs],
                    rews=self.rews_buf[idxs],
                    done=self.done_buf[idxs])
    
def create_sac_model(odim=10,adim=2,hdims=[256,256]):
    """
    Soft Actor Critic Model (compatible with Ray)
    """
    import tensorflow as tf # make it compatible with Ray actors
    
    def mlp(x,hdims=[256,256],actv=tf.nn.relu,out_actv=tf.nn.relu):
        ki = tf.truncated_normal_initializer(stddev=0.1)
        for hdim in hdims[:-1]:
            x = tf.layers.dense(x,units=hdim,activation=actv,kernel_initializer=ki)
        return tf.layers.dense(x,units=hdims[-1],activation=out_actv,kernel_initializer=ki)
    def gaussian_loglik(x,mu,log_std):
        EPS = 1e-8
        pre_sum = -0.5*(
            ( (x-mu)/(tf.exp(log_std)+EPS) )**2 +
            2*log_std + np.log(2*np.pi)
        )
        return tf.reduce_sum(pre_sum, axis=1)
    def mlp_gaussian_policy(o,adim=2,hdims=[256,256],actv=tf.nn.relu):
        net = mlp(x=o,hdims=hdims,actv=actv,out_actv=actv) # feature 
        mu = tf.layers.dense(net,adim,activation=None) # mu
        log_std = tf.layers.dense(net,adim,activation=None) # log_std
        LOG_STD_MIN,LOG_STD_MAX = -10.0,+2.0
        log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX) 
        std = tf.exp(log_std) # std 
        pi = mu + tf.random_normal(tf.shape(mu)) * std  # sampled
        logp_pi = gaussian_loglik(x=pi,mu=mu,log_std=log_std) # log lik
        return mu,pi,logp_pi
    def squash_action(mu,pi,logp_pi):
        # Squash those unbounded actions
        logp_pi -= tf.reduce_sum(2*(np.log(2) - pi -
                                    tf.nn.softplus(-2*pi)), axis=1)
        mu,pi = tf.tanh(mu),tf.tanh(pi)
        return mu, pi, logp_pi
    def mlp_actor_critic(o,a,hdims=[256,256],actv=tf.nn.relu,out_actv=None,
                         policy=mlp_gaussian_policy):
        adim = a.shape.as_list()[-1]
        with tf.variable_scope('pi'): # policy
            mu,pi,logp_pi = policy(o=o,adim=adim,hdims=hdims,actv=actv)
            mu,pi,logp_pi = squash_action(mu=mu,pi=pi,logp_pi=logp_pi)
        def vf_mlp(x): return tf.squeeze(
            mlp(x=x,hdims=hdims+[1],actv=actv,out_actv=None),axis=1)
        with tf.variable_scope('q1'): q1 = vf_mlp( tf.concat([o,a],axis=-1))
        with tf.variable_scope('q2'): q2 = vf_mlp( tf.concat([o,a],axis=-1))
        return mu,pi,logp_pi,q1,q2
    
    def placeholder(dim=None):
        return tf.placeholder(dtype=tf.float32,shape=(None,dim) if dim else (None,))
    def placeholders(*args):
        """
        Usage: a_ph,b_ph,c_ph = placeholders(adim,bdim,None)
        """
        return [placeholder(dim) for dim in args]
    def get_vars(scope):
        return [x for x in tf.compat.v1.global_variables() if scope in x.name]
    
    # Have own session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    
    # Placeholders
    o_ph,a_ph,o2_ph,r_ph,d_ph = placeholders(odim,adim,odim,None,None)
    # Actor critic 
    ac_kwargs = {'hdims':hdims,'actv':tf.nn.relu,'out_actv':None,'policy':mlp_gaussian_policy}
    with tf.variable_scope('main'):
        mu,pi,logp_pi,q1,q2 = mlp_actor_critic(o=o_ph,a=a_ph,**ac_kwargs)
    with tf.variable_scope('main',reuse=True):
        _,_,_,q1_pi,q2_pi = mlp_actor_critic(o=o_ph,a=pi,**ac_kwargs)
        _,pi_next,logp_pi_next,_,_ = mlp_actor_critic(o=o2_ph,a=a_ph,**ac_kwargs)
    # Target value
    with tf.variable_scope('target'):
        _,_,_,q1_targ,q2_targ = mlp_actor_critic(o=o2_ph,a=pi_next,**ac_kwargs)
        
    # Get variables
    main_vars,q_vars,pi_vars,target_vars = \
        get_vars('main'),get_vars('main/q'),get_vars('main/pi'),get_vars('target')
    
    model = {'o_ph':o_ph,'a_ph':a_ph,'o2_ph':o2_ph,'r_ph':r_ph,'d_ph':d_ph,
             'mu':mu,'pi':pi,'logp_pi':logp_pi,'q1':q1,'q2':q2,
             'q1_pi':q1_pi,'q2_pi':q2_pi,
             'pi_next':pi_next,'logp_pi_next':logp_pi_next,
             'q1_targ':q1_targ,'q2_targ':q2_targ,
             'main_vars':main_vars,'q_vars':q_vars,'pi_vars':pi_vars,'target_vars':target_vars}
        
    return model,sess

def create_sac_graph(model,lr=1e-3,gamma=0.98,alpha=0.1,polyak=0.995):
    """
    SAC Computational Graph
    """
    # Double Q-learning
    min_q_pi = tf.minimum(model['q1_pi'],model['q2_pi'])
    min_q_targ = tf.minimum(model['q1_targ'],model['q2_targ'])
    
    # Entropy-regularized Bellman backup
    q_backup = tf.stop_gradient(
        model['r_ph'] + 
        gamma*(1-model['d_ph'])*(min_q_targ - alpha*model['logp_pi_next'])
    )
    
    # Soft actor-critic losses
    pi_loss = tf.reduce_mean(alpha*model['logp_pi'] - min_q_pi)
    q1_loss = 0.5 * tf.reduce_mean((q_backup - model['q1'])**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - model['q2'])**2)
    value_loss = q1_loss + q2_loss
    
    # Policy train op
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_pi_op = pi_optimizer.minimize(pi_loss,var_list=model['pi_vars'])
    
    # Value train op 
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss,var_list=model['q_vars'])
        
    # Polyak averaging for target variables
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
                                  for v_main, v_targ in 
                                      zip(model['main_vars'], model['target_vars'])]
                                )
    
    # All ops to call during one training step
    step_ops = [pi_loss, q1_loss, q2_loss, model['q1'], model['q2'], model['logp_pi'],
                train_pi_op, train_value_op, target_update]
    
    # Initializing targets to match main variables
    target_init = tf.group([tf.assign(v_targ, v_main)
                            for v_main, v_targ in 
                                zip(model['main_vars'], model['target_vars'])]
                          )

    return step_ops,target_init
    
def get_action(model,sess,o,deterministic=False):
    act_op = model['mu'] if deterministic else model['pi']
    return sess.run(act_op, feed_dict={model['o_ph']:o.reshape(1,-1)})[0]

print ("SAC model ready.")

SAC model ready.


### Rollout Worker Class

In [5]:
class RolloutWorkerClass(object):
    """
    Worker without RAY (for update purposes)
    """
    def __init__(self,odim=10,adim=2):
        self.odim = odim
        self.adim = adim
        self.model,self.sess = create_sac_model(odim=self.odim,adim=self.adim)
        self.step_ops,self.target_init = \
            create_sac_graph(self.model,lr=1e-3,gamma=0.98,alpha=0.1,polyak=0.995)
        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.target_init)
    def get_weights(self):
        weight_vals = self.sess.run(self.model['main_vars'])
        return weight_vals
    def set_weights(self,weight_vals):
        for w_idx,weight_tf_var in enumerate(self.model['main_vars']):
            self.sess.run(tf.assign(weight_tf_var,weight_vals[w_idx]))
    def rollout(self,n_rollout=10,seed=None):
        if seed is not None:
            np.random.seed(seed=seed)
        o_random = np.random.rand(n_rollout,self.odim)
        mu_vals = self.sess.run(self.model['mu'],feed_dict={self.model['o_ph']:o_random})
        return mu_vals
    
@ray.remote
class RayRolloutWorkerClass(object):
    """
    Rollout Worker with RAY
    """
    def __init__(self,worker_id=0,odim=10,adim=2):
        self.worker_id = worker_id
        self.odim = odim
        self.adim = adim
        self.model,self.sess = create_sac_model(odim=self.odim,adim=self.adim)
        self.sess.run(tf.global_variables_initializer())
        
    def get_weights(self):
        weight_vals = self.sess.run(self.model['main_vars'])
        return weight_vals
    def set_weights(self,weight_vals):
        for w_idx,weight_tf_var in enumerate(self.model['main_vars']):
            self.sess.run(tf.assign(weight_tf_var,weight_vals[w_idx]))
    def rollout(self,n_rollout=10,seed=None):
        if seed is not None:
            np.random.seed(seed=seed)
        o_random = np.random.rand(n_rollout,self.odim)
        mu_vals = self.sess.run(self.model['mu'],feed_dict={self.model['o_ph']:o_random})
        return mu_vals
    
print ("Rollout worker classes (with and without RAY) ready.")

Rollout worker classes (with and without RAY) ready.


### Initialize a single <font color='red'> *CentralWorker* </font>

In [6]:
odim,adim = 10,2
tf.reset_default_graph()
R = RolloutWorkerClass(odim=odim,adim=adim)
print ("Single centralized worker initialized.")

Single centralized worker initialized.


### Initialize multiple <font color='blue'> *RayWorkers* </font>

In [7]:
n_workers = 5
workers = [RayRolloutWorkerClass.remote(
    worker_id=i,odim=odim,adim=adim) for i in range(n_workers)]
print ("[%d] workers initialized."%(n_workers))

[5] workers initialized.


### Rollout of <font color='red'> *CentralWorker* </font>

In [8]:
rollout_val = R.rollout(n_rollout=2,seed=0)
print ("Rollout result of the central worker is:\n%s"%(rollout_val))

Rollout result of the central worker is:
[[0.15 0.07]
 [0.02 0.08]]


### Rollout of <font color='blue'> *RayWorkers* </font>

In [9]:
ops = [worker.rollout.remote(n_rollout=2,seed=0) for worker in workers] # non-block
rollout_vals = ray.get(ops)
for r_idx,rollout_val in enumerate(rollout_vals):
    print ("Rollout result of [%d] worker is:\n %s"%(r_idx,rollout_val))

[2m[36m(pid=18456)[0m Instructions for updating:
[2m[36m(pid=18456)[0m Use keras.layers.dense instead.
[2m[36m(pid=18459)[0m Instructions for updating:
[2m[36m(pid=18459)[0m Use keras.layers.dense instead.
[2m[36m(pid=18457)[0m Instructions for updating:
[2m[36m(pid=18457)[0m Use keras.layers.dense instead.
[2m[36m(pid=18460)[0m Instructions for updating:
[2m[36m(pid=18460)[0m Use keras.layers.dense instead.
[2m[36m(pid=18458)[0m Instructions for updating:
[2m[36m(pid=18458)[0m Use keras.layers.dense instead.
[2m[36m(pid=18456)[0m Instructions for updating:
[2m[36m(pid=18456)[0m Call initializer instance with the dtype argument instead of passing it to the constructor
[2m[36m(pid=18459)[0m Instructions for updating:
[2m[36m(pid=18459)[0m Call initializer instance with the dtype argument instead of passing it to the constructor
[2m[36m(pid=18457)[0m Instructions for updating:
[2m[36m(pid=18457)[0m Call initializer instance with the dtype a

### Assign the weights of <font color='red'> *CentralWorker* </font> &#10140; <font color='blue'> *RayWorkers* </font>

In [10]:
weights = R.get_weights()
set_weights_list = [worker.set_weights.remote(weights) for worker in workers] # non-block

### The rollout results of <font color='blue'> *RayWorkers* </font> should be the same

In [11]:
ops = [worker.rollout.remote(n_rollout=2,seed=0) for worker in workers] # non-block
rollout_vals = ray.get(ops)
for r_idx,rollout_val in enumerate(rollout_vals):
    print ("Rollout result of [%d] worker is:\n %s"%(r_idx,rollout_val))

Rollout result of [0] worker is:
 [[0.15 0.07]
 [0.02 0.08]]
Rollout result of [1] worker is:
 [[0.15 0.07]
 [0.02 0.08]]
Rollout result of [2] worker is:
 [[0.15 0.07]
 [0.02 0.08]]
Rollout result of [3] worker is:
 [[0.15 0.07]
 [0.02 0.08]]
Rollout result of [4] worker is:
 [[0.15 0.07]
 [0.02 0.08]]


### Update <font color='red'> *CentralWorker* </font> with Dummy Data

In [12]:
batch_size = 128
rand = np.random.rand
feed_dict = {
    R.model['o_ph']:rand(batch_size,R.odim),
    R.model['o2_ph']:rand(batch_size,R.odim),
    R.model['a_ph']:rand(batch_size,R.adim),
    R.model['r_ph']:rand(batch_size),
    R.model['d_ph']:np.zeros(batch_size)
}
outs = R.sess.run(R.step_ops, feed_dict) # train 

### Assign the weights of <font color='red'> *CentralWorker* </font> &#10140; <font color='blue'> *RayWorkers* </font>  and Rollout <font color='blue'> *RayWorkers* </font>

In [13]:
weights = R.get_weights()
set_weights_list = [worker.set_weights.remote(weights) for worker in workers] # non-block
ops = [worker.rollout.remote(n_rollout=2,seed=0) for worker in workers] # non-block
rollout_vals = ray.get(ops)
for r_idx,rollout_val in enumerate(rollout_vals):
    print ("Rollout result of [%d] worker is:\n %s"%(r_idx,rollout_val))

Rollout result of [0] worker is:
 [[0.19 0.27]
 [0.07 0.29]]
Rollout result of [1] worker is:
 [[0.19 0.27]
 [0.07 0.29]]
Rollout result of [2] worker is:
 [[0.19 0.27]
 [0.07 0.29]]
Rollout result of [3] worker is:
 [[0.19 0.27]
 [0.07 0.29]]
Rollout result of [4] worker is:
 [[0.19 0.27]
 [0.07 0.29]]


### Shutdown RAY

In [14]:
ray.shutdown()
print ("RAY shutdown.")

RAY shutdown.
