# Synchronous SAC with PyBullet Ant

In [1]:
import datetime,gym,os,pybullet_envs,time,os,ray
import numpy as np
import tensorflow as tf
from util import gpu_sess,suppress_tf_warning
from sac import ReplayBuffer,create_sac_model,create_sac_graph
np.set_printoptions(precision=2)
suppress_tf_warning() # suppress warning 
gym.logger.set_level(40) # gym logger 
print ("Packaged loaded. TF version is [%s]."%(tf.__version__))

Packaged loaded. TF version is [1.14.0].


### Rollout Worker

In [2]:
class RolloutWorkerClass(object):
    """
    Worker without RAY (for update purposes)
    """
    def __init__(self,lr=1e-3,gamma=0.99,alpha=0.1,polyak=0.995,seed=1):
        self.seed = seed
        # Each worker should maintain its own environment
        import pybullet_envs,gym
        from util import suppress_tf_warning
        suppress_tf_warning() # suppress TF warnings
        gym.logger.set_level(40) # gym logger 
        
        self.env = gym.make('AntBulletEnv-v0')
        odim,adim = self.env.observation_space.shape[0],self.env.action_space.shape[0]
        self.odim = odim
        self.adim = adim
        
        # Create SAC model and computational graph 
        self.model,self.sess = create_sac_model(odim=self.odim,adim=self.adim)
        self.step_ops,self.target_init = \
            create_sac_graph(self.model,lr=lr,gamma=gamma,alpha=alpha,polyak=polyak)
        
        # Initialize model 
        tf.set_random_seed(self.seed)
        np.random.seed(self.seed)
        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.target_init)
    
    def get_action(self,o,deterministic=False):
        act_op = self.model['mu'] if deterministic else self.model['pi']
        return self.sess.run(act_op, feed_dict={self.model['o_ph']:o.reshape(1,-1)})[0]

    def get_weights(self):
        """
        Get weights
        """
        weight_vals = self.sess.run(self.model['main_vars'])
        return weight_vals
    
    def set_weights(self,weight_vals):
        """
        Set weights
        """
        for w_idx,weight_tf_var in enumerate(self.model['main_vars']):
            self.sess.run(tf.assign(weight_tf_var,weight_vals[w_idx]))
            
    def rollout(self,ep_len_rollout=10):
        """
        Rollout
        """
        o_buffer,a_buffer,r_buffer,o2_buffer,d_buffer = \
            np.zeros((ep_len_rollout,self.odim)),np.zeros((ep_len_rollout,self.adim)),\
            np.zeros((ep_len_rollout)),np.zeros((ep_len_rollout,self.odim)),np.zeros((ep_len_rollout))
        o = self.env.reset() # reset environment
        # Loop
        for t in range(ep_len_rollout):
            a = self.get_action(o,deterministic=False) 
            o2,r,d,_ = self.env.step(a)
            # Append
            o_buffer[t,:],a_buffer[t,:],r_buffer[t],o2_buffer[t,:],d_buffer[t] = o,a,r,o2,d
            o = o2
            if d: o = self.env.reset()
        return o_buffer,a_buffer,r_buffer,o2_buffer,d_buffer
    
@ray.remote
class RayRolloutWorkerClass(object):
    """
    Rollout Worker with RAY
    """
    def __init__(self,worker_id=0):
        self.worker_id = worker_id
        # Each worker should maintain its own environment
        import pybullet_envs,gym
        from util import suppress_tf_warning
        suppress_tf_warning() # suppress TF warnings
        gym.logger.set_level(40) # gym logger 

        self.env = gym.make('AntBulletEnv-v0')
        odim,adim = self.env.observation_space.shape[0],self.env.action_space.shape[0]
        self.odim = odim
        self.adim = adim
        # Create SAC model
        self.model,self.sess = create_sac_model(odim=self.odim,adim=self.adim)
        self.sess.run(tf.global_variables_initializer())
        
    def get_action(self,o,deterministic=False):
        act_op = self.model['mu'] if deterministic else self.model['pi']
        return self.sess.run(act_op, feed_dict={self.model['o_ph']:o.reshape(1,-1)})[0]
    
    def get_weights(self):
        """
        Get weights
        """
        weight_vals = self.sess.run(self.model['main_vars'])
        return weight_vals
    
    def set_weights(self,weight_vals):
        """
        Set weights
        """
        for w_idx,weight_tf_var in enumerate(self.model['main_vars']):
            self.sess.run(tf.assign(weight_tf_var,weight_vals[w_idx]))
            
    def rollout(self,ep_len_rollout=10):
        """
        Rollout
        """
        o_buffer,a_buffer,r_buffer,o2_buffer,d_buffer = \
            np.zeros((ep_len_rollout,self.odim)),np.zeros((ep_len_rollout,self.adim)),\
            np.zeros((ep_len_rollout)),np.zeros((ep_len_rollout,self.odim)),np.zeros((ep_len_rollout))
        o = self.env.reset() # reset environment
        # Loop
        for t in range(ep_len_rollout):
            a = self.get_action(o,deterministic=False) 
            o2,r,d,_ = self.env.step(a)
            # Append
            o_buffer[t,:],a_buffer[t,:],r_buffer[t],o2_buffer[t,:],d_buffer[t] = o,a,r,o2,d
            o = o2
            if d: o = self.env.reset()
        return o_buffer,a_buffer,r_buffer,o2_buffer,d_buffer
    
print ("Rollout worker classes (with and without RAY) ready.")

Rollout worker classes (with and without RAY) ready.


### Initilize PyBullet Ant Environment

In [3]:
env_name = 'AntBulletEnv-v0'
test_env = gym.make(env_name)
_ = test_env.render(mode='human') # enable rendering on test_env
_ = test_env.reset()
for _ in range(3): # dummy run for proper rendering 
    a = test_env.action_space.sample()
    o,r,d,_ = test_env.step(a)
    time.sleep(0.01)
print ("[%s] ready."%(env_name))

[AntBulletEnv-v0] ready.


In [4]:
odim,adim = o.shape[0],a.shape[0]

### Initialize Workers

In [5]:
n_cpu = n_workers = 15
ray.init(num_cpus=n_cpu)
tf.reset_default_graph()
R = RolloutWorkerClass(lr=5e-4,gamma=0.99,alpha=0.1,polyak=0.995,seed=1)
workers = [RayRolloutWorkerClass.remote(worker_id=i) for i in range(n_workers)]
print ("RAY initialized with [%d] cpus and [%d] workers."%
       (n_cpu,n_workers))

2020-06-19 23:39:28,226	INFO resource_spec.py:212 -- Starting Ray with 16.5 GiB memory available for workers and up to 8.27 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-06-19 23:39:28,609	INFO services.py:1078 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


RAY initialized with [15] cpus and [15] workers.


In [6]:
time.sleep(1)

### Replay Buffers

In [7]:
replay_buffer = ReplayBuffer(odim=odim,adim=adim,size=int(1e6))
replay_buffer_short = ReplayBuffer(odim=odim,adim=adim,size=int(1e5))

### Loop

In [8]:
total_steps,evaluate_every = 300,20
ep_len_rollout = 1000
batch_size,update_count = 128,1000
num_eval,max_ep_len_eval = 3,1e3

In [9]:
start_time = time.time()
for t in range(int(total_steps)):
    esec = time.time()-start_time
    
    # Synchronize worker weights
    weights = R.get_weights()
    set_weights_list = [worker.set_weights.remote(weights) for worker in workers] 

    # Make rollout and accumulate to Buffers
    ops = [worker.rollout.remote(ep_len_rollout=ep_len_rollout) for worker in workers]
    rollout_vals = ray.get(ops)
    for rollout_val in rollout_vals:
        o_buffer,a_buffer,r_buffer,o2_buffer,d_buffer = rollout_val
        for i in range(ep_len_rollout):
            o,a,r,o2,d = o_buffer[i,:],a_buffer[i,:],r_buffer[i],o2_buffer[i,:],d_buffer[i]
            replay_buffer.store(o, a, r, o2, d) 
            replay_buffer_short.store(o, a, r, o2, d) 

    # Update
    for _ in range(int(update_count)):
        batch = replay_buffer.sample_batch(batch_size//2) 
        batch_short = replay_buffer_short.sample_batch(batch_size//2) 
        feed_dict = {R.model['o_ph']: np.concatenate((batch['obs1'],batch_short['obs1'])),
                     R.model['o2_ph']: np.concatenate((batch['obs2'],batch_short['obs2'])),
                     R.model['a_ph']: np.concatenate((batch['acts'],batch_short['acts'])),
                     R.model['r_ph']: np.concatenate((batch['rews'],batch_short['rews'])),
                     R.model['d_ph']: np.concatenate((batch['done'],batch_short['done']))
                    }
        outs = R.sess.run(R.step_ops, feed_dict)

    # Evaluate
    if (t == 0) or (((t+1)%evaluate_every) == 0): 
        print ("[Evaluate] step:[%d/%d][%.1f%%] time:[%s]."%
               (t+1,total_steps,t/total_steps*100,
                time.strftime("%H:%M:%S", time.gmtime(time.time()-start_time)))
              )
        for eval_idx in range(num_eval): 
            o,d,ep_ret,ep_len = test_env.reset(),False,0,0
            _ = test_env.render(mode='human') 
            while not(d or (ep_len == max_ep_len_eval)):
                a = R.get_action(o,deterministic=True)
                o,r,d,_ = test_env.step(a)
                _ = test_env.render(mode='human') 
                ep_ret += r # compute return 
                ep_len += 1
            print ("[Evaluate] [%d/%d] ep_ret:[%.4f] ep_len:[%d]"
                %(eval_idx,num_eval,ep_ret,ep_len))
    
print ("Done.")

[Evaluate] step:[1/300][0.0%] time:[00:00:12].
[Evaluate] [0/3] ep_ret:[14.9603] ep_len:[25]
[Evaluate] [1/3] ep_ret:[17.0414] ep_len:[29]
[Evaluate] [2/3] ep_ret:[13.8742] ep_len:[23]
[Evaluate] step:[20/300][6.3%] time:[00:03:49].
[Evaluate] [0/3] ep_ret:[489.2665] ep_len:[1000]
[Evaluate] [1/3] ep_ret:[432.3884] ep_len:[1000]
[Evaluate] [2/3] ep_ret:[568.0658] ep_len:[1000]
[Evaluate] step:[40/300][13.0%] time:[00:08:17].
[Evaluate] [0/3] ep_ret:[800.8129] ep_len:[1000]
[Evaluate] [1/3] ep_ret:[802.1072] ep_len:[1000]
[Evaluate] [2/3] ep_ret:[810.2450] ep_len:[1000]
[Evaluate] step:[60/300][19.7%] time:[00:13:26].
[Evaluate] [0/3] ep_ret:[765.1344] ep_len:[1000]
[Evaluate] [1/3] ep_ret:[802.4225] ep_len:[1000]
[Evaluate] [2/3] ep_ret:[795.4619] ep_len:[1000]
[Evaluate] step:[80/300][26.3%] time:[00:19:16].
[Evaluate] [0/3] ep_ret:[1190.5351] ep_len:[1000]
[Evaluate] [1/3] ep_ret:[1120.6869] ep_len:[1000]
[Evaluate] [2/3] ep_ret:[875.4159] ep_len:[1000]
[Evaluate] step:[100/300][33.0

### Close

In [15]:
test_env.close()

In [10]:
ray.shutdown()

### Test-Run

In [18]:
gym.logger.set_level(40)
env_name = 'AntBulletEnv-v0'
test_env = gym.make(env_name)
_ = test_env.render(mode='human') # enable rendering on test_env
_ = test_env.reset()
for _ in range(3): # dummy run for proper rendering 
    a = test_env.action_space.sample()
    o,r,d,_ = test_env.step(a)
    time.sleep(0.01)
print ("[%s] ready."%(env_name))
o,d,ep_ret,ep_len = test_env.reset(),False,0,0
_ = test_env.render(mode='human') 
while not(d or (ep_len == max_ep_len_eval)):
    a = R.get_action(o,deterministic=True)
    o,r,d,_ = test_env.step(a)
    _ = test_env.render(mode='human') 
    ep_ret += r # compute return 
    ep_len += 1
print ("[Evaluate] ep_ret:[%.4f] ep_len:[%d]"
    %(eval_idx,ep_len))
test_env.close() # close env

[AntBulletEnv-v0] ready.
[Evaluate] ep_ret:[2.0000] ep_len:[1000]
