# Augmented Random Search (ARS) with PyBullet Ant

In [1]:
import datetime,gym,os,pybullet_envs,time,os,psutil,ray
import numpy as np
import tensorflow as tf
from ars import create_ars_model,get_noises_from_weights
from util import gpu_sess,suppress_tf_warning
np.set_printoptions(precision=2)
suppress_tf_warning() # suppress warning 
gym.logger.set_level(40) # gym logger 
print ("Packaged loaded. TF version is [%s]."%(tf.__version__))

Packaged loaded. TF version is [1.15.0].


### Rollout Worker

In [2]:
RENDER_ON_EVAL = False

In [3]:
def get_env():
    import pybullet_envs,gym
    gym.logger.set_level(40) # gym logger 
    return gym.make('AntBulletEnv-v0')

def get_eval_env():
    import pybullet_envs,gym
    gym.logger.set_level(40) # gym logger
    eval_env = gym.make('AntBulletEnv-v0')
    if RENDER_ON_EVAL:
        _ = eval_env.render(mode='human') # enable rendering
    _ = eval_env.reset()
    for _ in range(3): # dummy run for proper rendering 
        a = eval_env.action_space.sample()
        o,r,d,_ = eval_env.step(a)
        time.sleep(0.01)
    return eval_env

In [4]:
class RolloutWorkerClass(object):
    """
    Worker without RAY (for update purposes)
    """
    def __init__(self,seed=1):
        self.seed = seed
        from util import suppress_tf_warning
        suppress_tf_warning() # suppress TF warnings
        self.env = get_env()
        odim,adim = self.env.observation_space.shape[0],self.env.action_space.shape[0]
        self.odim = odim
        self.adim = adim
        # ARS model 
        self.model,self.sess = create_ars_model(
            odim=self.odim,adim=self.adim,hdims=[128],
            actv=tf.nn.relu,out_actv=tf.nn.tanh)
        # Initialize model 
        tf.set_random_seed(self.seed)
        np.random.seed(self.seed)
        self.sess.run(tf.global_variables_initializer())
        # Flag to initialize assign operations for 'set_weights()'
        self.FIRST_SET_FLAG = True
        
    def get_action(self,o):
        return self.sess.run(
            self.model['mu'],feed_dict={self.model['o_ph']:o.reshape(1,-1)})[0]
    
    def get_weights(self):
        """
        Get weights
        """
        weight_vals = self.sess.run(self.model['main_vars'])
        return weight_vals
    
    def set_weights(self,weight_vals):
        """
        Set weights without memory leakage
        """
        if self.FIRST_SET_FLAG:
            self.FIRST_SET_FLAG = False
            self.assign_placeholders = []
            self.assign_ops = []
            for w_idx,weight_tf_var in enumerate(self.model['main_vars']):
                a = weight_tf_var
                assign_placeholder = tf.placeholder(a.dtype, shape=a.get_shape())
                assign_op = a.assign(assign_placeholder)
                self.assign_placeholders.append(assign_placeholder)
                self.assign_ops.append(assign_op)
        for w_idx,weight_tf_var in enumerate(self.model['main_vars']):
            self.sess.run(self.assign_ops[w_idx],
                          {self.assign_placeholders[w_idx]:weight_vals[w_idx]})
            
@ray.remote
class RayRolloutWorkerClass(object):
    """
    Rollout Worker with RAY
    """
    def __init__(self,worker_id=0,ep_len_rollout=1000):
        self.worker_id = worker_id
        self.ep_len_rollout = ep_len_rollout
        from util import suppress_tf_warning
        suppress_tf_warning() # suppress TF warnings
        self.env = get_env()
        odim,adim = self.env.observation_space.shape[0],self.env.action_space.shape[0]
        self.odim = odim
        self.adim = adim
        # ARS model 
        self.model,self.sess = create_ars_model(
            odim=self.odim,adim=self.adim,hdims=[128],
            actv=tf.nn.relu,out_actv=tf.nn.tanh)
        
        # Flag to initialize assign operations for 'set_weights()'
        self.FIRST_SET_FLAG = True
        
        # Flag to initialize rollout
        self.FIRST_ROLLOUT_FLAG = True
        
    def get_action(self,o):
        return self.sess.run(
            self.model['mu'],feed_dict={self.model['o_ph']:o.reshape(1,-1)})[0]
    
    def set_weights(self,weight_vals,noise_vals,noise_sign=+1):
        """
        Set weights without memory leakage
        """
        if self.FIRST_SET_FLAG:
            self.FIRST_SET_FLAG = False
            self.assign_placeholders = []
            self.assign_ops = []
            for w_idx,weight_tf_var in enumerate(self.model['main_vars']):
                a = weight_tf_var
                assign_placeholder = tf.placeholder(a.dtype, shape=a.get_shape())
                assign_op = a.assign(assign_placeholder)
                self.assign_placeholders.append(assign_placeholder)
                self.assign_ops.append(assign_op)
        for w_idx,weight_tf_var in enumerate(self.model['main_vars']):
            self.sess.run(self.assign_ops[w_idx],
                          {self.assign_placeholders[w_idx]:
                           weight_vals[w_idx]+noise_sign*noise_vals[w_idx]}) 
            
    def rollout(self):
        """
        Rollout
        """
        if self.FIRST_ROLLOUT_FLAG:
            self.FIRST_ROLLOUT_FLAG = False
            self.o = self.env.reset() # reset environment
            
        # Loop
        self.o = self.env.reset() # reset always
        r_sum,step = 0,0
        for t in range(self.ep_len_rollout):
            self.a = self.get_action(self.o) 
            self.o2,self.r,self.d,_ = self.env.step(self.a)
            # Save next state 
            self.o = self.o2
            # Accumulate reward
            r_sum += self.r
            step += 1
            if self.d: break
        return r_sum,step
    

### Initialize Env

In [5]:
eval_env = get_eval_env()
adim,odim = eval_env.action_space.shape[0],eval_env.observation_space.shape[0]
print ("Environment Ready. odim:[%d] adim:[%d]."%(odim,adim))

Environment Ready. odim:[28] adim:[8].


### Hyper-parameters

In [6]:
n_cpu = n_workers = 30
total_steps,evaluate_every = 2000,50 
ep_len_rollout = 1000
num_eval,max_ep_len_eval = 3,1e3
n_env_step = 0
alpha,nu,b = 0.01,0.02,n_workers//2

### Initialize Workers

In [7]:
ray.init(num_cpus=n_cpu)
tf.reset_default_graph()
R = RolloutWorkerClass(seed=0)
workers = [RayRolloutWorkerClass.remote(worker_id=i,ep_len_rollout=ep_len_rollout) 
           for i in range(n_workers)]
print ("RAY initialized with [%d] cpus and [%d] workers."%
       (n_cpu,n_workers))

2020-07-12 10:55:15,598	INFO resource_spec.py:212 -- Starting Ray with 143.02 GiB memory available for workers and up to 65.29 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-07-12 10:55:16,172	INFO services.py:1165 -- View the Ray dashboard at [1m[32mlocalhost:8267[39m[22m


RAY initialized with [30] cpus and [30] workers.


In [8]:
time.sleep(1)

### Loop

In [None]:
start_time = time.time()
for t in range(int(total_steps)):
    
    # Distribute worker weights
    weights = R.get_weights()
    noises_list = []
    for _ in range(n_workers):
        noises_list.append(get_noises_from_weights(weights,nu=nu))
        
    # Positive rollouts (noise_sign=+1)
    set_weights_list = [worker.set_weights.remote(weights,noises,noise_sign=+1) 
                        for worker,noises in zip(workers,noises_list)] 
    ops = [worker.rollout.remote() for worker in workers]
    res_pos = ray.get(ops)
    rollout_pos_vals,r_idx = np.zeros(n_workers),0
    for rew,eplen in res_pos:
        rollout_pos_vals[r_idx] = rew
        r_idx = r_idx + 1
        n_env_step += eplen
    
    # Negative rollouts (noise_sign=-1)
    set_weights_list = [worker.set_weights.remote(weights,noises,noise_sign=-1) 
                        for worker,noises in zip(workers,noises_list)] 
    ops = [worker.rollout.remote() for worker in workers]
    res_neg = ray.get(ops)
    rollout_neg_vals,r_idx = np.zeros(n_workers),0
    for rew,eplen in res_neg:
        rollout_neg_vals[r_idx] = rew
        r_idx = r_idx + 1
        n_env_step += eplen
    
    # Reward 
    rollout_concat_vals = np.concatenate((rollout_pos_vals,rollout_neg_vals))
    rollout_delta_vals = rollout_pos_vals - rollout_neg_vals 
    rollout_max_vals = np.maximum(rollout_pos_vals,rollout_neg_vals)
    
    # Sort
    sort_idx = np.argsort(-rollout_max_vals)
    
    # Update
    sigma_R = np.std(rollout_concat_vals)
    weights_updated = []
    for w_idx,weight in enumerate(weights): # for each weight 
        delta_weight_sum = np.zeros_like(weight)
        for k in range(b):
            idx = sort_idx[k] # sorted index
            rollout_delta = rollout_delta_vals[idx]
            noises_k = noises_list[idx]
            noise_k = noises_k[w_idx] # noise for current weight
            delta_weight_sum += rollout_delta*noise_k
        delta_weight = (alpha/(b*sigma_R))*delta_weight_sum
        weight = weight + delta_weight
        weights_updated.append(weight) 
    
    # Set weight
    R.set_weights(weights_updated)
    
    # Evaluate
    if (t == 0) or (((t+1)%evaluate_every) == 0) or (t == (total_steps-1)): 
        ram_percent = psutil.virtual_memory().percent # memory usage
        print ("[Evaluate] step:[%d/%d][%.1f%%] #step:[%.1e] time:[%s] ram:[%.1f%%]."%
               (t+1,total_steps,t/total_steps*100,
                n_env_step,
                time.strftime("%H:%M:%S", time.gmtime(time.time()-start_time)),
                ram_percent)
              )
        for eval_idx in range(num_eval): 
            o,d,ep_ret,ep_len = eval_env.reset(),False,0,0
            if RENDER_ON_EVAL:
                _ = eval_env.render(mode='human') 
            while not(d or (ep_len == max_ep_len_eval)):
                a = R.get_action(o)
                o,r,d,_ = eval_env.step(a)
                if RENDER_ON_EVAL:
                    _ = eval_env.render(mode='human') 
                ep_ret += r # compute return 
                ep_len += 1
            print ("[Evaluate] [%d/%d] ep_ret:[%.4f] ep_len:[%d]"
                %(eval_idx,num_eval,ep_ret,ep_len)) 
    
print ("Done.")

[Evaluate] step:[1/2000][0.0%] #step:[1.1e+04] time:[00:00:03] ram:[17.3%].
[Evaluate] [0/3] ep_ret:[48.9050] ep_len:[62]
[Evaluate] [1/3] ep_ret:[25.8082] ep_len:[38]
[Evaluate] [2/3] ep_ret:[26.2239] ep_len:[39]
