# ARS with CustomADTEnvContinuous

In [1]:
import datetime,gym,time,os,psutil,ray
import numpy as np
import tensorflow as tf
from util import gpu_sess,suppress_tf_warning,tic,toc,open_txt,write_txt,OnlineMeanVariance
from ars import create_ars_model,get_noises_from_weights,save_ars_model,restore_ars_model
np.set_printoptions(precision=2)
suppress_tf_warning() # suppress warning 
gym.logger.set_level(40) # gym logger 

from episci.environment_wrappers.tactical_action_adt_env_continuous import CustomADTEnvContinuous
from episci.agents.utils.constants import Agents, RewardType
print ("Packaged loaded. TF version is [%s]."%(tf.__version__))

Packaged loaded. TF version is [1.15.0].


### Hyperparameters

In [2]:
exp_name = 'ars_adt_cont'
n_cpu = 101
n_workers = 100
total_steps,evaluate_every,print_every = 5000,5,1
ep_len_rollout = 15000 
hdims,actv,out_actv = [128,128],tf.nn.relu,tf.nn.tanh
# alpha:stepsize / nu:exploration std / b: elite set size
alpha,nu,b = 0.01,0.01,(n_workers//4)
seed = 1
# Train
red_list_train = [
    Agents.SPOT_RANDOM,
    Agents.EXPERT_SYSTEM,
    Agents.EXPERT_SYSTEM
]*2
# Evaluation
red_list_eval = [
    Agents.ZOMBIE, 
    Agents.ROSIE, 
    Agents.BUD, 
    Agents.BUD_FSM, 
    Agents.EXPERT_SYSTEM_TRIAL_2, 
    Agents.EXPERT_SYSTEM_TRIAL_3_SCRIMMAGE_4, 
    Agents.EXPERT_SYSTEM
]*n_workers
red_list_eval = red_list_eval[:n_workers]
num_eval,max_ep_len_eval = len(red_list_eval),15e3 # evaluation 

### Logger

In [3]:
txt_path = '../log/%s/log_%s.txt'%(
    exp_name,
    datetime.datetime.now().strftime("%b-%d-%Y-%H:%M:%S"))
f = open_txt(txt_path)
print ("[%s] created."%(txt_path))
time.sleep(1) # wait 

[../log/ars_adt_cont/log_Jul-20-2020-03:08:41.txt] created.


### Environment

In [4]:
action_length = 5 # 50/5 = 10HZ
def get_env(red_distribution=None):
    from episci.environment_wrappers.tactical_action_adt_env_continuous import CustomADTEnvContinuous
    from episci.agents.utils.constants import Agents, RewardType
    env_config = {
        "red_distribution": red_distribution,
        "reward_type": RewardType.SHAPED
    }
    return CustomADTEnvContinuous(env_config,action_length=action_length)

### Worker

In [5]:
class RolloutWorkerClass(object):
    """
    Worker without RAY (for update purposes)
    """
    def __init__(self,
                 hdims=[64]*2,actv=tf.nn.relu,out_actv=tf.nn.tanh,
                 seed=1):
        self.seed = seed
        from util import suppress_tf_warning
        suppress_tf_warning() # suppress TF warnings
        self.env = get_env()
        odim,adim = self.env.observation_space.shape[0],self.env.action_space.shape[0]
        self.odim,self.adim = odim,adim
        # Observation normalization
        self.obs_mu = np.zeros(self.odim)
        self.obs_std = np.ones(self.odim)
        # ARS model 
        self.model,self.sess = create_ars_model(
            odim=self.odim,adim=self.adim,hdims=hdims,
            actv=actv,out_actv=out_actv)
        # Initialize model 
        tf.set_random_seed(self.seed)
        np.random.seed(self.seed)
        self.sess.run(tf.global_variables_initializer())
        # Flag to initialize assign operations for 'set_weights()'
        self.FIRST_SET_FLAG = True
    def set_observation_stats(self,obs_mu,obs_std):
        self.obs_mu = obs_mu
        self.obs_std = obs_std
    def get_action(self,o):
        obs_std = self.obs_std
        obs_std[obs_std<1e-6] = np.inf
        nzd_o = (o-self.obs_mu)/obs_std
        return self.sess.run(
            self.model['mu'],feed_dict={self.model['o_ph']:nzd_o.reshape(1,-1)})[0]
    def get_weights(self):
        weight_vals = self.sess.run(self.model['main_vars'])
        return weight_vals
    def set_weights(self,weight_vals):
        if self.FIRST_SET_FLAG:
            self.FIRST_SET_FLAG = False
            self.assign_placeholders = []
            self.assign_ops = []
            for w_idx,weight_tf_var in enumerate(self.model['main_vars']):
                a = weight_tf_var
                assign_placeholder = tf.placeholder(a.dtype, shape=a.get_shape())
                assign_op = a.assign(assign_placeholder)
                self.assign_placeholders.append(assign_placeholder)
                self.assign_ops.append(assign_op)
        for w_idx,weight_tf_var in enumerate(self.model['main_vars']):
            self.sess.run(self.assign_ops[w_idx],
                          {self.assign_placeholders[w_idx]:weight_vals[w_idx]})
            
@ray.remote
class RayRolloutWorkerClass(object):
    """
    Rollout Worker with RAY
    """
    def __init__(self,worker_id=0,
                 hdims=[128],actv=tf.nn.relu,out_actv=tf.nn.tanh,
                 ep_len_rollout=15000):
        self.worker_id = worker_id
        self.ep_len_rollout = ep_len_rollout
        from util import suppress_tf_warning
        suppress_tf_warning() # suppress TF warnings
        self.env = get_env()
        odim,adim = self.env.observation_space.shape[0],self.env.action_space.shape[0]
        self.odim,self.adim = odim,adim
        # Observation normalization
        self.obs_mu = np.zeros(self.odim)
        self.obs_std = np.ones(self.odim)
        # ARS model 
        self.model,self.sess = create_ars_model(
            odim=self.odim,adim=self.adim,hdims=hdims,
            actv=actv,out_actv=out_actv)
        # Flag to initialize assign operations for 'set_weights()'
        self.FIRST_SET_FLAG = True        
    def set_observation_stats(self,obs_mu,obs_std):
        self.obs_mu = np.copy(obs_mu) # call by value
        self.obs_std = np.copy(obs_std) # call by value
    def get_action(self,o):
        obs_std = self.obs_std
        obs_std[obs_std<1e-6] = np.inf
        nzd_o = (o-self.obs_mu)/obs_std
        return self.sess.run(
            self.model['mu'],feed_dict={self.model['o_ph']:nzd_o.reshape(1,-1)})[0]
    def set_weights(self,weight_vals,noise_vals,noise_sign=+1):
        if self.FIRST_SET_FLAG:
            self.FIRST_SET_FLAG = False
            self.assign_placeholders = []
            self.assign_ops = []
            for w_idx,weight_tf_var in enumerate(self.model['main_vars']):
                a = weight_tf_var
                assign_placeholder = tf.placeholder(a.dtype, shape=a.get_shape())
                assign_op = a.assign(assign_placeholder)
                self.assign_placeholders.append(assign_placeholder)
                self.assign_ops.append(assign_op)
        for w_idx,weight_tf_var in enumerate(self.model['main_vars']):
            self.sess.run(self.assign_ops[w_idx],
                          {self.assign_placeholders[w_idx]:
                           weight_vals[w_idx]+noise_sign*noise_vals[w_idx]})
    def rollout(self,
                red_list=[Agents.SPOT_RANDOM,Agents.EXPERT_SYSTEM]):
        """
        Rollout
        """
        obs_buffer,obs_cnt = np.zeros((len(red_list)*self.ep_len_rollout,self.odim)),0
        for r_idx,red in enumerate(red_list): # for each red policy
            # Specify red policy
            self.o,r_sum,n_step = self.env.reset(red=red),0,0 
            for t in range(self.ep_len_rollout):
                self.a = self.get_action(self.o) 
                self.o2,self.r,self.d,_ = self.env.step(self.a)
                # Save next state 
                self.o = self.o2
                # Accumulate reward
                r_sum += self.r
                n_step += 1
                # Stack observation
                obs_buffer[obs_cnt,:] = self.o
                obs_cnt += 1
                if self.d: 
                    break 
        # Compute the average return and steps 
        r_avg = r_sum / len(red_list)
        n_step_avg = n_step / len(red_list)
        obs_buffer = obs_buffer[:obs_cnt,:] # trim observation buffer 
        return r_avg,n_step_avg,obs_buffer
    
    def evaluate(self,red=None):
        o,d,ep_ret,ep_len = self.env.reset(red=red),False,0,0
        while not(d or (ep_len == self.ep_len_rollout)):
            a = self.get_action(o)
            o,r,d,_ = self.env.step(a)
            ep_ret += r # compute return 
            ep_len += 1
        blue_health,red_health = self.env.blue_health,self.env.red_health
        eval_res = [ep_ret,ep_len,blue_health,red_health] # evaluation result 
        return eval_res
    

### Initialize

In [6]:
env = get_env()
adim,odim = env.action_space.shape[0],env.observation_space.shape[0]
print ("Environment Ready. odim:[%d] adim:[%d]."%(odim,adim))
write_txt(f,"Environment Ready. odim:[%d] adim:[%d]."%(odim,adim),
          ADD_NEWLINE=True,DO_PRINT=False)

Environment Ready. odim:[55] adim:[4].


### Observation online normalizer

In [7]:
mv = OnlineMeanVariance()

### Initialize Workers

In [8]:
ray.init(num_cpus=n_cpu)
tf.reset_default_graph()
R = RolloutWorkerClass(hdims=hdims,actv=actv,out_actv=out_actv,seed=seed)
workers = [RayRolloutWorkerClass.remote(
    worker_id=i,hdims=hdims,actv=actv,out_actv=out_actv,
    ep_len_rollout=ep_len_rollout)
           for i in range(n_workers)]
print ("RAY initialized with [%d] cpus and [%d] workers."%
       (n_cpu,n_workers))
write_txt(f,"RAY initialized with [%d] cpus and [%d] workers."%(n_cpu,n_workers),
          ADD_NEWLINE=True,DO_PRINT=False)

2020-07-20 03:08:42,422	INFO resource_spec.py:212 -- Starting Ray with 139.94 GiB memory available for workers and up to 63.98 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-07-20 03:08:42,877	INFO services.py:1165 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


RAY initialized with [101] cpus and [100] workers.


### Loop

In [None]:
start_time = time.time()
n_env_step = 0 # number of environment steps
for t in range(int(total_steps)): # for all steps 
    esec = time.time()-start_time
    
    # Distribute the central weights to distributed workers
    weights = R.get_weights() # weights of the central worker 
    noises_list = []
    for _ in range(n_workers):
        noises_list.append(get_noises_from_weights(weights,nu=nu))
    
    # Positive rollouts using distributed workers
    set_weights_list = [worker.set_weights.remote(weights,noises,noise_sign=+1) 
                        for worker,noises in zip(workers,noises_list)] # set weights
    rollout_ops = [worker.rollout.remote(
        red_list=[Agents.SPOT_RANDOM,Agents.EXPERT_SYSTEM]
    )
           for worker in workers] # do positive rollouts
    res_pos_rollout = ray.get(rollout_ops) # get positive rollout results
    pos_rets,r_idx = np.zeros(n_workers),0
    for pos_ret,ep_len,obs_buffer in res_pos_rollout:
        pos_rets[r_idx] = pos_ret # return
        r_idx = r_idx + 1
        n_env_step += ep_len # accumulate episode length
        for obs in obs_buffer: mv.include(obs) # update observation mean and std
    
    # Negative rollouts using distributed workers
    set_weights_list = [worker.set_weights.remote(weights,noises,noise_sign=-1) 
                        for worker,noises in zip(workers,noises_list)] # set weights
    rollout_ops = [worker.rollout.remote(
        red_list=[Agents.SPOT_RANDOM,Agents.EXPERT_SYSTEM]
    )
           for worker in workers] # do negative rollouts
    res_neg_rollout = ray.get(rollout_ops) # get negative rollout results
    neg_rets,r_idx = np.zeros(n_workers),0
    for neg_ret,ep_len,obs_buffer in res_neg_rollout:
        neg_rets[r_idx] = neg_ret # return
        r_idx = r_idx + 1
        n_env_step += ep_len # accumulate episode length
        for obs in obs_buffer: mv.include(obs) # update observation mean and std
    
    # Compute return statistics
    concat_rets = np.concatenate((pos_rets,neg_rets)) # concatenated returns [2*n_workers]
    ret_deltas = pos_rets - neg_rets # return difference [n_workers]
    max_rets = np.maximum(pos_rets,neg_rets) # maximum returns [n_workers]
    max_ret = np.max(max_rets) # maximum return [1]
    max_ret_delta = np.max(np.abs(ret_deltas)) # maximum return diff [1]
    sort_idx = np.argsort(-max_rets)
    
    # Update
    sigma_R = np.std(concat_rets)
    weights_updated = []
    for w_idx,weight in enumerate(weights): # for each weight 
        delta_weight_sum = np.zeros_like(weight)
        for k in range(b):
            idx_k = sort_idx[k] # sorted index
            rollout_delta_k = ret_deltas[idx_k]
            noises_k = noises_list[idx_k]
            noise_k = (1/nu)*noises_k[w_idx] # noise for current weight
            delta_weight_sum += rollout_delta_k*noise_k
        delta_weight = (alpha/(b*sigma_R))*delta_weight_sum
        weight = weight + delta_weight
        weights_updated.append(weight) 
    
    # Set weights of the central worker 
    R.set_weights(weights_updated)
    
    # Distribute the central weights to the distributed workers
    weights = R.get_weights()
    noises_list = []
    for _ in range(n_workers):
        noises_list.append(get_noises_from_weights(weights,nu=nu))
    
    # Set observation stats
    obs_mean,obs_std = mv.mean,mv.std
    sef_obs_list= [worker.set_observation_stats.remote(obs_mean,obs_std) 
                   for worker in workers] # set observation mean and std
    
    # Print
    if (t == 0) or (((t+1)%print_every) == 0):
        print ("[%d/%d] max_ret:[%.2f] max_ret_delta:[%.2f] sigma_R:[%.2f] "%
               (t,total_steps,max_ret,max_ret_delta,sigma_R))
        write_txt(f,
                  "[%d/%d] max_ret:[%.2f] max_ret_delta:[%.2f] sigma_R:[%.2f] "%
                  (t,total_steps,max_ret,max_ret_delta,sigma_R),
                  ADD_NEWLINE=True,DO_PRINT=False)
    
    # Evaluate
    if (t == 0) or (((t+1)%evaluate_every) == 0): 
        ram_percent = psutil.virtual_memory().percent # memory usage
        print ("[Eval. start] step:[%d/%d][%.1f%%] #step:[%.1e] time:[%s] ram:[%.1f%%]."%
               (t+1,total_steps,t/total_steps*100,
                n_env_step,
                time.strftime("day:[%d] %H:%M:%S", time.gmtime(time.time()-start_time)),
                ram_percent)
              )
        write_txt(f,
                  "[Eval. start] step:[%d/%d][%.1f%%] #step:[%.1e] time:[%s] ram:[%.1f%%]."%
                  (t+1,total_steps,t/total_steps*100,
                   n_env_step,
                   time.strftime("day:[%d] %H:%M:%S", time.gmtime(time.time()-start_time)),
                   ram_percent),
                  ADD_NEWLINE=True,DO_PRINT=False)
        ops = []
        for i_idx in range(num_eval):
            worker,red = workers[i_idx],red_list_eval[i_idx]
            ops.append(worker.evaluate.remote(red=red))
        eval_vals = ray.get(ops)
        
        ep_ret_sum = 0
        for i_idx in range(num_eval):
            red,eval_val = red_list_eval[i_idx],eval_vals[i_idx]
            ep_ret,ep_len,blue_health,red_health = eval_val[0],eval_val[1],eval_val[2],eval_val[3]
            ep_ret_sum += ep_ret
            print (" [%d/%d] [%s] ep_ret:[%.4f] ep_len:[%d]. blue health:[%.2f] red health:[%.2f]"
                %(i_idx,len(eval_vals),red,ep_ret,ep_len,blue_health,red_health))
            write_txt(f,
                      " [%d/%d] [%s] ep_ret:[%.4f] ep_len:[%d]. blue health:[%.2f] red health:[%.2f]"
                      %(i_idx,len(eval_vals),red,ep_ret,ep_len,blue_health,red_health),
                      ADD_NEWLINE=True,DO_PRINT=False)
        ep_ret_avg = ep_ret_sum / num_eval
        print ("[Eval. done] time:[%s] ep_ret_avg:[%.3f].\n"%
               (time.strftime("day:[%d] %H:%M:%S", time.gmtime(time.time()-start_time)),
                ep_ret_avg)
              )
        write_txt(f,
                  "[Eval. done] time:[%s] ep_ret_avg:[%.3f].\n"%
                  (time.strftime("day:[%d] %H:%M:%S", time.gmtime(time.time()-start_time)),
                   ep_ret_avg),
                  ADD_NEWLINE=True,DO_PRINT=False)
        # Save
        npz_path = '../data/net/%s/model_%d.npz'%(expname,t+1)
        save_ars_model(npz_path,R,VERBOSE=False)
        write_txt(f,
                  " [%s] saved."%npz_path,
                  ADD_NEWLINE=True,DO_PRINT=False)
        
    # Loop 
    # break # for debugging 
    
print ("Done.")

[2m[36m(pid=97185)[0m 
[2m[36m(pid=97185)[0m 
[2m[36m(pid=97185)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jul 11 2020 05:35:14
[2m[36m(pid=97192)[0m 
[2m[36m(pid=97192)[0m 
[2m[36m(pid=97192)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jul 11 2020 05:35:14
[2m[36m(pid=97196)[0m 
[2m[36m(pid=97196)[0m 
[2m[36m(pid=97196)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jul 11 2020 05:35:14
[2m[36m(pid=97214)[0m 
[2m[36m(pid=97214)[0m 
[2m[36m(pid=97214)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jul 11 2020 05:35:14
[2m[36m(pid=97280)[0m 
[2m[36m(pid=97280)[0m 
[2m[36m(pid=97280)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jul 11 2020 05:35:14
[2m[36m(pid=97206)[0m 
[2m[36m(pid=97206)[0m 
[2m[36m(pid=97206)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jul 11 2020 05:35:14
[2m[36m(pid=97209)[0m 
[2m[36m(pid=97209)[0m 
[2m[36m(pid=97209)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jul 11 2020 0

[0/5000] max_ret:[51.14] max_ret_delta:[101.33] sigma_R:[36.96] 
[Eval. start] step:[1/5000][0.0%] #step:[1.7e+05] time:[day:[01] 00:10:58] ram:[28.4%].
 [0/100] [zombie] ep_ret:[-2.2325] ep_len:[1308]. blue health:[1.00] red health:[1.00]
 [1/100] [rosie] ep_ret:[-4.6957] ep_len:[3001]. blue health:[1.00] red health:[1.00]
 [2/100] [bud] ep_ret:[-6.9573] ep_len:[2890]. blue health:[0.00] red health:[1.00]
 [3/100] [bud_fsm] ep_ret:[-4.8342] ep_len:[1445]. blue health:[0.00] red health:[1.00]
 [4/100] [es_trial2] ep_ret:[-5.1787] ep_len:[2071]. blue health:[0.00] red health:[1.00]
 [5/100] [es_trial3_scrimmage4] ep_ret:[-103.8402] ep_len:[1660]. blue health:[0.00] red health:[1.00]
 [6/100] [expert_system] ep_ret:[-101.8373] ep_len:[765]. blue health:[0.00] red health:[1.00]
 [7/100] [zombie] ep_ret:[-1.5152] ep_len:[959]. blue health:[1.00] red health:[1.00]
 [8/100] [rosie] ep_ret:[-4.8875] ep_len:[3001]. blue health:[1.00] red health:[1.00]
 [9/100] [bud] ep_ret:[-6.0910] ep_len:[20

[1/5000] max_ret:[-13.16] max_ret_delta:[37.28] sigma_R:[2.97] 
[2/5000] max_ret:[-0.34] max_ret_delta:[49.80] sigma_R:[7.78] 
