# PPO with ADT Continuous Env

In [1]:
import datetime,gym,time,os,psutil,ray
import numpy as np
import tensorflow as tf
from util import open_txt,write_txt
from ppo import PPOBuffer,create_ppo_model,create_ppo_graph,update_ppo,\
    save_ppo_model,restore_ppo_model
from episci.environment_wrappers.tactical_action_adt_env_continuous \
    import CustomADTEnvContinuous
from episci.agents.utils.constants import Agents,RewardType,StateInfo
print ("Packaged loaded. TF version is [%s]."%(tf.__version__))

Packaged loaded. TF version is [1.15.0].


### Hyperparameters

In [2]:
# Worker
exp_name = 'ppo_adt_cont'
n_cpu = 21
n_workers = 20

# Environment
action_length = 5 # 50/5 = 10HZ
red_list_train = [
    Agents.ZOMBIE,
    Agents.SPOT_RANDOM,
    Agents.EXPERT_SYSTEM
]
# Red agent distribution for evaluation
red_list_eval = [
    Agents.ZOMBIE,
    Agents.ROSIE, 
    Agents.BUD, 
    Agents.BUD_FSM, 
    Agents.EXPERT_SYSTEM
]*n_workers
red_list_eval = red_list_eval[:n_workers]
num_eval = len(red_list_eval) # evaluation

# Steps
total_steps,evaluate_every,print_every = 5000,5,1
ep_len_rollout = 3000 # 15,000/5
buffer_size = int(3000*len(red_list_train))
batch_size = int(2**15) 

# Network configuration
hdims = [64,32,16]
clip_ratio = 0.2
pi_lr = 1e-5 # 3e-4
vf_lr = 1e-4 # 1e-3
epsilon = 1e-5 # 1e-2
# Buffer
gamma = 0.99 # 0.99
lam = 0.95 # 0.95
# Update
train_pi_iters = 1000
train_v_iters = 1000
target_kl = 0.005 # 0.01

### Environment

In [3]:
def get_env(red_distribution=None):
    from episci.environment_wrappers.tactical_action_adt_env_continuous \
        import CustomADTEnvContinuous
    from episci.agents.utils.constants import Agents, RewardType
    env_config = {
        "red_distribution": red_distribution,
        "reward_type": RewardType.SHAPED
    }
    return CustomADTEnvContinuous(env_config,action_length=action_length)

### Logger

In [4]:
txt_path = '../report/log/%s/log_%s.txt'%(
    exp_name,
    datetime.datetime.now().strftime("%b-%d-%Y-%H:%M:%S"))
f = open_txt(txt_path)
print ("[%s] created."%(txt_path))
time.sleep(1)

[../report/log/ppo_adt_cont/log_Aug-19-2020-08:15:07.txt] created.


### Rollout Workers

In [5]:
class RolloutWorkerClass(object):
    """
    Worker without RAY (for update purposes)
    """
    def __init__(self,seed=1):
        self.seed = seed
        # Each worker should maintain its own environment
        import gym
        from util import suppress_tf_warning
        suppress_tf_warning() # suppress TF warnings
        gym.logger.set_level(40) # gym logger 
        self.env = get_env()
        odim,adim = self.env.observation_space.shape[0],self.env.action_space.shape[0]
        self.odim = odim
        self.adim = adim
        _ = self.env.reset(red=Agents.SPOT_RANDOM)
        
        # Initialize PPO
        self.model,self.sess = create_ppo_model(env=self.env,hdims=hdims,output_actv=tf.nn.tanh)
        self.graph = create_ppo_graph(self.model,
                                      clip_ratio=clip_ratio,pi_lr=pi_lr,vf_lr=vf_lr,epsilon=epsilon)
        # Initialize model 
        tf.set_random_seed(self.seed)
        np.random.seed(self.seed)
        self.sess.run(tf.global_variables_initializer())
        
        # Flag to initialize assign operations for 'set_weights()'
        self.FIRST_SET_FLAG = True
        
    def get_action(self,o,deterministic=False):
        act_op = self.model['mu'] if deterministic else self.model['pi']
        return self.sess.run(act_op, feed_dict={self.model['o_ph']:o.reshape(1,-1)})[0]
    
    def get_weights(self):
        """
        Get weights
        """
        weight_vals = self.sess.run(self.model['pi_vars']+self.model['v_vars'])
        return weight_vals
    
    def set_weights(self,weight_vals):
        """
        Set weights without memory leakage
        """
        if self.FIRST_SET_FLAG:
            self.FIRST_SET_FLAG = False
            self.assign_placeholders = []
            self.assign_ops = []
            for w_idx,weight_tf_var in enumerate(self.model['pi_vars']+self.model['v_vars']):
                a = weight_tf_var
                assign_placeholder = tf.placeholder(a.dtype, shape=a.get_shape())
                assign_op = a.assign(assign_placeholder)
                self.assign_placeholders.append(assign_placeholder)
                self.assign_ops.append(assign_op)
        for w_idx,weight_tf_var in enumerate(self.model['pi_vars']+self.model['v_vars']):
            self.sess.run(self.assign_ops[w_idx],
                          {self.assign_placeholders[w_idx]:weight_vals[w_idx]})    
    
@ray.remote
class RayRolloutWorkerClass(object):
    """
    Rollout Worker with RAY
    """
    def __init__(self,worker_id=0,ep_len_rollout=1000,buffer_size=1000):
        # Parse
        self.worker_id = worker_id
        self.ep_len_rollout = ep_len_rollout
        self.buffer_size = buffer_size
        # Each worker should maintain its own environment
        import gym
        from util import suppress_tf_warning
        suppress_tf_warning() # suppress TF warnings
        gym.logger.set_level(40) # gym logger 
        self.env = get_env()
        odim,adim = self.env.observation_space.shape[0],self.env.action_space.shape[0]
        self.odim = odim
        self.adim = adim
        _ = self.env.reset(red=Agents.SPOT_RANDOM)
        
        # Replay buffers to pass
        self.o_buffer = np.zeros((self.buffer_size,self.odim))
        self.a_buffer = np.zeros((self.buffer_size,self.adim))
        self.r_buffer = np.zeros((self.buffer_size))
        self.v_t_buffer = np.zeros((self.buffer_size))
        self.logp_t_buffer = np.zeros((self.buffer_size))
        # Create PPO model
        self.model,self.sess = create_ppo_model(env=self.env,hdims=hdims,output_actv=tf.nn.tanh)
        # Initialize model 
        self.sess.run(tf.global_variables_initializer())
        # Buffer
        self.buf = PPOBuffer(odim=self.odim,adim=self.adim,
                             size=buffer_size,gamma=gamma,lam=lam)
        print ("Ray Worker [%d] Ready."%(self.worker_id))
        
        # Flag to initialize assign operations for 'set_weights()'
        self.FIRST_SET_FLAG = True
        
        # Flag to initialize rollout
        self.FIRST_ROLLOUT_FLAG = True
        
    def get_action(self,o,deterministic=False):
        act_op = self.model['mu'] if deterministic else self.model['pi']
        return self.sess.run(act_op, feed_dict={self.model['o_ph']:o.reshape(1,-1)})[0]
    
    def set_weights(self,weight_vals):
        """
        Set weights without memory leakage
        """
        if self.FIRST_SET_FLAG:
            self.FIRST_SET_FLAG = False
            self.assign_placeholders = []
            self.assign_ops = []
            for w_idx,weight_tf_var in enumerate(self.model['pi_vars']+self.model['v_vars']):
                a = weight_tf_var
                assign_placeholder = tf.placeholder(a.dtype, shape=a.get_shape())
                assign_op = a.assign(assign_placeholder)
                self.assign_placeholders.append(assign_placeholder)
                self.assign_ops.append(assign_op)
        for w_idx,weight_tf_var in enumerate(self.model['pi_vars']+self.model['v_vars']):
            self.sess.run(self.assign_ops[w_idx],
                          {self.assign_placeholders[w_idx]:weight_vals[w_idx]})    
        
    def rollout(self,red_list=[Agents.SPOT_RANDOM,Agents.EXPERT_SYSTEM]):
        """
        Rollout
        """
        if self.FIRST_ROLLOUT_FLAG:
            self.FIRST_ROLLOUT_FLAG = False
            self.o = self.env.reset(red=Agents.SPOT_RANDOM) # reset environment
        # Loop
        r_sum,cnt = 0,0
        for r_idx,red in enumerate(red_list): # for each red policy
            self.o = self.env.reset(red=red) # reset environment
            for t in range(self.ep_len_rollout):
                a,v_t,logp_t = self.sess.run(
                    self.model['get_action_ops'],
                    feed_dict={self.model['o_ph']:self.o.reshape(1,-1)})
                o2, r, d, _ = self.env.step(a[0])
                r_sum += r
                cnt += 1
                # save and log
                self.buf.store(self.o,a,r,v_t,logp_t)
                # Update obs (critical!)
                self.o = o2
                if d:
                    self.buf.finish_path(last_val=0.0)
                    self.o = self.env.reset(red=Agents.SPOT_RANDOM) # reset when done 
            last_val = self.sess.run(self.model['v'],
                                     feed_dict={self.model['o_ph']:self.o.reshape(1,-1)})
            self.buf.finish_path(last_val) # finish path buffer
        r_avg = r_sum / cnt
        return self.buf.get(),r_avg # obs_buf, act_buf, adv_buf, ret_buf, logp_buf
    
    def evaluate(self,red=None):
        """
        Evaluate
        """
        o,d,ep_ret,ep_len = self.env.reset(red=red),False,0,0
        while not(d or (ep_len == self.ep_len_rollout)):
            a = self.get_action(o,deterministic=True)
            o,r,d,_ = self.env.step(a)
            ep_ret += r # compute return 
            ep_len += 1
        blue_health,red_health = self.env.blue_health,self.env.red_health
        eval_res = [ep_ret,ep_len,blue_health,red_health] # evaluation result 
        return eval_res

### Initialize Workers

In [6]:
ray.init(num_cpus=n_cpu)
tf.reset_default_graph()
R = RolloutWorkerClass(seed=0)
workers = [RayRolloutWorkerClass.remote(
    worker_id=i,ep_len_rollout=ep_len_rollout,buffer_size=buffer_size) 
           for i in range(n_workers)]
print ("RAY initialized with [%d] cpus and [%d] workers."%
       (n_cpu,n_workers))

2020-08-19 08:15:08,230	INFO resource_spec.py:212 -- Starting Ray with 134.23 GiB memory available for workers and up to 61.53 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-08-19 08:15:08,732	INFO services.py:1165 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


RAY initialized with [21] cpus and [20] workers.


### Loop

In [None]:
start_time = time.time()
n_env_step = 0 # number of environment steps
ep_ret_avg = 0
for t in range(int(total_steps)):
    esec = time.time()-start_time
    
    # 1. Synchronize worker weights
    weights = R.get_weights()
    set_weights_list = [worker.set_weights.remote(weights) for worker in workers] 
    
    # 2. Make rollout and accumulate to Buffers
    t_start = time.time()
    ops = [worker.rollout.remote(
        red_list=red_list_train # <= with the list of pre-defined red agent policies
    ) for worker in workers]
    rollout_vals = ray.get(ops)
    sec_rollout = time.time() - t_start
    
    # Get stats before update
    t_start = time.time() # tic

    # 3. Update the PPO model 
    r_sum = 0
    for r_idx,rval in enumerate(rollout_vals): # concat all buffers from workers
        obs_buf,act_buf,adv_buf,ret_buf,logp_buf,r_rollout_avg = \
            rval[0][0],rval[0][1],rval[0][2],rval[0][3],rval[0][4],rval[1]
        if r_idx == 0:
            obs_bufs,act_bufs,adv_bufs,ret_bufs,logp_bufs = \
                obs_buf,act_buf,adv_buf,ret_buf,logp_buf
        else:
            obs_bufs = np.concatenate((obs_bufs,obs_buf),axis=0)
            act_bufs = np.concatenate((act_bufs,act_buf),axis=0)
            adv_bufs = np.concatenate((adv_bufs,adv_buf),axis=0)
            ret_bufs = np.concatenate((ret_bufs,ret_buf),axis=0)
            logp_bufs = np.concatenate((logp_bufs,logp_buf),axis=0)
        r_sum += r_rollout_avg
    r_avg = r_sum / len(rollout_vals)
    n_val_total = obs_bufs.shape[0] # total buffer size 
    for pi_iter in range(train_pi_iters): # update actor
        rand_idx = np.random.permutation(n_val_total)[:batch_size]
        buf_batches = [obs_bufs[rand_idx],act_bufs[rand_idx],adv_bufs[rand_idx],
                       ret_bufs[rand_idx],logp_bufs[rand_idx]]
        feeds = {k:v for k,v in zip(R.model['all_phs'],buf_batches)}
        _,kl,pi_loss,ent = R.sess.run([R.graph['train_pi'],R.graph['approx_kl'],
                               R.graph['pi_loss'],R.graph['approx_ent']],
                           feed_dict=feeds)        
        if kl > 1.5 * target_kl:
            # print ("  pi_iter:[%d] kl(%.3f) is higher than 1.5x(%.3f)"%(pi_iter,kl,target_kl))
            break
    for _ in range(train_v_iters): # update critic
        rand_idx = np.random.permutation(n_val_total)[:batch_size]
        buf_batches = [obs_bufs[rand_idx],act_bufs[rand_idx],adv_bufs[rand_idx],
                       ret_bufs[rand_idx],logp_bufs[rand_idx]]
        feeds = {k:v for k,v in zip(R.model['all_phs'],buf_batches)}
        R.sess.run(R.graph['train_v'],feed_dict=feeds)
    sec_update = time.time() - t_start # toc
    
    # 4. Synchronize worker weights (after update)
    weights = R.get_weights()
    set_weights_list = [worker.set_weights.remote(weights) for worker in workers] 
    
    # Print
    if (t == 0) or (((t+1)%print_every) == 0): 
        print ("[%d/%d] rollout:[%.1f]s pi_iter:[%d/%d] update:[%.1f]s kl:[%.4f] target_kl:[%.4f]."%
               (t+1,total_steps,sec_rollout,pi_iter,train_pi_iters,sec_update,kl,target_kl))
        print ("   pi_loss:[%.4f], entropy:[%.4f], r_avg[%.4f]"%
               (pi_loss,ent,r_avg))
        write_txt(f,"%.2f, r_train:%.4f, ret_eval:%.4f"%(time.time()-start_time, r_avg, ep_ret_avg),
                  ADD_NEWLINE=True,DO_PRINT=False)
        
    # 5. Evaluate
    if (t == 0) or (((t+1)%evaluate_every) == 0): 
        ram_percent = psutil.virtual_memory().percent # memory usage
        print ("[Eval. start] step:[%d/%d][%.1f%%] #step:[%.1e] time:[%s] ram:[%.1f%%]."%
               (t+1,total_steps,t/total_steps*100,
                n_env_step,
                time.strftime("day:[%d] %H:%M:%S", time.gmtime(time.time()-start_time)),
                ram_percent)
              )
        ops = []
        for i_idx in range(num_eval):
            worker,red = workers[i_idx],red_list_eval[i_idx]
            ops.append(worker.evaluate.remote(red=red))
        eval_vals = ray.get(ops)
        ep_ret_sum = 0
        for i_idx in range(num_eval):
            red,eval_val = red_list_eval[i_idx],eval_vals[i_idx]
            ep_ret,ep_len,blue_health,red_health = eval_val[0],eval_val[1],eval_val[2],eval_val[3]
            ep_ret_sum += ep_ret
            print (" [%d/%d] [%s] ep_ret:[%.4f] ep_len:[%d]. blue health:[%.2f] red health:[%.2f]"
                %(i_idx,len(eval_vals),red,ep_ret,ep_len,blue_health,red_health))
        ep_ret_avg = ep_ret_sum / num_eval
        print ("[Eval. done] time:[%s] ep_ret_avg:[%.3f]."%
               (time.strftime("day:[%d] %H:%M:%S", time.gmtime(time.time()-start_time)),ep_ret_avg))
        # Save current PPO model
        npz_path = '../report/net/%s/model_%d.npz'%(exp_name,t+1)
        save_ppo_model(npz_path,R,VERBOSE=False)
        

print ("Done.")

[2m[36m(pid=81186)[0m 
[2m[36m(pid=81186)[0m 
[2m[36m(pid=81186)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jul 11 2020 05:35:14
[2m[36m(pid=81193)[0m 
[2m[36m(pid=81193)[0m 
[2m[36m(pid=81193)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jul 11 2020 05:35:14
[2m[36m(pid=81180)[0m 
[2m[36m(pid=81180)[0m 
[2m[36m(pid=81180)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jul 11 2020 05:35:14
[2m[36m(pid=81178)[0m 
[2m[36m(pid=81178)[0m 
[2m[36m(pid=81178)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jul 11 2020 05:35:14
[2m[36m(pid=81183)[0m 
[2m[36m(pid=81183)[0m 
[2m[36m(pid=81183)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jul 11 2020 05:35:14
[2m[36m(pid=81182)[0m 
[2m[36m(pid=81182)[0m 
[2m[36m(pid=81182)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jul 11 2020 05:35:14
[2m[36m(pid=81198)[0m 
[2m[36m(pid=81198)[0m 
[2m[36m(pid=81198)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jul 11 2020 0

[6/5000] rollout:[436.7]s pi_iter:[999/1000] update:[69.1]s kl:[0.0042] target_kl:[0.0050].
   pi_loss:[0.0019], entropy:[3.6359], r_avg[-0.0102]
[7/5000] rollout:[413.6]s pi_iter:[999/1000] update:[65.4]s kl:[0.0031] target_kl:[0.0050].
   pi_loss:[-0.0004], entropy:[3.6338], r_avg[-0.0109]
[8/5000] rollout:[407.2]s pi_iter:[999/1000] update:[57.5]s kl:[0.0031] target_kl:[0.0050].
   pi_loss:[-0.0027], entropy:[3.6134], r_avg[-0.0092]
[9/5000] rollout:[444.8]s pi_iter:[999/1000] update:[56.4]s kl:[0.0029] target_kl:[0.0050].
   pi_loss:[-0.0100], entropy:[3.6010], r_avg[-0.0113]
[10/5000] rollout:[422.9]s pi_iter:[999/1000] update:[51.6]s kl:[0.0039] target_kl:[0.0050].
   pi_loss:[-0.0018], entropy:[3.6074], r_avg[-0.0098]
[Eval. start] step:[10/5000][0.2%] #step:[0.0e+00] time:[day:[01] 01:28:17] ram:[19.8%].
 [0/20] [zombie] ep_ret:[0.0213] ep_len:[3000]. blue health:[1.00] red health:[1.00]
 [1/20] [rosie] ep_ret:[94.9187] ep_len:[3000]. blue health:[1.00] red health:[0.06]
 [2/20

[22/5000] rollout:[435.4]s pi_iter:[999/1000] update:[53.2]s kl:[0.0032] target_kl:[0.0050].
   pi_loss:[0.0032], entropy:[3.5130], r_avg[-0.0076]
[23/5000] rollout:[437.7]s pi_iter:[999/1000] update:[58.5]s kl:[0.0034] target_kl:[0.0050].
   pi_loss:[-0.0003], entropy:[3.5002], r_avg[-0.0066]
[24/5000] rollout:[440.1]s pi_iter:[999/1000] update:[59.6]s kl:[0.0033] target_kl:[0.0050].
   pi_loss:[-0.0045], entropy:[3.4755], r_avg[-0.0088]
[25/5000] rollout:[446.4]s pi_iter:[999/1000] update:[53.6]s kl:[0.0033] target_kl:[0.0050].
   pi_loss:[-0.0032], entropy:[3.4742], r_avg[-0.0085]
[Eval. start] step:[25/5000][0.5%] #step:[0.0e+00] time:[day:[01] 03:38:55] ram:[20.9%].
 [0/20] [zombie] ep_ret:[-0.5462] ep_len:[3000]. blue health:[1.00] red health:[1.00]
 [1/20] [rosie] ep_ret:[-0.3308] ep_len:[3000]. blue health:[1.00] red health:[1.00]
 [2/20] [bud] ep_ret:[100.1368] ep_len:[738]. blue health:[1.00] red health:[0.00]
 [3/20] [bud_fsm] ep_ret:[-1.6515] ep_len:[2655]. blue health:[0.0

[38/5000] rollout:[433.7]s pi_iter:[999/1000] update:[59.7]s kl:[0.0031] target_kl:[0.0050].
   pi_loss:[-0.0013], entropy:[3.3587], r_avg[-0.0056]
[39/5000] rollout:[427.9]s pi_iter:[999/1000] update:[54.7]s kl:[0.0033] target_kl:[0.0050].
   pi_loss:[-0.0022], entropy:[3.3400], r_avg[-0.0090]
[40/5000] rollout:[429.2]s pi_iter:[999/1000] update:[53.7]s kl:[0.0032] target_kl:[0.0050].
   pi_loss:[-0.0054], entropy:[3.3348], r_avg[-0.0073]
[Eval. start] step:[40/5000][0.8%] #step:[0.0e+00] time:[day:[01] 05:51:18] ram:[21.2%].
 [0/20] [zombie] ep_ret:[-0.1829] ep_len:[1033]. blue health:[1.00] red health:[1.00]
 [1/20] [rosie] ep_ret:[-2.8398] ep_len:[794]. blue health:[0.00] red health:[1.00]
 [2/20] [bud] ep_ret:[99.7946] ep_len:[787]. blue health:[1.00] red health:[0.00]
 [3/20] [bud_fsm] ep_ret:[-2.6342] ep_len:[1264]. blue health:[0.00] red health:[1.00]
 [4/20] [expert_system] ep_ret:[-11.8948] ep_len:[1017]. blue health:[0.00] red health:[1.00]
 [5/20] [zombie] ep_ret:[-0.5096] 

[55/5000] rollout:[412.4]s pi_iter:[999/1000] update:[57.5]s kl:[0.0035] target_kl:[0.0050].
   pi_loss:[-0.0042], entropy:[3.1620], r_avg[-0.0034]
[Eval. start] step:[55/5000][1.1%] #step:[0.0e+00] time:[day:[01] 07:56:22] ram:[21.2%].
 [0/20] [zombie] ep_ret:[-3.4435] ep_len:[1416]. blue health:[0.00] red health:[1.00]
 [1/20] [rosie] ep_ret:[-1.6984] ep_len:[3000]. blue health:[1.00] red health:[1.00]
 [2/20] [bud] ep_ret:[99.9058] ep_len:[537]. blue health:[1.00] red health:[0.00]
 [3/20] [bud_fsm] ep_ret:[-2.6792] ep_len:[1286]. blue health:[0.00] red health:[1.00]
 [4/20] [expert_system] ep_ret:[-101.1396] ep_len:[1109]. blue health:[0.00] red health:[1.00]
 [5/20] [zombie] ep_ret:[-0.3681] ep_len:[3000]. blue health:[1.00] red health:[1.00]
 [6/20] [rosie] ep_ret:[-1.5730] ep_len:[3000]. blue health:[1.00] red health:[1.00]
 [7/20] [bud] ep_ret:[99.5937] ep_len:[819]. blue health:[1.00] red health:[0.00]
 [8/20] [bud_fsm] ep_ret:[-2.9539] ep_len:[1133]. blue health:[0.00] red he

 [0/20] [zombie] ep_ret:[-1.0118] ep_len:[2145]. blue health:[1.00] red health:[1.00]
 [1/20] [rosie] ep_ret:[-2.1002] ep_len:[3000]. blue health:[1.00] red health:[1.00]
 [2/20] [bud] ep_ret:[98.9298] ep_len:[1197]. blue health:[1.00] red health:[0.00]
 [3/20] [bud_fsm] ep_ret:[-3.2815] ep_len:[1083]. blue health:[0.00] red health:[1.00]
 [4/20] [expert_system] ep_ret:[-3.2258] ep_len:[759]. blue health:[0.00] red health:[1.00]
 [5/20] [zombie] ep_ret:[99.3605] ep_len:[1821]. blue health:[1.00] red health:[0.00]
 [6/20] [rosie] ep_ret:[-3.2002] ep_len:[3000]. blue health:[1.00] red health:[1.00]
 [7/20] [bud] ep_ret:[99.3846] ep_len:[1270]. blue health:[1.00] red health:[0.00]
 [8/20] [bud_fsm] ep_ret:[-3.2108] ep_len:[1386]. blue health:[0.00] red health:[1.00]
 [9/20] [expert_system] ep_ret:[-3.3005] ep_len:[803]. blue health:[0.00] red health:[1.00]
 [10/20] [zombie] ep_ret:[-1.2139] ep_len:[1560]. blue health:[1.00] red health:[1.00]
 [11/20] [rosie] ep_ret:[-3.3945] ep_len:[3000]

[86/5000] rollout:[439.8]s pi_iter:[999/1000] update:[57.5]s kl:[0.0040] target_kl:[0.0050].
   pi_loss:[0.0046], entropy:[2.7788], r_avg[-0.0046]
[87/5000] rollout:[417.6]s pi_iter:[999/1000] update:[62.3]s kl:[0.0036] target_kl:[0.0050].
   pi_loss:[-0.0010], entropy:[2.7666], r_avg[-0.0028]
[88/5000] rollout:[453.9]s pi_iter:[999/1000] update:[56.3]s kl:[0.0048] target_kl:[0.0050].
   pi_loss:[-0.0071], entropy:[2.7654], r_avg[0.0064]
[89/5000] rollout:[425.5]s pi_iter:[999/1000] update:[58.5]s kl:[0.0044] target_kl:[0.0050].
   pi_loss:[0.0040], entropy:[2.7306], r_avg[0.0026]
[90/5000] rollout:[447.3]s pi_iter:[999/1000] update:[53.0]s kl:[0.0042] target_kl:[0.0050].
   pi_loss:[0.0032], entropy:[2.7247], r_avg[0.0006]
[Eval. start] step:[90/5000][1.8%] #step:[0.0e+00] time:[day:[01] 12:58:11] ram:[21.2%].
 [0/20] [zombie] ep_ret:[-40.9827] ep_len:[3000]. blue health:[0.60] red health:[1.00]
 [1/20] [rosie] ep_ret:[-1.9397] ep_len:[3000]. blue health:[1.00] red health:[1.00]
 [2/2

[103/5000] rollout:[453.6]s pi_iter:[999/1000] update:[59.2]s kl:[0.0048] target_kl:[0.0050].
   pi_loss:[-0.0043], entropy:[2.5968], r_avg[-0.0008]
[104/5000] rollout:[427.0]s pi_iter:[999/1000] update:[56.2]s kl:[0.0039] target_kl:[0.0050].
   pi_loss:[-0.0032], entropy:[2.5809], r_avg[0.0010]
[105/5000] rollout:[432.6]s pi_iter:[999/1000] update:[55.4]s kl:[0.0036] target_kl:[0.0050].
   pi_loss:[0.0025], entropy:[2.5713], r_avg[0.0031]
[Eval. start] step:[105/5000][2.1%] #step:[0.0e+00] time:[day:[01] 15:09:35] ram:[21.3%].
 [0/20] [zombie] ep_ret:[-0.7754] ep_len:[1638]. blue health:[1.00] red health:[1.00]
 [1/20] [rosie] ep_ret:[-3.0268] ep_len:[3000]. blue health:[1.00] red health:[1.00]
 [2/20] [bud] ep_ret:[99.3926] ep_len:[790]. blue health:[1.00] red health:[0.00]
 [3/20] [bud_fsm] ep_ret:[-1.1530] ep_len:[3000]. blue health:[1.00] red health:[1.00]
 [4/20] [expert_system] ep_ret:[-3.2427] ep_len:[534]. blue health:[0.00] red health:[1.00]
 [5/20] [zombie] ep_ret:[-2.7497] 

[120/5000] rollout:[455.0]s pi_iter:[999/1000] update:[50.6]s kl:[0.0046] target_kl:[0.0050].
   pi_loss:[-0.0066], entropy:[2.4258], r_avg[-0.0003]
[Eval. start] step:[120/5000][2.4%] #step:[0.0e+00] time:[day:[01] 17:20:27] ram:[21.4%].
 [0/20] [zombie] ep_ret:[-0.3885] ep_len:[1314]. blue health:[1.00] red health:[1.00]
 [1/20] [rosie] ep_ret:[-3.3034] ep_len:[3000]. blue health:[1.00] red health:[1.00]
 [2/20] [bud] ep_ret:[99.5023] ep_len:[647]. blue health:[1.00] red health:[0.00]
 [3/20] [bud_fsm] ep_ret:[-2.6242] ep_len:[3000]. blue health:[1.00] red health:[1.00]
 [4/20] [expert_system] ep_ret:[-20.0086] ep_len:[1034]. blue health:[0.00] red health:[1.00]
 [5/20] [zombie] ep_ret:[-1.9408] ep_len:[3000]. blue health:[1.00] red health:[1.00]
 [6/20] [rosie] ep_ret:[-2.0771] ep_len:[3000]. blue health:[1.00] red health:[1.00]
 [7/20] [bud] ep_ret:[99.1865] ep_len:[855]. blue health:[1.00] red health:[0.00]
 [8/20] [bud_fsm] ep_ret:[-3.3343] ep_len:[1369]. blue health:[0.00] red h

 [0/20] [zombie] ep_ret:[-1.5578] ep_len:[2600]. blue health:[1.00] red health:[1.00]
 [1/20] [rosie] ep_ret:[-4.1303] ep_len:[2074]. blue health:[0.00] red health:[1.00]
 [2/20] [bud] ep_ret:[99.1318] ep_len:[869]. blue health:[1.00] red health:[0.00]
 [3/20] [bud_fsm] ep_ret:[-3.5631] ep_len:[1313]. blue health:[0.00] red health:[1.00]
 [4/20] [expert_system] ep_ret:[-3.3193] ep_len:[698]. blue health:[0.00] red health:[1.00]
 [5/20] [zombie] ep_ret:[-0.7636] ep_len:[1366]. blue health:[1.00] red health:[1.00]
 [6/20] [rosie] ep_ret:[-4.5239] ep_len:[2149]. blue health:[0.00] red health:[1.00]
 [7/20] [bud] ep_ret:[99.2642] ep_len:[1052]. blue health:[1.00] red health:[0.00]
 [8/20] [bud_fsm] ep_ret:[-2.6429] ep_len:[565]. blue health:[0.00] red health:[1.00]
 [9/20] [expert_system] ep_ret:[-3.1953] ep_len:[833]. blue health:[0.00] red health:[1.00]
 [10/20] [zombie] ep_ret:[-2.8703] ep_len:[490]. blue health:[0.00] red health:[1.00]
 [11/20] [rosie] ep_ret:[-5.4211] ep_len:[2750]. b

[151/5000] rollout:[475.3]s pi_iter:[999/1000] update:[57.7]s kl:[0.0028] target_kl:[0.0050].
   pi_loss:[0.0014], entropy:[2.0014], r_avg[-0.0007]
[152/5000] rollout:[453.3]s pi_iter:[999/1000] update:[54.3]s kl:[0.0036] target_kl:[0.0050].
   pi_loss:[-0.0078], entropy:[2.0014], r_avg[0.0011]
[153/5000] rollout:[463.4]s pi_iter:[999/1000] update:[55.2]s kl:[0.0029] target_kl:[0.0050].
   pi_loss:[-0.0088], entropy:[1.9780], r_avg[0.0026]
[154/5000] rollout:[467.5]s pi_iter:[999/1000] update:[57.8]s kl:[0.0033] target_kl:[0.0050].
   pi_loss:[-0.0050], entropy:[1.9779], r_avg[0.0007]
[155/5000] rollout:[464.9]s pi_iter:[999/1000] update:[53.9]s kl:[0.0049] target_kl:[0.0050].
   pi_loss:[0.0012], entropy:[1.9767], r_avg[0.0020]
[Eval. start] step:[155/5000][3.1%] #step:[0.0e+00] time:[day:[01] 22:39:11] ram:[21.6%].
 [0/20] [zombie] ep_ret:[-3.2812] ep_len:[1715]. blue health:[0.00] red health:[1.00]
 [1/20] [rosie] ep_ret:[-2.9901] ep_len:[3000]. blue health:[1.00] red health:[1.00]


[168/5000] rollout:[436.2]s pi_iter:[999/1000] update:[54.8]s kl:[0.0036] target_kl:[0.0050].
   pi_loss:[-0.0084], entropy:[1.8569], r_avg[-0.0017]
[169/5000] rollout:[459.4]s pi_iter:[999/1000] update:[63.3]s kl:[0.0040] target_kl:[0.0050].
   pi_loss:[-0.0052], entropy:[1.8357], r_avg[0.0020]
[170/5000] rollout:[470.7]s pi_iter:[999/1000] update:[52.7]s kl:[0.0042] target_kl:[0.0050].
   pi_loss:[-0.0034], entropy:[1.8350], r_avg[0.0028]
[Eval. start] step:[170/5000][3.4%] #step:[0.0e+00] time:[day:[02] 00:56:30] ram:[21.7%].
 [0/20] [zombie] ep_ret:[-0.9655] ep_len:[1412]. blue health:[1.00] red health:[1.00]
 [1/20] [rosie] ep_ret:[-3.8382] ep_len:[3000]. blue health:[1.00] red health:[1.00]
 [2/20] [bud] ep_ret:[99.4368] ep_len:[706]. blue health:[1.00] red health:[0.00]
 [3/20] [bud_fsm] ep_ret:[-4.0224] ep_len:[1297]. blue health:[0.00] red health:[1.00]
 [4/20] [expert_system] ep_ret:[-3.0385] ep_len:[558]. blue health:[0.00] red health:[1.00]
 [5/20] [zombie] ep_ret:[-1.4460]