# SAC with ADT Continuous Env

In [1]:
import datetime,gym,time,os,psutil,ray
import numpy as np
import tensorflow as tf
from util import open_txt,write_txt
from sac import ReplayBuffer,create_sac_model,create_sac_graph,\
    save_sac_model_and_buffers,restore_sac_model_and_buffers
from episci.environment_wrappers.tactical_action_adt_env_continuous \
    import CustomADTEnvContinuous
from episci.agents.utils.constants import Agents,RewardType,StateInfo
print ("Packaged loaded. TF version is [%s]."%(tf.__version__))

Packaged loaded. TF version is [1.15.0].


### Hyperparameters

In [2]:
# Worker
exp_name = 'sac_adt_cont_04'
n_cpu = 21
n_workers = 20

In [3]:
# Environment
action_length = 10 # 50/? HZ
# Red agent distribution for training
red_list_train = [
    Agents.ZOMBIE,
    Agents.SPOT_RANDOM,
    Agents.EXPERT_SYSTEM
]
# Red agent distribution for evaluation
red_list_eval = [
    Agents.ZOMBIE,
    Agents.ROSIE, 
    Agents.BUD, 
    Agents.BUD_FSM, 
    Agents.EXPERT_SYSTEM
]*n_workers
red_list_eval = red_list_eval[:n_workers]
num_eval = len(red_list_eval) # evaluation

# Steps
total_steps,evaluate_every,print_every = 5000,5,1
ep_len_rollout = 15000 // action_length 
buffer_size = int(3000*len(red_list_train)) 

# Network configuration
hdims = [32,32]
actv = tf.nn.relu
batch_size,update_count = int(2**15),500 # batchsize / number of updates
lr = 1e-6 # 1e-5
epsilon = 1e-1 # 1e-5
# SAC
gamma = 0.999 #
alpha_q,alpha_pi = 0.01,0.01 # 0.5,0.5
polyak = 0.995 
# Buffer
buffer_sz_long,buffer_sz_short = 1e6,1e5 # 1e5,1e5
# Temperature
temp_min,temp_max = 1.0,1.0 # 0.0,0.5
# Reward Offset
r_offset = 0.0

### Environment

In [4]:
def get_env(red_distribution=None):
    from episci.environment_wrappers.tactical_action_adt_env_continuous \
        import CustomADTEnvContinuous
    from episci.agents.utils.constants import Agents, RewardType
    env_config = {
        "red_distribution": red_distribution,
        "reward_type": RewardType.SHAPED
    }
    return CustomADTEnvContinuous(env_config,action_length=action_length)

### Logger

In [5]:
txt_path = '../report/log/%s/log_%s.txt'%(
    exp_name,
    datetime.datetime.now().strftime("%b-%d-%Y-%H:%M:%S"))
f = open_txt(txt_path)
print ("[%s] created."%(txt_path))
time.sleep(1)

[../report/log/sac_adt_cont_04/log_Aug-19-2020-08:44:55.txt] created.


### Rollout Workers

In [6]:
class RolloutWorkerClass(object):
    """
    Worker without RAY (for update purposes)
    """
    def __init__(self,hdims=[256,256],actv=tf.nn.relu,
                 lr=1e-3,gamma=0.99,alpha_q=0.1,alpha_pi=0.1,polyak=0.995,epsilon=1e-2,
                 seed=1):
        self.seed = seed
        # Each worker should maintain its own environment
        import gym
        from util import suppress_tf_warning
        suppress_tf_warning() # suppress TF warnings
        gym.logger.set_level(40) 
        self.env = get_env()
        odim,adim = self.env.observation_space.shape[0],self.env.action_space.shape[0]
        self.odim = odim
        self.adim = adim
        _ = self.env.reset(red=Agents.SPOT_RANDOM)
        
        # Create SAC model and computational graph 
        self.model,self.sess = create_sac_model(
            odim=self.odim,adim=self.adim,hdims=hdims,actv=actv)
        self.step_ops,self.target_init = \
            create_sac_graph(self.model,lr=lr,gamma=gamma,alpha_q=alpha_q,alpha_pi=alpha_pi,
                             polyak=polyak,epsilon=epsilon)
        
        # Initialize model 
        tf.set_random_seed(self.seed)
        np.random.seed(self.seed)
        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.target_init)
        
        # Flag to initialize assign operations for 'set_weights()'
        self.FIRST_SET_FLAG = True
    
    def get_action(self,o,deterministic=False):
        act_op = self.model['mu'] if deterministic else self.model['pi']
        return self.sess.run(act_op, feed_dict={self.model['o_ph']:o.reshape(1,-1)})[0]
    
    def set_weights(self,weight_vals):
        """
        Set weights without memory leakage
        """
        if self.FIRST_SET_FLAG:
            self.FIRST_SET_FLAG = False
            self.assign_placeholders = []
            self.assign_ops = []
            for w_idx,weight_tf_var in enumerate(self.model['main_vars']): #+self.model['target_vars']:
                a = weight_tf_var
                assign_placeholder = tf.placeholder(a.dtype, shape=a.get_shape())
                assign_op = a.assign(assign_placeholder)
                self.assign_placeholders.append(assign_placeholder)
                self.assign_ops.append(assign_op)
        for w_idx,weight_tf_var in enumerate(self.model['main_vars']): #+self.model['target_vars']:
            self.sess.run(self.assign_ops[w_idx],
                          {self.assign_placeholders[w_idx]:weight_vals[w_idx]})

    def get_weights(self):
        """
        Get weights
        """
        weight_vals = self.sess.run(self.model['main_vars']) #+self.model['target_vars']
        return weight_vals
    
@ray.remote
class RayRolloutWorkerClass(object):
    """
    Rollout Worker with RAY
    """
    def __init__(self,worker_id=0,hdims=[256,256],actv=tf.nn.relu,
                 ep_len_rollout=1000,buffer_size=1000):
        # Parse
        self.worker_id = worker_id
        self.ep_len_rollout = ep_len_rollout
        self.buffer_size = buffer_size
        # Each worker should maintain its own environment
        import gym
        from util import suppress_tf_warning
        suppress_tf_warning() # suppress TF warnings
        gym.logger.set_level(40) 
        self.env = get_env()
        odim,adim = self.env.observation_space.shape[0],self.env.action_space.shape[0]
        self.odim = odim
        self.adim = adim
        _ = self.env.reset(red=Agents.SPOT_RANDOM)
        
        # Replay buffers to pass
        self.o_buffer = np.zeros((self.buffer_size,self.odim))
        self.a_buffer = np.zeros((self.buffer_size,self.adim))
        self.r_buffer = np.zeros((self.buffer_size))
        self.o2_buffer = np.zeros((self.buffer_size,self.odim))
        self.d_buffer = np.zeros((self.buffer_size))
        
        # Create SAC model
        self.model,self.sess = create_sac_model(
            odim=self.odim,adim=self.adim,hdims=hdims,actv=actv)
        self.sess.run(tf.global_variables_initializer())
        print ("Ray Worker [%d] Ready."%(self.worker_id))
        
        # Flag to initialize assign operations for 'set_weights()'
        self.FIRST_SET_FLAG = True
        
        # Flag to initialize rollout
        self.FIRST_ROLLOUT_FLAG = True
        
    def get_action(self,o,deterministic=False,temperature=1.0):
        """
        Get action (if temperature is 0, it becomes deterministic)
        """
        a_mu = self.sess.run(self.model['mu'],
                             feed_dict={self.model['o_ph']:o.reshape(1,-1)})[0]
        a_pi = self.sess.run(self.model['pi'],
                             feed_dict={self.model['o_ph']:o.reshape(1,-1)})[0]
        if deterministic:
            a = a_mu
        else:
            a = temperature*a_pi + (1-temperature)*a_mu
        return a
    
    def set_weights(self,weight_vals):
        """
        Set weights without memory leakage
        """
        if self.FIRST_SET_FLAG:
            self.FIRST_SET_FLAG = False
            self.assign_placeholders = []
            self.assign_ops = []
            for w_idx,weight_tf_var in enumerate(self.model['main_vars']): #+self.model['target_vars']:
                a = weight_tf_var
                assign_placeholder = tf.placeholder(a.dtype, shape=a.get_shape())
                assign_op = a.assign(assign_placeholder)
                self.assign_placeholders.append(assign_placeholder)
                self.assign_ops.append(assign_op)
        for w_idx,weight_tf_var in enumerate(self.model['main_vars']): #+self.model['target_vars']:
            self.sess.run(self.assign_ops[w_idx],
                          {self.assign_placeholders[w_idx]:weight_vals[w_idx]})
            
    def rollout(self,temperature=1.0,
                red_list=[Agents.SPOT_RANDOM,Agents.EXPERT_SYSTEM]):
        """
        Rollout
        """
        if self.FIRST_ROLLOUT_FLAG:
            self.FIRST_ROLLOUT_FLAG = False
            self.o = self.env.reset(red=Agents.SPOT_RANDOM) # reset environment
        # Loop
        r_sum,cnt = 0,0
        for r_idx,red in enumerate(red_list): # for each red policy
            self.o = self.env.reset(red=red) # reset environment
            for t in range(self.ep_len_rollout):
                self.a = self.get_action(self.o,deterministic=False,temperature=temperature)
                self.o2,self.r,self.d,_ = self.env.step(self.a)
                r_sum += (self.r+r_offset)
                # Append
                self.o_buffer[cnt,:] = self.o
                self.a_buffer[cnt,:] = self.a
                self.r_buffer[cnt] = self.r
                self.o2_buffer[cnt,:] = self.o2
                self.d_buffer[cnt] = self.d
                cnt += 1
                # Save next state 
                self.o = self.o2
                if self.d: 
                    # self.o = self.env.reset(red=Agents.SPOT_RANDOM) # reset when done 
                    break
        o_buffer = self.o_buffer[:cnt,:]
        a_buffer = self.a_buffer[:cnt,:]
        r_buffer = self.r_buffer[:cnt]
        o2_buffer = self.o2_buffer[:cnt,:]
        d_buffer = self.d_buffer[:cnt]
        r_avg = r_sum / cnt
        return o_buffer,a_buffer,r_buffer,o2_buffer,d_buffer,r_avg
    
    def evaluate(self,red=None):
        """
        Evaluate
        """
        o,d,ep_ret,ep_len = self.env.reset(red=red),False,0,0
        while not(d or (ep_len == self.ep_len_rollout)):
            a = self.get_action(o,deterministic=True)
            o,r,d,_ = self.env.step(a)
            ep_ret += r # compute return 
            ep_len += 1
        blue_health,red_health = self.env.blue_health,self.env.red_health
        eval_res = [ep_ret,ep_len,blue_health,red_health] # evaluation result 
        return eval_res
    

### Initialize Workers

In [7]:
ray.init(num_cpus=n_cpu)
tf.reset_default_graph()
R = RolloutWorkerClass(hdims=hdims,actv=actv,
                       lr=lr,gamma=gamma,alpha_q=alpha_q,alpha_pi=alpha_pi,
                       polyak=polyak,epsilon=epsilon,
                       seed=0)
workers = [RayRolloutWorkerClass.remote(
    worker_id=i,hdims=hdims,actv=actv,
    ep_len_rollout=ep_len_rollout,buffer_size=buffer_size) 
           for i in range(n_workers)]
print ("RAY initialized with [%d] cpus and [%d] workers."%
       (n_cpu,n_workers))

2020-08-19 08:44:56,893	INFO resource_spec.py:212 -- Starting Ray with 148.44 GiB memory available for workers and up to 67.62 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-08-19 08:44:57,558	INFO services.py:1165 -- View the Ray dashboard at [1m[32mlocalhost:8269[39m[22m


RAY initialized with [21] cpus and [20] workers.


### Replay Buffers

In [8]:
replay_buffer_long = ReplayBuffer(odim=R.odim,adim=R.adim,size=int(buffer_sz_long))
replay_buffer_short = ReplayBuffer(odim=R.odim,adim=R.adim,size=int(buffer_sz_short))

### Loop

In [9]:
npz_path = ''
if npz_path:
    restore_sac_model_and_buffers(npz_path=npz_path,R=R,
                                  replay_buffer_long=replay_buffer_long,
                                  replay_buffer_short=replay_buffer_short,
                                  VERBOSE=False,IGNORE_BUFFERS=True)

In [None]:
npz_path_list,ep_ret_avg_list = [],[]
start_time = time.time()
n_env_step = 0 # number of environment steps
ep_ret_avg = 0
for t in range(int(total_steps)):
    esec = time.time()-start_time
    
    # 1. Synchronize worker weights
    weights = R.get_weights()
    set_weights_list = [worker.set_weights.remote(weights) for worker in workers] 
    
    # 2. Make rollout and accumulate to Buffers
    t_start = time.time()
    ops = [worker.rollout.remote(
        temperature=temp_min+(temp_max-temp_min)*np.random.rand(),
        red_list=red_list_train # <= with the list of pre-defined red agent policies
    )
           for worker in workers]
    rollout_vals = ray.get(ops)
    r_sum = 0
    for rollout_val in rollout_vals:
        o_buffer,a_buffer,r_buffer,o2_buffer,d_buffer,r_rollout_avg = rollout_val
        r_sum += r_rollout_avg
        for i in range(o_buffer.shape[0]):
            o,a,r,o2,d = o_buffer[i,:],a_buffer[i,:],r_buffer[i],o2_buffer[i,:],d_buffer[i]
            replay_buffer_long.store(o, a, r, o2, d) 
            replay_buffer_short.store(o, a, r, o2, d) 
            n_env_step += 1
    r_avg = r_sum / len(rollout_vals)
    sec_rollout = time.time() - t_start
    
    # 3. Update the SAC model
    t_start = time.time()
    avg_qs = np.zeros(int(update_count))
    for c_idx in range(int(update_count)):
        batch_long = replay_buffer_long.sample_batch(batch_size//2) 
        batch_short = replay_buffer_short.sample_batch(batch_size//2) 
        feed_dict = {R.model['o_ph']: np.concatenate((batch_long['obs1'],batch_short['obs1'])),
                     R.model['o2_ph']: np.concatenate((batch_long['obs2'],batch_short['obs2'])),
                     R.model['a_ph']: np.concatenate((batch_long['acts'],batch_short['acts'])),
                     R.model['r_ph']: np.concatenate((batch_long['rews'],batch_short['rews'])),
                     R.model['d_ph']: np.concatenate((batch_long['done'],batch_short['done']))
                    }
        outs = R.sess.run(R.step_ops, feed_dict) # update 
        q1_vals,q2_vals = outs[3],outs[4]
        avg_q = 0.5*np.mean(q1_vals)+0.5*np.mean(q2_vals)
        avg_qs[c_idx] = avg_q
    sec_update = time.time() - t_start
    
    # 4. Synchronize worker weights (after update)
    weights = R.get_weights()
    set_weights_list = [worker.set_weights.remote(weights) for worker in workers] 
    
    # Print
    if (t == 0) or (((t+1)%print_every) == 0): 
        print ("[%d/%d] n_env_step:[%.1e] rollout:[%.1f]s update:[%.1f]s r_avg:[%.4f] avg_q:[%.3f]."%
               (t+1,total_steps,n_env_step,sec_rollout,sec_update,r_avg,np.mean(avg_qs)))
        write_txt(f,"%.2f, r_train:%.4f, ret_eval:%.4f"%(time.time()-start_time,r_avg,ep_ret_avg),
                  ADD_NEWLINE=True,DO_PRINT=False)
        
    # 5. Evaluate
    if (t == 0) or (((t+1)%evaluate_every) == 0): 
        ram_percent = psutil.virtual_memory().percent # memory usage
        print ("[Eval. start] step:[%d/%d][%.1f%%] #step:[%.1e] time:[%s] ram:[%.1f%%]."%
               (t+1,total_steps,t/total_steps*100,
                n_env_step,
                time.strftime("day:[%d] %H:%M:%S", time.gmtime(time.time()-start_time)),
                ram_percent)
              )
        ops = []
        for i_idx in range(num_eval):
            worker,red = workers[i_idx],red_list_eval[i_idx]
            ops.append(worker.evaluate.remote(red=red))
        eval_vals = ray.get(ops)
        ep_ret_sum = 0
        for i_idx in range(num_eval):
            red,eval_val = red_list_eval[i_idx],eval_vals[i_idx]
            ep_ret,ep_len,blue_health,red_health = eval_val[0],eval_val[1],eval_val[2],eval_val[3]
            ep_ret_sum += ep_ret
            print (" [%d/%d] [%s] ep_ret:[%.4f] ep_len:[%d]. blue health:[%.2f] red health:[%.2f]"
                %(i_idx,len(eval_vals),red,ep_ret,ep_len,blue_health,red_health))
        ep_ret_avg = ep_ret_sum / num_eval
        print ("[Eval. done] time:[%s] ep_ret_avg:[%.3f].\n"%
               (time.strftime("day:[%d] %H:%M:%S", time.gmtime(time.time()-start_time)),ep_ret_avg))
        # Save current SAC model and replay buffers 
        npz_path = '../report/net/%s/model_and_buffers_%d.npz'%(exp_name,t+1)
        save_sac_model_and_buffers(npz_path,R,replay_buffer_long,replay_buffer_short,
                                   VERBOSE=False,IGNORE_BUFFERS=True)
        
        # If something went bad, restore 
        npz_path_list.append(npz_path)
        ep_ret_avg_list.append(ep_ret_avg)
        """
        ep_ret_avg_array = np.asanyarray(ep_ret_avg_list)
        ep_ret_avg_max = np.max(ep_ret_avg_array)
        if (ep_ret_avg < 0.5*ep_ret_avg_max) and (len(npz_path_list) >= 2) and (ep_ret_avg_max > 0):
            npz_path = npz_path_list[np.argmax(ep_ret_avg_array)] 
            restore_sac_model_and_buffers(npz_path=npz_path,R=R,
                                          replay_buffer_long=replay_buffer_long,
                                          replay_buffer_short=replay_buffer_short,
                                          VERBOSE=False,IGNORE_BUFFERS=True)
            print ("Restoring [%s] as current[%.2f] is way lower than max[%.2f]"%
                   (npz_path,ep_ret_avg,ep_ret_avg_max))
        """
print ("Done.")

[2m[36m(pid=125427)[0m 
[2m[36m(pid=125427)[0m 
[2m[36m(pid=125427)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jul 11 2020 05:35:14
[2m[36m(pid=125413)[0m 
[2m[36m(pid=125413)[0m 
[2m[36m(pid=125413)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jul 11 2020 05:35:14
[2m[36m(pid=125416)[0m 
[2m[36m(pid=125416)[0m 
[2m[36m(pid=125416)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jul 11 2020 05:35:14
[2m[36m(pid=125414)[0m 
[2m[36m(pid=125414)[0m 
[2m[36m(pid=125414)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jul 11 2020 05:35:14
[2m[36m(pid=125431)[0m 
[2m[36m(pid=125431)[0m 
[2m[36m(pid=125431)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jul 11 2020 05:35:14
[2m[36m(pid=125418)[0m 
[2m[36m(pid=125418)[0m 
[2m[36m(pid=125418)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jul 11 2020 05:35:14
[2m[36m(pid=125432)[0m 
[2m[36m(pid=125432)[0m 
[2m[36m(pid=125432)[0m      JSBSim Flight Dynamics Model v1.1

[6/5000] n_env_step:[4.1e+05] rollout:[418.6]s update:[26.5]s r_avg:[-0.0217] avg_q:[0.025].
[7/5000] n_env_step:[4.8e+05] rollout:[411.7]s update:[26.4]s r_avg:[-0.0244] avg_q:[0.025].
[8/5000] n_env_step:[5.4e+05] rollout:[342.6]s update:[29.0]s r_avg:[-0.0195] avg_q:[0.025].
[9/5000] n_env_step:[6.0e+05] rollout:[353.5]s update:[26.6]s r_avg:[-0.0287] avg_q:[0.025].
[10/5000] n_env_step:[6.7e+05] rollout:[359.4]s update:[29.5]s r_avg:[-0.0279] avg_q:[0.025].
[Eval. start] step:[10/5000][0.2%] #step:[6.7e+05] time:[day:[01] 01:14:50] ram:[20.1%].
 [0/20] [zombie] ep_ret:[-0.0011] ep_len:[1500]. blue health:[1.00] red health:[1.00]
 [1/20] [rosie] ep_ret:[100.5082] ep_len:[1038]. blue health:[1.00] red health:[0.00]
 [2/20] [bud] ep_ret:[100.1483] ep_len:[1038]. blue health:[1.00] red health:[0.00]
 [3/20] [bud_fsm] ep_ret:[100.1581] ep_len:[193]. blue health:[1.00] red health:[0.00]
 [4/20] [expert_system] ep_ret:[-75.9070] ep_len:[675]. blue health:[0.00] red health:[0.76]
 [5/20] [

[26/5000] n_env_step:[1.7e+06] rollout:[328.8]s update:[26.8]s r_avg:[-0.0235] avg_q:[0.024].
[27/5000] n_env_step:[1.8e+06] rollout:[360.3]s update:[32.6]s r_avg:[-0.0227] avg_q:[0.024].
[28/5000] n_env_step:[1.9e+06] rollout:[399.4]s update:[27.0]s r_avg:[-0.0237] avg_q:[0.024].
[29/5000] n_env_step:[1.9e+06] rollout:[383.2]s update:[30.8]s r_avg:[-0.0260] avg_q:[0.024].
[30/5000] n_env_step:[2.0e+06] rollout:[403.4]s update:[29.0]s r_avg:[-0.0246] avg_q:[0.024].
[Eval. start] step:[30/5000][0.6%] #step:[2.0e+06] time:[day:[01] 03:43:53] ram:[21.0%].
 [0/20] [zombie] ep_ret:[0.1087] ep_len:[1500]. blue health:[1.00] red health:[1.00]
 [1/20] [rosie] ep_ret:[100.4655] ep_len:[1000]. blue health:[1.00] red health:[0.00]
 [2/20] [bud] ep_ret:[100.0061] ep_len:[1425]. blue health:[1.00] red health:[0.00]
 [3/20] [bud_fsm] ep_ret:[100.1338] ep_len:[202]. blue health:[1.00] red health:[0.00]
 [4/20] [expert_system] ep_ret:[-100.2603] ep_len:[323]. blue health:[0.00] red health:[1.00]
 [5/2

[46/5000] n_env_step:[3.1e+06] rollout:[372.8]s update:[28.8]s r_avg:[-0.0247] avg_q:[0.023].
[47/5000] n_env_step:[3.1e+06] rollout:[376.4]s update:[28.8]s r_avg:[-0.0229] avg_q:[0.023].
[48/5000] n_env_step:[3.2e+06] rollout:[363.0]s update:[29.6]s r_avg:[-0.0276] avg_q:[0.023].
[49/5000] n_env_step:[3.3e+06] rollout:[400.6]s update:[29.5]s r_avg:[-0.0235] avg_q:[0.023].
[50/5000] n_env_step:[3.3e+06] rollout:[361.0]s update:[30.6]s r_avg:[-0.0261] avg_q:[0.023].
[Eval. start] step:[50/5000][1.0%] #step:[3.3e+06] time:[day:[01] 06:03:12] ram:[21.2%].
 [0/20] [zombie] ep_ret:[0.2765] ep_len:[1500]. blue health:[1.00] red health:[1.00]
 [1/20] [rosie] ep_ret:[100.7938] ep_len:[837]. blue health:[1.00] red health:[0.00]
 [2/20] [bud] ep_ret:[100.0809] ep_len:[1309]. blue health:[1.00] red health:[0.00]
 [3/20] [bud_fsm] ep_ret:[0.3345] ep_len:[1500]. blue health:[1.00] red health:[1.00]
 [4/20] [expert_system] ep_ret:[-70.3978] ep_len:[524]. blue health:[0.00] red health:[1.00]
 [5/20] 

[66/5000] n_env_step:[4.4e+06] rollout:[341.8]s update:[28.1]s r_avg:[-0.0262] avg_q:[0.022].
[67/5000] n_env_step:[4.5e+06] rollout:[385.4]s update:[29.3]s r_avg:[-0.0280] avg_q:[0.022].
[68/5000] n_env_step:[4.5e+06] rollout:[365.9]s update:[27.5]s r_avg:[-0.0269] avg_q:[0.022].
[69/5000] n_env_step:[4.6e+06] rollout:[398.4]s update:[27.4]s r_avg:[-0.0252] avg_q:[0.022].
[70/5000] n_env_step:[4.7e+06] rollout:[332.4]s update:[23.6]s r_avg:[-0.0237] avg_q:[0.022].
[Eval. start] step:[70/5000][1.4%] #step:[4.7e+06] time:[day:[01] 08:24:15] ram:[21.3%].
 [0/20] [zombie] ep_ret:[0.0747] ep_len:[1500]. blue health:[1.00] red health:[1.00]
 [1/20] [rosie] ep_ret:[100.1752] ep_len:[986]. blue health:[1.00] red health:[0.00]
 [2/20] [bud] ep_ret:[100.1242] ep_len:[988]. blue health:[1.00] red health:[0.00]
 [3/20] [bud_fsm] ep_ret:[100.1040] ep_len:[133]. blue health:[1.00] red health:[0.00]
 [4/20] [expert_system] ep_ret:[-100.1683] ep_len:[218]. blue health:[0.00] red health:[1.00]
 [5/20]

[86/5000] n_env_step:[5.7e+06] rollout:[355.1]s update:[26.4]s r_avg:[-0.0255] avg_q:[0.021].
[87/5000] n_env_step:[5.8e+06] rollout:[324.9]s update:[28.5]s r_avg:[-0.0214] avg_q:[0.021].
[88/5000] n_env_step:[5.9e+06] rollout:[320.5]s update:[33.7]s r_avg:[-0.0263] avg_q:[0.021].
[89/5000] n_env_step:[5.9e+06] rollout:[386.5]s update:[34.2]s r_avg:[-0.0253] avg_q:[0.021].
[90/5000] n_env_step:[6.0e+06] rollout:[367.9]s update:[29.4]s r_avg:[-0.0195] avg_q:[0.021].
[Eval. start] step:[90/5000][1.8%] #step:[6.0e+06] time:[day:[01] 10:47:33] ram:[21.0%].
 [0/20] [zombie] ep_ret:[0.2244] ep_len:[1500]. blue health:[1.00] red health:[1.00]
 [1/20] [rosie] ep_ret:[100.3498] ep_len:[529]. blue health:[1.00] red health:[0.00]
 [2/20] [bud] ep_ret:[100.4190] ep_len:[855]. blue health:[1.00] red health:[0.00]
 [3/20] [bud_fsm] ep_ret:[100.1212] ep_len:[182]. blue health:[1.00] red health:[0.00]
 [4/20] [expert_system] ep_ret:[-91.6948] ep_len:[705]. blue health:[0.00] red health:[0.91]
 [5/20] 

[106/5000] n_env_step:[7.1e+06] rollout:[375.6]s update:[28.7]s r_avg:[-0.0273] avg_q:[0.021].
[107/5000] n_env_step:[7.1e+06] rollout:[343.1]s update:[29.1]s r_avg:[-0.0257] avg_q:[0.021].
[108/5000] n_env_step:[7.2e+06] rollout:[358.6]s update:[31.9]s r_avg:[-0.0252] avg_q:[0.021].
[109/5000] n_env_step:[7.3e+06] rollout:[382.9]s update:[34.1]s r_avg:[-0.0248] avg_q:[0.021].
[110/5000] n_env_step:[7.3e+06] rollout:[425.9]s update:[30.2]s r_avg:[-0.0265] avg_q:[0.021].
[Eval. start] step:[110/5000][2.2%] #step:[7.3e+06] time:[day:[01] 13:13:12] ram:[21.2%].
 [0/20] [zombie] ep_ret:[100.2398] ep_len:[1484]. blue health:[1.00] red health:[0.00]
 [1/20] [rosie] ep_ret:[100.1432] ep_len:[972]. blue health:[1.00] red health:[0.00]
 [2/20] [bud] ep_ret:[100.5629] ep_len:[996]. blue health:[1.00] red health:[0.00]
 [3/20] [bud_fsm] ep_ret:[100.1915] ep_len:[219]. blue health:[1.00] red health:[0.00]
 [4/20] [expert_system] ep_ret:[73.9350] ep_len:[726]. blue health:[0.74] red health:[0.00]
 

[126/5000] n_env_step:[8.4e+06] rollout:[370.5]s update:[24.7]s r_avg:[-0.0287] avg_q:[0.020].
[127/5000] n_env_step:[8.5e+06] rollout:[357.1]s update:[27.2]s r_avg:[-0.0263] avg_q:[0.020].
[128/5000] n_env_step:[8.5e+06] rollout:[390.5]s update:[24.8]s r_avg:[-0.0270] avg_q:[0.020].
[129/5000] n_env_step:[8.6e+06] rollout:[383.3]s update:[26.4]s r_avg:[-0.0259] avg_q:[0.020].
[130/5000] n_env_step:[8.7e+06] rollout:[349.8]s update:[27.6]s r_avg:[-0.0262] avg_q:[0.020].
[Eval. start] step:[130/5000][2.6%] #step:[8.7e+06] time:[day:[01] 15:38:38] ram:[21.4%].
 [0/20] [zombie] ep_ret:[0.0215] ep_len:[1214]. blue health:[1.00] red health:[1.00]
 [1/20] [rosie] ep_ret:[100.5686] ep_len:[504]. blue health:[1.00] red health:[0.00]
 [2/20] [bud] ep_ret:[100.1514] ep_len:[609]. blue health:[1.00] red health:[0.00]
 [3/20] [bud_fsm] ep_ret:[100.3853] ep_len:[882]. blue health:[1.00] red health:[0.00]
 [4/20] [expert_system] ep_ret:[-100.3040] ep_len:[806]. blue health:[0.00] red health:[1.00]
 

[146/5000] n_env_step:[9.8e+06] rollout:[423.1]s update:[26.8]s r_avg:[-0.0245] avg_q:[0.020].
[147/5000] n_env_step:[9.8e+06] rollout:[406.8]s update:[26.1]s r_avg:[-0.0234] avg_q:[0.020].
[148/5000] n_env_step:[9.9e+06] rollout:[405.0]s update:[24.7]s r_avg:[-0.0265] avg_q:[0.020].
[149/5000] n_env_step:[9.9e+06] rollout:[354.8]s update:[25.0]s r_avg:[-0.0209] avg_q:[0.020].
[150/5000] n_env_step:[1.0e+07] rollout:[432.9]s update:[31.6]s r_avg:[-0.0246] avg_q:[0.020].
[Eval. start] step:[150/5000][3.0%] #step:[1.0e+07] time:[day:[01] 18:08:09] ram:[21.5%].
 [0/20] [zombie] ep_ret:[-0.0622] ep_len:[1500]. blue health:[1.00] red health:[1.00]
 [1/20] [rosie] ep_ret:[100.3559] ep_len:[613]. blue health:[1.00] red health:[0.00]
 [2/20] [bud] ep_ret:[100.2059] ep_len:[1170]. blue health:[1.00] red health:[0.00]
 [3/20] [bud_fsm] ep_ret:[0.1875] ep_len:[1500]. blue health:[0.88] red health:[0.88]
 [4/20] [expert_system] ep_ret:[-77.6116] ep_len:[1500]. blue health:[0.13] red health:[0.90]


[166/5000] n_env_step:[1.1e+07] rollout:[398.1]s update:[29.3]s r_avg:[-0.0258] avg_q:[0.020].
[167/5000] n_env_step:[1.1e+07] rollout:[410.0]s update:[30.8]s r_avg:[-0.0262] avg_q:[0.020].
[168/5000] n_env_step:[1.1e+07] rollout:[402.7]s update:[29.3]s r_avg:[-0.0251] avg_q:[0.020].
[169/5000] n_env_step:[1.1e+07] rollout:[370.5]s update:[29.5]s r_avg:[-0.0273] avg_q:[0.019].
[170/5000] n_env_step:[1.1e+07] rollout:[382.2]s update:[30.8]s r_avg:[-0.0246] avg_q:[0.020].
[Eval. start] step:[170/5000][3.4%] #step:[1.1e+07] time:[day:[01] 20:41:01] ram:[21.6%].
 [0/20] [zombie] ep_ret:[0.2098] ep_len:[1500]. blue health:[1.00] red health:[1.00]
 [1/20] [rosie] ep_ret:[100.3726] ep_len:[299]. blue health:[1.00] red health:[0.00]
 [2/20] [bud] ep_ret:[100.2226] ep_len:[686]. blue health:[1.00] red health:[0.00]
 [3/20] [bud_fsm] ep_ret:[100.1311] ep_len:[1204]. blue health:[1.00] red health:[0.00]
 [4/20] [expert_system] ep_ret:[100.2223] ep_len:[495]. blue health:[1.00] red health:[0.00]
 

[186/5000] n_env_step:[1.2e+07] rollout:[430.9]s update:[31.8]s r_avg:[-0.0189] avg_q:[0.019].
[187/5000] n_env_step:[1.2e+07] rollout:[453.0]s update:[25.6]s r_avg:[-0.0171] avg_q:[0.019].
[188/5000] n_env_step:[1.3e+07] rollout:[419.6]s update:[27.7]s r_avg:[-0.0261] avg_q:[0.019].
[189/5000] n_env_step:[1.3e+07] rollout:[352.3]s update:[33.5]s r_avg:[-0.0220] avg_q:[0.019].
[190/5000] n_env_step:[1.3e+07] rollout:[386.9]s update:[31.2]s r_avg:[-0.0246] avg_q:[0.019].
[Eval. start] step:[190/5000][3.8%] #step:[1.3e+07] time:[day:[01] 23:15:15] ram:[21.7%].
 [0/20] [zombie] ep_ret:[-0.0785] ep_len:[1500]. blue health:[1.00] red health:[1.00]
 [1/20] [rosie] ep_ret:[100.1360] ep_len:[1098]. blue health:[1.00] red health:[0.00]
 [2/20] [bud] ep_ret:[100.1278] ep_len:[880]. blue health:[1.00] red health:[0.00]
 [3/20] [bud_fsm] ep_ret:[100.3564] ep_len:[705]. blue health:[1.00] red health:[0.00]
 [4/20] [expert_system] ep_ret:[15.2215] ep_len:[1500]. blue health:[0.87] red health:[0.72]
