# SAC with CustomADTEnvContinuous

In [1]:
import datetime,gym,time,os,psutil,ray
import numpy as np
import tensorflow as tf
from util import gpu_sess,suppress_tf_warning,tic,toc
from sac import ReplayBuffer,create_sac_model,create_sac_graph,\
    save_sac_model_and_buffers,restore_sac_model_and_buffers
np.set_printoptions(precision=2)
suppress_tf_warning() # suppress warning 
gym.logger.set_level(40) # gym logger 

from episci.environment_wrappers.tactical_action_adt_env_continuous import CustomADTEnvContinuous
from episci.agents.utils.constants import Agents, RewardType
print ("Packaged loaded. TF version is [%s]."%(tf.__version__))

Packaged loaded. TF version is [1.15.0].


### Rollout Worker

In [2]:
action_length = 5
def get_env():
    from episci.environment_wrappers.tactical_action_adt_env_continuous import CustomADTEnvContinuous
    from episci.agents.utils.constants import Agents, RewardType
    
    red_distribution = {
        Agents.SPOT_4G: 0.15,
        Agents.SPOT_5G: 0.30,
        Agents.SPOT_RANDOM: 0.45,
        Agents.EXPERT_SYSTEM_TRIAL_2: 0.6,
        Agents.EXPERT_SYSTEM_TRIAL_3_SCRIMMAGE_4: 0.75,
        Agents.EXPERT_SYSTEM: 1.0
    }
    env_config = {
        "red_distribution": red_distribution,
        "reward_type": RewardType.SHAPED
    }
    return CustomADTEnvContinuous(env_config,action_length=action_length)

def get_eval_env():
    # from episci.environment_wrappers.tactical_action_adt_env_continuous import CustomADTEnvContinuous
    # from episci.agents.utils.constants import Agents, RewardType
    red_distribution = {
        Agents.SPOT_4G: 0.15,
        Agents.SPOT_5G: 0.30,
        Agents.SPOT_RANDOM: 0.45,
        Agents.EXPERT_SYSTEM_TRIAL_2: 0.6,
        Agents.EXPERT_SYSTEM_TRIAL_3_SCRIMMAGE_4: 0.75,
        Agents.EXPERT_SYSTEM: 1.0
    }
    env_config = {
        "red_distribution": red_distribution,
        "reward_type": RewardType.SHAPED
    }
    return CustomADTEnvContinuous(env_config,action_length=action_length)


In [3]:
class RolloutWorkerClass(object):
    """
    Worker without RAY (for update purposes)
    """
    def __init__(self,hdims=[256,256],actv=tf.nn.relu,
                 lr=1e-3,gamma=0.99,alpha_q=0.1,alpha_pi=0.1,polyak=0.995,epsilon=1e-2,
                 seed=1):
        self.seed = seed
        # Each worker should maintain its own environment
        import gym
        from util import suppress_tf_warning
        suppress_tf_warning() # suppress TF warnings
        gym.logger.set_level(40) 
        self.env = get_eval_env()
        odim,adim = self.env.observation_space.shape[0],self.env.action_space.shape[0]
        self.odim = odim
        self.adim = adim
        _ = self.env.reset()
        
        # Create SAC model and computational graph 
        self.model,self.sess = create_sac_model(
            odim=self.odim,adim=self.adim,hdims=hdims,actv=actv)
        self.step_ops,self.target_init = \
            create_sac_graph(self.model,lr=lr,gamma=gamma,alpha_q=alpha_q,alpha_pi=alpha_pi,
                             polyak=polyak,epsilon=epsilon)
        
        # Initialize model 
        tf.set_random_seed(self.seed)
        np.random.seed(self.seed)
        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.target_init)
    
    def get_action(self,o,deterministic=False):
        act_op = self.model['mu'] if deterministic else self.model['pi']
        return self.sess.run(act_op, feed_dict={self.model['o_ph']:o.reshape(1,-1)})[0]

    def get_weights(self):
        """
        Get weights
        """
        weight_vals = self.sess.run(self.model['main_vars'])
        return weight_vals
    
@ray.remote
class RayRolloutWorkerClass(object):
    """
    Rollout Worker with RAY
    """
    def __init__(self,worker_id=0,hdims=[256,256],actv=tf.nn.relu,
                 ep_len_rollout=1000):
        # Parse
        self.worker_id = worker_id
        self.ep_len_rollout = ep_len_rollout
        # Each worker should maintain its own environment
        import gym
        from util import suppress_tf_warning
        suppress_tf_warning() # suppress TF warnings
        gym.logger.set_level(40) 
        self.env = get_env()
        odim,adim = self.env.observation_space.shape[0],self.env.action_space.shape[0]
        self.odim = odim
        self.adim = adim
        _ = self.env.reset()
        
        # Replay buffers to pass
        self.o_buffer = np.zeros((self.ep_len_rollout,self.odim))
        self.a_buffer = np.zeros((self.ep_len_rollout,self.adim))
        self.r_buffer = np.zeros((self.ep_len_rollout))
        self.o2_buffer = np.zeros((self.ep_len_rollout,self.odim))
        self.d_buffer = np.zeros((self.ep_len_rollout))
        
        # Create SAC model
        self.model,self.sess = create_sac_model(
            odim=self.odim,adim=self.adim,hdims=hdims,actv=actv)
        self.sess.run(tf.global_variables_initializer())
        print ("Ray Worker [%d] Ready."%(self.worker_id))
        
        # Flag to initialize assign operations for 'set_weights()'
        self.FIRST_SET_FLAG = True
        
        # Flag to initialize rollout
        self.FIRST_ROLLOUT_FLAG = True
        
    def get_action(self,o,deterministic=False):
        act_op = self.model['mu'] if deterministic else self.model['pi']
        return self.sess.run(act_op, feed_dict={self.model['o_ph']:o.reshape(1,-1)})[0]
    
    def set_weights(self,weight_vals):
        """
        Set weights without memory leakage
        """
        if self.FIRST_SET_FLAG:
            self.FIRST_SET_FLAG = False
            self.assign_placeholders = []
            self.assign_ops = []
            for w_idx,weight_tf_var in enumerate(self.model['main_vars']):
                a = weight_tf_var
                assign_placeholder = tf.placeholder(a.dtype, shape=a.get_shape())
                assign_op = a.assign(assign_placeholder)
                self.assign_placeholders.append(assign_placeholder)
                self.assign_ops.append(assign_op)
        for w_idx,weight_tf_var in enumerate(self.model['main_vars']):
            # Memory-leakage-free assign (hopefully)
            self.sess.run(self.assign_ops[w_idx],
                          {self.assign_placeholders[w_idx]:weight_vals[w_idx]})
            
    def rollout(self):
        """
        Rollout
        """
        if self.FIRST_ROLLOUT_FLAG:
            self.FIRST_ROLLOUT_FLAG = False
            self.o = self.env.reset() # reset environment
        # Loop
        r_sum = 0
        for t in range(ep_len_rollout):
            self.a = self.get_action(self.o,deterministic=False) 
            self.o2,self.r,self.d,_ = self.env.step(self.a)
            r_sum += self.r
            # Append
            self.o_buffer[t,:] = self.o
            self.a_buffer[t,:] = self.a
            self.r_buffer[t] = self.r
            self.o2_buffer[t,:] = self.o2
            self.d_buffer[t] = self.d
            # Save next state 
            self.o = self.o2
            if self.d: self.o = self.env.reset() # reset when done 
        r_avg = r_sum / ep_len_rollout
        return self.o_buffer,self.a_buffer,self.r_buffer,self.o2_buffer,self.d_buffer,r_avg
    
    def evaluate(self,red=None):
        """
        Evaluate
        """
        o,d,ep_ret,ep_len = self.env.reset(red=red),False,0,0
        while not(d or (ep_len == max_ep_len_eval)):
            a = self.get_action(o,deterministic=True)
            o,r,d,_ = self.env.step(a)
            ep_ret += r # compute return 
            ep_len += 1
        blue_health,red_health = self.env.blue_health,self.env.red_health
        eval_res = [ep_ret,ep_len,blue_health,red_health] # evaluation result 
        return eval_res
    
print ("Rollout worker classes (with and without RAY) ready.")

Rollout worker classes (with and without RAY) ready.


### Hyperparameters

In [4]:
n_cpu = 11
n_workers = 11
total_steps,evaluate_every,print_every = 50000,50,5
ep_len_rollout = 50*30 # 30sec rollout
hdims,actv = [128,128],tf.nn.relu # [64,32,16],tf.nn.relu
red_list = [Agents.SPOT_4G,
            Agents.SPOT_5G,
            Agents.SPOT_RANDOM,
            Agents.EXPERT_SYSTEM_TRIAL_2,
            Agents.EXPERT_SYSTEM_TRIAL_3_SCRIMMAGE_4,
            Agents.EXPERT_SYSTEM,
            Agents.SPOT_4G,
            Agents.SPOT_5G,
            Agents.EXPERT_SYSTEM_TRIAL_2,
            Agents.EXPERT_SYSTEM_TRIAL_3_SCRIMMAGE_4,
            Agents.EXPERT_SYSTEM
           ]
num_eval,max_ep_len_eval = len(red_list),15e3 # evaluation 
# Learning hyp
batch_size,update_count = 2**12,50*10 # batchsize / number of updates
lr = 1e-4 # 1e-3
epsilon = 1e-5
# SAC
gamma = 0.98 # discount 0.99
alpha_q,alpha_pi = 0.05,0.5 # 0.2
polyak = 0.995 # 0.995
# Buffer
buffer_sz_long,buffer_sz_short = 1e7,1e6 # 1e5,1e5

### Initialize Workers

In [5]:
ray.init(num_cpus=n_cpu,
         memory = 5*1024*1024*1024,
         object_store_memory = 10*1024*1024*1024,
         driver_object_store_memory = 5*1024*1024*1024)
tf.reset_default_graph()
R = RolloutWorkerClass(hdims=hdims,actv=actv,
                       lr=lr,gamma=gamma,alpha_q=alpha_q,alpha_pi=alpha_pi,
                       polyak=polyak,epsilon=epsilon,
                       seed=0)
workers = [RayRolloutWorkerClass.remote(worker_id=i,hdims=hdims,actv=actv,
                                        ep_len_rollout=ep_len_rollout) 
           for i in range(n_workers)]
print ("RAY initialized with [%d] cpus and [%d] workers."%
       (n_cpu,n_workers))

2020-07-07 10:25:33,527	INFO resource_spec.py:212 -- Starting Ray with 4.98 GiB memory available for workers and up to 10.0 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-07-07 10:25:33,897	INFO services.py:1078 -- View the Ray dashboard at [1m[32mlocalhost:8266[39m[22m


RAY initialized with [11] cpus and [11] workers.


In [6]:
time.sleep(1)

### Replay Buffers

In [7]:
replay_buffer_long = ReplayBuffer(odim=R.odim,adim=R.adim,size=int(buffer_sz_long))
replay_buffer_short = ReplayBuffer(odim=R.odim,adim=R.adim,size=int(buffer_sz_short))

### Loop

In [None]:
RESTORE = False
if RESTORE:
    npz_path = '../data/net/adt_cont_tactic/model_and_buffers_final.npz'
    restore_sac_model_and_buffers(npz_path,R,replay_buffer_long,replay_buffer_short,
                                  VERBOSE=False,IGNORE_BUFFERS=False)

In [None]:
start_time = time.time()
n_env_step = 0 # number of environment steps
for t in range(int(total_steps)):
    esec = time.time()-start_time
    
    # Synchronize worker weights
    weights = R.get_weights()
    set_weights_list = [worker.set_weights.remote(weights) for worker in workers] 
    
    # Make rollout and accumulate to Buffers
    t_start = time.time()
    ops = [worker.rollout.remote() for worker in workers]
    rollout_vals = ray.get(ops)
    r_sum = 0
    for rollout_val in rollout_vals:
        o_buffer,a_buffer,r_buffer,o2_buffer,d_buffer,r_rollout_avg = rollout_val
        r_sum += r_rollout_avg
        for i in range(ep_len_rollout):
            o,a,r,o2,d = o_buffer[i,:],a_buffer[i,:],r_buffer[i],o2_buffer[i,:],d_buffer[i]
            replay_buffer_long.store(o, a, r, o2, d) 
            replay_buffer_short.store(o, a, r, o2, d) 
            n_env_step += 1
    r_avg = r_sum / len(rollout_vals)
    sec_rollout = time.time() - t_start
    
    # Update
    t_start = time.time()
    avg_qs = np.zeros(int(update_count))
    for c_idx in range(int(update_count)):
        batch_long = replay_buffer_long.sample_batch(batch_size//2) 
        batch_short = replay_buffer_short.sample_batch(batch_size//2) 
        feed_dict = {R.model['o_ph']: np.concatenate((batch_long['obs1'],batch_short['obs1'])),
                     R.model['o2_ph']: np.concatenate((batch_long['obs2'],batch_short['obs2'])),
                     R.model['a_ph']: np.concatenate((batch_long['acts'],batch_short['acts'])),
                     R.model['r_ph']: np.concatenate((batch_long['rews'],batch_short['rews'])),
                     R.model['d_ph']: np.concatenate((batch_long['done'],batch_short['done']))
                    }
        outs = R.sess.run(R.step_ops, feed_dict)
        q1_vals,q2_vals = outs[3],outs[4]
        avg_q = 0.5*np.mean(q1_vals)+0.5*np.mean(q2_vals)
        avg_qs[c_idx] = avg_q
    sec_update = time.time() - t_start
    
    # Synchronize worker weights (after update)
    weights = R.get_weights()
    set_weights_list = [worker.set_weights.remote(weights) for worker in workers] 
    
    # Print
    if (t == 0) or (((t+1)%print_every) == 0): 
        print ("[%d/%d] rollout:[%.1f]s update:[%.1f]s r_avg:[%.4f] avg_q:[%.3f]."%
               (t+1,total_steps,sec_rollout,sec_update,r_avg,np.mean(avg_qs)))
    
    # Evaluate
    if (t == 0) or (((t+1)%evaluate_every) == 0): 
        ram_percent = psutil.virtual_memory().percent # memory usage
        print ("[Eval. start] step:[%d/%d][%.1f%%] #step:[%.1e] time:[%s] ram:[%.1f%%]."%
               (t+1,total_steps,t/total_steps*100,
                n_env_step,
                time.strftime("day:[%d] %H:%M:%S", time.gmtime(time.time()-start_time)),
                ram_percent)
              )
        
        LOCAL_EVAL = 0
        if LOCAL_EVAL:
            ep_ret_sum = 0
            for eval_idx in range(num_eval): 
                red = red_list[eval_idx]
                o,d,ep_ret,ep_len = R.env.reset(red=red),False,0,0
                while not(d or (ep_len == max_ep_len_eval)):
                    a = R.get_action(o,deterministic=True)
                    o,r,d,_ = R.env.step(a)
                    ep_ret += r # compute return
                    ep_len += 1
                ep_ret_sum += ep_ret
                blue_health,red_health = R.env.blue_health,R.env.red_health
                print (" [%d/%d] [%s] ep_ret:[%.4f] ep_len:[%d]. blue health:[%.2f] red health:[%.2f]"
                    %(eval_idx,num_eval,red,ep_ret,ep_len, blue_health,red_health))
            ep_ret_avg = ep_ret_sum / num_eval
            print ("[Eval. done] time:[%s] ep_ret_avg:[%.3f]."%
                   (time.strftime("day:[%d] %H:%M:%S", time.gmtime(time.time()-start_time)),
                    ep_ret_avg)
                  )
        else: # parallel evaluation with Ray
            ops = []
            for i_idx in range(num_eval):
                worker,red = workers[i_idx],red_list[i_idx]
                ops.append(worker.evaluate.remote(red=red))
            eval_vals = ray.get(ops)
            ep_ret_sum = 0
            for i_idx in range(num_eval):
                red,eval_val = red_list[i_idx],eval_vals[i_idx]
                ep_ret,ep_len,blue_health,red_health = eval_val[0],eval_val[1],eval_val[2],eval_val[3]
                ep_ret_sum += ep_ret
                print (" [%d/%d] [%s] ep_ret:[%.4f] ep_len:[%d]. blue health:[%.2f] red health:[%.2f]"
                    %(i_idx,len(eval_vals),red,ep_ret,ep_len,blue_health,red_health))
            ep_ret_avg = ep_ret_sum / num_eval
            print ("[Eval. done] time:[%s] ep_ret_avg:[%.3f].\n"%
                   (time.strftime("day:[%d] %H:%M:%S", time.gmtime(time.time()-start_time)),
                    ep_ret_avg)
                  )
        
        # Save current SAC model and replay buffers 
        npz_path = '../data/net/adt_cont_tactic/model_and_buffers_%d.npz'%(t+1)
        save_sac_model_and_buffers(npz_path,R,replay_buffer_long,replay_buffer_short,
                                   VERBOSE=False,IGNORE_BUFFERS=True)

print ("Done.")

[2m[36m(pid=22945)[0m 
[2m[36m(pid=22945)[0m 
[2m[36m(pid=22945)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jun 16 2020 00:16:24
[2m[36m(pid=22943)[0m 
[2m[36m(pid=22943)[0m 
[2m[36m(pid=22943)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jun 16 2020 00:16:24
[2m[36m(pid=22946)[0m 
[2m[36m(pid=22946)[0m 
[2m[36m(pid=22946)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jun 16 2020 00:16:24
[2m[36m(pid=22942)[0m 
[2m[36m(pid=22942)[0m 
[2m[36m(pid=22942)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jun 16 2020 00:16:24
[2m[36m(pid=22949)[0m 
[2m[36m(pid=22949)[0m 
[2m[36m(pid=22949)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jun 16 2020 00:16:24
[2m[36m(pid=22947)[0m 
[2m[36m(pid=22947)[0m 
[2m[36m(pid=22947)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jun 16 2020 00:16:24
[2m[36m(pid=22940)[0m 
[2m[36m(pid=22940)[0m 
[2m[36m(pid=22940)[0m      JSBSim Flight Dynamics Model v1.1.0.dev1 Jun 16 2020 0

[110/50000] rollout:[137.9]s update:[8.7]s r_avg:[-0.0334] avg_q:[2.600].
[115/50000] rollout:[127.6]s update:[8.7]s r_avg:[-0.0252] avg_q:[2.543].
[120/50000] rollout:[114.9]s update:[8.7]s r_avg:[-0.0200] avg_q:[2.505].
[125/50000] rollout:[139.7]s update:[8.7]s r_avg:[-0.0314] avg_q:[2.468].
[2m[33m(pid=raylet)[0m E0707 15:38:04.780090 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff01000080be0a0000 was evicted before the raylet could pin it.
[130/50000] rollout:[137.7]s update:[8.8]s r_avg:[-0.0137] avg_q:[2.394].
[135/50000] rollout:[141.4]s update:[8.8]s r_avg:[-0.0140] avg_q:[2.300].
[140/50000] rollout:[137.1]s update:[8.8]s r_avg:[-0.0574] avg_q:[2.139].
[145/50000] rollout:[136.8]s update:[8.7]s r_avg:[-0.0291] avg_q:[2.037].
[150/50000] rollout:[118.6]s update:[8.7]s r_avg:[-0.0189] avg_q:[1.984].
[Eval. start] step:[150/50000][0.3%] #step:[2.5e+06] time:[day:[01] 06:12:05] ram:[21.8%].
 [0/11] [spot_4g] ep_ret:[99.9886] ep_len:[2027]. blue health:[1.00]

[300/50000] rollout:[157.4]s update:[10.0]s r_avg:[-0.0189] avg_q:[1.543].
[Eval. start] step:[300/50000][0.6%] #step:[5.0e+06] time:[day:[01] 12:52:25] ram:[36.6%].
 [0/11] [spot_4g] ep_ret:[25.8581] ep_len:[3001]. blue health:[1.00] red health:[0.75]
 [1/11] [spot_5g] ep_ret:[34.0036] ep_len:[2588]. blue health:[0.00] red health:[0.64]
 [2/11] [spot_random] ep_ret:[100.8095] ep_len:[2252]. blue health:[1.00] red health:[0.00]
 [3/11] [es_trial2] ep_ret:[100.4802] ep_len:[715]. blue health:[1.00] red health:[0.00]
 [4/11] [es_trial3_scrimmage4] ep_ret:[76.6049] ep_len:[2555]. blue health:[0.79] red health:[0.00]
 [5/11] [expert_system] ep_ret:[-100.4231] ep_len:[368]. blue health:[0.00] red health:[1.00]
 [6/11] [spot_4g] ep_ret:[0.1972] ep_len:[3001]. blue health:[1.00] red health:[1.00]
 [7/11] [spot_5g] ep_ret:[100.4054] ep_len:[2967]. blue health:[1.00] red health:[0.00]
 [8/11] [es_trial2] ep_ret:[100.2698] ep_len:[557]. blue health:[1.00] red health:[0.00]
 [9/11] [es_trial3_scr

[455/50000] rollout:[148.8]s update:[9.3]s r_avg:[-0.0169] avg_q:[1.408].
[2m[33m(pid=raylet)[0m E0708 06:35:29.348553 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff010000805e270000 was evicted before the raylet could pin it.
[460/50000] rollout:[136.5]s update:[9.3]s r_avg:[-0.0182] avg_q:[1.389].
[465/50000] rollout:[148.0]s update:[9.5]s r_avg:[-0.0237] avg_q:[1.449].
[470/50000] rollout:[150.3]s update:[9.4]s r_avg:[-0.0259] avg_q:[1.507].
[475/50000] rollout:[147.4]s update:[9.4]s r_avg:[-0.0496] avg_q:[1.481].
[2m[33m(pid=raylet)[0m E0708 07:21:52.043387 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff01000080e6280000 was evicted before the raylet could pin it.
[2m[33m(pid=raylet)[0m E0708 07:24:28.567756 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff0100008001290000 was evicted before the raylet could pin it.
[480/50000] rollout:[146.7]s update:[9.5]s r_avg:[-0.0238] avg_q:[1.486].
[485/50000] rollout:[148.5]s up

[2m[33m(pid=raylet)[0m E0708 14:16:49.731897 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff0100008037360000 was evicted before the raylet could pin it.
[2m[33m(pid=raylet)[0m E0708 14:18:45.477232 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff0100008052360000 was evicted before the raylet could pin it.
[635/50000] rollout:[135.1]s update:[9.5]s r_avg:[-0.0104] avg_q:[1.429].
[2m[33m(pid=raylet)[0m E0708 14:27:43.099149 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff01000080b2360000 was evicted before the raylet could pin it.
[640/50000] rollout:[152.9]s update:[9.5]s r_avg:[-0.0193] avg_q:[1.478].
[645/50000] rollout:[153.8]s update:[9.5]s r_avg:[-0.0035] avg_q:[1.519].
[2m[33m(pid=raylet)[0m E0708 14:59:54.927903 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff01000080af370000 was evicted before the raylet could pin it.
[650/50000] rollout:[150.7]s update:[9.5]s r_avg:[0.0036] avg_q:[1.557].
[Eval

[755/50000] rollout:[148.1]s update:[9.5]s r_avg:[-0.0192] avg_q:[1.598].
[760/50000] rollout:[150.3]s update:[9.5]s r_avg:[-0.0130] avg_q:[1.590].
[765/50000] rollout:[127.9]s update:[9.5]s r_avg:[-0.0249] avg_q:[1.589].
[770/50000] rollout:[156.3]s update:[9.5]s r_avg:[-0.0240] avg_q:[1.600].
[775/50000] rollout:[167.7]s update:[11.4]s r_avg:[-0.0108] avg_q:[1.608].
[780/50000] rollout:[148.4]s update:[9.5]s r_avg:[-0.0146] avg_q:[1.555].
[785/50000] rollout:[184.8]s update:[14.3]s r_avg:[-0.0119] avg_q:[1.507].
[790/50000] rollout:[179.8]s update:[12.9]s r_avg:[-0.0033] avg_q:[1.508].
[795/50000] rollout:[177.2]s update:[12.8]s r_avg:[-0.0251] avg_q:[1.447].
[800/50000] rollout:[187.5]s update:[12.8]s r_avg:[-0.0090] avg_q:[1.414].
[Eval. start] step:[800/50000][1.6%] #step:[1.3e+07] time:[day:[02] 11:32:29] ram:[64.5%].
 [0/11] [spot_4g] ep_ret:[100.7160] ep_len:[2058]. blue health:[1.00] red health:[0.00]
 [1/11] [spot_5g] ep_ret:[4.1415] ep_len:[3001]. blue health:[1.00] red heal

[2m[33m(pid=raylet)[0m E0709 04:26:56.900789 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff01000080bc4e0000 was evicted before the raylet could pin it.
[2m[33m(pid=raylet)[0m E0709 04:26:56.911543 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff01000080c34e0000 was evicted before the raylet could pin it.
[2m[33m(pid=raylet)[0m E0709 04:33:20.934557 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff01000080dd4e0000 was evicted before the raylet could pin it.
[2m[33m(pid=raylet)[0m E0709 04:33:20.967324 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff01000080eb4e0000 was evicted before the raylet could pin it.
[920/50000] rollout:[174.9]s update:[12.8]s r_avg:[-0.0374] avg_q:[1.446].
[2m[33m(pid=raylet)[0m E0709 04:49:26.044723 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff010000804c4f0000 was evicted before the raylet could pin it.
[2m[33m(pid=raylet)[0m E0709 04:49:26.064421 22

[1035/50000] rollout:[179.8]s update:[12.6]s r_avg:[-0.0191] avg_q:[1.390].
[1040/50000] rollout:[178.3]s update:[12.7]s r_avg:[-0.0197] avg_q:[1.350].
[2m[33m(pid=raylet)[0m E0709 11:19:52.039081 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff01000080b6590000 was evicted before the raylet could pin it.
[1045/50000] rollout:[181.9]s update:[12.6]s r_avg:[-0.0238] avg_q:[1.323].
[1050/50000] rollout:[174.6]s update:[12.6]s r_avg:[-0.0170] avg_q:[1.377].
[Eval. start] step:[1050/50000][2.1%] #step:[1.7e+07] time:[day:[03] 01:13:03] ram:[53.1%].
 [0/11] [spot_4g] ep_ret:[101.6927] ep_len:[2746]. blue health:[1.00] red health:[0.00]
 [1/11] [spot_5g] ep_ret:[76.3492] ep_len:[3001]. blue health:[1.00] red health:[0.24]
 [2/11] [spot_random] ep_ret:[100.2805] ep_len:[672]. blue health:[1.00] red health:[0.00]
 [3/11] [es_trial2] ep_ret:[100.3809] ep_len:[977]. blue health:[1.00] red health:[0.00]
 [4/11] [es_trial3_scrimmage4] ep_ret:[-83.3151] ep_len:[3001]. blue healt

[1175/50000] rollout:[165.6]s update:[11.2]s r_avg:[-0.0534] avg_q:[1.430].
[2m[33m(pid=raylet)[0m E0709 18:24:42.589754 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff0100008019650000 was evicted before the raylet could pin it.
[2m[33m(pid=raylet)[0m E0709 18:33:08.468626 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff010000804f650000 was evicted before the raylet could pin it.
[1180/50000] rollout:[163.4]s update:[11.3]s r_avg:[-0.0379] avg_q:[1.446].
[1185/50000] rollout:[170.6]s update:[11.2]s r_avg:[-0.0093] avg_q:[1.417].
[2m[33m(pid=raylet)[0m E0709 19:03:00.875339 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff010000802c660000 was evicted before the raylet could pin it.
[1190/50000] rollout:[175.7]s update:[11.2]s r_avg:[-0.0238] avg_q:[1.335].
[1195/50000] rollout:[173.5]s update:[11.2]s r_avg:[-0.0167] avg_q:[1.387].
[2m[33m(pid=raylet)[0m E0709 19:24:20.386191 22929 node_manager.cc:3118] Plasma object fffff

[2m[33m(pid=raylet)[0m E0710 00:31:50.396562 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff01000080796f0000 was evicted before the raylet could pin it.
[1300/50000] rollout:[193.6]s update:[13.5]s r_avg:[-0.0071] avg_q:[1.305].
[Eval. start] step:[1300/50000][2.6%] #step:[2.1e+07] time:[day:[03] 14:16:35] ram:[55.3%].
 [0/11] [spot_4g] ep_ret:[101.4063] ep_len:[1731]. blue health:[1.00] red health:[0.00]
 [1/11] [spot_5g] ep_ret:[12.8942] ep_len:[3001]. blue health:[1.00] red health:[0.87]
 [2/11] [spot_random] ep_ret:[100.1145] ep_len:[1017]. blue health:[1.00] red health:[0.00]
 [3/11] [es_trial2] ep_ret:[100.3887] ep_len:[510]. blue health:[1.00] red health:[0.00]
 [4/11] [es_trial3_scrimmage4] ep_ret:[-1.1799] ep_len:[3001]. blue health:[1.00] red health:[1.00]
 [5/11] [expert_system] ep_ret:[84.4372] ep_len:[2713]. blue health:[0.84] red health:[0.00]
 [6/11] [spot_4g] ep_ret:[101.4816] ep_len:[1726]. blue health:[1.00] red health:[0.00]
 [7/11] [spot_5g] ep

[1415/50000] rollout:[168.2]s update:[11.5]s r_avg:[-0.0074] avg_q:[1.439].
[1420/50000] rollout:[172.0]s update:[11.8]s r_avg:[-0.0104] avg_q:[1.402].
[2m[33m(pid=raylet)[0m E0710 07:09:31.433171 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff010000801f7a0000 was evicted before the raylet could pin it.
[1425/50000] rollout:[171.3]s update:[11.7]s r_avg:[-0.0081] avg_q:[1.436].
[2m[33m(pid=raylet)[0m E0710 07:28:30.668813 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff01000080be7a0000 was evicted before the raylet could pin it.
[1430/50000] rollout:[185.3]s update:[11.6]s r_avg:[-0.0129] avg_q:[1.534].
[1435/50000] rollout:[198.9]s update:[11.6]s r_avg:[-0.0262] avg_q:[1.521].
[1440/50000] rollout:[155.4]s update:[11.6]s r_avg:[-0.0150] avg_q:[1.516].
[2m[33m(pid=raylet)[0m E0710 08:06:34.340101 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff01000080bf7b0000 was evicted before the raylet could pin it.
[1445/50000] rollou

[1565/50000] rollout:[92.2]s update:[8.8]s r_avg:[0.0039] avg_q:[1.358].
[1570/50000] rollout:[138.7]s update:[8.8]s r_avg:[-0.0121] avg_q:[1.303].
[2m[33m(pid=raylet)[0m E0710 13:59:48.479653 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff0100008005870000 was evicted before the raylet could pin it.
[1575/50000] rollout:[140.2]s update:[8.8]s r_avg:[-0.0088] avg_q:[1.333].
[1580/50000] rollout:[138.4]s update:[8.8]s r_avg:[-0.0125] avg_q:[1.433].
[1585/50000] rollout:[139.7]s update:[8.8]s r_avg:[0.0001] avg_q:[1.352].
[2m[33m(pid=raylet)[0m E0710 14:36:38.781862 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff0100008050880000 was evicted before the raylet could pin it.
[1590/50000] rollout:[100.6]s update:[8.7]s r_avg:[-0.0131] avg_q:[1.300].
[2m[33m(pid=raylet)[0m E0710 14:54:52.250838 22929 node_manager.cc:3118] Plasma object ffffffffffffffffffffffff0100008000890000 was evicted before the raylet could pin it.
[1595/50000] rollout:[140.4]

### Close

In [None]:
ray.shutdown()

### Save model weights and replay buffers

In [None]:
# Path to save the npz file 
npz_path = '../data/net/adt_cont_tactic/model_and_buffers_final.npz'
save_sac_model_and_buffers(npz_path,R,replay_buffer_long,replay_buffer_short,
                           VERBOSE=False,IGNORE_BUFFERS=True)

### Restore 

In [None]:
npz_path = '../data/net/adt_cont_tactic/model_and_buffers_final.npz'
restore_sac_model_and_buffers(npz_path,R,replay_buffer_long,replay_buffer_short,
                              VERBOSE=False,IGNORE_BUFFERS=False)

### Test Run

In [None]:
eval_env = get_eval_env()
red_list = [Agents.SPOT_4G,Agents.SPOT_5G,Agents.SPOT_RANDOM,
            Agents.EXPERT_SYSTEM_TRIAL_2,Agents.EXPERT_SYSTEM_TRIAL_3_SCRIMMAGE_4,
            Agents.EXPERT_SYSTEM]
red = red_list[0]
o,d,ep_ret,ep_len = eval_env.reset(red=red),False,0,0
while not(d or (ep_len == max_ep_len_eval)):
    a = R.get_action(o,deterministic=True)
    o,r,d,_ = eval_env.step(a)
    ep_ret += r # compute return 
    ep_len += 1
print ("[Evaluate] ep_ret:[%.4f] ep_len:[%d]"
    %(eval_idx,ep_len))
eval_env.close() # close env