# Evaluation of SAC agent

In [None]:
import datetime,gym,time,os,psutil,ray
import numpy as np
import tensorflow as tf
from util import gpu_sess,suppress_tf_warning,tic,toc,open_txt,write_txt
from sac import ReplayBuffer,create_sac_model,create_sac_graph,\
    save_sac_model_and_buffers,restore_sac_model_and_buffers
np.set_printoptions(precision=2)
suppress_tf_warning() # suppress warning 
gym.logger.set_level(40) # gym logger 

from episci.environment_wrappers.tactical_action_adt_env_continuous import CustomADTEnvContinuous
from episci.agents.utils.constants import Agents, RewardType
print ("Packaged loaded. TF version is [%s]."%(tf.__version__))

### Hyperparameters

In [None]:
# Ray
n_cpu = 30
n_workers = 30
# Restore
npz_path_restore = '../data/net/sac_adt_cont_a/model_and_buffers_1550.npz' 
# Environment
# action_length,action_length_eval = 5,5 # 50/5=10HZ
action_length,action_length_eval = 10,10 # 50/1=50HZ
ema = 0.9 # exponential moving average of actions
# Evaluation
red_list_eval = [
    Agents.ZOMBIE, 
    Agents.ROSIE, 
    Agents.BUD, 
    Agents.BUD_FSM, 
    Agents.EXPERT_SYSTEM_TRIAL_2, 
    Agents.EXPERT_SYSTEM_TRIAL_3_SCRIMMAGE_4, 
    Agents.EXPERT_SYSTEM
    ]*4
num_eval,max_ep_len_eval = len(red_list_eval),15e3 # evaluation 

In [None]:

# Rollouts
total_steps,burnin_steps = 50000,5
evaluate_every,print_every = 50,5

ep_len_rollout = 10*150 # 150sec rollout
hdims,actv = [128,128],tf.nn.relu
red_list_train = {
    Agents.SPOT_4G: 0.15,
    Agents.SPOT_5G: 0.30,
    Agents.SPOT_RANDOM: 0.45,
    Agents.EXPERT_SYSTEM_TRIAL_2: 0.6,
    Agents.EXPERT_SYSTEM_TRIAL_3_SCRIMMAGE_4: 0.75,
    Agents.EXPERT_SYSTEM: 1.0
    }
# Learning hyp
batch_size,update_count = 2**16,500 # batchsize / number of updates
lr = 1e-4
epsilon = 1e-2
# SAC
gamma = 0.99 # discount 0.99
alpha_q,alpha_pi = 0.05,0.5
polyak = 0.995 # 0.995
# Buffer
buffer_sz_long,buffer_sz_short = 1e5,1e4
# Temperature % epsilon greediness 
temp_min,temp_max = 1.0,1.0
eps_greedy = 0.0

### Define Environments

In [None]:
# Environments
def get_env():
    from episci.environment_wrappers.tactical_action_adt_env_continuous import CustomADTEnvContinuous
    from episci.agents.utils.constants import Agents, RewardType
    red_distribution = red_list_train
    env_config = {
        "red_distribution": red_distribution,
        "reward_type": RewardType.SHAPED
    }
    return CustomADTEnvContinuous(env_config,action_length=action_length)

def get_eval_env():
    from episci.environment_wrappers.tactical_action_adt_env_continuous import CustomADTEnvContinuous
    from episci.agents.utils.constants import Agents, RewardType
    red_distribution = red_list_train
    env_config = {
        "red_distribution": red_distribution,
        "reward_type": RewardType.SHAPED
    }
    return CustomADTEnvContinuous(env_config,action_length=action_length_eval)

### Define Workers

In [None]:
# Rollout Worker
class RolloutWorkerClass(object):
    """
    Worker without RAY (for update purposes)
    """
    def __init__(self,hdims=[256,256],actv=tf.nn.relu,
                 lr=1e-3,gamma=0.99,alpha_q=0.1,alpha_pi=0.1,polyak=0.995,epsilon=1e-2,
                 seed=1):
        self.seed = seed
        # Each worker should maintain its own environment
        import gym
        from util import suppress_tf_warning
        suppress_tf_warning() # suppress TF warnings
        gym.logger.set_level(40) 
        self.env = get_eval_env()
        odim,adim = self.env.observation_space.shape[0],self.env.action_space.shape[0]
        self.odim = odim
        self.adim = adim
        _ = self.env.reset()

        # Create SAC model and computational graph 
        self.model,self.sess = create_sac_model(
            odim=self.odim,adim=self.adim,hdims=hdims,actv=actv)
        self.step_ops,self.target_init = \
            create_sac_graph(self.model,lr=lr,gamma=gamma,alpha_q=alpha_q,alpha_pi=alpha_pi,
                             polyak=polyak,epsilon=epsilon)

        # Initialize model 
        self.FIRST_SET_FLAG = True
        tf.set_random_seed(self.seed)
        np.random.seed(self.seed)
        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.target_init)

    def get_action(self,o,deterministic=False):
        act_op = self.model['mu'] if deterministic else self.model['pi']
        return self.sess.run(act_op, feed_dict={self.model['o_ph']:o.reshape(1,-1)})[0]

    def get_weights(self):
        """
        Get weights
        """
        weight_vals = self.sess.run(self.model['main_vars'])
        return weight_vals

    def set_weights(self,weight_vals):
        """
        Set weights without memory leakage
        """
        if self.FIRST_SET_FLAG:
            self.FIRST_SET_FLAG = False
            self.assign_placeholders = []
            self.assign_ops = []
            for w_idx,weight_tf_var in enumerate(self.model['main_vars']):
                a = weight_tf_var
                assign_placeholder = tf.placeholder(a.dtype, shape=a.get_shape())
                assign_op = a.assign(assign_placeholder)
                self.assign_placeholders.append(assign_placeholder)
                self.assign_ops.append(assign_op)
        for w_idx,weight_tf_var in enumerate(self.model['main_vars']):
            # Memory-leakage-free assign (hopefully)
            self.sess.run(self.assign_ops[w_idx],
                          {self.assign_placeholders[w_idx]:weight_vals[w_idx]})

@ray.remote
class RayRolloutWorkerClass(object):
    """
    Rollout Worker with RAY
    """
    def __init__(self,worker_id=0,hdims=[256,256],actv=tf.nn.relu,
                 ep_len_rollout=1000,max_ep_len_eval=1000):
        # Parse
        self.worker_id = worker_id
        self.ep_len_rollout = ep_len_rollout
        self.max_ep_len_eval = max_ep_len_eval
        # Each worker should maintain its own environment
        import gym
        from util import suppress_tf_warning
        suppress_tf_warning() # suppress TF warnings
        gym.logger.set_level(40) 
        self.env = get_env()
        odim,adim = self.env.observation_space.shape[0],self.env.action_space.shape[0]
        self.odim = odim
        self.adim = adim
        _ = self.env.reset()

        # Replay buffers to pass
        self.o_buffer = np.zeros((self.ep_len_rollout,self.odim))
        self.a_buffer = np.zeros((self.ep_len_rollout,self.adim))
        self.r_buffer = np.zeros((self.ep_len_rollout))
        self.o2_buffer = np.zeros((self.ep_len_rollout,self.odim))
        self.d_buffer = np.zeros((self.ep_len_rollout))

        # Create SAC model
        self.model,self.sess = create_sac_model(
            odim=self.odim,adim=self.adim,hdims=hdims,actv=actv)
        self.sess.run(tf.global_variables_initializer())
        print ("Ray Worker [%d] Ready."%(self.worker_id))

        # Flag to initialize assign operations for 'set_weights()'
        self.FIRST_SET_FLAG = True

        # Flag to initialize rollout
        self.FIRST_ROLLOUT_FLAG = True
    
    def get_action(self,o,deterministic=False,temperature=1.0):
        """
        Get action (if temperature is 0, it becomes deterministic)
        """
        a_mu = self.sess.run(self.model['mu'],
                             feed_dict={self.model['o_ph']:o.reshape(1,-1)})[0]
        a_pi = self.sess.run(self.model['pi'],
                             feed_dict={self.model['o_ph']:o.reshape(1,-1)})[0]
        if deterministic:
            a = a_mu
        else:
            a = temperature*a_pi + (1-temperature)*a_mu
        return a

    def set_weights(self,weight_vals):
        """
        Set weights without memory leakage
        """
        if self.FIRST_SET_FLAG:
            self.FIRST_SET_FLAG = False
            self.assign_placeholders = []
            self.assign_ops = []
            for w_idx,weight_tf_var in enumerate(self.model['main_vars']):
                a = weight_tf_var
                assign_placeholder = tf.placeholder(a.dtype, shape=a.get_shape())
                assign_op = a.assign(assign_placeholder)
                self.assign_placeholders.append(assign_placeholder)
                self.assign_ops.append(assign_op)
        for w_idx,weight_tf_var in enumerate(self.model['main_vars']):
            # Memory-leakage-free assign (hopefully)
            self.sess.run(self.assign_ops[w_idx],
                          {self.assign_placeholders[w_idx]:weight_vals[w_idx]})

    def rollout(self,temperature=1.0,eps_greedy=0.0):
        """
        Rollout
        """
        if self.FIRST_ROLLOUT_FLAG:
            self.FIRST_ROLLOUT_FLAG = False
            self.o = self.env.reset() # reset environment
        # Loop
        r_sum = 0
        for t in range(self.ep_len_rollout):
            if np.random.rand() < eps_greedy:
                self.a = self.env.action_space.sample() # random sample 
            else:
                self.a = self.get_action(self.o,deterministic=False,temperature=temperature)
            self.o2,self.r,self.d,_ = self.env.step(self.a)
            r_sum += self.r
            # Append
            self.o_buffer[t,:] = self.o
            self.a_buffer[t,:] = self.a
            self.r_buffer[t] = self.r
            self.o2_buffer[t,:] = self.o2
            self.d_buffer[t] = self.d
            # Save next state 
            self.o = self.o2
            if self.d: 
                self.o = self.env.reset() # reset when done 
        r_avg = r_sum / self.ep_len_rollout
        return self.o_buffer,self.a_buffer,self.r_buffer,self.o2_buffer,self.d_buffer,r_avg

    def evaluate(self,red=None):
        """
        Evaluate
        """
        o,d,ep_ret,ep_len = self.env.reset(red=red),False,0,0
        while not(d or (ep_len == self.max_ep_len_eval)):
            a = self.get_action(o,deterministic=True)
            
            if ep_len == 0:
                a_prev = a
            else:
                a = (ema)*a + (1-ema)*a_prev
                a_prev = a
            
            o,r,d,_ = self.env.step(a) # set
            ep_ret += r # accumulate reward
            ep_len += 1 # length
        blue_health,red_health = self.env.blue_health,self.env.red_health
        eval_res = [ep_ret,ep_len,blue_health,red_health] # evaluation result 
        return eval_res

### Initialize 

In [None]:
ray.init(num_cpus=n_cpu)
tf.reset_default_graph()
R = RolloutWorkerClass(hdims=hdims,actv=actv,
                       lr=lr,gamma=gamma,alpha_q=alpha_q,alpha_pi=alpha_pi,
                       polyak=polyak,epsilon=epsilon,
                       seed=0)
workers = [RayRolloutWorkerClass.remote(worker_id=i,hdims=hdims,actv=actv,
                                        ep_len_rollout=ep_len_rollout,
                                        max_ep_len_eval=max_ep_len_eval)
           for i in range(n_workers)]

# Replay Buffers
replay_buffer_long = ReplayBuffer(odim=R.odim,adim=R.adim,size=int(buffer_sz_long))
replay_buffer_short = ReplayBuffer(odim=R.odim,adim=R.adim,size=int(buffer_sz_short))

In [None]:
# Restore, if necessary
if npz_path_restore:
    restore_sac_model_and_buffers(npz_path_restore,R,replay_buffer_long,replay_buffer_short,
                                  VERBOSE=False,IGNORE_BUFFERS=True)

In [None]:
# Synchronize worker weights
weights = R.get_weights()
set_weights_list = [worker.set_weights.remote(weights) for worker in workers] 

### Evaluate

In [None]:
start_time = time.time()
ops = []
for i_idx in range(num_eval):
    worker,red = workers[i_idx],red_list_eval[i_idx]
    ops.append(worker.evaluate.remote(red=red)) # eval operation
eval_vals = ray.get(ops) # do evaluation here 
ep_ret_sum,n_eval_sum = 0,0
n_outer_loop = 1 # 1 
for o_idx in range(n_outer_loop):
    for i_idx in range(num_eval):
        red,eval_val = red_list_eval[i_idx],eval_vals[i_idx]
        ep_ret,ep_len,blue_health,red_health = eval_val[0],eval_val[1],eval_val[2],eval_val[3]
        ep_ret_sum += ep_ret
        n_eval_sum += 1
        print (" [%d/%d][%d/%d] [%s] ep_ret:[%.4f] ep_len:[%d]. blue health:[%.2f] red health:[%.2f]"
            %(o_idx,n_outer_loop,i_idx,len(eval_vals),red,ep_ret,ep_len,blue_health,red_health))
ep_ret_avg = ep_ret_sum / n_eval_sum
print ("[Eval. done] time:[%s] ep_ret_avg:[%.3f].\n"%
       (time.strftime("day:[%d] %H:%M:%S", time.gmtime(time.time()-start_time)),
        ep_ret_avg)
      )

### Close

In [None]:
ray.shutdown()