# Capture the Flag (RL - Policy Gradient)

- Seung Hyun Kim
- skim449@illinois.edu

## Implementation Details

- Actor-critic
- On Policy

### Sampling
- [ ] Mini-batch to update 'average' gradient
- [ ] Experience Replay for Random Sampling
- [ ] Importance Sampling
    
### Deterministic Policy
Gradient
- [ ] DDPG
- [ ] MADDPG

### Stability and Reducing Variance
- [x] Gradient clipping
- [ ] Normalized Reward/Advantage
- [ ] Target Network
- [ ] TRPO
- [ ] PPO

### Multiprocessing
- [ ] Synchronous Training (A2C)
- [x] Asynchronous Training (A3C)

### Applied Training Methods:
- [ ] Self-play
- [ ] Batch Policy

## Notes

## References :
- https://github.com/awjuliani/DeepRL-Agents/blob/master/Vanilla-Policy.ipynb (source)
- https://www.youtube.com/watch?v=PDbXPBwOavc
- https://github.com/lilianweng/deep-reinforcement-learning-gym/blob/master/playground/policies/actor_critic.py (source)
- https://github.com/spro/practical-pytorch/blob/master/reinforce-gridworld/reinforce-gridworld.ipynb

## TODO:

!rm -rf model/task

In [2]:
!rm -rf logs/task

In [3]:
RESET_GLOBAL_EPISODE=True

In [4]:
TRAIN_NAME='task'
LOG_PATH='./logs/'+TRAIN_NAME
MODEL_PATH='./model/' + TRAIN_NAME
RENDER_PATH='./render/' + TRAIN_NAME
GPU_CAPACITY=0.7 # gpu capacity in percentage

In [5]:
import os
import configparser

import signal
import threading
import multiprocessing

import tensorflow as tf
import tensorflow.contrib.layers as layers
from tensorflow.python.client import device_lib
import matplotlib.pyplot as plt
%matplotlib inline

import time
import gym
import gym_cap
import gym_cap.envs.const as CONST
import numpy as np
import random
import math

# the modules that you can use to generate the policy. 
import policy.random
import policy.roomba
import policy.policy_RL
import policy.zeros

# Data Processing Module
from utility.dataModule import state_processor, meta_state_processor
from utility.utils import MovingAverage as MA
from utility.utils import discount_rewards, store_args
from utility.buffer import Trajectory, Replay_buffer

from network.HAC_task import HAC_subcontroller as Network
from network.HAC_task import HAC_meta_controller as Meta_Network

import imageio

%load_ext autoreload
%autoreload 2

## Hyperparameters

In [6]:
# Importing global configuration
config = configparser.ConfigParser()
config.read('config.ini')

## Environment
action_space = config.getint('DEFAULT','ACTION_SPACE')
n_agent = 3 #config.getint('DEFAULT','NUM_AGENT')
n_enemy = 5
map_size = 30# config.getint('DEFAULT','MAP_SIZE')
vision_range = config.getint('DEFAULT','VISION_RANGE')

## Training
total_episodes = 300000#config.getint('TRAINING','TOTAL_EPISODES')
epsilon_meta = 20000
max_ep = config.getint('TRAINING','MAX_STEP')
critic_beta = config.getfloat('TRAINING', 'CRITIC_BETA')
entropy_beta = config.getfloat('TRAINING', 'ENTROPY_BETA')
gamma = config.getfloat('TRAINING', 'DISCOUNT_RATE')

decay_lr = config.getboolean('TRAINING','DECAYING_LR')
lr_a = 1e-5#config.getfloat('TRAINING','LR_ACTOR')
lr_c = 1e-4#config.getfloat('TRAINING','LR_CRITIC')

## Save/Summary
save_network_frequency = config.getint('TRAINING','SAVE_NETWORK_FREQ')
save_stat_frequency = config.getint('TRAINING','SAVE_STATISTICS_FREQ')
moving_average_step = 2 * config.getint('TRAINING','MOVING_AVERAGE_SIZE')

## GPU
gpu_capacity = config.getfloat('GPU_CONFIG','GPU_CAPACITY')
gpu_allowgrow = config.getboolean('GPU_CONFIG', 'GPU_ALLOWGROW')

In [7]:
# Local configuration parameters
minibatch_size = 500
optimization_steps = 5
batch_size = 500

# Env Settings
vision_dx, vision_dy = 2*vision_range+1, 2*vision_range+1
nchannel = 6
in_size = [None,vision_dx,vision_dy,nchannel]
shared_size = [None, 4]  # (Flag location, num allies, num enemy)
nenv = 8  #(int) (multiprocessing.cpu_count())
num_strategy = 3

# Asynch Settings
global_scope = 'global'

## Environment Setting

In [8]:
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    
#Create a directory to save episode playback gifs to
if not os.path.exists(LOG_PATH):
    os.makedirs(LOG_PATH)
    
if not os.path.exists(RENDER_PATH):
    os.makedirs(RENDER_PATH)

In [9]:
global_rewards = MA(moving_average_step)
global_ep_rewards = MA(moving_average_step)
global_length = MA(moving_average_step)
global_succeed = MA(moving_average_step)
mean_strategy_rewards = [MA(moving_average_step) for _ in range(num_strategy)]
global_episodes = 0

# Launch the session
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=GPU_CAPACITY, allow_growth=True)

sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
progbar = tf.keras.utils.Progbar(total_episodes,interval=1)

## Worker

In [10]:
class Worker(object):
    @store_args
    def __init__(self, name, global_network, global_meta_network, sess, global_step=0):
        # Initialize Environment worker
        print(f'worker: {name} initiated')
        self.env = gym.make("cap-v0").unwrapped
        self.env.num_blue_ugv = n_agent
        self.env.num_red_ugv = n_enemy
        self.sparse_reward = True
        self.env.reset()
        self.env.reset(map_size=map_size,
                       policy_red=policy.roomba.PolicyGen(self.env.get_map, self.env.get_team_red))
        print(f'worker: {name} environment info')
        print(f'    number of blue agents : {len(self.env.get_team_blue)}')
        print(f'    number of red agents  : {len(self.env.get_team_red)}')
        
        # Create Network for Worker
        self.network = Network(local_state_shape=in_size,
                               shared_state_shape=shared_size,
                               action_size=action_space,
                               scope=self.name, lr_actor=lr_a, lr_critic=lr_c,
                               entropy_beta = entropy_beta, critic_beta=1.0,
                               sess=self.sess, global_network=global_network)
        
        self.meta_network = Meta_Network(local_state_shape=[None, map_size, map_size, 8],
                              shared_state_shape=[None, 4],
                              action_size=3,
                              scope='meta_'+self.name,
                              sess=sess, global_network=global_meta_network)
    def get_action(self, local_obs, shared_obs, strategies):
        action = []
        for local, share, strategy in zip(local_obs, shared_obs, strategies):
            a1, _ = self.network.run_network(local[np.newaxis,:], share[np.newaxis,:], strategy)
            action.append(a1[0])
        return action
        
    def work(self, saver, writer):
        global global_rewards, global_episodes, global_length, global_succeed
        global mean_strategy_rewards
        summary = tf.Summary()
        
        self.network.pull_global_all()
        self.meta_network.pull_global_all()
        
        # loop
        with self.sess.as_default(), self.sess.graph.as_default():
            while not coord.should_stop() and global_episodes < total_episodes:
                s0 = self.env.reset()
                s_local_1, s_gps_1, _ = state_processor(s0, self.env.get_team_blue, vision_range, self.env._env,
                                                               flatten=False, partial=False)
                meta_obs1, meta_shared_obs1 = meta_state_processor(self.env._env, game_info=None, map_size=map_size, flatten=False, reverse=False)
            
                # Bootstrap
                if epsilon_meta < global_episodes:
                    strategy, _ = self.meta_network.run_network(meta_obs1, meta_shared_obs1)
                else:
                    #strategy = random.sample(range(3), n_agent)
                    strategy = [1]*n_agent
                a1 = self.get_action(s_local_1, s_gps_1, strategy)

                is_alive = [ag.isAlive for ag in self.env.get_team_blue]
                indv_history = [ [] for _ in range(n_agent) ]
                indv_meta_history = [ [] for _ in range(n_agent) ]
                
                prev_num_enemy = n_enemy
                prev_env_reward = 0
                ep_strategy_reward = [0]*num_strategy

                for step in range(max_ep+1):
                    # Iteration Reset
                    s_local_0=s_local_1
                    s_gps_0=s_gps_1
                    a0 = a1
                    was_alive = is_alive
                    meta_obs0 = meta_obs1
                    meta_shared_obs0 = meta_shared_obs1
                    
                    # Action
                    s1, env_reward, d, info = self.env.step(a0)
                    ep_reward = env_reward - prev_env_reward
                    prev_env_reward = env_reward
                    ep_reward /= 100
                    if step == max_ep and d == False:
                        ep_reward = -1
                        d = True
                        
                    s_local_1, s_gps_1, _ = state_processor(s1, self.env.get_team_blue, vision_range, self.env._env,
                                                               flatten=False, partial=False)
                    meta_obs1, meta_shared_obs1 = meta_state_processor(self.env._env, game_info=info,
                                                               map_size=map_size, flatten=False, reverse=False)
                    
                    # Get Next Action
                    a1 = self.get_action(s_local_1, s_gps_1, strategy)
                    is_alive = info['blue_alive'][-1]

                    # Reward Expansion
                    strategy_reward = [0]*3
                    # Attack
                    num_enemy = sum(info['red_alive'][-1])
                    r = int(prev_num_enemy - num_enemy)
                    prev_num_enemy = num_enemy
                    strategy_reward[0] = r
                    ep_strategy_reward[0] += r
                    # Search 
                    if info['red_flag_caught'][-1]:
                        r= 1
                    elif d:
                        r=-1
                    else:
                        r= 0
                    strategy_reward[1] = r
                    ep_strategy_reward[1] += r
                    # Defend
                    if info['blue_flag_caught'][-1]:
                        r = -1
                    elif d:
                        r = 1
                    else:
                        r = 0
                    strategy_reward[2] = r
                    ep_strategy_reward[2] += r
                    # Push to buffer
                    for idx, agent in enumerate(self.env.get_team_blue):
                        if was_alive[idx]:
                            indv_history[idx].append([[s_local_0[idx], s_gps_0[idx]],
                                                      a0[idx],
                                                      strategy_reward[strategy[idx]]
                                                     ])
                            indv_meta_history[idx].append([[meta_obs0[idx], meta_shared_obs0[idx]],
                                                      strategy[idx],
                                                      ep_reward
                                                     ])
                            
                    if d:
                        aloss = []
                        closs = []
                        etrpy = []
                        v1 = [self.network.get_critic(loc[np.newaxis,:,:,:], shr[np.newaxis,:], str_id)[0]
                              for loc, shr, str_id in zip(s_local_1, s_gps_1, strategy)]
                        v1_meta = [self.meta_network.get_critic(loc[np.newaxis,:], shr[np.newaxis,:])[0] for loc, shr in zip(meta_obs1, meta_shared_obs1)]
                        for history, bootstrap, strategy_id in zip(indv_history, v1, strategy):
                            if len(history) <= 0:
                                continue
                            al, cl, etr = self.process_history(history, bootstrap, strategy_id)
                            aloss.append(al)
                            closs.append(cl)
                            etrpy.append(etr)
                        for history, bootstrap in zip(indv_meta_history, v1_meta):
                            if len(history) <= 0:
                                continue
                            al, cl, etr = self.process_meta_history(history, bootstrap)
                        r_episode = 1 if self.env.blue_win else -1  # Global Reward
                        break
                        
                global_rewards.append(r_episode)
                global_length.append(step)
                global_succeed.append(self.env.blue_win)
                global_episodes += 1
                self.sess.run(global_step_next)
                progbar.update(global_episodes)
                
                for sid in range(num_strategy):
                    mean_strategy_rewards[sid].append(ep_strategy_reward[sid])
                
                if global_episodes % save_stat_frequency == 0 and global_episodes != 0:
                    summary.value.add(tag='Records/mean_reward', simple_value=global_rewards())
                    summary.value.add(tag='Records/mean_length', simple_value=global_length())
                    summary.value.add(tag='Records/mean_succeed', simple_value=global_succeed())
                    summary.value.add(tag='summary/entropy', simple_value=np.mean(etrpy))
                    summary.value.add(tag='summary/actor_loss', simple_value=np.mean(aloss))
                    summary.value.add(tag='summary/critic_loss', simple_value=np.mean(closs))
                    for sid in range(num_strategy):
                        summary.value.add(tag=f'strategy/reward_{sid}', simple_value=mean_strategy_rewards[sid]())
                        
                    writer.add_summary(summary,global_episodes)
                    writer.flush()
                    
                if global_episodes % save_network_frequency == 0 and global_episodes != 0:
                    saver.save(self.sess, MODEL_PATH+'/ctf_policy.ckpt', global_step=global_episodes)
                        
    def process_history(self, buffer, bootstrap, strategy_id):
        played_size = len(buffer)

        # Extract matrix    
        local_obs, gps_obs, action, reward = [],[],[],[]
        for mdp in buffer:
            local_obs.append(mdp[0][0])  # 0.0
            gps_obs.append(mdp[0][1])    # 0.1
            action.append(mdp[1])        # 1
            reward.append(mdp[2])


        # Discount Reward and Universal Advantage
        critic = self.network.get_critic(local_obs,
                                         gps_obs,
                                         strategy_id)
        value_ext = np.append(critic, bootstrap)
        td_target = reward + gamma * value_ext[1:]
        advantage = reward + gamma * value_ext[1:] - value_ext[:-1]
        advantage = discount_rewards(advantage,gamma)

        td_target = td_target.tolist()   # 2
        advantage = advantage.tolist()   # 4

        aloss, closs, entropy = self.train(local_obs, gps_obs, action, advantage, td_target, strategy_id)
        return aloss, closs, entropy
        
    def train(self, local_obs, gps_obs, action, advantage, td_target, strategy_id):
        aloss, closs, entropy = self.network.update_global(local_obs, gps_obs,
                                   action, advantage, td_target, strategy_id)
        self.network.pull_global(strategy_id)
        return aloss, closs, entropy
    
    def process_meta_history(self, buffer, bootstrap):
        played_size = len(buffer)

        # Extract matrix    
        local_obs, gps_obs, action, reward = [],[],[],[]
        for mdp in buffer:
            local_obs.append(mdp[0][0])  # 0.0
            gps_obs.append(mdp[0][1])    # 0.1
            action.append(mdp[1])        # 1
            reward.append(mdp[2])


        # Discount Reward and Universal Advantage
        critic = self.meta_network.get_critic(local_obs, gps_obs)
        value_ext = np.append(critic, bootstrap)
        td_target = reward + gamma * value_ext[1:]
        advantage = reward + gamma * value_ext[1:] - value_ext[:-1]
        advantage = discount_rewards(advantage,gamma)

        td_target = td_target.tolist()   # 2
        advantage = advantage.tolist()   # 4

        aloss, closs, entropy = self.train_meta(local_obs, gps_obs, action, advantage, td_target)
        return aloss, closs, entropy
        
    def train_meta(self, local_obs, gps_obs, action, advantage, td_target):
        aloss, closs, entropy = self.meta_network.update_global(local_obs, gps_obs,
                                   action, advantage, td_target)
        self.meta_network.pull_global_all()
        return aloss, closs, entropy
    

## Run

In [None]:
# Global Network
global_step = tf.Variable(0, trainable=False, name=global_scope+'/global_step')
global_step_next = tf.assign_add(global_step, 1)
global_network = Network(local_state_shape=in_size,
                         shared_state_shape=shared_size,
                         action_size=action_space,
                         scope=global_scope,
                         sess=sess)
global_meta_network = Meta_Network(local_state_shape=[None, map_size, map_size, 8],
                              shared_state_shape=[None, 4],
                              action_size=3,
                              scope='meta_'+global_scope,
                              sess=sess)

# Local workers
workers = []
# loop for each workers
for idx in range(nenv):
    name = 'W_%i' % idx
    print(f'worker: {name} initializing')
    workers.append(Worker(name, global_network, global_meta_network, sess, global_step=global_step))
    

worker: W_0 initializing
worker: W_0 initiated
worker: W_0 environment info
    number of blue agents : 3
    number of red agents  : 5


  result = entry_point.load(False)


worker: W_1 initializing
worker: W_1 initiated
worker: W_1 environment info
    number of blue agents : 3
    number of red agents  : 5
worker: W_2 initializing
worker: W_2 initiated
worker: W_2 environment info
    number of blue agents : 3
    number of red agents  : 5
worker: W_3 initializing
worker: W_3 initiated
worker: W_3 environment info
    number of blue agents : 3
    number of red agents  : 5
worker: W_4 initializing
worker: W_4 initiated
worker: W_4 environment info
    number of blue agents : 3
    number of red agents  : 5
worker: W_5 initializing
worker: W_5 initiated
worker: W_5 environment info
    number of blue agents : 3
    number of red agents  : 5
worker: W_6 initializing
worker: W_6 initiated
worker: W_6 environment info
    number of blue agents : 3
    number of red agents  : 5
worker: W_7 initializing
worker: W_7 initiated
worker: W_7 environment info
    number of blue agents : 3
    number of red agents  : 5


In [None]:
# Prepare summary to only record global
#regular_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=global_scope)
#saver2 = tf.train.Saver(var_list = regular_vars, max_to_keep=3)
global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=global_scope) + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='meta_'+global_scope)
global_vars.append(global_step)
saver = tf.train.Saver(var_list = global_vars, max_to_keep=3)
writer = tf.summary.FileWriter(LOG_PATH, sess.graph)

def initialize_uninitialized_vars(sess):
    from itertools import compress
    global_vars = tf.global_variables()
    is_not_initialized = sess.run([~(tf.is_variable_initialized(var)) \
                                   for var in global_vars])
    not_initialized_vars = list(compress(global_vars, is_not_initialized))

    if len(not_initialized_vars):
        sess.run(tf.variables_initializer(not_initialized_vars))
    

In [None]:
# Restore Weights
ckpt = tf.train.get_checkpoint_state(MODEL_PATH)
if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
    saver.restore(sess, ckpt.model_checkpoint_path)
    initialize_uninitialized_vars(sess)
    print("Load Model : ", ckpt.model_checkpoint_path)
else:
    sess.run(tf.global_variables_initializer())
    print("Initialized Variables")
if RESET_GLOBAL_EPISODE:
    sess.run(tf.assign(global_step, 0))
    
coord = tf.train.Coordinator()
worker_threads = []
global_episodes = sess.run(global_step)

saver.save(sess, MODEL_PATH+'/ctf_policy.ckpt', global_step=global_episodes)
print('    initial save done')

# Summarize
for var in tf.trainable_variables(scope=global_scope):
    tf.summary.histogram(var.name, var)
merged_summary_op = tf.summary.merge_all()

Initialized Variables
    initial save done
INFO:tensorflow:Summary name global/policy_0/conv_0/weights:0 is illegal; using global/policy_0/conv_0/weights_0 instead.
INFO:tensorflow:Summary name global/policy_0/conv_0/biases:0 is illegal; using global/policy_0/conv_0/biases_0 instead.
INFO:tensorflow:Summary name global/policy_0/conv_1/weights:0 is illegal; using global/policy_0/conv_1/weights_0 instead.
INFO:tensorflow:Summary name global/policy_0/conv_1/biases:0 is illegal; using global/policy_0/conv_1/biases_0 instead.
INFO:tensorflow:Summary name global/policy_0/conv_2/weights:0 is illegal; using global/policy_0/conv_2/weights_0 instead.
INFO:tensorflow:Summary name global/policy_0/conv_2/biases:0 is illegal; using global/policy_0/conv_2/biases_0 instead.
INFO:tensorflow:Summary name global/policy_0/dense_0gps_proc/weights:0 is illegal; using global/policy_0/dense_0gps_proc/weights_0 instead.
INFO:tensorflow:Summary name global/policy_0/dense_0gps_proc/biases:0 is illegal; using gl

In [None]:
for worker in workers:
    job = lambda: worker.work(saver, writer)
    t = threading.Thread(target=job)
    t.start()
    worker_threads.append(t)
coord.join(worker_threads)

 20001/300000 [=>............................] - ETA: 84:59:08

  return [np.random.choice(self.action_size, p=prob/sum(prob)) for prob in a_probs], critic


