# Capture the Flag (RL - Policy Gradient)

- Seung Hyun Kim
- skim449@illinois.edu

## Implementation Details

- Actor-critic
- On Policy

### Sampling
- [ ] Mini-batch to update 'average' gradient
- [ ] Experience Replay for Random Sampling
- [ ] Importance Sampling
    
### Deterministic Policy Gradient
- [ ] DDPG
- [ ] MADDPG

### Stability and Reducing Variance
- [x] Gradient clipping
- [ ] Normalized Reward/Advantage
- [ ] Target Network
- [ ] TRPO
- [ ] PPO

### Multiprocessing
- [ ] Synchronous Training (A2C)
- [x] Asynchronous Training (A3C)

### Applied Training Methods:
- [ ] Self-play
- [ ] Batch Policy

## Notes

- This notebook includes:
    - Building the structure of policy driven network.
    - Training with/without render
    - Saver that save model and weights to ./model directory
    - Writer that will record some necessary datas to ./logs

- This notebook does not include:
    - Simulation with RL policy
        - The simulation can be done using policy_RL.py
    - cap_test.py is changed appropriately.
    
## References :
- https://github.com/awjuliani/DeepRL-Agents/blob/master/Vanilla-Policy.ipynb (source)
- https://www.youtube.com/watch?v=PDbXPBwOavc
- https://github.com/lilianweng/deep-reinforcement-learning-gym/blob/master/playground/policies/actor_critic.py (source)
- https://github.com/spro/practical-pytorch/blob/master/reinforce-gridworld/reinforce-gridworld.ipynb

## TODO:


!rm -rf logs/HRL/ model/HRL

In [1]:
TRAIN_NAME='HRL'
LOG_PATH='./logs/'+TRAIN_NAME
MODEL_PATH='./model/' + TRAIN_NAME
RENDER_PATH='./render/' + TRAIN_NAME
GPU_CAPACITY=0.25 # gpu capacity in percentage

In [2]:
import os
import configparser

import signal
import threading
import multiprocessing

import tensorflow as tf
import tensorflow.contrib.slim as slim
import tensorflow.contrib.layers as layers
from tensorflow.python.client import device_lib
import matplotlib.pyplot as plt
%matplotlib inline

import time
import gym
import gym_cap
import gym_cap.envs.const as CONST
import numpy as np
import random
import math

# the modules that you can use to generate the policy. 
import policy.random
import policy.roomba
import policy.policy_RL
import policy.zeros

# Data Processing Module
from utility.dataModule import state_processor_v2 as state_processor
from utility.utils import MovingAverage as MA
from utility.utils import discount_rewards, store_args
from utility.buffer import Trajectory, Replay_buffer

from network.HAC import HAC as Network

import imageio

%load_ext autoreload
%autoreload 2

## Hyperparameters

In [3]:
# Importing global configuration
config = configparser.ConfigParser()
config.read('config.ini')

## Environment
action_space = config.getint('DEFAULT','ACTION_SPACE')
n_agent = 4 #config.getint('DEFAULT','NUM_AGENT')
map_size = 50# config.getint('DEFAULT','MAP_SIZE')
vision_range = config.getint('DEFAULT','VISION_RANGE')

## Training
total_episodes = 150000#config.getint('TRAINING','TOTAL_EPISODES')
max_ep = config.getint('TRAINING','MAX_STEP')
critic_beta = config.getfloat('TRAINING', 'CRITIC_BETA')
entropy_beta = config.getfloat('TRAINING', 'ENTROPY_BETA')
gamma = config.getfloat('TRAINING', 'DISCOUNT_RATE')

decay_lr = config.getboolean('TRAINING','DECAYING_LR')
lr_a = 1e-5#config.getfloat('TRAINING','LR_ACTOR')
lr_c = 1e-4#config.getfloat('TRAINING','LR_CRITIC')

## Save/Summary
save_network_frequency = config.getint('TRAINING','SAVE_NETWORK_FREQ')
save_stat_frequency = config.getint('TRAINING','SAVE_STATISTICS_FREQ')
moving_average_step = config.getint('TRAINING','MOVING_AVERAGE_SIZE')

## GPU
gpu_capacity = config.getfloat('GPU_CONFIG','GPU_CAPACITY')
gpu_allowgrow = config.getboolean('GPU_CONFIG', 'GPU_ALLOWGROW')

In [4]:
# Local configuration parameters
minibatch_size = 500
optimization_steps = 5
batch_size = 500

# Env Settings
vision_dx, vision_dy = 2*vision_range+1, 2*vision_range+1
nchannel = 6
in_size = [None,vision_dx,vision_dy,nchannel]
shared_size = [None, 4]  # (Flag location, num allies, num enemy)
nenv = 8#(int) (multiprocessing.cpu_count())

# Asynch Settings
global_scope = 'global'

## Environment Setting

In [5]:
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    
#Create a directory to save episode playback gifs to
if not os.path.exists(LOG_PATH):
    os.makedirs(LOG_PATH)
    
if not os.path.exists(RENDER_PATH):
    os.makedirs(RENDER_PATH)

In [6]:
global_rewards = MA(moving_average_step)
global_ep_rewards = MA(moving_average_step)
global_length = MA(moving_average_step)
global_succeed = MA(moving_average_step)
global_episodes = 0

# Launch the session
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=GPU_CAPACITY, allow_growth=True)

sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
progbar = tf.keras.utils.Progbar(total_episodes,interval=1)

## Worker

In [7]:
class Worker(object):
    @store_args
    def __init__(self, name, global_network, target_network, sess, global_step=0):
        # Initialize Environment worker
        print(f'worker: {name} initiated')
        self.env = gym.make("cap-v0").unwrapped
        self.env.num_blue_ugv = n_agent
        self.env.num_red_ugv = 4
        self.env.reset()
        self.env.reset(map_size=map_size,
                       policy_red=policy.zeros.PolicyGen(self.env.get_map, self.env.get_team_red))
        print(f'worker: {name} environment info')
        print(f'    number of blue agents : {len(self.env.get_team_blue)}')
        print(f'    number of red agents  : {len(self.env.get_team_red)}')
        
        # Create Network for Worker
        self.meta_controller = Network(local_state_shape=in_size,
                                       shared_state_shape=shared_size,
                                       action_size=3,
                                       scope=self.name+'_meta', lr_critic=1e-3,
                                       explicit_policy=False,
                                       sess=self.sess, global_network=target_network)
        
        self.network = Network(local_state_shape=in_size,
                               shared_state_shape=shared_size,
                               action_size=action_space,
                               scope=self.name, lr_actor=lr_a, lr_critic=lr_c,
                               entropy_beta = entropy_beta, critic_beta=1.0,
                               sess=self.sess, global_network=global_network)
        
        self.replay_buffer = Replay_buffer(depth=7, buffer_size=5000)
        
    def work(self, saver, writer):
        global global_rewards, global_episodes, global_length, global_succeed
        summary = tf.Summary()
        summary2 = tf.Summary()
        goal_id = 0
        eta = 1
                
        # loop
        with self.sess.as_default(), self.sess.graph.as_default():
            while not coord.should_stop() and global_episodes < total_episodes:
                goal_id = (total_episodes / 10000) % 3
                
                s0 = self.env.reset()
                s_local_1, s_gps_1, shared_state = state_processor(s0, self.env.get_team_blue, vision_range, self.env._env,
                                                               flatten=False, partial=False)
                
                # Select Goal
                if goal_id == 0:  # Flag
                    self.goal = [shared_state]*n_agent
                elif goal_id == 1:  # Survive
                    self.goal = [np.array(list(coord[:2])+shared_state[2:].tolist()) for coord in s_gps_1]
                elif goal_id == 2:  # Attack
                    self.goal = [np.array(list(coord[:2])+[shared_state[2],0]) for coord in s_gps_1]
            
                # Bootstrap
                a1, _ = self.network.run_network(s_local_1, s_gps_1, self.goal)
                is_alive = [ag.isAlive for ag in self.env.get_team_blue]
                indv_history = [ [] for _ in range(len(self.env.get_team_blue)) ]
                prev_state = shared_state
                for step in range(max_ep+1):
                    # Iteration Reset
                    s_local_0=s_local_1
                    s_gps_0=s_gps_1
                    a0 = a1
                    was_alive = is_alive
                    
                    # Action
                    s1, _, d, _ = self.env.step(a0)
                    s_local_1, s_gps_1, game_state = state_processor(s1, self.env.get_team_blue, vision_range, self.env._env,
                                                            flatten=False, partial=False)
                    
                    # Reward / update goal
                    reward = [-1]*n_agent
                    for idx, each_goal in enumerate(self.goal):
                        if goal_id == 0:  # Flag
                            if each_goal[0] == s_gps_1[idx][0] and each_goal[1] == s_gps_1[idx][1]:
                                reward[idx] = 0
                            each_goal[2:] = game_state[2:]
                        elif goal_id == 1:  # Survive
                            if each_goal[2] == game_state[2]:
                                reward[idx] = 0
                            each_goal[:2] = list(s_gps_1[idx])[:2]
                            each_goal[3] = game_state[3]
                        elif goal_id == 2:  # Attack
                            if game_state[3] == 0:
                                reward[idx] = 0
                            each_goal[:3] = list(s_gps_1[idx])[:2]+[game_state[2]]
                    
                    # Get Next Action
                    a1, v1 = self.network.run_network(s_local_1, s_gps_1, self.goal)
                    is_alive = [ag.isAlive for ag in self.env.get_team_blue]
                    
                    if step == max_ep and d == False:
                        d = True

                    # push to buffer
                    for idx, agent in enumerate(self.env.get_team_blue):
                        if was_alive[idx]:
                            indv_history[idx].append([[s_local_0[idx], s_gps_0[idx]],
                                                      a0[idx],
                                                      [s_local_1[idx], s_gps_1[idx]],
                                                      np.copy(self.goal[idx]),
                                                      reward[idx]
                                                     ])
                            self.replay_buffer.append([s_local_0[idx],
                                                       goal_id,
                                                       1 if self.env.blue_win else -1,
                                                       s_local_1[idx],
                                                       game_state
                                                       ])
                            
                    if d:
                        aloss, closs, etrpy = self.process_history(indv_history, v1)
                        r_episode = 1 if self.env.blue_win else -1  # Global Reward
                        break
                        
                if len(self.replay_buffer) > 4000:
                    q, q_loss = self.q_train()
                    summary2.value.add(tag='summary/Q_loss', simple_value=q_loss)
                    summary2.value.add(tag='Q_goal/Q0', simple_value=q[0])
                    summary2.value.add(tag='Q_goal/Q1', simple_value=q[1])
                    summary2.value.add(tag='Q_goal/Q2', simple_value=q[2])
                    
                    writer.add_summary(summary2,global_episodes)
                    writer.flush()
                    
                global_rewards.append(r_episode)
                global_length.append(step)
                global_succeed.append(self.env.blue_win)
                global_episodes += 1
                self.sess.run(global_step_next)
                progbar.update(global_episodes)
                
                if global_episodes % save_stat_frequency == 0 and global_episodes != 0:
                    summary.value.add(tag='Records/mean_reward', simple_value=global_rewards())
                    summary.value.add(tag='Records/mean_length', simple_value=global_length())
                    summary.value.add(tag='Records/mean_succeed', simple_value=global_succeed())
                    summary.value.add(tag='summary/Entropy', simple_value=etrpy)
                    summary.value.add(tag='summary/actor_loss', simple_value=aloss)
                    summary.value.add(tag='summary/critic_loss', simple_value=closs)
                    writer.add_summary(summary,global_episodes)
                    writer.flush()
                    
                if global_episodes % save_network_frequency == 0 and global_episodes != 0:
                    saver.save(self.sess, MODEL_PATH+'/ctf_policy.ckpt', global_step=global_episodes)
                        
    def process_history(self, indv_buffer, bootstrap):
        aloss, closs, entropy = [],[],[]
        for idx, buffer in enumerate(indv_buffer):
            played_size = len(buffer)
            if played_size == 0:
                continue
                
            # Extract matrix    
            local_obs, gps_obs, action, local_obs_1, gps_obs_1, goal, reward = [],[],[],[],[],[],[]
            for mdp in buffer:
                local_obs.append(mdp[0][0])  # 0.0
                gps_obs.append(mdp[0][1])    # 0.1
                action.append(mdp[1])        # 1
                local_obs_1.append(mdp[2][0])
                gps_obs_1.append(mdp[2][1])
                goal.append(mdp[3])   # 5
                reward.append(mdp[4])
                

            # Discount Reward and Universal Advantage
            critic = self.network.get_critic(local_obs,
                                             gps_obs,
                                             goal)
            
            value_ext = np.append(critic, bootstrap[idx])
            td_target = reward + gamma * value_ext[1:]
            advantage = reward + gamma * value_ext[1:] - value_ext[:-1]
            advantage = discount_rewards(advantage,gamma)

            td_target = td_target.tolist()   # 2
            advantage = advantage.tolist()   # 4

            al, cl, entr = self.train(local_obs, gps_obs, action, advantage, goal, td_target)
            aloss.append(al)
            closs.append(cl)
            entropy.append(entr)
        return np.mean(aloss), np.mean(closs), np.mean(entropy)
        
    def train(self, local_obs, gps_obs, action, advantage, goal, td_target):
        al, cl, entr = self.network.update_global(local_obs, gps_obs,
                                   action, advantage, goal, td_target)
        self.network.pull_global()
        return al, cl, entr
    
    def q_train(self):
        ## Q learning Training Batch
        self.replay_buffer.shuffle()
        mean_q_loss = []
        q_tables = []
        for k in range(optimization_steps):
            sl0, action, reward, sl1, goal = [],[],[],[],[],
            experience = self.replay_buffer.pop(batch_size)
            for exp in experience:
                sl0.append(exp[0])
                action.append(exp[1])
                reward.append(exp[2])
                sl1.append(exp[3])
                goal.append(exp[4])

            Q1 = sess.run(self.meta_controller.q, 
                          feed_dict={self.meta_controller.state_input_: np.stack(sl1),
                                     self.meta_controller.goal_state_:np.stack(goal)}
                         )
            Q2 = sess.run(self.target_network.q,
                          feed_dict={self.target_network.state_input_: np.stack(sl1),
                                     self.target_network.goal_state_: np.stack(goal)}
                          )
            doubleQ = np.array([Q[idx] for Q, idx in zip(Q2, np.argmax(Q1,axis=-1))])# Q2[:, np.argmax(Q1, axis=-1)]
            Q_target = reward + gamma * doubleQ
            q, q_loss = self.meta_controller.update_global(sl0, None, action, None, goal, None, sl1, Q_target)
            mean_q_loss.append(q_loss)
            q_tables.extend(q.tolist())
        q_tables = np.array(q_tables)
        mn = np.mean(np.array(q_tables), axis=0)
        return np.mean(q_tables, axis=0), np.mean(mean_q_loss)
    

## Run

In [8]:
# Global Network
global_step = tf.Variable(0, trainable=False, name='global_step')
global_step_next = tf.assign_add(global_step, 1)
global_network = Network(local_state_shape=in_size,
                         shared_state_shape=shared_size,
                         action_size=action_space,
                         scope=global_scope,
                         sess=sess)
target_network = Network(local_state_shape=in_size,
                         shared_state_shape=shared_size,
                         action_size=3,
                         scope=global_scope+'_meta',
                         explicit_policy=False,
                         sess=sess)

# Local workers
workers = []
# loop for each workers
for idx in range(nenv):
    name = 'W_%i' % idx
    print(f'worker: {name} initializing')
    workers.append(Worker(name, global_network, target_network, sess, global_step=global_step))
saver = tf.train.Saver(max_to_keep=3)
writer = tf.summary.FileWriter(LOG_PATH, sess.graph)
    
ckpt = tf.train.get_checkpoint_state(MODEL_PATH)
if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Load Model : ", ckpt.model_checkpoint_path)
else:
    sess.run(tf.global_variables_initializer())
    print("Initialized Variables")
    
coord = tf.train.Coordinator()
worker_threads = []
global_episodes = sess.run(global_step)

saver.save(sess, MODEL_PATH+'/ctf_policy.ckpt', global_step=global_episodes)

# Summarize
for var in tf.trainable_variables(scope=global_scope):
    tf.summary.histogram(var.name, var)
merged_summary_op = tf.summary.merge_all()

worker: W_0 initializing
worker: W_0 initiated
worker: W_0 environment info
    number of blue agents : 4
    number of red agents  : 4


  result = entry_point.load(False)


worker: W_1 initializing
worker: W_1 initiated
worker: W_1 environment info
    number of blue agents : 4
    number of red agents  : 4
worker: W_2 initializing
worker: W_2 initiated
worker: W_2 environment info
    number of blue agents : 4
    number of red agents  : 4
worker: W_3 initializing
worker: W_3 initiated
worker: W_3 environment info
    number of blue agents : 4
    number of red agents  : 4
worker: W_4 initializing
worker: W_4 initiated
worker: W_4 environment info
    number of blue agents : 4
    number of red agents  : 4
worker: W_5 initializing
worker: W_5 initiated
worker: W_5 environment info
    number of blue agents : 4
    number of red agents  : 4
worker: W_6 initializing
worker: W_6 initiated
worker: W_6 environment info
    number of blue agents : 4
    number of red agents  : 4
worker: W_7 initializing
worker: W_7 initiated
worker: W_7 environment info
    number of blue agents : 4
    number of red agents  : 4
INFO:tensorflow:Restoring parameters from ./mode

In [9]:
for worker in workers:
    job = lambda: worker.work(saver, writer)
    t = threading.Thread(target=job)
    t.start()
    worker_threads.append(t)
coord.join(worker_threads)



  return [np.random.choice(self.action_size, p=prob/sum(prob)) for prob in a_probs], critic




KeyboardInterrupt: 