# Capture the Flag (RL - Policy Gradient)

- Seung Hyun Kim
- skim449@illinois.edu

## Implementation Details

- Actor-critic experience buffer.
- Off-policy: Training is done at the end of the rollout

### Sampling
- [ ] Mini-batch to update 'average' gradient
- [x] Experience Replay for Random Sampling
- [ ] Importance Sampling
    
### Deterministic Policy Gradient
- [ ] DDPG
- [ ] MADDPG

### Stability and Reducing Variance
- [x] Gradient clipping
- [x] Normalized Reward/Advantage
- [ ] Target Network
- [ ] TRPO
- [ ] PPO

### Multiprocessing
- [x] Synchronous Training (A2C)
- [ ] Asynchronous Training (A3C)

### Applied Training Methods:
- [ ] Self-play
- [ ] Batch Policy

## Notes

- This notebook includes:
    - Building the structure of policy driven network.
    - Training with/without render
    - Saver that save model and weights to ./model directory
    - Writer that will record some necessary datas to ./logs

- This notebook does not include:
    - Simulation with RL policy
        - The simulation can be done using policy_RL.py
    - cap_test.py is changed appropriately.
    
## References :
- https://github.com/awjuliani/DeepRL-Agents/blob/master/Vanilla-Policy.ipynb (source)
- https://www.youtube.com/watch?v=PDbXPBwOavc
- https://github.com/lilianweng/deep-reinforcement-learning-gym/blob/master/playground/policies/actor_critic.py (source)
- https://github.com/spro/practical-pytorch/blob/master/reinforce-gridworld/reinforce-gridworld.ipynb

## TODO:

- enemy with different policies (zero, patrol)
- stochastic interaction
- Reward -> only 100 for completion (with small observation)

In [1]:
!rm -rf logs/B4R4_Rzero_SyncA2C/ model/B4R4_Rzero_SyncA2C

In [2]:
TRAIN_NAME='B4R4_Rzero_SyncA2C'
LOG_PATH='./logs/'+TRAIN_NAME
MODEL_PATH='./model/' + TRAIN_NAME
GPU_CAPACITY=0.35 # gpu capacity in percentage

In [3]:
import os

import signal

import tensorflow as tf
import tensorflow.contrib.slim as slim
import tensorflow.contrib.layers as layers
from tensorflow.python.client import device_lib
import matplotlib.pyplot as plt
%matplotlib inline

import time
import gym
import gym_cap
import gym_cap.envs.const as CONST
import numpy as np
import random

# the modules that you can use to generate the policy. 
import policy.random
import policy.roomba
import policy.policy_RL
import policy.zeros

# Data Processing Module
from utility.dataModule import one_hot_encoder
from utility.utils import MovingAverage as MA
from utility.utils import Experience_buffer, discount_rewards
from utility.vec_env import SubprocVecEnv

%load_ext autoreload
%autoreload 2

## Hyperparameters

In [4]:
# Replay Variables
total_episodes=5000000
max_ep = 150
update_frequency = 2
batch_size = 512
experience_size=2048

# Saving Related
save_network_frequency = 1024
save_stat_frequency = 128
moving_average_step = 128

# Training Variables
LEARNING_RATE_FIX = True
LEARNINGRATE_AC  = 1e-5
LEARNINGRATE_CRITIC = 1e-4
LR_DECAY = 0.9995
LR_CRITIC_DECAY = 0.995
LR_FINAL = 1e-4
LR_CRITIC_FINAL = 5e-4
gamma = 0.99
discount_factor = 0.99

# Env Settings
MAP_SIZE = 20
VISION_RANGE = 9 # What decide the network size !!!
VISION_dX, VISION_dY = 2*VISION_RANGE+1, 2*VISION_RANGE+1
NENV = 8

## Environment Setting

In [None]:
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    
#Create a directory to save episode playback gifs to
if not os.path.exists(LOG_PATH):
    os.makedirs(LOG_PATH)

In [None]:
action_space = 5
n_agent = 4

## Policy Network

In [None]:
#Used to initialize weights for policy and value output layers
def normalized_columns_initializer(std=1.0):
    def _initializer(shape, dtype=None, partition_info=None):
        out = np.random.randn(*shape).astype(np.float32)
        out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
        return tf.constant(out)
    return _initializer

In [None]:
class Agent():
    def __init__(self, in_size, action_size, grad_clip_norm):
        # Parameters
        self.grad_clip_norm = grad_clip_norm
        
        # Learning Rate Variables
        self.learning_rate = tf.placeholder(tf.float32, shape=None, name='learning_rate')
        self.learning_rate_critic = tf.placeholder(tf.float32, shape=None, name='learning_rate_critic')
        
        # Placeholders
        self.state_input = tf.placeholder(shape=in_size,dtype=tf.float32, name='state')
        self.action_holder = tf.placeholder(shape=[None],dtype=tf.int32)
        self.action_OH = tf.one_hot(self.action_holder, action_size)
        self.reward_holder = tf.placeholder(shape=[None],dtype=tf.float32, name='reward')
        self.td_target_holder = tf.placeholder(shape=[None], dtype=tf.float32, name='td_target')
        self.advantage_holder = tf.placeholder(shape=[None], dtype=tf.float32, name='adv')
        
        
        # Feed-Forward Network
        # Actor stream
        layer = slim.conv2d(self.state_input, 32, [5,5], activation_fn=tf.nn.relu,
                            weights_initializer=layers.xavier_initializer_conv2d(),
                            biases_initializer=tf.zeros_initializer(),
                            padding='VALID')
        layer = slim.max_pool2d(layer, [2,2])
        layer = slim.conv2d(layer, 64, [3,3], activation_fn=tf.nn.relu,
                            weights_initializer=layers.xavier_initializer_conv2d(),
                            biases_initializer=tf.zeros_initializer(),
                            padding='VALID')
        layer = slim.max_pool2d(layer, [2,2])
        layer = slim.conv2d(layer, 64, [2,2], activation_fn=tf.nn.relu,
                            weights_initializer=layers.xavier_initializer_conv2d(),
                            biases_initializer=tf.zeros_initializer(),
                            padding='VALID')
        layer = slim.flatten(layer)
        
        with tf.variable_scope('actor'):
            actor = layers.fully_connected(layer, 128,
                                        weights_initializer=normalized_columns_initializer(0.001),
                                        activation_fn=tf.nn.relu)
            self.actor = layers.fully_connected(actor, action_size,
                                        weights_initializer=normalized_columns_initializer(0.001),
                                        activation_fn=None)
            self.output = tf.nn.softmax(self.actor, name='action')
            self.output_argmax = tf.argmax(self.output, axis=1,output_type=tf.int32, name='argmax')
        
        # Value stream
        with tf.variable_scope('critic'):
            self.critic = layers.fully_connected(layer, 1,
                                                 weights_initializer=normalized_columns_initializer(1.0),
                                                 activation_fn=None)
            self.critic = tf.reshape(self.critic, [-1])
        
        # Feed Backward
        # - compute the  loss, and use it to find gradient, and update the network
        # - May be need to add bootstrap value at the end of the value
        #self.selected=tf.equal(self.action_holder, self.output_argmax)
        self.actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Conv')+tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='actor')
        print(self.actor_vars)
        self.critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Conv')+tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic')
        print(self.critic_vars)
        with tf.name_scope('train'):
            self.entropy = -tf.reduce_sum(self.output * tf.log(self.output), name='entropy')

            # loss critic
            self.loss_critic = tf.reduce_mean(tf.square(self.td_target_holder - self.critic))
            self.optimizer_critic = tf.train.AdamOptimizer(self.learning_rate_critic)
            self.grads_critic = self.optimizer_critic.compute_gradients(self.loss_critic, self.critic_vars)
            if self.grad_clip_norm:
                self.grads_critic = [(tf.clip_by_norm(grad, 70), var) for grad, var in self.grads_critic]
            self.update_critic = self.optimizer_critic.apply_gradients(self.grads_critic)

            # loss actor
            self.policy_outputs = tf.reduce_sum(self.output * self.action_OH, 1)
            self.objective_function = tf.log(self.policy_outputs)
            self.loss_actor = -tf.reduce_sum(self.objective_function*self.advantage_holder) # using holder, no need to stop gradient
            self.optimizer_actor = tf.train.AdamOptimizer(self.learning_rate)
            self.grads_actor = self.optimizer_actor.compute_gradients(self.loss_actor, self.actor_vars)
            if self.grad_clip_norm:
                self.grads_actor = [(tf.clip_by_norm(grad, self.grad_clip_norm), var) for grad, var in self.grads_actor]
            self.update_actor = self.optimizer_actor.apply_gradients(self.grads_actor)
            
        with tf.name_scope('update'):
            self.loss = 0.5*self.loss_critic + self.loss_actor# - self.entropy * 0.0001
            self.update_batch = tf.group([self.update_critic,self.update_actor])
            self.extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            
        # Summary
        # Histogram output
        with tf.name_scope('debug_parameters'):
            tf.summary.histogram('output', self.output)
            tf.summary.histogram('actor', self.actor)
            tf.summary.histogram('critic', self.critic)        
            tf.summary.histogram('action', self.action_holder)
            tf.summary.histogram('objective_function', self.objective_function)
            tf.summary.histogram('td_target', self.td_target_holder)
            tf.summary.histogram('rewards_in', self.reward_holder)
            tf.summary.histogram('advantage_in', self.advantage_holder)
            
        with tf.name_scope('gradient'):
            for grad, var in self.grads_critic:
                tf.summary.histogram(var.name+'_grad', grad)
            for grad, var in self.grads_actor:
                tf.summary.histogram(var.name+'_grad', grad)
            
        
        # Graph summary Loss
        with tf.name_scope('summary'):
            tf.summary.scalar(name='actor_loss', tensor=self.loss_actor)
            tf.summary.scalar(name='critic_loss', tensor=self.loss_critic)
            tf.summary.scalar(name='total_loss', tensor=self.loss)
            tf.summary.scalar(name='Entropy', tensor=self.entropy)
        
        with tf.name_scope('weights_bias'):
            # Histogram weights and bias
            for var in slim.get_model_variables():
                tf.summary.histogram(var.op.name, var)
                
        with tf.name_scope('Learning_Rate'):
            # Learning Rate
            tf.summary.scalar(name='actor_lr', tensor=self.learning_rate)

In [None]:
tf.reset_default_graph() # Clear the Tensorflow graph.
myAgent = Agent(in_size=[None,VISION_dX,VISION_dY,6],action_size=5, grad_clip_norm=50) #Load the agent.
with tf.variable_scope('global_step'):
    global_step = tf.Variable(0, trainable=False, name='global_step') # global step
    increment_global_step_op = tf.assign(global_step, global_step+NENV)
merged = tf.summary.merge_all()

[<tf.Variable 'Conv/weights:0' shape=(5, 5, 6, 32) dtype=float32_ref>, <tf.Variable 'Conv/biases:0' shape=(32,) dtype=float32_ref>, <tf.Variable 'Conv_1/weights:0' shape=(3, 3, 32, 64) dtype=float32_ref>, <tf.Variable 'Conv_1/biases:0' shape=(64,) dtype=float32_ref>, <tf.Variable 'Conv_2/weights:0' shape=(2, 2, 64, 64) dtype=float32_ref>, <tf.Variable 'Conv_2/biases:0' shape=(64,) dtype=float32_ref>, <tf.Variable 'actor/fully_connected/weights:0' shape=(64, 128) dtype=float32_ref>, <tf.Variable 'actor/fully_connected/biases:0' shape=(128,) dtype=float32_ref>, <tf.Variable 'actor/fully_connected_1/weights:0' shape=(128, 5) dtype=float32_ref>, <tf.Variable 'actor/fully_connected_1/biases:0' shape=(5,) dtype=float32_ref>]
[<tf.Variable 'Conv/weights:0' shape=(5, 5, 6, 32) dtype=float32_ref>, <tf.Variable 'Conv/biases:0' shape=(32,) dtype=float32_ref>, <tf.Variable 'Conv_1/weights:0' shape=(3, 3, 32, 64) dtype=float32_ref>, <tf.Variable 'Conv_1/biases:0' shape=(64,) dtype=float32_ref>, <tf

## Session

In [None]:
# Launch the session
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=GPU_CAPACITY, allow_growth=True)

sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
#sess = tf.Session()

ma_reward = MA(moving_average_step)
ma_length = MA(moving_average_step)
ma_captured = MA(moving_average_step)

# Setup Save and Restore Network
saver = tf.train.Saver(tf.global_variables())
writer = tf.summary.FileWriter(LOG_PATH, sess.graph)

ckpt = tf.train.get_checkpoint_state(MODEL_PATH)
if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Load Model : ", ckpt.model_checkpoint_path)
else:
    sess.run(tf.global_variables_initializer())
    print("Initialized Variables")

Initialized Variables


In [None]:
def record(summary_):
    with tf.device('/cpu:0'): 
        summary = tf.Summary()
        summary.value.add(tag='Records/mean_reward', simple_value=ma_reward())
        summary.value.add(tag='Records/mean_length', simple_value=ma_length())
        summary.value.add(tag='Records/mean_succeed', simple_value=ma_captured())
        writer.add_summary(summary, sess.run(global_step))
        
        #summary_str = sess.run(merged,feed_dict={myAgent.state_input:obs})
        writer.add_summary(summary_, sess.run(global_step))
        
        writer.flush()

In [None]:
env_list = [gym.make("cap-v0") for i in range(NENV)]
red_policy = policy.zeros.PolicyGen(env_list[0].get_map, env_list[0].get_team_red)
envs = SubprocVecEnv(NENV, env_list, map_size=MAP_SIZE, initial_red = red_policy)

Process 0 Initiated
Process 1 Initiated
Process 2 Initiated
Process 3 Initiated
Process 4 Initiated
Process 5 Initiated
Process 6 Initiated
Process 7 Initiated


In [None]:
def policy_rollout(nenv = NENV):
    # Run single episode, return the results
    ss, teams = envs.reset()
    
    s1_OH = [one_hot_encoder(s0, team, VISION_RANGE) for s0,team in zip(ss, teams)]
    env_live = [True for _ in range(nenv)]
    n_alive = nenv
    
    prev_rewards = np.zeros(nenv)
    indv_histories = [[[] for _ in range(len(teams[0]))] for __ in range(len(teams))]
    was_alives = [[ag.isAlive for ag in team] for team in teams]
    frames, wons = [], []
    ep_history = []
    for steps in range(max_ep + 1):
        s0_OH = s1_OH
        s0_OH_flat = np.concatenate(s0_OH) # flatten to run NN
        
        with tf.device('/cpu:0'):
            a_probs, v0s = sess.run([myAgent.output, myAgent.critic], feed_dict={myAgent.state_input:s0_OH_flat})
        actions = [np.random.choice(action_space, p=prob/sum(prob)) for prob in a_probs]
        
        # Re-group
        beta_policies = np.reshape(a_probs, (nenv,len(teams[0]),-1))
        v0s = np.reshape(v0s, (nenv,-1))
        actions = np.reshape(actions, (nenv,-1)).tolist()
        
        for idx in range(nenv):
            if not env_live[idx]:
                actions.insert(idx,[]) # add empty list for died environment
        
        s1s, r1s, ds = envs.step(actions) # Get Reponse
        
        s1_OH = [one_hot_encoder(s1, team, VISION_RANGE) for s1,team in zip(s1s, teams)]
        rs = r1s - prev_rewards
        
        for id_env in range(nenv): # For each environment
            if not env_live[id_env]:
                continue
            s0, a, r, s1, d, team = s0_OH[id_env], actions[id_env], rs[id_env], s1_OH[id_env], ds[id_env], teams[id_env]
            v0, beta_policy = v0s[id_env], beta_policies[id_env]
            indv_history, was_alive  = indv_histories[id_env], was_alives[id_env]
            
            if steps == max_ep and d == False:
                r = -100
                r1s[id_env] = -100
                d = True
            
            if d:
                v1 = np.array([0.0 for _ in range(len(team))])
            else:
                v1 = sess.run(myAgent.critic, feed_dict={myAgent.state_input:s1})
                
            for idx, agent in enumerate(team):
                if was_alive[idx]:
                    indv_history[idx].append([s0[idx],a[idx],r,gamma*v1[idx],gamma*v1[idx]-v0[idx],beta_policy[idx]])                        
            was_alive = [ag.isAlive for ag in team]        
            
            
            if d:
                # set this environment as dead
                n_alive -= 1
                env_live[id_env] = False
                
                # record all individual histories
                for idx, history in enumerate(indv_history):
                    if len(history)==0:
                        continue
                    _history = np.array(history)
                    _history[:,2] = discount_rewards(_history[:,2], discount_factor) / 100.0#, normalize=True)
                    _history[:,3] += _history[:,2]
                    _history[:,4] += _history[:,2]
                    ep_history.extend(_history)
                frames.append(steps)
                wons.append(r==100)
        
        prev_rewards = r1s
        if n_alive == 0:
            break
            
    if len(ep_history) > 0:        
        ep_history = np.stack(ep_history)
    
    return [frames, ep_history, r1s, wons]

## Training

In [None]:
if __name__=='__main__':
    ep = sess.run(global_step)
    exp_buffer = Experience_buffer(experience_shape=6, buffer_size=experience_size)
    batch_history = []
    progbar = tf.keras.utils.Progbar(total_episodes,width=5, interval=0.5)
    try:
        while True: #ep < total_episodes+1:
            progbar.update(ep) # update progress bar
            
            # Run episode
            frame, history, reward, did_won = policy_rollout()
            
            # Add history
            exp_buffer.add(history)

            if ep % update_frequency == 0 and ep != 0:
                batch_history = exp_buffer.sample(batch_size)
                feed_dict={myAgent.learning_rate          :LEARNINGRATE_AC,
                           myAgent.learning_rate_critic   :LEARNINGRATE_CRITIC,
                           myAgent.state_input            :np.stack(batch_history[:,0]),
                           myAgent.action_holder          :batch_history[:,1],
                           myAgent.reward_holder          :batch_history[:,2],
                           myAgent.td_target_holder       :batch_history[:,3],
                           myAgent.advantage_holder       :batch_history[:,4]}
                with tf.device('/gpu:0'):
                    sess.run([myAgent.update_batch], feed_dict=feed_dict)    
                exp_buffer.flush()
            
            # decay lr
            if not LEARNING_RATE_FIX:
                LEARNINGRATE_AC  = max(LEARNINGRATE_AC*LR_DECAY,LR_FINAL)
                LEARNINGRATE_CRITIC = max(LEARNINGRATE_CRITIC*LR_CRITIC_DECAY,LR_CRITIC_FINAL)
            
            # summarize and record
            ma_reward.extend(reward)
            ma_length.extend(frame)
            ma_captured.extend(did_won)   
            
            if ep % save_stat_frequency == 0 and ep != 0 and len(batch_history) > 0:
                summary_ = sess.run(merged, feed_dict=feed_dict)
                record(summary_)

            # save weight
            if ep % save_network_frequency == 0 and ep != 0:
                saver.save(sess, MODEL_PATH+'/ctf_policy.ckpt', global_step=global_step)

            # Proceed to next episode
            ep += NENV
            
            sess.run(increment_global_step_op)

    except KeyboardInterrupt:
        print('\n\nManually stopped the training (KeyboardInterrupt)');
        saver.save(sess, MODEL_PATH+'/ctf_policy.ckpt', global_step=global_step)
        print("save: ", sess.run(global_step), 'episodes')

   1672/5000000 [.....] - ETA: 927:39:20