# Capture the Flag (RL - Policy Gradient)

- Seung Hyun Kim
- skim449@illinois.edu

## Notes
- This notebook includes:
    - Building the structure of policy driven network.
    - Training with/without render
    - Saver that save model and weights to ./model directory
    - Writer that will record some necessary datas to ./logs
- This notebook does not include running the CtF game with the RL policy. Using the network will be separately scripted in policy/policy_RL1.py.
    - cap_test.py is changed appropriately.
    
## References :
- https://github.com/awjuliani/DeepRL-Agents/blob/master/Vanilla-Policy.ipynb (source)
- https://www.youtube.com/watch?v=PDbXPBwOavc

!rm -rf logs/B4R4_Rzero_VANILLA/ model/B4R4_Rzero_VANILLA

In [1]:
TRAIN_NAME='B4R4_Rzero_VANILLA'
LOG_PATH='./logs/'+TRAIN_NAME
MODEL_PATH='./model/' + TRAIN_NAME
GPU_CAPACITY=0.125 # gpu capacity in percentage

In [2]:
import os

import tensorflow as tf
import tensorflow.contrib.slim as slim
import tensorflow.contrib.layers as layers
from tensorflow.python.client import device_lib
import matplotlib.pyplot as plt
%matplotlib inline

import time
from datetime import datetime
import gym
import gym_cap
import gym_cap.envs.const as CONST
import numpy as np
import random

# the modules that you can use to generate the policy.
import policy.patrol 
import policy.random
import policy.simple # custon written policy
import policy.policy_RL
import policy.zeros

# Data Processing Module
from DataModule import one_hot_encoder
from Utils import MovingAverage as MA
from Utils import Experience_buffer, discount_rewards

## Hyperparameters

In [3]:
# Training Related
total_episodes = 500000 #Set total number of episodes to train agent on.
max_ep = 150
update_frequency = 50
batch_size = 2000
experience_size=10000

# Saving Related
save_network_frequency = 1000
save_stat_frequency = 100
moving_average_step = 100

# Parameters
LEARNING_RATE = 1e-3
gamma = 0.99
MAP_SIZE = 10
VISION_RANGE = 5
VISION_dX, VISION_dY = 2*VISION_RANGE+1, 2*VISION_RANGE+1

## Environment Setting

In [4]:
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    
#Create a directory to save episode playback gifs to
if not os.path.exists(LOG_PATH):
    os.makedirs(LOG_PATH)

In [None]:
env = gym.make("cap-v0") # initialize the environment
policy_red = policy.zeros.PolicyGen(env.get_map, env.get_team_red)
#plt.imshow(env.render(mode='rgb_array'))
# Environment Related
action_space = 5
n_agent = len(env.get_team_blue)

print('red number : ', len(env.get_team_red))
print('blue number : ', len(env.get_team_blue))

red number :  4
blue number :  4


## Network Setting

In [None]:
class agent():
    def __init__(self, lr, in_size,action_size):
        #These lines established the feed-forward part of the network. The agent takes a state and produces an action.
        self.state_input = tf.placeholder(shape=in_size,dtype=tf.float32, name='state')
        
        layer = slim.conv2d(self.state_input, 32, [5,5], activation_fn=tf.nn.relu,
                            weights_initializer=layers.xavier_initializer_conv2d(),
                            biases_initializer=tf.zeros_initializer(),
                            padding='SAME',
                            scope='conv1')
        layer = slim.max_pool2d(layer, [2,2])
        layer = slim.conv2d(layer, 64, [3,3], activation_fn=tf.nn.relu,
                            weights_initializer=layers.xavier_initializer_conv2d(),
                            biases_initializer=tf.zeros_initializer(),
                            padding='SAME',
                            scope='conv2')
        layer = slim.max_pool2d(layer, [2,2])
        layer = slim.conv2d(layer, 64, [2,2], activation_fn=tf.nn.relu,
                            weights_initializer=layers.xavier_initializer_conv2d(),
                            biases_initializer=tf.zeros_initializer(),
                            padding='SAME',
                            scope='conv3')
        layer = slim.flatten(layer)
        #flat  = tf.reshape(conv, [-1, VISION_dX*VISION_dY*128])
        #layer = slim.dropout(layer,keep_prob=0.8)
        #layer = slim.fully_connected(layer, 516,
        #                            activation_fn=tf.nn.relu,
        #                            biases_initializer=None,
        #                            scope='hidden_fc1')
        layer = layers.fully_connected(layer, 128, 
                                    activation_fn=tf.nn.relu)
        self.dense = layers.fully_connected(layer, action_size,
                                    activation_fn=None,
                                    scope='output_fc')
        self.output = tf.nn.softmax(self.dense, name='action')
        
        #The next six lines establish the training proceedure. We feed the reward and chosen action into the network
        #to compute the loss, and use it to update the network.
    
        with tf.name_scope('action_placement'):
            self.action_holder = tf.placeholder(shape=[None],dtype=tf.int32)
            self.indexes = tf.range(0, tf.shape(self.output)[0]) * tf.shape(self.output)[1] + self.action_holder

        with tf.name_scope('Loss'):
            self.reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
            self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes) # output
            self.loss = -tf.reduce_sum(tf.log(self.responsible_outputs)*self.reward_holder)

        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for idx,var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder')
            self.gradient_holders.append(placeholder)

        with tf.name_scope('optimizer'):
            optimizer = tf.train.AdamOptimizer(learning_rate=lr)
            
        '''with tf.name_scope('Loss'):            
        self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
                        logits=self.dense, labels=self.action_holder)*self.reward_holder)
        
        with tf.name_scope('optimizer'):
            optimizer = tf.train.AdamOptimizer(learning_rate=lr)
            self.update_batch = optimizer.minimize(self.loss)'''
            
        with tf.name_scope('gradients'):
            self.gradients = tf.gradients(self.loss,tvars)
            self.gradients = [tf.clip_by_norm(grad, 50) for grad in self.gradients]
            #self.gradients = optimizer.compute_gradients(self.loss, tvars)
            

        with tf.name_scope('update'):
            self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders,tvars))
                            
        # Summary
        # Histogram output
        with tf.variable_scope('debug_parameters'):
            tf.summary.histogram('output', self.output)
            tf.summary.histogram('actor', self.dense)     
            tf.summary.histogram('action', self.action_holder)
        
        # Graph summary Loss
        with tf.variable_scope('summary'):
            tf.summary.scalar(name='total_loss', tensor=self.loss)
        
        with tf.variable_scope('weights_bias'):
            # Histogram weights and bias
            for var in slim.get_model_variables():
                tf.summary.histogram(var.op.name, var)
                
        with tf.variable_scope('gradients'):
            # Histogram Gradients
            for var, grad in zip(slim.get_model_variables(), self.gradients):
                tf.summary.histogram(var.op.name+'/grad', grad[0])

In [None]:
tf.reset_default_graph() # Clear the Tensorflow graph.
myAgent = agent(lr=LEARNING_RATE,in_size=[None,VISION_dX,VISION_dY,6],action_size=5) #Load the agent.
global_step = tf.Variable(0, trainable=False, name='global_step') # global step
increment_global_step_op = tf.assign(global_step, global_step+1)
merged = tf.summary.merge_all()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


## Session

In [None]:
# Launch the session
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=GPU_CAPACITY, allow_growth=True)

sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
#sess = tf.Session()

ma_reward = MA(moving_average_step)
ma_length = MA(moving_average_step)
ma_captured = MA(moving_average_step)

# Setup Save and Restore Network
saver = tf.train.Saver(tf.global_variables())
writer = tf.summary.FileWriter(LOG_PATH, sess.graph)

ckpt = tf.train.get_checkpoint_state(MODEL_PATH)
if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Load Model : ", ckpt.model_checkpoint_path)
else:
    sess.run(tf.global_variables_initializer())
    print("Initialized Variables")

INFO:tensorflow:Restoring parameters from ./model/B4R4_Rzero_VANILLA/ctf_policy.ckpt-157000
Load Model :  ./model/B4R4_Rzero_VANILLA/ctf_policy.ckpt-157000


In [None]:
def record(summary_):
    with tf.device('/cpu:0'): 
        summary = tf.Summary()
        summary.value.add(tag='Records/mean_reward', simple_value=ma_reward())
        summary.value.add(tag='Records/mean_length', simple_value=ma_length())
        summary.value.add(tag='Records/mean_succeed', simple_value=ma_captured())
        writer.add_summary(summary, sess.run(global_step))
        
        #summary_str = sess.run(merged,feed_dict={myAgent.state_input:obs})
        writer.add_summary(summary_, sess.run(global_step))
        
        writer.flush()

In [None]:
gradBuffer = sess.run(tf.trainable_variables())
def clear_buffer():
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0 #np.zeros(grad.shape)

In [None]:
def policy_rollout(DETERMINISTIC=True):
    # Run single episode, return the results
    s = env.reset(map_size=MAP_SIZE, policy_red=policy_red)
    #obs = one_hot_encoder(s, env.get_team_blue) # partial observation
    obs_next = one_hot_encoder(env._env, env.get_team_blue, VISION_RANGE)
    
    ep_history = []
    indv_history = [[] for _ in range(len(env.get_team_blue))]
    
    was_alive = [ag.isAlive for ag in env.get_team_blue]
    prev_reward=0
    frame=0
    for frame in range(max_ep+1):
        obs = obs_next
        
        with tf.device('/cpu:0'):
            #Probabilistically pick an action given our network outputs.
            act_prob = sess.run(myAgent.output, feed_dict={myAgent.state_input:obs})
        if DETERMINISTIC:
            act = np.argmax(act_prob, axis=1).tolist()
        else:
            act = [np.random.choice(action_space, p=act_prob[x]/sum(act_prob[x])) for x in range(n_agent)] # divide by sum : normalize
            
        s,r1,d,_ = env.step(act) #Get our reward for taking an action given a bandit.

        r = r1-prev_reward

        if frame == max_ep and d == False:
            #r -= frame * (30/1000)
            r = -100
            r1 = -100

        obs_next = one_hot_encoder(env._env, env.get_team_blue, VISION_RANGE) # Full Observation
        
        # Push history for individual that 'was' alive previous frame
        for idx, agent in enumerate(env.get_team_blue):
            if was_alive[idx]:
                indv_history[idx].append([obs[idx],act[idx],r])

        # If reward sequence change, push the history into the episode history.
        '''if prev_reward != r:
            for idx, history in enumerate(indv_history):
                if len(history)==0: continue
                if not was_alive[idx]: continue
                _history = np.array(history)
                _history[:,2] = discount_rewards(_history[:,2])
                ep_history.extend(_history)'''
        
        # State Transition
        prev_reward = r1
        was_alive = [ag.isAlive for ag in env.get_team_blue]
        
        if d:
            break

    for idx, history in enumerate(indv_history):
        if len(history)==0: continue
        _history = np.array(history)
        _history[:,2] = discount_rewards(_history[:,2], gamma)
        ep_history.extend(_history)
            
    if len(ep_history) > 0:        
        ep_history = np.stack(ep_history)
    
    return [frame, ep_history, r1, env.blue_win, obs]

## Training

In [None]:
if __name__=='__main__':
    ep = sess.run(global_step)

    exp_buffer = Experience_buffer(experience_shape=3)
    clear_buffer()
    lost_pass = 1
    try:
        #progbar = tf.keras.utils.Progbar(total_episodes,width=5, stateful_metrics=[datetime.now()], interval=0.5)
        while ep < total_episodes+1:
            # Run episode
            frame, history, reward, did_won, obs = policy_rollout(DETERMINISTIC=False)

            # Add history
            #if did_won or np.random.random_sample() < lost_pass:
            exp_buffer.add(history)

            batch_history = exp_buffer.sample(batch_size) # Sample from experience replay
            if len(batch_history) > 0:
                feed_dict={myAgent.reward_holder:batch_history[:,2],
                       myAgent.action_holder:batch_history[:,1],
                       myAgent.state_input:np.stack(batch_history[:,0])}
                with tf.device('/gpu:0'):
                    loss, grads, summary_ = sess.run([myAgent.loss, myAgent.gradients, merged], feed_dict=feed_dict)
                if ep % save_stat_frequency == 0 and ep != 0:
                    record(summary_)

                for idx,grad in enumerate(grads):
                    gradBuffer[idx] += grad

            if ep % update_frequency == 0 and ep != 0:
                feed_dict= dictionary = dict(zip(myAgent.gradient_holders, gradBuffer))
                with tf.device('/gpu:0'):
                    _ = sess.run(myAgent.update_batch, feed_dict=feed_dict)
                clear_buffer()
                exp_buffer.flush()
                print('train completed: ', ep)
                #progbar.update(ep) # update progress bar
                
            ma_reward.append(reward)
            ma_length.append(frame)
            ma_captured.append(env.blue_win)
            
            # save every 100 ep
            if ep % save_network_frequency == 0 and ep != 0:
                #print(' Average r : ', np.mean(ma_reward[-save_network_frequency:]))
                saver.save(sess, MODEL_PATH+'/ctf_policy.ckpt', global_step=global_step)
                #print("save weights: ", sess.run(global_step), 'episodes')

            ep += 1

            sess.run(increment_global_step_op)

    except KeyboardInterrupt:
        print('\n\nManually stopped the training (KeyboardInterrupt)');
        saver.save(sess, MODEL_PATH+'/ctf_policy.ckpt', global_step=global_step)
        print("save: ", sess.run(global_step), 'episodes')

train completed:  157000
train completed:  157050
train completed:  157100
train completed:  157150
train completed:  157200
train completed:  157250
train completed:  157300
train completed:  157350
train completed:  157400
train completed:  157450
train completed:  157500
train completed:  157550
train completed:  157600
train completed:  157650
train completed:  157700
train completed:  157750
train completed:  157800
train completed:  157850
train completed:  157900
train completed:  157950
train completed:  158000
train completed:  158050
train completed:  158100
train completed:  158150
train completed:  158200
train completed:  158250
train completed:  158300
train completed:  158350
train completed:  158400
train completed:  158450
train completed:  158500
train completed:  158550
train completed:  158600
train completed:  158650
train completed:  158700
train completed:  158750
train completed:  158800
train completed:  158850
train completed:  158900
train completed:  158950


train completed:  173400
train completed:  173450
train completed:  173500
train completed:  173550
train completed:  173600
train completed:  173650
train completed:  173700
train completed:  173750
train completed:  173800
train completed:  173850
train completed:  173900
train completed:  173950
train completed:  174000
train completed:  174050
train completed:  174100
train completed:  174150
train completed:  174200
train completed:  174250
train completed:  174300
train completed:  174350
train completed:  174400
train completed:  174450
train completed:  174500
train completed:  174550
train completed:  174600
train completed:  174650
train completed:  174700
train completed:  174750
train completed:  174800
train completed:  174850
train completed:  174900
train completed:  174950
train completed:  175000
train completed:  175050
train completed:  175100
train completed:  175150
train completed:  175200
train completed:  175250
train completed:  175300
train completed:  175350


train completed:  189800
train completed:  189850
train completed:  189900
train completed:  189950
train completed:  190000
train completed:  190050
train completed:  190100
train completed:  190150
train completed:  190200
train completed:  190250
train completed:  190300
train completed:  190350
train completed:  190400
train completed:  190450
train completed:  190500
train completed:  190550
train completed:  190600
train completed:  190650
train completed:  190700
train completed:  190750
train completed:  190800
train completed:  190850
train completed:  190900
train completed:  190950
train completed:  191000
train completed:  191050
train completed:  191100
train completed:  191150
train completed:  191200
train completed:  191250
train completed:  191300
train completed:  191350
train completed:  191400
train completed:  191450
train completed:  191500
train completed:  191550
train completed:  191600
train completed:  191650
train completed:  191700
train completed:  191750


train completed:  206200
train completed:  206250
train completed:  206300
train completed:  206350
train completed:  206400
train completed:  206450
train completed:  206500
train completed:  206550
train completed:  206600
train completed:  206650
train completed:  206700
train completed:  206750
train completed:  206800
train completed:  206850
train completed:  206900
train completed:  206950
train completed:  207000
train completed:  207050
train completed:  207100
train completed:  207150
train completed:  207200
train completed:  207250
train completed:  207300
train completed:  207350
train completed:  207400
train completed:  207450
train completed:  207500
train completed:  207550
train completed:  207600
train completed:  207650
train completed:  207700
train completed:  207750
train completed:  207800
train completed:  207850
train completed:  207900
train completed:  207950
train completed:  208000
train completed:  208050
train completed:  208100
train completed:  208150


train completed:  222600
train completed:  222650
train completed:  222700
train completed:  222750
train completed:  222800
train completed:  222850
train completed:  222900
train completed:  222950
train completed:  223000
train completed:  223050
train completed:  223100
train completed:  223150
train completed:  223200
train completed:  223250
train completed:  223300
train completed:  223350
train completed:  223400
train completed:  223450
train completed:  223500
train completed:  223550
train completed:  223600
train completed:  223650
train completed:  223700
train completed:  223750
train completed:  223800
train completed:  223850
train completed:  223900
train completed:  223950
train completed:  224000
train completed:  224050
train completed:  224100
train completed:  224150
train completed:  224200
train completed:  224250
train completed:  224300
train completed:  224350
train completed:  224400
train completed:  224450
train completed:  224500
train completed:  224550


train completed:  239000
train completed:  239050
train completed:  239100
train completed:  239150
train completed:  239200
train completed:  239250
train completed:  239300
train completed:  239350
train completed:  239400
train completed:  239450
train completed:  239500
train completed:  239550
train completed:  239600
train completed:  239650
train completed:  239700
train completed:  239750
train completed:  239800
train completed:  239850
train completed:  239900
train completed:  239950
train completed:  240000
train completed:  240050
train completed:  240100
train completed:  240150
train completed:  240200
train completed:  240250
train completed:  240300
train completed:  240350
train completed:  240400
train completed:  240450
train completed:  240500
train completed:  240550
train completed:  240600
train completed:  240650
train completed:  240700
train completed:  240750
train completed:  240800
train completed:  240850
train completed:  240900
train completed:  240950


train completed:  255400
train completed:  255450
train completed:  255500
train completed:  255550
train completed:  255600
train completed:  255650
train completed:  255700
train completed:  255750
train completed:  255800
train completed:  255850
train completed:  255900
train completed:  255950
train completed:  256000
train completed:  256050
train completed:  256100
train completed:  256150
train completed:  256200
train completed:  256250
train completed:  256300
train completed:  256350
train completed:  256400
train completed:  256450
train completed:  256500
train completed:  256550
train completed:  256600
train completed:  256650
train completed:  256700
train completed:  256750
train completed:  256800
train completed:  256850
train completed:  256900
train completed:  256950
train completed:  257000
train completed:  257050
train completed:  257100
train completed:  257150
train completed:  257200
train completed:  257250
train completed:  257300
train completed:  257350


train completed:  271800
train completed:  271850
train completed:  271900
train completed:  271950
train completed:  272000
train completed:  272050
train completed:  272100
train completed:  272150
train completed:  272200
train completed:  272250
train completed:  272300
train completed:  272350
train completed:  272400
train completed:  272450
train completed:  272500
train completed:  272550
train completed:  272600
train completed:  272650
train completed:  272700
train completed:  272750
train completed:  272800
train completed:  272850
train completed:  272900
train completed:  272950
train completed:  273000
train completed:  273050
train completed:  273100
train completed:  273150
train completed:  273200
train completed:  273250
train completed:  273300
train completed:  273350
train completed:  273400
train completed:  273450
train completed:  273500
train completed:  273550
train completed:  273600
train completed:  273650
train completed:  273700
train completed:  273750


train completed:  288200
train completed:  288250
train completed:  288300
train completed:  288350
train completed:  288400
train completed:  288450
train completed:  288500
train completed:  288550
train completed:  288600
train completed:  288650
train completed:  288700
train completed:  288750
train completed:  288800
train completed:  288850
train completed:  288900
train completed:  288950
train completed:  289000
train completed:  289050
train completed:  289100
train completed:  289150
train completed:  289200
train completed:  289250
train completed:  289300
train completed:  289350
train completed:  289400
train completed:  289450
train completed:  289500
train completed:  289550
train completed:  289600
train completed:  289650
train completed:  289700
train completed:  289750
train completed:  289800
train completed:  289850
train completed:  289900
train completed:  289950
train completed:  290000
train completed:  290050
train completed:  290100
train completed:  290150


train completed:  304600
train completed:  304650
train completed:  304700
train completed:  304750
train completed:  304800
train completed:  304850
train completed:  304900
train completed:  304950
train completed:  305000
train completed:  305050
train completed:  305100
train completed:  305150
train completed:  305200
train completed:  305250
train completed:  305300
train completed:  305350
train completed:  305400
train completed:  305450
train completed:  305500
train completed:  305550
train completed:  305600
train completed:  305650
train completed:  305700
train completed:  305750
train completed:  305800
train completed:  305850
train completed:  305900
train completed:  305950
train completed:  306000
train completed:  306050
train completed:  306100
train completed:  306150
train completed:  306200
train completed:  306250
train completed:  306300
train completed:  306350
train completed:  306400
train completed:  306450
train completed:  306500
train completed:  306550


train completed:  321000
train completed:  321050
train completed:  321100
train completed:  321150
train completed:  321200
train completed:  321250
train completed:  321300
train completed:  321350
train completed:  321400
train completed:  321450
train completed:  321500
train completed:  321550
train completed:  321600
train completed:  321650
train completed:  321700
train completed:  321750
train completed:  321800
train completed:  321850
train completed:  321900
train completed:  321950
train completed:  322000
train completed:  322050
train completed:  322100
train completed:  322150
train completed:  322200
train completed:  322250
train completed:  322300
train completed:  322350
train completed:  322400
train completed:  322450
train completed:  322500
train completed:  322550
train completed:  322600
train completed:  322650
train completed:  322700
train completed:  322750
train completed:  322800
train completed:  322850
train completed:  322900
train completed:  322950


train completed:  337400
train completed:  337450
train completed:  337500
train completed:  337550
train completed:  337600
train completed:  337650
train completed:  337700
train completed:  337750
train completed:  337800
train completed:  337850
train completed:  337900
train completed:  337950
train completed:  338000
train completed:  338050
train completed:  338100
train completed:  338150
train completed:  338200
train completed:  338250
train completed:  338300
train completed:  338350
train completed:  338400
train completed:  338450
train completed:  338500
train completed:  338550
train completed:  338600
train completed:  338650
train completed:  338700
train completed:  338750
train completed:  338800
train completed:  338850
train completed:  338900
train completed:  338950
train completed:  339000
train completed:  339050
train completed:  339100
train completed:  339150
train completed:  339200
train completed:  339250
train completed:  339300
train completed:  339350
