# Capture the Flag (RL - Policy Gradient)

- Seung Hyun Kim
- skim449@illinois.edu

## Notes
- This notebook includes:
    - Building the structure of policy driven network.
    - Training with/without render
    - Saver that save model and weights to ./model directory
    - Writer that will record some necessary datas to ./logs
- This notebook does not include running the CtF game with the RL policy. Using the network will be separately scripted in policy/policy_RL1.py.
    - cap_test.py is changed appropriately.
    
## References :
- https://github.com/awjuliani/DeepRL-Agents/blob/master/Vanilla-Policy.ipynb (source)
- https://www.youtube.com/watch?v=PDbXPBwOavc

In [1]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import matplotlib.pyplot as plt
%matplotlib inline

import time
import gym
import gym_cap
import numpy as np

# the modules that you can use to generate the policy.
import policy.patrol 
import policy.random
import policy.simple # custon written policy
import policy.policy_RL

  return f(*args, **kwds)


In [2]:
gamma = 0.99

def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

## Environment Setting

In [3]:
env = gym.make("cap-v0") # initialize the environment

policy_red = policy.random.PolicyGen(env.get_map, env.get_team_red)

[33mWARN: Environment '<class 'gym_cap.envs.cap_env.CapEnvGenerate'>' has deprecated methods. Compatibility code invoked.[0m


## Network Setting

In [4]:
class agent():
    def __init__(self, lr, in_size,action_size):
        #These lines established the feed-forward part of the network. The agent takes a state and produces an action.
        self.state_input = tf.placeholder(shape=in_size,dtype=tf.float32, name='state')
        conv1 = slim.conv2d(self.state_input, 128, [2,2], scope='conv1')
        conv2 = slim.conv2d(conv1, 128, [3,3], scope='conv2')
        flat  = slim.flatten(conv2)
        dense = slim.fully_connected(flat, action_size,
                                           biases_initializer=None)
        self.output = tf.nn.softmax(dense, name='action')
        #self.chosen_action = tf.argmax(self.output,1, name='action')

        #The next six lines establish the training proceedure. We feed the reward and chosen action into the network
        #to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[None],dtype=tf.int32)
        
        self.indexes = tf.range(0, tf.shape(self.output)[0]) * tf.shape(self.output)[1] + self.action_holder
        self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes)

        self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.reward_holder)
        
        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for idx,var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder')
            self.gradient_holders.append(placeholder)
        
        self.gradients = tf.gradients(self.loss,tvars)
    

        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders,tvars))

In [5]:
tf.reset_default_graph() # Clear the Tensorflow graph.
myAgent = agent(lr=1e-4,in_size=[None,5,5,8],action_size=5) #Load the agent.
global_step = tf.Variable(0, trainable=False, name='global_step') # global step
increment_global_step_op = tf.assign(global_step, global_step+1)
merged = tf.summary.merge_all()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


## Hyperparameters

In [6]:
total_episodes = 5000 #Set total number of episodes to train agent on.
max_ep = 999
update_frequency = 5

## Session

In [7]:
# Launch the session
sess = tf.Session()

# Setup Save and Restore Network
saver = tf.train.Saver(tf.global_variables())

writer = tf.summary.FileWriter('./logs', sess.graph)

ckpt = tf.train.get_checkpoint_state('./model')
if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Load Model : ", ckpt.model_checkpoint_path)
else:
    sess.run(tf.global_variables_initializer())
    print("Initialized Variables")

Initialized Variables


In [8]:
def one_hot_encoder(state, agents):
    ret = np.zeros((len(agents),5,5,8))
    reorder = {0:0, 1:1, 2:2, 4:3, 6:4, 7:5, 8:6, 9:7}
    
    # Expand the observation with 3-thickness wall
    # - in order to avoid dealing with the boundary
    sx, sy = state.shape
    _state = np.ones((sx+6, sy+6)) * 8 # 8 for obstacle
    _state[3:3+sx, 3:3+sy] = state
    state = _state

    for idx,agent in enumerate(agents):
        # Initialize Variables
        x, y = agent.get_loc()
        x += 3
        y += 3
        vision = state[x-2:x+3, y-2:y+3] # limited view for the agent (5x5)
        for i in range(len(vision)):
            for j in range(len(vision[0])):
                if vision[i][j] != -1:
                    height = reorder[vision[i][j]]
                    ret[idx][i][j][height] = 1
    return ret

## Training

In [10]:
i = 0
total_reward = []
total_lenght = []

action_space = 5
n_agent = len(env.get_team_blue)

gradBuffer = sess.run(tf.trainable_variables())
for ix,grad in enumerate(gradBuffer):
    gradBuffer[ix] = grad * 0
    
while i < total_episodes:
    s = env.reset(map_size=20, render_mode='env', policy_red=policy_red)
    running_reward = 0
    ep_history = []
    for j in range(max_ep):
        observation = one_hot_encoder(s, env.get_team_blue).tolist()
        a = sess.run(myAgent.output, feed_dict={myAgent.state_input:observation})
        #Probabilistically pick an action given our network outputs.
        a = [np.random.choice(action_space, p=a[x]/sum(a[x])) for x in range(n_agent)] # divide by sum : normalize
        
        s1,r,d,_ = env.step(a) #Get our reward for taking an action given a bandit.
        
        # Rendering
        #env.render(mode="fast")
        #time.sleep(0.05)
        
        for obs, act in zip(observation, a):
            ep_history.append([obs,act,r,s1])
        s = s1
        running_reward += r
        if d == True:
            #Update the network.
            ep_history = np.array(ep_history)
            ep_history[:,2] = discount_rewards(ep_history[:,2])
            feed_dict={myAgent.reward_holder:ep_history[:,2],
                       myAgent.action_holder:ep_history[:,1],
                       myAgent.state_input:np.stack(ep_history[:,0])}
            grads = sess.run(myAgent.gradients, feed_dict=feed_dict)
            for idx,grad in enumerate(grads):
                gradBuffer[idx] += grad

            if i % update_frequency == 0 and i != 0:
                feed_dict= dictionary = dict(zip(myAgent.gradient_holders, gradBuffer))
                _ = sess.run(myAgent.update_batch, feed_dict=feed_dict)
                for ix,grad in enumerate(gradBuffer):
                    gradBuffer[ix] = grad * 0

            total_reward.append(running_reward)
            total_lenght.append(j)
            break


        #Update our running tally of scores.
    if i % 100 == 0:
        print(np.mean(total_reward[-100:]))
        saver.save(sess, './model/ctf_policy.ckpt', global_step=global_step)
        print("save: ", sess.run(global_step))
    i += 1
    sess.run(increment_global_step_op)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


nan
save:  5000
1970.8992424242424
save:  5100
1935.6195
save:  5200
1821.1425000000004
save:  5300
1297.44
save:  5400
1613.5145000000002
save:  5500
1672.8185
save:  5600
1209.1515000000002
save:  5700
1880.1835000000005
save:  5800
1764.7940000000006
save:  5900
2203.0475
save:  6000
1937.3464999999999
save:  6100
826.9115
save:  6200
1113.938
save:  6300
1741.2955000000002
save:  6400
1132.9115000000002
save:  6500
1014.9725
save:  6600
1889.7480000000003
save:  6700
1422.8020000000001
save:  6800
2171.6305
save:  6900
2770.8794999999996
save:  7000
1489.9085000000005
save:  7100
1622.5770000000005
save:  7200
1542.9265000000003
save:  7300
2776.388
save:  7400
3840.3725
save:  7500
2306.5820000000003
save:  7600
1572.1825000000003
save:  7700
2348.0640000000003
save:  7800
1274.2805
save:  7900
1518.0180000000005
save:  8000
1726.9325000000003
save:  8100
2170.0615
save:  8200
2120.1470000000004
save:  8300
1985.2230000000004
save:  8400
2530.8215000000005
save:  8500
2958.4894999