# Capture the Flag (RL - Policy Gradient)

- Seung Hyun Kim
- skim449@illinois.edu

## Notes
- This notebook includes:
    - Building the structure of policy driven network.
    - Training with/without render
    - Saver that save model and weights to ./model directory
    - Writer that will record some necessary datas to ./logs
- This notebook does not include running the CtF game with the RL policy. Using the network will be separately scripted in policy/policy_RL1.py.
    - cap_test.py is changed appropriately.
    
## References :
- https://github.com/awjuliani/DeepRL-Agents/blob/master/Vanilla-Policy.ipynb (source)
- https://www.youtube.com/watch?v=PDbXPBwOavc

In [1]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import matplotlib.pyplot as plt
%matplotlib inline

import time
import gym
import gym_cap
import gym_cap.envs.const as CONST
import numpy as np
import random

# the modules that you can use to generate the policy.
import policy.patrol 
import policy.random
import policy.simple # custon written policy
import policy.policy_RL
import policy.zeros

pygame 1.9.4
Hello from the pygame community. https://www.pygame.org/contribute.html


## Environment Setting

In [2]:
env = gym.make("cap-v0") # initialize the environment
policy_red = policy.random.PolicyGen(env.get_map, env.get_team_red)

UNKNOWN  = CONST.UNKNOWN # -1
TEAM1_BG = CONST.TEAM1_BACKGROUND # 0
TEAM2_BG = CONST.TEAM2_BACKGROUND # 1
TEAM1_AG = CONST.TEAM1_UGV # 2
TEAM2_AG = CONST.TEAM2_UGV # 4
TEAM1_FL = CONST.TEAM1_FLAG # 6
TEAM2_FL = CONST.TEAM2_FLAG # 7
OBSTACLE = CONST.OBSTACLE # 8
DEAD     = CONST.DEAD # 9
SELECTED = CONST.SELECTED # 10
COMPLETED= CONST.COMPLETED # 11

VISION_RANGE = CONST.UGV_RANGE
VISION_dX    = 2*VISION_RANGE+1
VISION_dY    = 2*VISION_RANGE+1

In [3]:
print('red number : ', len(env.get_team_red))
print('blue number : ', len(env.get_team_blue))
print('vision range : ', VISION_RANGE)

red number :  0
blue number :  4
vision range :  10


## Discount Reward

In [4]:
gamma = 0.97

def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0.0
    for t in reversed(range(r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    discounted_r = (discounted_r - np.mean(discounted_r)) / (np.std(discounted_r)+1e-8) # normalize
    return discounted_r

def discount_rewards_multiagent(r, n_agents):
    discounted_r = np.reshape(r,(-1,n_agents)) # Reshape the r into vertical matrix with 'n_agents' columns
    for idx in range(n_agents):
        column = discounted_r[:,idx]                     # extract single column
        discounted_r[:,idx] = discount_rewards(column);  # Run discount_reward on the column, and substitute
    return np.reshape(discounted_r,(-1))                 # return the flattened matrix

## Network Setting

In [5]:
class Agent():
    def __init__(self, lr, in_size,action_size):
        #These lines established the feed-forward part of the network. The agent takes a state and produces an action.
        self.state_input = tf.placeholder(shape=in_size,dtype=tf.float32, name='state')
        
        with tf.name_scope('Conv1'):
            cw1 = tf.Variable(tf.truncated_normal([5, 5, in_size[3], 16], stddev=0.1), name="conv_w1")
            cb1 = tf.Variable(tf.constant(1.0, shape=[16]), name="conv_b1")
            layer = tf.nn.conv2d(self.state_input, cw1, strides=[1, 1, 1, 1], padding="SAME")
            tf.summary.histogram("conv_nn1", layer)
            layer = tf.nn.relu(layer + cb1)
            tf.summary.histogram("conv_weights1", cw1)
            tf.summary.histogram("conv_biases1", cb1)
            #tf.summary.histogram("conv_relu1", layer)
            
        with tf.name_scope('Conv2'):
            cw2 = tf.Variable(tf.truncated_normal([3, 3, 16, 16], stddev=0.1), name="conv_w2")
            cb2 = tf.Variable(tf.constant(1.0, shape=[16]), name="conv_b2")
            layer = tf.nn.conv2d(layer, cw2, strides=[1, 1, 1, 1], padding="SAME")
            tf.summary.histogram("conv_nn2", layer)    
            layer = tf.nn.relu(layer + cb2)
            tf.summary.histogram("conv_weights2", cw2)
            tf.summary.histogram("conv_biases2", cb2)
            #tf.summary.histogram("conv_relu2", layer)
        
        #layer = slim.flatten(layer)
        layer  = tf.reshape(layer, [-1, VISION_dX*VISION_dY*16])
        #layer = slim.dropout(layer,keep_prob=0.7)
        with tf.name_scope('FC1'):
            w1 = tf.Variable(tf.truncated_normal([int(layer.shape[1]), 256], stddev=0.1), name="W1")
            b1 = tf.Variable(tf.constant(0.1, shape=[256]), name="B1")
            layer = tf.nn.relu(tf.matmul(layer, w1) + b1)
            tf.summary.histogram("weights_1", w1)
            tf.summary.histogram("biases_1", b1)
            tf.summary.histogram("activations_1", layer)
        
        with tf.name_scope('FC2'):
            w2 = tf.Variable(tf.truncated_normal([256,512], stddev=0.1), name="W2")
            b2 = tf.Variable(tf.constant(0.1, shape=[512]), name="B2")
            layer = tf.nn.relu(tf.matmul(layer, w2) + b2)
            tf.summary.histogram("weights_2", w2)
            tf.summary.histogram("biases_2", b2)
            tf.summary.histogram("activations_2", layer)

        with tf.name_scope('FC3_output'):
            w3 = tf.Variable(tf.truncated_normal([512,5], stddev=0.1), name="W3")
            b3 = tf.Variable(tf.constant(0.1, shape=[5]), name="B3")
            self.output = tf.nn.softmax(tf.matmul(layer, w3) + b3, name='action')
            tf.summary.histogram("weights_3", w3)
            tf.summary.histogram("biases_3", b3)
            tf.summary.histogram("mult_3_action", self.output)

        with tf.name_scope('Records'):
            self.mean_reward = tf.placeholder("float", None)
            self.mean_length = tf.placeholder("float", None)
            self.mean_succeed = tf.placeholder("float", None)
            tf.summary.scalar('mean_reward', self.mean_reward)
            tf.summary.scalar('mean_length', self.mean_length)
            tf.summary.scalar('mean_succeed', self.mean_succeed)
    
        with tf.name_scope('batch_hold'):
            self.action_holder = tf.placeholder(shape=[None],dtype=tf.int32)
            self.indexes = tf.range(0, tf.shape(self.output)[0]) * tf.shape(self.output)[1] + self.action_holder
            self.reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
            self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes)
        
        with tf.name_scope('loss'):
            self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.reward_holder)

        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for idx,var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder')
            self.gradient_holders.append(placeholder)
        
        with tf.name_scope('gradients'):
            self.gradients = tf.gradients(self.loss,tvars)

        with tf.name_scope('optimizer'):
            optimizer = tf.train.AdamOptimizer(learning_rate=lr)
            self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders,tvars))
            #self.update_batch_minimize = optimizer.minimize(self.loss)

In [6]:
tf.reset_default_graph() # Clear the Tensorflow graph.
myAgent = Agent(lr=1e-4,in_size=[None,VISION_dX,VISION_dY,6],action_size=5) #Load the agent.
global_step = tf.Variable(0, trainable=False, name='global_step') # global step
increment_global_step_op = tf.assign(global_step, global_step+1)
merged = tf.summary.merge_all()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


## Hyperparameters

In [7]:
total_episodes = 3000 #Set total number of episodes to train agent on.
max_ep = 150
update_frequency = 10
save_network_frequency = 100
save_reward_frequency = 10

batch_size = 512

## One-hot Encoder

In [8]:
def one_hot_encoder(state, agents):
    ret = np.zeros((len(agents),VISION_dX,VISION_dY,6))
    # team 1 : (1), team 2 : (-1), map elements: (0)
    map_channel = {UNKNOWN:0, DEAD:0,
                   TEAM1_BG:1, TEAM2_BG:1,
                   TEAM1_AG:2, TEAM2_AG:2,
                   3:3, 5:3, # UAV, does not need to be included for now
                   TEAM1_FL:4, TEAM2_FL:4,
                   OBSTACLE:5}
    map_color   = {UNKNOWN:1, DEAD:0, OBSTACLE:1,
                   TEAM1_BG:1, TEAM2_BG:-1,
                   TEAM1_AG:1, TEAM2_AG:-1,
                   3:1, 5:-1, # UAV, does not need to be included for now
                   TEAM1_FL:1, TEAM2_FL:-1}
    
    # Expand the observation with 3-thickness wall
    # - in order to avoid dealing with the boundary
    sx, sy = state.shape
    _state = np.ones((sx+2*VISION_RANGE, sy+2*VISION_RANGE)) * OBSTACLE # 8 for obstacle
    _state[VISION_RANGE:VISION_RANGE+sx, VISION_RANGE:VISION_RANGE+sy] = state
    state = _state

    for idx,agent in enumerate(agents):
        # Initialize Variables
        x, y = agent.get_loc()
        x += VISION_RANGE
        y += VISION_RANGE
        vision = state[x-VISION_RANGE:x+VISION_RANGE+1,y-VISION_RANGE:y+VISION_RANGE+1] # extract the limited view for the agent (5x5)
        for i in range(len(vision)):
            for j in range(len(vision[0])):
                if vision[i][j] != -1:
                    channel = map_channel[vision[i][j]]
                    ret[idx][i][j][channel] = map_color[vision[i][j]]
    return ret

## Session

In [9]:
# Launch the session
sess = tf.Session()

total_reward = []
total_length = []
total_captured = []
table_stdaction = []

exploration_steps = 300

# Setup Save and Restore Network
saver = tf.train.Saver(tf.global_variables())
writer = tf.summary.FileWriter('./logs', sess.graph)

ckpt = tf.train.get_checkpoint_state('./model')
if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Load Model : ", ckpt.model_checkpoint_path)
else:
    sess.run(tf.global_variables_initializer())
    print("Initialized Variables")

Initialized Variables


In [10]:
def record():
    obs = one_hot_encoder(env._env, env.get_team_blue)
    feed_dict = {myAgent.mean_reward:np.mean(total_reward[-save_reward_frequency:]),
                         myAgent.mean_length:np.mean(total_length[-save_reward_frequency:]),
                         myAgent.mean_succeed:np.mean(total_captured[-save_reward_frequency:]),
                         #myAgent.mean_stdaction:np.mean(table_stdaction[-save_reward_frequency:]),
                         myAgent.state_input:obs
                }
    
    summary_str = sess.run(merged, feed_dict=feed_dict)
    writer.add_summary(summary_str, sess.run(global_step))

In [11]:
gradBuffer = sess.run(tf.trainable_variables())
def clear_buffer():
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = np.zeros(grad.shape)#grad * 0

## Training

In [12]:
class Experience_buffer():
    def __init__(self, buffer_size = 50000):
        self.buffer = []
        self.buffer_size = buffer_size
    
    def add(self,experience):
        if len(self.buffer) + len(experience) >= self.buffer_size:
            self.buffer[0:(len(experience)+len(self.buffer))-self.buffer_size] = []
        self.buffer.extend(experience)
            
    def sample(self, size):
        if size > len(self.buffer):
            return np.array(self.buffer)
        else:
            return np.reshape(np.array(random.sample(self.buffer,size)),[size,4])

In [13]:
def policy_rollout(env, explore=False):
    # Run single episode, return the results (number of frame, history, total reward, and game_won)
    s = env.reset(map_size=20, policy_red=policy_red)
    ep_history = []
    indv_history = [[] for _ in range(len(env.get_team_blue))]
    running_reward = 0
    for frame in range(max_ep+1):
        #obs = one_hot_encoder(s, env.get_team_blue).tolist() # observation
        obs = one_hot_encoder(env._env, env.get_team_blue) # train with all map
        if explore: # explorate
            act = np.random.randint(action_space, size=len(env.get_team_blue)).tolist()
        else:
            with tf.device('/cpu:0'):        
                #Probabilistically pick an action given our network outputs.
                act_prob = sess.run(myAgent.output, feed_dict={myAgent.state_input:obs})
                table_stdaction = [act_prob]
                act = [np.random.choice(action_space, p=act_prob[x]/sum(act_prob[x])) for x in range(n_agent)] # divide by sum : normalize
        s1,r,d,_ = env.step(act) #Get our reward for taking an action given a bandit.
        
        #for state, act in zip(obs, a):
        for idx, agent in enumerate(env.get_team_blue):
            if agent.isAlive: indv_history[idx].append([obs[idx],act[idx],r,s1])

        s = s1
        running_reward += r
        frame += 1
        
        if d == True:
            break
        
    for history in indv_history:
        if len(history)==0: continue
        history = np.array(history)
        history[:,2] = discount_rewards(history[:,2])
        ep_history.append(history)
    ep_history = np.vstack(ep_history)

    return [frame, ep_history, running_reward, env.game_won]

In [16]:
ep = 0

action_space = 5
n_agent = len(env.get_team_blue)

exp_buffer = Experience_buffer()
clear_buffer()
try:
    progbar = tf.keras.utils.Progbar(total_episodes,width=5)
    while ep < total_episodes:
        progbar.update(ep) # update progress bar
        
         # Run episode
        frame, history, running_reward, did_won = policy_rollout(env, ep < exploration_steps)
        
        # Add history
        exp_buffer.add(history)
        
        # Performance recordings
        total_reward.append(running_reward/(1+frame))
        total_length.append(frame)
        total_captured.append(did_won)  
        
        with tf.device('/gpu:0'):
            batch_history = exp_buffer.sample(batch_size)
            feed_dict={myAgent.reward_holder:batch_history[:,2],
                       myAgent.action_holder:batch_history[:,1],
                       myAgent.state_input:np.stack(batch_history[:,0])}
            grads = sess.run(myAgent.gradients, feed_dict=feed_dict)
            
        for idx,grad in enumerate(grads):
            gradBuffer[idx] += grad
        
        if ep % update_frequency == 0 and ep != 0:
            with tf.device('/gpu:0'):
                feed_dict = dictionary = dict(zip(myAgent.gradient_holders, gradBuffer))
                _ = sess.run(myAgent.update_batch, feed_dict=feed_dict)
            clear_buffer()

        # Record log
        if ep % save_reward_frequency == 0 and ep != 0:
            record()
        
        # Save weights
        if ep % save_network_frequency == 0 and ep != 0:
            print(' Average r : ', np.mean(total_reward[-save_network_frequency:]))
            saver.save(sess, './model/ctf_policy.ckpt', global_step=global_step)
            print("save weights: ", sess.run(global_step), 'episodes', np.mean(table_stdaction[-10:]))

        ep += 1
        sess.run(increment_global_step_op)
        
except KeyboardInterrupt:
    print('\n\nManually stopped the training (KeyboardInterrupt)');
    plt.plot(total_reward)
    plt.figure()
    plt.plot(total_length)
    plt.figure()
    plt.plot(total_captured)
    saver.save(sess, './model/ctf_policy.ckpt', global_step=global_step)
    record()
    print("save: ", sess.run(global_step), 'episodes')

 100/3000 [.....] - ETA: 1:29:17

InvalidArgumentError: Nan in summary histogram for: Conv1/conv_weights1
	 [[Node: Conv1/conv_weights1 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](Conv1/conv_weights1/tag, Conv1/conv_w1/read/_91)]]

Caused by op 'Conv1/conv_weights1', defined at:
  File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/namsong/.local/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/namsong/.local/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/namsong/.local/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 497, in start
    self.io_loop.start()
  File "/home/namsong/.local/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.6/asyncio/base_events.py", line 422, in run_forever
    self._run_once()
  File "/usr/lib/python3.6/asyncio/base_events.py", line 1432, in _run_once
    handle._run()
  File "/usr/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/home/namsong/.local/lib/python3.6/site-packages/tornado/ioloop.py", line 758, in _run_callback
    ret = callback()
  File "/home/namsong/.local/lib/python3.6/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/namsong/.local/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 536, in <lambda>
    self.io_loop.add_callback(lambda : self._handle_events(self.socket, 0))
  File "/home/namsong/.local/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/home/namsong/.local/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/namsong/.local/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/home/namsong/.local/lib/python3.6/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/namsong/.local/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/namsong/.local/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/namsong/.local/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/namsong/.local/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/namsong/.local/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/namsong/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2662, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/home/namsong/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/namsong/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2901, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/namsong/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-a84c85413ec9>", line 2, in <module>
    myAgent = Agent(lr=1e-4,in_size=[None,VISION_dX,VISION_dY,6],action_size=5) #Load the agent.
  File "<ipython-input-5-00d6c4166aa9>", line 12, in __init__
    tf.summary.histogram("conv_weights1", cw1)
  File "/home/namsong/.local/lib/python3.6/site-packages/tensorflow/python/summary/summary.py", line 187, in histogram
    tag=tag, values=values, name=scope)
  File "/home/namsong/.local/lib/python3.6/site-packages/tensorflow/python/ops/gen_logging_ops.py", line 283, in histogram_summary
    "HistogramSummary", tag=tag, values=values, name=name)
  File "/home/namsong/.local/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/namsong/.local/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 454, in new_func
    return func(*args, **kwargs)
  File "/home/namsong/.local/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3155, in create_op
    op_def=op_def)
  File "/home/namsong/.local/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1717, in __init__
    self._traceback = tf_stack.extract_stack()

InvalidArgumentError (see above for traceback): Nan in summary histogram for: Conv1/conv_weights1
	 [[Node: Conv1/conv_weights1 = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](Conv1/conv_weights1/tag, Conv1/conv_w1/read/_91)]]
