# Capture the Flag (RL - Policy Gradient)

- Seung Hyun Kim
- skim449@illinois.edu

## Notes
- This notebook includes:
    - Building the structure of policy driven network.
    - Training with/without render
    - Saver that save model and weights to ./model directory
    - Writer that will record some necessary datas to ./logs
- This notebook does not include running the CtF game with the RL policy. Using the network will be separately scripted in policy/policy_RL1.py.
    - cap_test.py is changed appropriately.
    
## References :
- https://github.com/awjuliani/DeepRL-Agents/blob/master/Vanilla-Policy.ipynb (source)
- https://www.youtube.com/watch?v=PDbXPBwOavc

In [1]:
TRAIN_NAME='NO_RED_08_ImpSamp'
LOG_PATH='./logs/'+TRAIN_NAME
MODEL_PATH='./model/' + TRAIN_NAME
GPU_CAPACITY=0.125 # gpu capacity in percentage

In [2]:
import os

import tensorflow as tf
import tensorflow.contrib.slim as slim
import tensorflow.contrib.layers as layers
from tensorflow.python.client import device_lib
import matplotlib.pyplot as plt
%matplotlib inline

import time
import gym
import gym_cap
import gym_cap.envs.const as CONST
import numpy as np
import random

# the modules that you can use to generate the policy.
import policy.patrol 
import policy.random
import policy.simple # custon written policy
import policy.policy_RL
import policy.zeros

# Data Processing Module
from DataModule import one_hot_encoder, VISION_dX, VISION_dY

pygame 1.9.4
Hello from the pygame community. https://www.pygame.org/contribute.html


## Hyperparameters

In [3]:
# Training Related
total_episodes = 20000 #Set total number of episodes to train agent on.
max_ep = 300
update_frequency = 50
batch_size = 2000
experience_size=50000

# Saving Related
save_network_frequency = 100
save_stat_frequency = 50
moving_average_step = 50

## Environment Setting

In [4]:
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    
#Create a directory to save episode playback gifs to
if not os.path.exists(LOG_PATH):
    os.makedirs(LOG_PATH)

In [5]:
env = gym.make("cap-v0") # initialize the environment
policy_red = policy.random.PolicyGen(env.get_map, env.get_team_red)

# Environment Related
action_space = 5
n_agent = len(env.get_team_blue)

print('red number : ', len(env.get_team_red))
print('blue number : ', len(env.get_team_blue))

red number :  0
blue number :  4


## Discount Reward

In [6]:
gamma = 0.98

def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0.0
    for t in reversed(range(r.size)):
        running_add = (running_add * gamma + r[t])
        discounted_r[t] = running_add
    #discounted_r = (discounted_r - np.mean(discounted_r)) / (np.std(discounted_r)+1e-8) # normalize
    return discounted_r

## Network Setting

In [7]:
class agent():
    def __init__(self, lr, in_size,action_size):
        #These lines established the feed-forward part of the network. The agent takes a state and produces an action.
        self.state_input = tf.placeholder(shape=in_size,dtype=tf.float32, name='state')
        
        layer = slim.conv2d(self.state_input, 16, [5,5], activation_fn=tf.nn.relu,
                            weights_initializer=layers.xavier_initializer_conv2d(),
                            biases_initializer=tf.zeros_initializer(),
                            padding='SAME',
                            scope='conv1')
        layer = slim.conv2d(layer, 16, [3,3], activation_fn=tf.nn.relu,
                            weights_initializer=layers.xavier_initializer_conv2d(),
                            biases_initializer=tf.zeros_initializer(),
                            padding='SAME',
                            scope='conv2')
        layer = slim.flatten(layer)
        #flat  = tf.reshape(conv, [-1, VISION_dX*VISION_dY*128])
        #layer = slim.dropout(layer,keep_prob=0.8)
        #layer = slim.fully_connected(layer, 516,
        #                            activation_fn=tf.nn.relu,
        #                            biases_initializer=None,
        #                            scope='hidden_fc1')
        layer = slim.dropout(layer,keep_prob=0.8)
        layer = slim.fully_connected(layer, 1024, 
                                    weights_initializer=layers.xavier_initializer(),
                                    activation_fn=tf.nn.relu,
                                    biases_initializer=tf.zeros_initializer())
        layer = slim.dropout(layer,keep_prob=0.8)
        self.dense = slim.fully_connected(layer, action_size,
                                    weights_initializer=layers.xavier_initializer(),
                                    activation_fn=tf.nn.relu,
                                    biases_initializer=tf.zeros_initializer(),
                                    scope='output_fc')
        self.output = tf.nn.softmax(self.dense, name='action')
        
        tf.summary.histogram('output', self.output)
        
        with tf.name_scope('weights'):
            for var in slim.get_model_variables():
                tf.summary.histogram(var.op.name, var)

        #The next six lines establish the training proceedure. We feed the reward and chosen action into the network
        #to compute the loss, and use it to update the network.
    
        with tf.name_scope('action_placement'):
            self.action_holder = tf.placeholder(shape=[None],dtype=tf.int32)
            self.indexes = tf.range(0, tf.shape(self.output)[0]) * tf.shape(self.output)[1] + self.action_holder

        with tf.name_scope('loss'):
            self.reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
            self.sampling_weight_holder = tf.placeholder(shape=[None],dtype=tf.float32)
            self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes) # output
            #self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.reward_holder)
            self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.reward_holder*self.sampling_weight_holder)
        
        with tf.name_scope('optimizer'):
            optimizer = tf.train.AdamOptimizer(learning_rate=lr)
            self.update_batch = optimizer.minimize(self.loss)

In [8]:
tf.reset_default_graph() # Clear the Tensorflow graph.
myAgent = agent(lr=1e-5,in_size=[None,VISION_dX,VISION_dY,6],action_size=5) #Load the agent.
global_step = tf.Variable(0, trainable=False, name='global_step') # global step
increment_global_step_op = tf.assign(global_step, global_step+1)
merged = tf.summary.merge_all()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


## Session

In [9]:
# Launch the session
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=GPU_CAPACITY, allow_growth=True)

sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
#sess = tf.Session()

total_reward = []
total_length = []
total_captured = []
total_loss=[]

# Setup Save and Restore Network
saver = tf.train.Saver(tf.global_variables())
writer = tf.summary.FileWriter(LOG_PATH, sess.graph)

ckpt = tf.train.get_checkpoint_state(MODEL_PATH)
if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Load Model : ", ckpt.model_checkpoint_path)
else:
    sess.run(tf.global_variables_initializer())
    print("Initialized Variables")

Initialized Variables


In [10]:
def record(obs):
    with tf.device('/cpu:0'): 
        summary = tf.Summary()
        summary.value.add(tag='Records/mean_reward', simple_value=np.mean(total_reward[-moving_average_step:]))
        summary.value.add(tag='Records/mean_length', simple_value=np.mean(total_length[-moving_average_step:]))
        summary.value.add(tag='Records/mean_succeed', simple_value=np.mean(total_captured[-moving_average_step:]))
        summary.value.add(tag='Loss', simple_value=np.mean(total_loss[-moving_average_step:]))
        writer.add_summary(summary, sess.run(global_step))
        
        summary_str = sess.run(merged,feed_dict={myAgent.state_input:obs})
        writer.add_summary(summary_str, sess.run(global_step))
        
        writer.flush()

In [11]:
class Experience_buffer():
    def __init__(self, buffer_size = experience_size):
        self.buffer = []
        self.buffer_size = buffer_size
    
    def add(self,experience):
        if len(self.buffer) + len(experience) >= self.buffer_size:
            self.buffer[0:(len(experience)+len(self.buffer))-self.buffer_size] = []
        self.buffer.extend(experience)
    
    def flush(self):
        self.buffer = []
        self.buffer_size = 0
    
    def sample(self, size=2000):
        if size > len(self.buffer):
            return np.array(self.buffer)
        else:
            return np.reshape(np.array(random.sample(self.buffer,size)),[size,5])

In [33]:
def policy_rollout(env):
    # Run single episode, return the results (number of frame, history, total reward, game_won, last obs)
    s = env.reset(map_size=20, policy_red=policy_red)
    #obs = one_hot_encoder(s, env.get_team_blue).tolist() # partial observation
    obs_post = one_hot_encoder(env._env, env.get_team_blue) # Full observation
    
    ep_history = []
    indv_history = [[] for _ in range(len(env.get_team_blue))]
    
    was_alive = [ag.isAlive for ag in env.get_team_blue]
    prev_reward=0
    
    cumulative_sampling_weight = np.ones((n_agent,action_space))
    for frame in range(max_ep+1):
        obs = obs_post
        
        with tf.device('/cpu:0'):        
            #Probabilistically pick an action given our network outputs.
            act_prob = sess.run(myAgent.output, feed_dict={myAgent.state_input:obs})
            cumulative_sampling_weight += np.log(act_prob)
            act = [np.random.choice(action_space, p=act_prob[x]/sum(act_prob[x])) for x in range(n_agent)] # divide by sum : normalize
            
        s,r,d,_ = env.step(act) #Get our reward for taking an action given a bandit.
        obs_post = one_hot_encoder(env._env, env.get_team_blue) # Full observation
        #obs_post = one_hot_encoder(s, env.get_team_blue).tolist() # partial observation
        
        # If frame is at max and the game is still not done, negative reward
        if frame == max_ep and d == False:
            r -= 30
            
        # Push history for individual that 'was' alive previous frame
        sampling_weight = np.log(act_prob)*(frame+1) - cumulative_sampling_weight
        #sampling_weight = np.exp(sampling_weight)
        for idx, agent in enumerate(env.get_team_blue):
            if was_alive[idx]:
                #if sampling_weight[idx][act[idx]] > 1:
                    #print(act_prob[idx], ' ', cumulative_sampling_weight[idx], ' ', frame, ' ', act[idx])
                indv_history[idx].append([obs[idx],act[idx],r,sampling_weight[idx][act[idx]],d])

        # If reward sequence change, push the history into the episode history.
        if prev_reward != r:
            for idx, history in enumerate(indv_history):
                if len(history)==0: continue
                if not was_alive[idx]: continue
                _history = np.array(history)
                _history[:,2] = discount_rewards(_history[:,2])
                ep_history.extend(_history)
        
        # State Transition
        frame += 1
        prev_reward = r
        was_alive = [ag.isAlive for ag in env.get_team_blue]
        
        if d == True:
            break        
            
    if len(ep_history) > 0:        
        ep_history = np.stack(ep_history)
    return [frame, ep_history, r, env.game_won, obs]

## Training

In [37]:
batch_history[:,2]

array([-0.1477675803187372, -0.07140323473320649, -0.08392846519882961,
       ..., 0.7377961646676511, 65.42558123199925, 3.9461365638917085],
      dtype=object)

In [34]:
if __name__=='__main__':
    ep = 0

    exp_buffer = Experience_buffer()
    try:
        progbar = tf.keras.utils.Progbar(total_episodes,width=5)
        while ep < total_episodes+1:
            progbar.update(ep) # update progress bar

            # Run episode
            frame, history, reward, did_won, obs = policy_rollout(env)

            # Add history
            exp_buffer.add(history)

            if ep % update_frequency == 0 and ep != 0:
                with tf.device('/gpu:0'): 
                    batch_history = exp_buffer.sample(batch_size) # Sample from experience replay
                    feed_dict={myAgent.reward_holder:batch_history[:,2],
                               myAgent.sampling_weight_holder:batch_history[:,3],
                               myAgent.action_holder:batch_history[:,1],
                               myAgent.state_input:np.stack(batch_history[:,0])}
                    loss, _ = sess.run([myAgent.loss, myAgent.update_batch], feed_dict=feed_dict)
                    total_loss.append(loss)
                    exp_buffer.flush()

            total_reward.append(reward)
            total_length.append(frame)
            total_captured.append(env.game_won)
            if ep % save_stat_frequency == 0 and ep != 0:
                record(obs)

            # save every 100 ep
            if ep % save_network_frequency == 0 and ep != 0:
                print(' Average r : ', np.mean(total_reward[-save_network_frequency:]))
                saver.save(sess, MODEL_PATH+'/ctf_policy.ckpt', global_step=global_step)
                print("save weights: ", sess.run(global_step), 'episodes')

            ep += 1
            sess.run(increment_global_step_op)

    except KeyboardInterrupt:
        print('\n\nManually stopped the training (KeyboardInterrupt)');
        plt.plot(total_reward)
        plt.figure()
        plt.plot(total_length)
        plt.figure()
        plt.plot(total_captured)
        saver.save(sess, MODEL_PATH+'/ctf_policy.ckpt', global_step=global_step)
        record(obs)
        print("save: ", sess.run(global_step), 'episodes')

   50/20000 [.....] - ETA: 10:02:59

InvalidArgumentError: Nan in summary histogram for: weights/conv1/weights
	 [[{{node weights/conv1/weights}} = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](weights/conv1/weights/tag, conv1/weights/read/_39)]]

Caused by op 'weights/conv1/weights', defined at:
  File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/namsong/.local/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/namsong/.local/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/namsong/.local/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 499, in start
    self.io_loop.start()
  File "/home/namsong/.local/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.6/asyncio/base_events.py", line 422, in run_forever
    self._run_once()
  File "/usr/lib/python3.6/asyncio/base_events.py", line 1434, in _run_once
    handle._run()
  File "/usr/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/home/namsong/.local/lib/python3.6/site-packages/tornado/ioloop.py", line 758, in _run_callback
    ret = callback()
  File "/home/namsong/.local/lib/python3.6/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/namsong/.local/lib/python3.6/site-packages/tornado/gen.py", line 1233, in inner
    self.run()
  File "/home/namsong/.local/lib/python3.6/site-packages/tornado/gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "/home/namsong/.local/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 346, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/home/namsong/.local/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/home/namsong/.local/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 259, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/home/namsong/.local/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/home/namsong/.local/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 513, in execute_request
    user_expressions, allow_stdin,
  File "/home/namsong/.local/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/home/namsong/.local/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/namsong/.local/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/namsong/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2817, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/home/namsong/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2843, in _run_cell
    return runner(coro)
  File "/home/namsong/.local/lib/python3.6/site-packages/IPython/core/async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "/home/namsong/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3018, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/namsong/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3183, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "/home/namsong/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3265, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-8-e3289755b914>", line 2, in <module>
    myAgent = agent(lr=1e-5,in_size=[None,VISION_dX,VISION_dY,6],action_size=5) #Load the agent.
  File "<ipython-input-7-ced0632fdac5>", line 40, in __init__
    tf.summary.histogram(var.op.name, var)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/summary/summary.py", line 187, in histogram
    tag=tag, values=values, name=scope)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gen_logging_ops.py", line 284, in histogram_summary
    "HistogramSummary", tag=tag, values=values, name=name)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 3272, in create_op
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 1768, in __init__
    self._traceback = tf_stack.extract_stack()

InvalidArgumentError (see above for traceback): Nan in summary histogram for: weights/conv1/weights
	 [[{{node weights/conv1/weights}} = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](weights/conv1/weights/tag, conv1/weights/read/_39)]]


In [None]:
%%javascript
Jupyter.notebook.session.delete();