# Capture the Flag (RL - Policy Gradient)

- Seung Hyun Kim
- skim449@illinois.edu

## Implementation Details

- Simple Policy gradient with experience buffer.
- The implementation network is slightly different
    - Better code for mini-batch
    - include self-play for red
    - 19x19 vision
- Add LSTM network, serialize learning

### Sampling
- [x] Mini-batch to update 'average' gradient
- [x] Experience Replay for Random Sampling
- [ ] Importance Sampling

## Notes

- This notebook includes:
    - Building the structure of policy driven network.
    - Training with/without render
    - Saver that save model and weights to ./model directory
    - Writer that will record some necessary datas to ./logs

- This notebook does not include:
    - Simulation with RL policy
        - The simulation can be done using policy_RL.py
    - cap_test.py is changed appropriately.
    
## References :
- https://github.com/awjuliani/DeepRL-Agents/blob/master/Vanilla-Policy.ipynb (source)
- https://www.youtube.com/watch?v=PDbXPBwOavc
- https://github.com/lilianweng/deep-reinforcement-learning-gym/blob/master/playground/policies/actor_critic.py (source)
- https://github.com/spro/practical-pytorch/blob/master/reinforce-gridworld/reinforce-gridworld.ipynb

## TODO:

- enemy with different policies (zero, patrol)
- stochastic interaction

In [1]:
!rm -rf logs/VANILLA_lstm/ model/VANILLA_lstm

In [2]:
TRAIN_NAME='VANILLA_lstm'
LOG_PATH='./logs/'+TRAIN_NAME
MODEL_PATH='./model/' + TRAIN_NAME
GPU_CAPACITY=0.125 # gpu capacity in percentage

In [6]:
import os

from itertools import count

import tensorflow as tf
from tensorflow.python.client import device_lib
import matplotlib.pyplot as plt
%matplotlib inline

import time
from datetime import datetime
import gym
import gym_cap
import gym_cap.envs.const as CONST
import numpy as np
import random

# the modules that you can use to generate the policy.
import policy.policy_RL
import policy.random
import policy.zeros

from network.REINFORCE_lstm import REINFORCE as network

# Data Processing Module
from utility.dataModule import one_hot_encoder_v2 as one_hot_encoder
from utility.utils import MovingAverage as MA
from utility.utils import Experience_buffer, discount_rewards

## Hyperparameters

In [4]:
# Training Related
total_episode = 120000
max_ep = 150
update_frequency = 50
batch_size = 2000
experience_size=10000

# Saving Related
save_network_frequency = 1024
save_stat_frequency = 128
moving_average_step = 128

# Parameters
LEARNING_RATE = 1e-3
gamma = 0.99
map_size = 20
vision_range = 9
vision_dX, vision_dY = 2*vision_range+1, 2*vision_range+1
n_channel = 11

in_size = [None, vision_dX, vision_dY, n_channel]

## Environment Setting

In [None]:
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    
#Create a directory to save episode playback gifs to
if not os.path.exists(LOG_PATH):
    os.makedirs(LOG_PATH)

In [None]:
env = gym.make("cap-v0") # initialize the environment
policy_red = policy.zeros.PolicyGen(env.get_map, env.get_team_red)

# Environment Related
action_space = 5
n_agent = 4

red number :  4
blue number :  4


## Network Setting

In [None]:
tf.reset_default_graph() # Clear the Tensorflow graph.
agent = network(lr=LEARNING_RATE,in_size=[None,VISION_dX,VISION_dY,11],action_size=5,grad_clip_norm=50) #Load the agent.
global_step = tf.Variable(0, trainable=False, name='global_step') # global step
increment_global_step_op = tf.assign(global_step, global_step+1)
merged = tf.summary.merge_all()

## Session

In [None]:
# Launch the session
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=GPU_CAPACITY, allow_growth=True)

sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

ma_reward = MA(moving_average_step)
ma_length = MA(moving_average_step)
ma_captured = MA(moving_average_step)

# Setup Save and Restore Network
saver = tf.train.Saver(tf.global_variables())
writer = tf.summary.FileWriter(LOG_PATH, sess.graph)

ckpt = tf.train.get_checkpoint_state(MODEL_PATH)
if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Load Model : ", ckpt.model_checkpoint_path)
else:
    sess.run(tf.global_variables_initializer())
    print("Initialized Variables")    

Initialized Variables


In [None]:
def record(summary_):
    with tf.device('/cpu:0'): 
        summary = tf.Summary()
        summary.value.add(tag='Records/mean_reward', simple_value=ma_reward())
        summary.value.add(tag='Records/mean_length', simple_value=ma_length())
        summary.value.add(tag='Records/mean_succeed', simple_value=ma_captured())
        writer.add_summary(summary, sess.run(global_step))
        
        #summary_str = sess.run(merged,feed_dict={myAgent.state_input:obs})
        writer.add_summary(summary_, sess.run(global_step))
        
        writer.flush()

In [None]:
        observations, actions, rewards = [], [], []
        init_states = tuple([] for _ in range(self.num_layers))
        observation = self.env.reset()
        init_state = tuple(
             [np.zeros((1, self.gru_unit_size)) for _ in range(self.num_layers)])
        for t in range(self.max_step):

            action, final_state = self.policy.sampleAction(
                                        observation[np.newaxis, np.newaxis, :],
                                        init_state)
            next_observation, reward, done, _ = self.env.step(action)

            # appending the experience
            observations.append(observation)
            actions.append(action)
            rewards.append(reward)
            for i in range(self.num_layers):
              init_states[i].append(init_state[i][0])

            # going to next state
            observation = next_observation
            init_state = final_state
            if done:
                break
                
        self.flush_summary(np.sum(rewards))
        returns = self.compute_monte_carlo_returns(rewards)
        returns = (returns - np.mean(returns)) / (np.std(returns) + 1e-8)
        episode = dict(
                    observations = np.array(observations),
                    actions = np.array(actions),
                    returns = np.array(returns),
                    init_states = init_states,
                    )
        episode_size = len(episode["returns"])
        if episode_size % self.num_step:
            batch_from_episode = (episode_size // self.num_step + 1)
        else:
            batch_from_episode = (episode_size // self.num_step)

        extra_length = batch_from_episode * self.num_step - episode_size
        last_batch_size = self.num_step - extra_length

        batched_episode = {}
        for key, value in episode.items():
            if key == "init_states":
                truncated_value = tuple(value[i][::self.num_step] for i in
                                        range(self.num_layers))
                batched_episode[key] = truncated_value
            else:
                expanded_value = np.concatenate([value, np.zeros((extra_length,) +
                                                     value.shape[1:])])
                batched_episode[key] = expanded_value.reshape((-1, self.num_step) +
                                                         value.shape[1:])

        seq_len = [self.num_step] * (batch_from_episode - 1) + [last_batch_size]
        batched_episode["seq_len"] = np.array(seq_len)
        return batched_episode

In [None]:
def get_actions(states):
    pass

def policy_rollout(PARTIAL=False):
    # Run single episode, return the results
    # Temporary fix for episode reset
    s = env.reset(map_size=map_size, policy_red=policy_red)    
    if PARTIAL:
        obs_next = one_hot_encoder(s, env.get_team_blue) # partial observation
    else:
        obs_next = one_hot_encoder(env._env, env.get_team_blue, VISION_RANGE) # Full observation
    
    ep_history = []
    indv_history = [[] for _ in range(len(env.get_team_blue))]
    
    was_alive = [ag.isAlive for ag in env.get_team_blue]
    prev_reward=0
    frame=0
    for frame in range(max_ep+1):
        obs = obs_next
        
        act = get_actions(states)
        with tf.device('/cpu:0'):
            act_prob = sess.run(myAgent.output, feed_dict={myAgent.state_input:obs})
        act = [np.random.choice(action_space, p=act_prob[x]/sum(act_prob[x])) for x in range(n_agent)] # divide by sum : normalize
            
        s,r1,d,_ = env.step(act) #Get our reward for taking an action given a bandit.

        r = r1-prev_reward

        if frame == max_ep and d == False:
            #r -= frame * (30/1000)
            r = -100
            r1 = -100
            d = True
            
        r /= 100
            
        if PARTIAL:
            obs_next = one_hot_encoder(s, env.get_team_blue) # partial observation
        else:
            obs_next = one_hot_encoder(env._env, env.get_team_blue, VISION_RANGE) # Full observation
        
        # Push history for individual that 'was' alive previous frame
        for idx, agent in enumerate(env.get_team_blue):
            if was_alive[idx]:
                indv_history[idx].append([obs[idx],act[idx],r])
        
        # State Transition
        prev_reward = r1
        was_alive = [ag.isAlive for ag in env.get_team_blue]
        
        if d:
            break

    for idx, history in enumerate(indv_history):
        if len(history)==0: continue
        _history = np.array(history)
        _history[:,2] = discount_rewards(_history[:,2], gamma)
        ep_history.extend(_history)
            
    if len(ep_history) > 0:        
        ep_history = np.stack(ep_history)
    
    return [frame, ep_history, r1, env.blue_win, obs]

## Training

In [None]:
def run_training(num_ep):
    ep = sess.run(global_step)

    exp_buffer = Experience_buffer(experience_shape=3)
    try:
        if num_ep == -1: 
            progbar = tf.keras.utils.Progbar(9999999,width=5, interval=0.5)
        else:
            progbar = tf.keras.utils.Progbar(num_ep,width=5, interval=0.5)
            
        count = 0
        while num_ep == -1 or count < num_ep:
            count += 1
            ep += 1
            progbar.update(count) # update progress bar

            # Run episode
            frame, history, reward, did_won, obs = policy_rollout(True)

            # Add history
            exp_buffer.add(history)

            batch_history = exp_buffer.sample(batch_size) # Sample from experience replay
            if len(batch_history) > 0:
                feed_dict={myAgent.reward_holder:batch_history[:,2],
                           myAgent.action_holder:batch_history[:,1],
                           myAgent.state_input:np.stack(batch_history[:,0])}
                with tf.device('/gpu:0'):
                    sess.run(myAgent.accumulate_gradient, feed_dict=feed_dict)

            if ep % update_frequency == 0 and ep != 0:
                with tf.device('/gpu:0'):
                    sess.run(myAgent.update_batch)
                    sess.run(myAgent.clear_batch)
                exp_buffer.flush()
                                
            # summarize and record
            ma_reward.append(reward)
            ma_length.append(frame)
            ma_captured.append(env.blue_win)

            if ep % save_stat_frequency == 0 and ep != 0:
                summary_ = sess.run(merged, feed_dict=feed_dict)
                record(summary_)

            # save weight
            if ep % save_network_frequency == 0:
                saver.save(sess, MODEL_PATH+'/ctf_policy.ckpt', global_step=global_step)

            sess.run(increment_global_step_op)
            
        return 0

    except KeyboardInterrupt:
        print('\n\nManually stopped the training (KeyboardInterrupt)');
        saver.save(sess, MODEL_PATH+'/ctf_policy.ckpt', global_step=global_step)
        print("save: ", sess.run(global_step), 'episodes')
        
        return 1

## Run

In [None]:
print('Training with fixed policy')
policy_red = policy.zeros.PolicyGen(env.get_map, env.get_team_red)
run_training(total_episode)
print('training with fixed red: Done')

Training with fixed policy
     8/150000 [.....] - ETA: 31:04:42