# Capture the Flag (RL - Policy Gradient)

- Seung Hyun Kim
- skim449@illinois.edu

## Implementation Details

- Recurrent Policy Gradient

### Sampling
- [ ] Mini-batch to update 'average' gradient
- [x] Experience Replay for Random Sampling
- [ ] Importance Sampling
    
### Deterministic Policy Gradient
- [ ] DDPG
- [ ] MADDPG

### Stability and Reducing Variance
- [ ] Gradient clipping
- [ ] Normalized Reward/Advantage
- [x] Target Network
- [ ] TRPO
- [ ] PPO

### Multiprocessing
- [ ] Synchronous Training (A2C)
- [ ] Asynchronous Training (A3C)

### Applied Training Methods:
- [ ] Self-play
- [ ] Batch Policy

## Notes

- This notebook includes:
    - Building the structure of policy driven network.
    - Training with/without render
    - Saver that save model and weights to ./model directory
    - Writer that will record some necessary datas to ./logs

- This notebook does not include:
    - Simulation with RL policy
        - The simulation can be done using policy_RL.py
    - cap_test.py is changed appropriately.
    
## References :
- https://github.com/awjuliani/DeepRL-Agents/blob/master/Vanilla-Policy.ipynb (source)
- https://www.youtube.com/watch?v=PDbXPBwOavc
- https://github.com/lilianweng/deep-reinforcement-learning-gym/blob/master/playground/policies/actor_critic.py (source)
- https://github.com/spro/practical-pytorch/blob/master/reinforce-gridworld/reinforce-gridworld.ipynb

## TODO:

- Research on '_bootstrap_' instead of end-reward
- Add global step
- Think about adding discont to advantage
- Normalize reward?
- Record method in network

In [1]:
!rm -rf logs/RPG_v1/ model/RPG_v1

In [2]:
TRAIN_NAME='RPG_v1'
LOG_PATH='./logs/'+TRAIN_NAME
MODEL_PATH='./model/' + TRAIN_NAME
RENDER_PATH='./render/' + TRAIN_NAME

In [3]:
import os
import configparser
from tqdm import tqdm

import tensorflow as tf
import tensorflow.contrib.slim as slim
import tensorflow.contrib.layers as layers
from tensorflow.python.client import device_lib
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.signal

import time
import gym
import gym_cap
import gym_cap.envs.const as CONST
import numpy as np
import random
import math

# the modules that you can use to generate the policy. 
import policy.random
import policy.roomba
import policy.policy_RL
import policy.zeros

# Data Processing Module
from utility.dataModule import one_hot_encoder as one_hot_encoder
from utility.utils import MovingAverage as MA
from utility.utils import discount_rewards, normalize
from utility.buffer import Trajectory, Trajectory_buffer

from network.rpg import RPG as Network

import imageio

%load_ext autoreload
%autoreload 2

## Hyperparameters

In [4]:
# Importing global configuration
config = configparser.ConfigParser()
config.read('config.ini')

## Environment
action_space = config.getint('DEFAULT','ACTION_SPACE')
n_agent = config.getint('DEFAULT','NUM_AGENT')
map_size = config.getint('DEFAULT','MAP_SIZE')
vision_range = config.getint('DEFAULT','VISION_RANGE')

## Training
total_episodes = config.getint('TRAINING','TOTAL_EPISODES')
max_ep = config.getint('TRAINING','MAX_STEP')
critic_beta = config.getfloat('TRAINING', 'CRITIC_BETA')
entropy_beta = config.getfloat('TRAINING', 'ENTROPY_BETA')
gamma = config.getfloat('TRAINING', 'DISCOUNT_RATE')

decay_lr = config.getboolean('TRAINING','DECAYING_LR')
lr_a = 1e-2 # config.getfloat('TRAINING','LR_ACTOR')
lr_c = 2e-3 # config.getfloat('TRAINING','LR_CRITIC')

## Save/Summary
save_network_frequency = config.getint('TRAINING','SAVE_NETWORK_FREQ')
save_stat_frequency = config.getint('TRAINING','SAVE_STATISTICS_FREQ')
moving_average_step = config.getint('TRAINING','MOVING_AVERAGE_SIZE')

## GPU
gpu_capacity = config.getfloat('GPU_CONFIG','GPU_CAPACITY')
gpu_allowgrow = config.getboolean('GPU_CONFIG', 'GPU_ALLOWGROW')

In [None]:
# Local configuration parameters
batch_size = 32

# Env Settings
n_channel = 6
vision_dx, vision_dy = 2*vision_range+1, 2*vision_range+1
in_size = [None,vision_dx*vision_dy*n_channel]

# Asynch Settings
global_scope = 'global'

## Environment Setting

In [None]:
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    
#Create a directory to save episode playback gifs to
if not os.path.exists(LOG_PATH):
    os.makedirs(LOG_PATH)
    
if not os.path.exists(RENDER_PATH):
    os.makedirs(RENDER_PATH)

In [None]:
global_rewards = MA(moving_average_step)
global_ep_rewards = MA(moving_average_step)
global_length = MA(moving_average_step)
global_succeed = MA(moving_average_step)

# Launch the session
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_capacity,
                            allow_growth=gpu_allowgrow)

sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=True))
#progbar = tf.keras.utils.Progbar(total_episodes,interval=1)

In [None]:
env = gym.make("cap-v0").unwrapped

env.reset(map_size=map_size,
          policy_red=policy.zeros.PolicyGen(env.get_map, env.get_team_red))
network = Network(in_size=in_size,
                  action_size=action_space,
                  lr_policy=lr_a,
                  lr_baseline=lr_c,
                  tau=0.5,
                  sess=sess)

saver = tf.train.Saver(max_to_keep=3)
writer = tf.summary.FileWriter(LOG_PATH, sess.graph)
    
ckpt = tf.train.get_checkpoint_state(MODEL_PATH)
if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Load Model : ", ckpt.model_checkpoint_path)
else:
    sess.run(tf.global_variables_initializer())
    print("Initialized Variables")

  result = entry_point.load(False)


Initialized Variables


In [None]:
def run():
    global global_rewards, global_ep_rewards, global_length, global_succeed
    batch_count = 0
    for episode in tqdm(range(total_episodes + 1)):
        ep_r, r, length, batch_count, s, summary_ = rollout(init_step=batch_count,
                                                            episode=episode)

        global_ep_rewards.append(ep_r)
        global_rewards.append(r)
        global_length.append(length)
        global_succeed.append(s)

        #progbar.update(episode)

        if summary_ != None or (episode % save_stat_frequency == 0 and episode != 0):
            summary = tf.Summary()
            summary.value.add(tag='Records/mean_reward', simple_value=global_rewards())
            summary.value.add(tag='Records/mean_length', simple_value=global_length())
            summary.value.add(tag='Records/mean_succeed', simple_value=global_succeed())
            summary.value.add(tag='Records/mean_episode_reward', simple_value=global_ep_rewards())
            writer.add_summary(summary,episode)
            if summary_ is not None:
                writer.add_summary(summary_,episode)
            writer.flush()

        if episode % save_network_frequency == 0 and episode != 0:
            saver.save(sess, MODEL_PATH+'/ctf_policy.ckpt', global_step=episode)


def get_action(states, rnn_states):
    """Run graph to get action for each agents"""    
    actions, values, final_states = [], [], []
    for idx, state in enumerate(states):            
        action, value, final_state = network.feed_forward(
            state=state[np.newaxis, np.newaxis,:],
            rnn_state=rnn_states[idx]
            )
        actions.append(action)
        values.append(value)
        final_states.append(final_state)

    return actions, values, final_states

def rollout(init_step=0, episode=0, train=True):
    # Initialize run
    batch_count = init_step
    experience = []
    
    s0 = env.reset()
    s0 = one_hot_encoder(env._env, env.get_team_blue, vision_range, flatten=True)
    # parameters
    ep_r = 0 # Episodic Reward
    prev_r = 0
    step = 0
    d = False

    # Trajectory Buffers
    trajs = [Trajectory(depth=4) for _ in range(n_agent)]

    # RNN Initializer
    rnn_states = [sess.run(network.rnn_eval_init)
                      for _ in range(n_agent)]

    # Bootstrap
    a1, v1, final_states = get_action(s0, rnn_states)
    is_alive = [ag.isAlive for ag in env.get_team_blue]
    buffer_d = []

    while step <= max_ep and not d:
        a, v0 = a1, v1
        was_alive = is_alive
        rnn_states = final_states

        s1, rc, d, _ = env.step(a)
        s1 = one_hot_encoder(env._env, env.get_team_blue, vision_range, flatten=True)

        is_alive = [ag.isAlive for ag in env.get_team_blue]
        r = rc - prev_r - 0.01

        if step == max_ep and d == False:
            r = -100
            rc = -100
            d = True

        r /= 100.0
        ep_r += r

        if d:
            v1 = [0.0 for _ in range(n_agent)]
        else:
            a1, v1, final_states = get_action(s1, rnn_states)

        # push to buffer
        buffer_d.append(d)
        for idx, agent in enumerate(env.get_team_blue):
            if was_alive[idx]:
                trajs[idx].append([s0[idx],
                                   a[idx],
                                   r,
                                   v0[idx],
                                  ])

        # Iteration
        prev_r = rc
        step += 1
        s0 = s1
        
    if not train:
        return
    
    # Normalise rewards
    ds = np.array(buffer_d)
    for idx, traj in enumerate(trajs):
        if len(traj) == 0:
            continue

        # Discount Reward
        _ds = ds[:len(traj)]
        _rew = np.array(traj[2])
        _rew = discount_rewards(_rew, 0.98)
        #_rew = np.clip(_rew / reward_statistics.std, -10, 10)
        _base = np.array(traj[3])  # Bootstrap
        
        bs, ba, br, bbas = np.stack(traj[0]), np.array(traj[1]), np.array(_rew), np.array(_base)
        #np.reshape(traj[0], [len(traj[0])] + in_size[-3:]), 
                            
        experience.append([bs, ba, br, bbas])
        batch_count += 1
    
    # Update ppo
    if batch_count >= batch_size:
        #reward_statistics.update(np.array(np.concatenate(list(zip(*experience))[2])))

        # print(f'experience length: {len(experience)}')
        graph_summary, global_step = network.feed_backward(experience)
        
        # Reset
        batch_count, experience = 0, []
    else:
        graph_summary = None
    
    return ep_r, rc, step,batch_count, env.blue_win, graph_summary

## Run
### TODO
- Summary is often not defined as pass

In [None]:
run()

  1%|          | 854/100001 [46:18<98:33:36,  3.58s/it] 

## Test