# Capture the Flag (RL - Policy Gradient)

- Seung Hyun Kim
- skim449@illinois.edu

## Implementation Details

- Actor-critic
- On Policy

### Sampling
- [x] Mini-batch to update 'average' gradient
- [x] Experience Replay for Random Sampling
- [ ] Importance Sampling
    
### Deterministic Policy Gradient
- [ ] DDPG
- [ ] MADDPG

### Stability and Reducing Variance
- [x] Gradient clipping
- [ ] Normalized Reward/Advantage
- [ ] Target Network
- [ ] TRPO
- [x] PPO

### Multiprocessing
- [ ] Synchronous Training (A2C)
- [ ] Asynchronous Training (A3C)

### Applied Training Methods:
- [ ] Self-play
- [ ] Batch Policy

## Notes

- This notebook includes:
    - Building the structure of policy driven network.
    - Training with/without render
    - Saver that save model and weights to ./model directory
    - Writer that will record some necessary datas to ./logs

- This notebook does not include:
    - Simulation with RL policy
        - The simulation can be done using policy_RL.py
    - cap_test.py is changed appropriately.
    
## References :
- https://github.com/awjuliani/DeepRL-Agents/blob/master/Vanilla-Policy.ipynb (source)
- https://www.youtube.com/watch?v=PDbXPBwOavc
- https://github.com/lilianweng/deep-reinforcement-learning-gym/blob/master/playground/policies/actor_critic.py (source)
- https://github.com/spro/practical-pytorch/blob/master/reinforce-gridworld/reinforce-gridworld.ipynb

## TODO:

- Research on '_bootstrap_' instead of end-reward
- Add global step
- Think about adding discont to advantage
- Normalize reward?
- Record method in network

In [1]:
!rm -rf logs/PPO_LSTM_v2/ model/PPO_LSTM_v2

In [2]:
TRAIN_NAME='PPO_LSTM_v2'
LOG_PATH='./logs/'+TRAIN_NAME
MODEL_PATH='./model/' + TRAIN_NAME

In [3]:
import os
import configparser
#from tqdm import tqdm

import tensorflow as tf
import tensorflow.contrib.slim as slim
import tensorflow.contrib.layers as layers
from tensorflow.python.client import device_lib
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.signal

import time
import gym
import gym_cap
import gym_cap.envs.const as CONST
import numpy as np
import random
import math

# the modules that you can use to generate the policy. 
import policy.random
import policy.roomba
import policy.policy_RL
import policy.zeros

# Data Processing Module
from utility.dataModule import one_hot_encoder as one_hot_encoder
from utility.utils import MovingAverage as MA
from utility.utils import discount_rewards, normalize
from utility.buffer import Trajectory, Trajectory_buffer

from network.ppo_lstm import PPO as Network

%load_ext autoreload
%autoreload 2

## Hyperparameters

In [4]:
# Importing global configuration
config = configparser.ConfigParser()
config.read('config.ini')

## Environment
action_space = config.getint('DEFAULT','ACTION_SPACE')
n_agent = 1# config.getint('DEFAULT','NUM_AGENT')
map_size = config.getint('DEFAULT','MAP_SIZE')
vision_range = config.getint('DEFAULT','VISION_RANGE')

## Training
total_episodes = config.getint('TRAINING','TOTAL_EPISODES')
max_ep = config.getint('TRAINING','MAX_STEP')
critic_beta = config.getfloat('TRAINING', 'CRITIC_BETA')
entropy_beta = config.getfloat('TRAINING', 'ENTROPY_BETA')
gamma = config.getfloat('TRAINING', 'DISCOUNT_RATE')

decay_lr = config.getboolean('TRAINING','DECAYING_LR')
lr_a = 1e-3 # config.getfloat('TRAINING','LR_ACTOR')
lr_c = 2e-3 # config.getfloat('TRAINING','LR_CRITIC')

## Save/Summary
save_network_frequency = config.getint('TRAINING','SAVE_NETWORK_FREQ')
save_stat_frequency = config.getint('TRAINING','SAVE_STATISTICS_FREQ')
moving_average_step = config.getint('TRAINING','MOVING_AVERAGE_SIZE')

## GPU
gpu_capacity = config.getfloat('GPU_CONFIG','GPU_CAPACITY')
gpu_allowgrow = config.getboolean('GPU_CONFIG', 'GPU_ALLOWGROW')

In [None]:
# Local configuration parameters
po_transition = 100000 # Partial observable
batch_size = 2048*4

# Env Settings
n_channel = 6
vision_dx, vision_dy = 2*vision_range+1, 2*vision_range+1
in_size = [None,vision_dx*vision_dy*n_channel]

# Asynch Settings
global_scope = 'global'

## Environment Setting

In [None]:
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    
#Create a directory to save episode playback gifs to
if not os.path.exists(LOG_PATH):
    os.makedirs(LOG_PATH)

In [None]:
global_rewards = MA(moving_average_step)
global_ep_rewards = MA(moving_average_step)
global_length = MA(moving_average_step)
global_succeed = MA(moving_average_step)

# Launch the session
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_capacity,
                            allow_growth=gpu_allowgrow)

sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False))
progbar = tf.keras.utils.Progbar(total_episodes,interval=1)

In [None]:
env = gym.make("cap-v0").unwrapped
env.reset(map_size=map_size,
          policy_red=policy.zeros.PolicyGen(env.get_map, env.get_team_red))
network = Network(in_size=in_size, action_size=action_space, scope='main', sess=sess)

saver = tf.train.Saver(max_to_keep=3)
writer = tf.summary.FileWriter(LOG_PATH, sess.graph)
    
ckpt = tf.train.get_checkpoint_state(MODEL_PATH)
if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Load Model : ", ckpt.model_checkpoint_path)
else:
    sess.run(tf.global_variables_initializer())
    print("Initialized Variables")



  result = entry_point.load(False)


Initialized Variables


In [None]:
experience = []

def run():
    global global_rewards, global_ep_rewards, global_length, global_succeed
    batch_count = 0
    for episode in range(total_episodes + 1):
        ep_r, r, length, batch_count, s, summary_ = rollout(init_step=batch_count, episode=episode)

        global_ep_rewards.append(ep_r)
        global_rewards.append(r)
        global_length.append(length)
        global_succeed.append(s)

        progbar.update(episode)

        if summary_ != None or (episode % save_stat_frequency == 0 and episode != 0):
            summary = tf.Summary()
            summary.value.add(tag='Records/mean_reward', simple_value=global_rewards())
            summary.value.add(tag='Records/mean_length', simple_value=global_length())
            summary.value.add(tag='Records/mean_succeed', simple_value=global_succeed())
            summary.value.add(tag='Records/mean_episode_reward', simple_value=global_ep_rewards())
            writer.add_summary(summary,episode)
            if summary_ is not None:
                writer.add_summary(summary_,episode)
            writer.flush()

        if episode % save_network_frequency == 0 and episode != 0:
            saver.save(sess, MODEL_PATH+'/ctf_policy.ckpt', global_step=episode)


def get_action(states, rnn_states):
    """Run graph to get action for each agents"""
    actions, values, final_states = [], [], []
    for idx, state in enumerate(states):            
        action, value, final_state = network.feed_forward(
            state=state[np.newaxis,:],
            rnn_state=rnn_states[idx]
            )
        actions.append(action)
        values.append(value)
        final_states.append(final_state)

    return actions, values, final_states

def rollout(init_step=0, episode=0):
    global experience
    # Initialize run
    batch_count = init_step
    
    s0 = env.reset()
    if po_transition < episode:
        s0 = one_hot_encoder(s0, env.get_team_blue, vision_range, flatten=True)
    else:
        s0 = one_hot_encoder(env._env, env.get_team_blue, vision_range, flatten=True)
    # parameters
    ep_r = 0 # Episodic Reward
    prev_r = 0
    step = 0
    d = False

    # Trajectory Buffers
    trajs = [Trajectory(depth=6) for _ in range(n_agent)]

    # RNN Initializer
    rnn_states = [sess.run([network.action_eval_init_state, network.critic_eval_init_state])
                      for _ in range(n_agent)]

    # Bootstrap
    a1, v1, final_states = get_action(s0, rnn_states)
    is_alive = [ag.isAlive for ag in env.get_team_blue]
    buffer_d = []

    while step <= max_ep and not d:
        a, v0 = a1, v1
        was_alive = is_alive
        rnn_states = final_states

        s1, rc, d, _ = env.step(a)
        if po_transition < episode:
            s1 = one_hot_encoder(s1, env.get_team_blue, vision_range, flatten=True)
        else:
            s1 = one_hot_encoder(env._env, env.get_team_blue, vision_range, flatten=True)

        is_alive = [ag.isAlive for ag in env.get_team_blue]
        r = rc - prev_r - 0.01

        if step == max_ep and d == False:
            r = -100
            rc = -100
            d = True

        r /= 100.0
        ep_r += r

        if d:
            v1 = [0.0 for _ in range(n_agent)]
        else:
            a1, v1, final_states = get_action(s1, rnn_states)

        # push to buffer
        buffer_d.append(d)
        for idx, agent in enumerate(env.get_team_blue):
            if was_alive[idx]:
                trajs[idx].append([s0[idx],
                                   a[idx],
                                   r,
                                   v0[idx],
                                   0,
                                   rnn_states[idx]
                                  ])

        # Iteration
        prev_r = rc
        batch_count += 1
        step += 1
        s0 = s1    

        #env.render(mode='fast')
        
    # Normalise rewards
    ds = np.array(buffer_d)
    for idx, traj in enumerate(trajs):
        if len(traj) == 0:
            continue

        # Discount Reward
        _rew = np.array(traj[2])
        #_rew = np.clip(_rew / np.std(_rew), -10, 10)
        _val = np.append(traj[3],[v1[idx]])  # Bootstrap
        _td  = _rew + gamma * _val[1:] * (1-ds) - _val[:-1]
        _adv = discount_rewards(_td, 0.931, mask_array=ds)
        _ret = _adv + _val[:-1]
        traj[3] = _td.tolist()
        traj[4] = _adv.tolist()
        
        bs, ba, br, badv = traj[0], np.vstack(traj[1]), np.vstack(_ret), np.vstack(_adv) 
        #np.reshape(traj[0], [len(traj[0])] + in_size[-3:]), 
                            
        experience.append([bs, ba, br, badv])
    
    # Update ppo
    if batch_count >= batch_size:
        # Per batch normalisation of advantages
        advs = np.concatenate(list(zip(*experience))[3])
        for x in experience:
            x[3] = (x[3] - np.mean(advs)) / np.maximum(np.std(advs), 1e-6)

        print(f'experience length: {len(experience)}')
        graph_summary = network.feed_backward(experience)
        batch_count, experience = 0, []
    else:
        graph_summary = None
    
    return ep_r, rc, step,batch_count, env.blue_win, graph_summary

## Run

In [None]:
run()

    61/150000 [..............................] - ETA: 37:28:21experience length: 63
Train: 2036 batches trained, 0 episodes: 94.7599470615387 sec
   122/150000 [..............................] - ETA: 65:24:09experience length: 61
Train: 2008 batches trained, 0 episodes: 77.29691696166992 sec
   188/150000 [..............................] - ETA: 68:48:55experience length: 66
Train: 2023 batches trained, 0 episodes: 98.82200956344604 sec
   249/150000 [..............................] - ETA: 75:27:26experience length: 63
Train: 2020 batches trained, 0 episodes: 79.2430648803711 sec
   319/150000 [..............................] - ETA: 74:48:52experience length: 68
Train: 2012 batches trained, 0 episodes: 99.64769172668457 sec
   380/150000 [..............................] - ETA: 78:16:31experience length: 63
Train: 2019 batches trained, 0 episodes: 79.63556361198425 sec
   445/150000 [..............................] - ETA: 78:20:17experience length: 63
Train: 2042 batches trained, 0 episo