# Capture the Flag (RL - Policy Gradient)

- Seung Hyun Kim
- skim449@illinois.edu

## Implementation Details

- Actor-critic
- On Policy

### Sampling
- [ ] Mini-batch to update 'average' gradient
- [ ] Experience Replay for Random Sampling
- [ ] Importance Sampling
    
### Deterministic Policy Gradient
- [ ] DDPG
- [ ] MADDPG

### Stability and Reducing Variance
- [x] Gradient clipping
- [ ] Normalized Reward/Advantage
- [ ] Target Network
- [ ] TRPO
- [x] PPO

### Multiprocessing
- [ ] Synchronous Training (A2C)
- [x] Asynchronous Training (A3C)

### Applied Training Methods:
- [ ] Self-play
- [ ] Batch Policy

## Notes

- This notebook includes:
    - Building the structure of policy driven network.
    - Training with/without render
    - Saver that save model and weights to ./model directory
    - Writer that will record some necessary datas to ./logs

- This notebook does not include:
    - Simulation with RL policy
        - The simulation can be done using policy_RL.py
    - cap_test.py is changed appropriately.
    
## References :
- https://github.com/awjuliani/DeepRL-Agents/blob/master/Vanilla-Policy.ipynb (source)
- https://www.youtube.com/watch?v=PDbXPBwOavc
- https://github.com/lilianweng/deep-reinforcement-learning-gym/blob/master/playground/policies/actor_critic.py (source)
- https://github.com/spro/practical-pytorch/blob/master/reinforce-gridworld/reinforce-gridworld.ipynb

## TODO:

- Research on '_bootstrap_' instead of end-reward
- Add global step
- Think about adding discont to advantage
- Normalize reward?
- Record method in network

In [1]:
!rm -rf logs/A3C_lstm_v3/ model/A3C_lstm_v3

In [2]:
TRAIN_NAME='A3C_lstm_v3'
LOG_PATH='./logs/'+TRAIN_NAME
MODEL_PATH='./model/' + TRAIN_NAME

In [3]:
import os
import configparser
#from tqdm import tqdm

import signal
import threading
import multiprocessing

import tensorflow as tf
import tensorflow.contrib.slim as slim
import tensorflow.contrib.layers as layers
from tensorflow.python.client import device_lib
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.signal

import time
import gym
import gym_cap
import gym_cap.envs.const as CONST
import numpy as np
import random
import math

# the modules that you can use to generate the policy. 
import policy.random
import policy.roomba
import policy.policy_RL
import policy.zeros

# Data Processing Module
from utility.dataModule import one_hot_encoder_v2 as one_hot_encoder
from utility.utils import MovingAverage as MA
#from utility.utils import discount_rewards
from utility.buffer import Trajectory, Trajectory_buffer

from network.ActorCritic_lstm import ActorCritic as Network

%load_ext autoreload
%autoreload 2

In [4]:
def discount_rewards(x, gamma):
    return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]

## Hyperparameters

In [5]:
# Importing global configuration
config = configparser.ConfigParser()
config.read('config.ini')

## Environment
action_space = config.getint('DEFAULT','ACTION_SPACE')
n_agent = config.getint('DEFAULT','NUM_AGENT')
map_size = config.getint('DEFAULT','MAP_SIZE')
vision_range = config.getint('DEFAULT','VISION_RANGE')

## Training
total_episodes = config.getint('TRAINING','TOTAL_EPISODES')
max_ep = config.getint('TRAINING','MAX_STEP')
critic_beta = config.getfloat('TRAINING', 'CRITIC_BETA')
entropy_beta = config.getfloat('TRAINING', 'ENTROPY_BETA')
gamma = config.getfloat('TRAINING', 'DISCOUNT_RATE')

decay_lr = config.getboolean('TRAINING','DECAYING_LR')
lr_a = 5e-5 # config.getfloat('TRAINING','LR_ACTOR')
lr_c = 1e-4 # config.getfloat('TRAINING','LR_CRITIC')

## Save/Summary
save_network_frequency = config.getint('TRAINING','SAVE_NETWORK_FREQ')
save_stat_frequency = config.getint('TRAINING','SAVE_STATISTICS_FREQ')
moving_average_step = config.getint('TRAINING','MOVING_AVERAGE_SIZE')

## GPU
gpu_capacity = config.getfloat('GPU_CONFIG','GPU_CAPACITY')
gpu_allowgrow = config.getboolean('GPU_CONFIG', 'GPU_ALLOWGROW')

In [6]:
# Local configuration parameters
po_transition = 100000 # Partial observable
serial_length = 16

# Env Settings
n_channel = 11
vision_dx, vision_dy = 2*vision_range+1, 2*vision_range+1
in_size = [None,vision_dx,vision_dy,n_channel]
nenv = 8#(int) (multiprocessing.cpu_count())

# Asynch Settings
global_scope = 'global'

## Environment Setting

In [7]:
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    
#Create a directory to save episode playback gifs to
if not os.path.exists(LOG_PATH):
    os.makedirs(LOG_PATH)

In [8]:
global_rewards = MA(moving_average_step)
global_ep_rewards = MA(moving_average_step)
global_length = MA(moving_average_step)
global_succeed = MA(moving_average_step)
global_episodes = 0

# Launch the session
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_capacity,
                            allow_growth=gpu_allowgrow)

sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
progbar = tf.keras.utils.Progbar(total_episodes,interval=1)

## Environment Unit

In [9]:
class Environment(threading.Thread):
    def __init__(self, name, global_network, sess, global_step, coord):
        super(Environment, self).__init__()
        # Initialize Environment worker
        self.env = gym.make("cap-v0").unwrapped
        self.name = name
        self.global_network = global_network
        self.sess = sess
        self.global_step = global_step
        self.coord = coord
        
        # Create AC Network for Worker
        self.local_network = Network(in_size=in_size,
                                     action_size=action_space,
                                     lr_actor=lr_a,
                                     lr_critic=lr_c,
                                     scope=self.name,
                                     #grad_clip_norm=10.0,
                                     global_step=global_step,
                                     entropy_beta=entropy_beta,
                                     sess=sess,
                                     global_network=global_ac,
                                     separate_train=True)
        

        
    def run(self, saver, writer):
        """Override Thread.run

        Note:
            Loop to run rollout
            Include summarizing and save
        """
        self.saver = saver
        self.writer = writer
        
        global global_rewards, global_ep_rewards, global_length, global_succeed, global_episodes
        total_step = 0
        while not self.coord.should_stop() and global_episodes < total_episodes:
            ep_r, r, l, s, aloss, closs, etrpy, summary_ = self.rollout(init_step=total_step)
            total_step += l
            
            global_ep_rewards.append(ep_r)
            global_rewards.append(r)
            global_length.append(l)
            global_succeed.append(s)
            
            global_episodes += 1
            progbar.update(global_episodes)
            self.sess.run(global_step_next)
            
            if global_episodes % save_stat_frequency == 0 and global_episodes != 0:
                summary = tf.Summary()
                summary.value.add(tag='Records/mean_reward', simple_value=global_rewards())
                summary.value.add(tag='Records/mean_length', simple_value=global_length())
                summary.value.add(tag='Records/mean_succeed', simple_value=global_succeed())
                summary.value.add(tag='Records/mean_episode_reward', simple_value=global_ep_rewards())
                summary.value.add(tag='summary/Entropy', simple_value=etrpy)
                summary.value.add(tag='summary/actor_loss', simple_value=aloss)
                summary.value.add(tag='summary/critic_loss', simple_value=closs)
                self.writer.add_summary(summary,global_episodes)
                self.writer.add_summary(summary_,global_episodes)

                self.writer.flush()
                
            if global_episodes % save_network_frequency == 0 and global_episodes != 0:
                self.saver.save(self.sess, MODEL_PATH+'/ctf_policy.ckpt', global_step=global_episodes)
                
    
    def get_action(self, states, rnn_states):
        """Run graph to get action

        Args:
            state (list): list of state for each agent
            rnn_states (list): list of rnn inputs for each agent

        Returns:
            action (list) : The action for each egent
            values (list) : The value for each action for each agent
            rnn_next (list) : List of next rnn state for each agent

        Note:
            If rnn_states=None, get action without rnn_states.
        """
        actions, values, final_states = self.local_network.feed_forward(
            state=np.expand_dims(states,axis=1),
            rnn_init_state=rnn_states,
            seq_len=[1]*len(states)
            )
            
        self.feed_dict = {self.local_network.state_input_: np.expand_dims(states, axis=1),
                          self.local_network.rnn_init_states_: self.local_network.get_lstm_initial(4),
                          self.local_network.seq_len_: [1]*n_agent}
        return actions, values, final_states
            
    def rollout(self, init_step=0):
        global global_episodes
        total_step = init_step
        with self.sess.as_default(), self.sess.graph.as_default():
            # Initialize run
            s0 = self.env.reset(map_size=map_size,
                                policy_red=policy.zeros.PolicyGen(self.env.get_map,
                                                                  self.env.get_team_red))
            if po_transition < global_episodes:
                s0 = one_hot_encoder(s0, self.env.get_team_blue, vision_range)
            else:
                s0 = one_hot_encoder(self.env._env, self.env.get_team_blue, vision_range)

            # parameters 
            ep_r = 0 # Episodic Reward
            prev_r = 0
            step = 0
            d = False
            
            # Trajectory Buffers
            trajs = [Trajectory(depth=6) for _ in range(n_agent)]

            # RNN Initializer
            self.rnn_states = self.local_network.get_lstm_initial(n_agent)

            # Bootstrap
            a1, v1, final_states = self.get_action(s0, self.rnn_states)
            is_alive = [ag.isAlive for ag in self.env.get_team_blue]
            
            while step <= max_ep and not d:
                a, v0 = a1, v1
                was_alive = is_alive
                self.rnn_states = np.asarray(final_states)

                s1, rc, d, _ = self.env.step(a)
                if po_transition < global_episodes:
                    s1 = one_hot_encoder(s1, self.env.get_team_blue, vision_range)
                else:
                    s1 = one_hot_encoder(self.env._env, self.env.get_team_blue, vision_range)
                is_alive = [ag.isAlive for ag in self.env.get_team_blue]
                r = rc - prev_r - 0.5
                
                if step == max_ep and d == False:
                    r = -100
                    rc = -100
                    d = True

                r /= 100.0
                ep_r += r

                if d:
                    v1 = [0.0 for _ in range(n_agent)]
                else:
                    a1, v1, final_states = self.get_action(s1, self.rnn_states)

                # push to buffer
                for idx, agent in enumerate(self.env.get_team_blue):
                    if was_alive[idx]:
                        trajs[idx].append([s0[idx],
                                           a[idx],
                                           r,
                                           v0[idx],
                                           0,
                                           self.rnn_states[:,idx,:]
                                          ])
                        
                if (total_step % serial_length == 0 and total_step != 0) or d:
                    aloss, closs, etrpy, summary_ = self.train(trajs, v1)
                    trajs = [Trajectory(depth=6) for _ in range(n_agent)]

                # Iteration
                prev_r = rc
                total_step += 1
                step += 1
                s0 = s1
                
                #self.env.render(mode='fast')
        return ep_r, rc, step, self.env.blue_win, aloss, closs, etrpy, summary_
    
    def train(self, trajectories, bootstrap=[0.0]*n_agent):
        replay_buffer = Trajectory_buffer(depth=6)
        alosses, closses, etrpys = [],[],[]
        for idx, traj in enumerate(trajectories):
            if len(traj) == 0:
                continue
            
            # Discount Reward
            _rew = np.array(traj[2])
            _val = np.append(traj[3],[bootstrap[idx]])  # Bootstrap
            _td  = _rew + gamma * _val[1:]
            _adv = _rew + gamma * _val[1:] - _val[:-1]
            _adv = discount_rewards(_adv, gamma)
            traj[3] = _td.tolist()
            traj[4] = _adv.tolist()

            states, actions, rewards, td_targets, advantages, rnn_initial_states = traj.sample()

            aloss, closs, etrpy = self.local_network.feed_backward(np.expand_dims(states, axis=0),
                                                                   np.expand_dims(actions.astype(int), axis=0),
                                                                   np.expand_dims(td_targets, axis=0),
                                                                   np.expand_dims(advantages, axis=0),
                                                                   np.expand_dims(rnn_initial_states[-1],axis=1),
                                                                   np.array([1]),
                                                                   retrace_lambda=0.202)
            alosses.append(aloss)
            closses.append(closs)
            etrpys.append(etrpy)
        summary_ = self.sess.run(merged_summary_op, self.feed_dict)

        self.local_network.pull_global()
        
        return np.mean(alosses), np.mean(closses), np.mean(etrpys), summary_

## Run

In [None]:
coord = tf.train.Coordinator()
# Global Network
global_step = tf.Variable(0, trainable=False, name='global_step')
global_step_next = tf.assign_add(global_step, 1)
global_ac = Network(in_size=in_size,
                    action_size=action_space,
                    scope=global_scope,
                    sess=sess,
                    global_step=global_step,
                    separate_train=True)

# Local workers
workers = []
# loop for each workers

for idx in range(nenv):#tqdm(range(nenv), ncols=65, desc="Process Initiate"):
    name = 'W_%i' % idx
    workers.append(Environment(name, global_ac, sess, global_step=global_step, coord=coord))
saver = tf.train.Saver(max_to_keep=3)
writer = tf.summary.FileWriter(LOG_PATH, sess.graph)

    
ckpt = tf.train.get_checkpoint_state(MODEL_PATH)
if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Load Model : ", ckpt.model_checkpoint_path)
else:
    sess.run(tf.global_variables_initializer())
    print("Initialized Variables")
    

worker_threads = []
global_episodes = sess.run(global_step)

# Summarize
for var in tf.trainable_variables(scope=global_scope):
    tf.summary.histogram(var.name, var)
merged_summary_op = tf.summary.merge_all()

for worker in workers:
    job = lambda: worker.run(saver, writer)
    t = threading.Thread(target=job)
    t.start()
    worker_threads.append(t)
    time.sleep(10)
coord.join(worker_threads)


  result = entry_point.load(False)


Initialized Variables
INFO:tensorflow:Summary name global/actor/fully_connected/weights:0 is illegal; using global/actor/fully_connected/weights_0 instead.
INFO:tensorflow:Summary name global/actor/fully_connected/biases:0 is illegal; using global/actor/fully_connected/biases_0 instead.
INFO:tensorflow:Summary name global/actor/rnn/multi_rnn_cell/cell_0/gru_cell/gates/kernel:0 is illegal; using global/actor/rnn/multi_rnn_cell/cell_0/gru_cell/gates/kernel_0 instead.
INFO:tensorflow:Summary name global/actor/rnn/multi_rnn_cell/cell_0/gru_cell/gates/bias:0 is illegal; using global/actor/rnn/multi_rnn_cell/cell_0/gru_cell/gates/bias_0 instead.
INFO:tensorflow:Summary name global/actor/rnn/multi_rnn_cell/cell_0/gru_cell/candidate/kernel:0 is illegal; using global/actor/rnn/multi_rnn_cell/cell_0/gru_cell/candidate/kernel_0 instead.
INFO:tensorflow:Summary name global/actor/rnn/multi_rnn_cell/cell_0/gru_cell/candidate/bias:0 is illegal; using global/actor/rnn/multi_rnn_cell/cell_0/gru_cell/ca