# Capture the Flag (RL - Policy Gradient)

- Seung Hyun Kim
- skim449@illinois.edu

## Implementation Details

- Actor-critic
- On Policy

### Sampling
- [ ] Mini-batch to update 'average' gradient
- [ ] Experience Replay for Random Sampling
- [ ] Importance Sampling
    
### Deterministic Policy Gradient
- [ ] DDPG
- [ ] MADDPG

### Stability and Reducing Variance
- [x] Gradient clipping
- [ ] Normalized Reward/Advantage
- [ ] Target Network
- [ ] TRPO
- [ ] PPO

### Multiprocessing
- [ ] Synchronous Training (A2C)
- [x] Asynchronous Training (A3C)

### Applied Training Methods:
- [ ] Self-play
- [ ] Batch Policy

## Notes

- This notebook includes:
    - Building the structure of policy driven network.
    - Training with/without render
    - Saver that save model and weights to ./model directory
    - Writer that will record some necessary datas to ./logs

- This notebook does not include:
    - Simulation with RL policy
        - The simulation can be done using policy_RL.py
    - cap_test.py is changed appropriately.
    
## References :
- https://github.com/awjuliani/DeepRL-Agents/blob/master/Vanilla-Policy.ipynb (source)
- https://www.youtube.com/watch?v=PDbXPBwOavc
- https://github.com/lilianweng/deep-reinforcement-learning-gym/blob/master/playground/policies/actor_critic.py (source)
- https://github.com/spro/practical-pytorch/blob/master/reinforce-gridworld/reinforce-gridworld.ipynb

## TODO:

- Research on '_bootstrap_' instead of end-reward
- Add global step
- Think about adding discont to advantage
- Normalize reward?
- Record method in network

In [1]:
!rm -rf logs/A3C_lstm/ model/A3C_lstm

In [None]:
TRAIN_NAME='A3C_lstm'
LOG_PATH='./logs/'+TRAIN_NAME
MODEL_PATH='./model/' + TRAIN_NAME

In [None]:
import os
import configparser

import signal
import threading
import multiprocessing

import tensorflow as tf
import tensorflow.contrib.slim as slim
import tensorflow.contrib.layers as layers
from tensorflow.python.client import device_lib
import matplotlib.pyplot as plt
%matplotlib inline

import time
import gym
import gym_cap
import gym_cap.envs.const as CONST
import numpy as np
import random
import math

# the modules that you can use to generate the policy. 
import policy.random
import policy.roomba
import policy.policy_RL
import policy.zeros

# Data Processing Module
from utility.dataModule import one_hot_encoder_v2 as one_hot_encoder
from utility.utils import MovingAverage as MA
from utility.utils import Experience_buffer, discount_rewards

from network.ActorCritic_lstm import ActorCritic as Network

%load_ext autoreload
%autoreload 2

## Hyperparameters

In [None]:
# Importing global configuration
config = configparser.ConfigParser()
config.read('config.ini')

## Environment
action_space = config.getint('DEFAULT','ACTION_SPACE')
n_agent = config.getint('DEFAULT','NUM_AGENT')
map_size = config.getint('DEFAULT','MAP_SIZE')
vision_range = config.getint('DEFAULT','VISION_RANGE')

## Training
total_episodes = config.getint('TRAINING','TOTAL_EPISODES')
max_ep = config.getint('TRAINING','MAX_STEP')
critic_beta = config.getfloat('TRAINING', 'CRITIC_BETA')
entropy_beta = config.getfloat('TRAINING', 'ENTROPY_BETA')
gamma = config.getfloat('TRAINING', 'DISCOUNT_RATE')

decay_lr = config.getboolean('TRAINING','DECAYING_LR')
lr_a = 1e-4 # config.getfloat('TRAINING','LR_ACTOR')
lr_c = 5e-4 # config.getfloat('TRAINING','LR_CRITIC')

## Save/Summary
save_network_frequency = config.getint('TRAINING','SAVE_NETWORK_FREQ')
save_stat_frequency = config.getint('TRAINING','SAVE_STATISTICS_FREQ')
moving_average_step = config.getint('TRAINING','MOVING_AVERAGE_SIZE')

## GPU
gpu_capacity = config.getfloat('GPU_CONFIG','GPU_CAPACITY')
gpu_allowgrow = config.getboolean('GPU_CONFIG', 'GPU_ALLOWGROW')

In [None]:
# Local configuration parameters
update_frequency = 32

# Env Settings
vision_dx, vision_dy = 2*vision_range+1, 2*vision_range+1
in_size = [None,vision_dx,vision_dy,11]
nenv = 12#(int) (multiprocessing.cpu_count())

# Asynch Settings
global_scope = 'global'

## Environment Setting

In [None]:
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    
#Create a directory to save episode playback gifs to
if not os.path.exists(LOG_PATH):
    os.makedirs(LOG_PATH)

## A3C Network Structure

![Network Structure](https://cdn-images-1.medium.com/max/1600/1*YtnGhtSAMnnHSL8PvS7t_w.png)

- Network is given in network.ActorCritic

## Environments

![Environment Interaction Diagram](https://cdn-images-1.medium.com/max/1600/1*Hzql_1t0-wwDxiz0C97AcQ.png)

In [None]:
global_rewards = MA(moving_average_step)
global_ep_rewards = MA(moving_average_step)
global_length = MA(moving_average_step)
global_succeed = MA(moving_average_step)
global_episodes = 0

# Launch the session
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_capacity,
                            allow_growth=gpu_allowgrow)

sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
progbar = tf.keras.utils.Progbar(total_episodes,interval=1)

## Environment Unit

In [None]:
class Environment(threading.Thread):
    def __init__(self, name, global_network, sess, global_step, coord):
        super(Environment, self).__init__()
        # Initialize Environment worker
        self.env = gym.make("cap-v0").unwrapped
        self.name = name
        self.global_network = global_network
        self.sess = sess
        self.global_step = global_step
        self.coord = coord
        
        # Create AC Network for Worker
        self.local_network = Network(in_size=in_size,
                                     action_size=action_space,
                                     decay_lr=decay_lr,
                                     lr_actor=lr_a,
                                     lr_critic=lr_c,
                                     scope=self.name,
                                     global_step=global_step,
                                     initial_step=0,
                                     entropy_beta=entropy_beta,
                                     sess=sess,
                                     global_network=global_ac,
                                     lstm_network=True)
        

        
    def run(self, saver, writer):
        """Override Thread.run

        Note:
            Loop to run rollout
            Include summarizing and save
        """
        self.saver = saver
        self.writer = writer
        
        global global_rewards, global_ep_rewards, global_length, global_succeed, global_episodes
        total_step = 0
        while not coord.should_stop() and global_episodes < total_episodes:
            ep_r, r, l, s, aloss, closs, etrpy, summary_ = self.rollout(init_step=total_step)
            total_step += l
            
            global_ep_rewards.append(ep_r)
            global_rewards.append(r)
            global_length.append(l)
            global_succeed.append(s)
            
            global_episodes += 1
            self.sess.run(global_step_next)
            progbar.update(global_episodes)
            
            if global_episodes % save_stat_frequency == 0 and global_episodes != 0:
                summary = tf.Summary()
                summary.value.add(tag='Records/mean_reward', simple_value=global_rewards())
                summary.value.add(tag='Records/mean_length', simple_value=global_length())
                summary.value.add(tag='Records/mean_succeed', simple_value=global_succeed())
                summary.value.add(tag='Records/mean_episode_reward', simple_value=global_ep_rewards())
                summary.value.add(tag='summary/Entropy', simple_value=etrpy)
                summary.value.add(tag='summary/actor_loss', simple_value=aloss)
                summary.value.add(tag='summary/critic_loss', simple_value=closs)
                self.writer.add_summary(summary,global_episodes)
                self.writer.add_summary(summary_,global_episodes)

                self.writer.flush()
                
            if global_episodes % save_network_frequency == 0 and global_episodes != 0:
                self.saver.save(self.sess, MODEL_PATH+'/ctf_policy.ckpt', global_step=global_episodes)
                
    
    def get_action(self, state, rnn_states=None):
        """Run graph to get action

        Args:
            state (list): list of state for each agent
            rnn_states (list): list of rnn inputs for each agent

        Returns:
            action (list) : The action for each egent
            values (list) : The value for each action for each agent
            rnn_next (list) : List of next rnn state for each agent

        Note:
            If rnn_states=None, get action without rnn_states.
        """
        a1, v1 = [], []
        rnn_next = []
        for idx in range(n_agent):
            feed_dict = {self.local_network.state_input: state[idx:idx+1],
                         self.local_network.rnn_train_length : 1
                        }
            if rnn_states != None:
                feed_dict.update({self.local_network.rnn_state_in: rnn_states[idx]})
            a_, v_, rnn_state = self.local_network.run_network(feed_dict)
            a1.append(a_[0])
            v1.append(v_[0])
            rnn_next.append(rnn_state)
            
        return a1, v1, rnn_next
    
    def rnn_state_initialize(self, batch_size=1):
        if self.local_network.lstm_network:
            return (np.zeros([batch_size,self.local_network.rnn_steps],np.float32),
                    np.zeros([batch_size,self.local_network.rnn_steps],np.float32))
        else:
            return None
            
    def rollout(self, init_step=0):
        total_step = init_step
        with self.sess.as_default(), self.sess.graph.as_default():
            # Initialize run
            s1 = self.env.reset(map_size=map_size, 
                                policy_red=policy.zeros.PolicyGen(self.env.get_map, self.env.get_team_red))
            #s1 = one_hot_encoder(s1, self.env.get_team_blue, vision_range)
            s1 = one_hot_encoder(self.env._env, self.env.get_team_blue, vision_range)

            # parameters 
            ep_r = 0 # Episodic Reward
            prev_r = 0
            was_alive = [ag.isAlive for ag in self.env.get_team_blue]
            indv_history = [[] for _ in range(len(self.env.get_team_blue))]

            # RNN Initialize (If lstm is off, it will remain as list of None)
            rnn_states = [self.rnn_state_initialize() for _ in range(n_agent)]

            # Bootstrap
            a1, v1, rnn_states = self.get_action(s1, rnn_states)

            step, d = 0, False
            while step <= max_ep and not d:
                a, v0 = a1, v1
                s0 = s1

                s1, rc, d, _ = self.env.step(a)
                #s1 = one_hot_encoder(s1, self.env.get_team_blue, vision_range)
                s1 = one_hot_encoder(self.env._env,
                                     self.env.get_team_blue,
                                     vision_range)

                r = (rc - prev_r-1)
                if step == max_ep and d == False:
                    r = -100
                    rc = -100
                    d = True

                r /= 100.0
                ep_r += r

                if d:
                    v1 = [0.0 for _ in range(len(self.env.get_team_blue))]
                else:
                    a1, v1, rnn_states = self.get_action(s1, rnn_states)

                # push to buffer
                for idx, agent in enumerate(self.env.get_team_blue):
                    if was_alive[idx]:
                        indv_history[idx].append([s0[idx],
                                                  a[idx],
                                                  r,
                                                  v0[idx]])

                if total_step % update_frequency == 0 or d:
                    rnn_states = [self.rnn_state_initialize() for _ in range(n_agent)]
                    aloss, closs, etrpy, feed_dict = self.train(indv_history, sess, v1, rnn_states, step)
                    indv_history = [[] for _ in range(len(self.env.get_team_blue))]

                # Iteration
                prev_r = rc
                was_alive = [ag.isAlive for ag in self.env.get_team_blue]
                total_step += 1
                step += 1

            summary_ = self.sess.run(merged_summary_op, feed_dict)                        
        return ep_r, rc, step, self.env.blue_win, aloss, closs, etrpy, summary_

    def train(self, indv_buffer, sess, bootstrap, rnn_states, train_length):
        alosses, closses, entropys = [], [], []
        for idx, buffer in enumerate(indv_buffer):
            if len(buffer) == 0:
                continue
            _history = np.array(buffer)
            observations = _history[:,0]
            actions = _history[:,1]
            rewards = _history[:,2]
            values = _history[:,3]
            
            value_ext = np.append(values, [bootstrap[idx]])
            td_target = rewards + gamma * value_ext[1:]
            advantages = rewards + gamma * value_ext[1:] - value_ext[:-1]
            advantages = discount_rewards(advantages,gamma)
            
            feed_dict = {
                self.local_network.state_input      : np.stack(observations),
                self.local_network.action_    : actions,
                self.local_network.td_target_ : td_target,
                self.local_network.advantage_ : advantages,
                self.local_network.rnn_state_in : rnn_states[idx],
                self.local_network.rnn_train_length : train_length
            }

            # Update Buffer
            aloss, closs, etrpy = self.local_network.update_global(feed_dict)
            alosses.append(aloss)
            closses.append(closs)
            entropys.append(etrpy)

        # get global parameters to local ActorCritic 
        self.local_network.pull_global()
        
        return np.mean(alosses), np.mean(closses), np.mean(entropys), feed_dict
    

## Run

In [None]:
coord = tf.train.Coordinator()
with tf.device("/cpu:0"):
    # Global Network
    global_step = tf.Variable(0, trainable=False, name='global_step')
    global_step_next = tf.assign_add(global_step, 1)
    global_ac = Network(in_size=in_size,
                   action_size=action_space,
                   scope=global_scope,
                   sess=sess,
                   global_step=global_step,
                   lstm_network=True)

    # Local workers
    workers = []
    # loop for each workers

    for idx in range(nenv):
        name = 'W_%i' % idx
        workers.append(Environment(name, global_ac, sess, global_step=global_step, coord=coord))
        print(f'worker: {name} initiated')
    saver = tf.train.Saver(max_to_keep=3)
    writer = tf.summary.FileWriter(LOG_PATH, sess.graph)

    
ckpt = tf.train.get_checkpoint_state(MODEL_PATH)
if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Load Model : ", ckpt.model_checkpoint_path)
else:
    sess.run(tf.global_variables_initializer())
    print("Initialized Variables")
    

worker_threads = []
global_episodes = sess.run(global_step)

# Summarize
for var in tf.trainable_variables(scope=global_scope):
    tf.summary.histogram(var.name, var)
merged_summary_op = tf.summary.merge_all()

for worker in workers:
    job = lambda: worker.run(saver, writer)
    t = threading.Thread(target=job)
    t.start()
    worker_threads.append(t)
coord.join(worker_threads)
