# Capture the Flag (RL - Policy Gradient)

- Seung Hyun Kim
- skim449@illinois.edu

## Implementation Details

- Actor-critic
- On-Policy
- 

### Sampling
- [ ] Mini-batch to update 'average' gradient
- [ ] Experience Replay for Random Sampling
- [x] Importance Sampling
    
### Policy Gradient
- [x] MA-A3C

### Stability and Reducing Variance
- [x] Gradient clipping
- [ ] Normalized Reward/Advantage
- [x] Target Network
- [ ] TRPO
- [ ] PPO

### Multiprocessing
- [ ] Synchronous Training (A2C)
- [x] Asynchronous Training (A3C)

### Applied Training Methods:
- [ ] Self-play
- [ ] Batch Policy

## Notes

- This notebook includes:
    - Building the structure of policy driven network.
    - Training with/without render
    - Saver that save model and weights to ./model directory
    - Writer that will record some necessary datas to ./logs

- This notebook does not include:
    - Simulation with RL policy
        - The simulation can be done using policy_RL.py
    - cap_test.py is changed appropriately.
    
## References :
- https://github.com/awjuliani/DeepRL-Agents/blob/master/Vanilla-Policy.ipynb (source)
- https://www.youtube.com/watch?v=PDbXPBwOavc
- https://github.com/lilianweng/deep-reinforcement-learning-gym/blob/master/playground/policies/actor_critic.py (source)
- https://github.com/spro/practical-pytorch/blob/master/reinforce-gridworld/reinforce-gridworld.ipynb
- https://arxiv.org/pdf/1706.02275.pdf

## TODO:

- Try to add experience buffer

In [1]:
!rm -rf logs/DDQN_t1/ model/DDQN_t1

In [2]:
TRAIN_NAME='DDQN_t1'
LOG_PATH='./logs/'+TRAIN_NAME
MODEL_PATH='./model/' + TRAIN_NAME
GPU_CAPACITY=0.25 # gpu capacity in percentage

In [3]:
import os

import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

import time
import gym
import gym_cap
import gym_cap.envs.const as CONST
import numpy as np
import random
import math

# the modules that you can use to generate the policy. 
import policy.random
import policy.roomba
import policy.policy_RL
import policy.zeros

# Data Processing Module
from utility.dataModule import one_hot_encoder_v2 as one_hot_encoder
from utility.utils import MovingAverage as MA
from utility.utils import Experience_buffer, discount_rewards

from network.DQN import DQN as Network

%load_ext autoreload
%autoreload 2

## Hyperparameters

In [4]:
# Replay Variables
total_episodes= 200000
max_ep = 150
update_frequency = 20
batch_size = 2000
replay_capacity = 5000

# Saving Related
save_network_frequency = 1200
save_stat_frequency = 128
moving_average_step = 128

# Training Variables
decay_lr = False
lr_a = 1e-4

gamma = 0.98 # discount_factor
tau = 0.05

# Env Settings
MAP_SIZE = 10
VISION_RANGE = 9 # What decide the network size !!!
VISION_dX, VISION_dY = 2*VISION_RANGE+1, 2*VISION_RANGE+1
in_size = [None,VISION_dX,VISION_dY,11]

## Environment Setting

In [None]:
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    
#Create a directory to save episode playback gifs to
if not os.path.exists(LOG_PATH):
    os.makedirs(LOG_PATH)

In [None]:
action_space = 5
n_agent = 4

In [None]:
global_rewards = MA(moving_average_step)
global_ep_rewards = MA(moving_average_step)
global_length = MA(moving_average_step)
global_succeed = MA(moving_average_step)
global_episodes = 0

# Launch the session
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=GPU_CAPACITY, allow_growth=True)

sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
#sess = tf.Session()
progbar = tf.keras.utils.Progbar(total_episodes,interval=1)

## Worker

In [None]:
class Worker(object):
    def __init__(self, name, target_network, sess, trainer, global_step=0):
        # Initialize Environment worker
        self.env = gym.make("cap-v0").unwrapped
        self.name = name
        
        # Create AC Network for Worker
        self.Network = Network(in_size=in_size,
                               action_size=action_space,
                               scope=name,
                               trainer=trainer,
                               num_agent=n_agent,
                               tau=tau,
                               gamma=gamma,
                               grad_clip_norm=0,
                               global_step=global_step,
                               sess=sess,
                               target_network=target_network)
        
        self.sess=sess
        
    def work(self, saver, writer):
        global global_rewards, global_ep_rewards, global_episodes, global_length, global_succeed, MAP_SIZE
        total_step = 1
        local_ep = 0
        buffer = Experience_buffer(experience_shape=6,
                                   buffer_size=replay_capacity)
        epsilon = 1.0
        epsilon_gamma = 0.9999
        epsilon_final = 0.1
        with self.sess.as_default(), self.sess.graph.as_default():
            while global_episodes < total_episodes:
                local_ep += 1
                s1 = self.env.reset(map_size=MAP_SIZE, policy_red=policy.zeros.PolicyGen(self.env.get_map, self.env.get_team_red))
                #s1 = one_hot_encoder(s1, self.env.get_team_blue, VISION_RANGE)
                s1 = one_hot_encoder(self.env._env, self.env.get_team_blue, VISION_RANGE)
                a1 = self.Network.run_network(np.expand_dims(s1,axis=0))[0]
                
                # parameters 
                ep_r = 0 # Episodic Reward
                prev_r = 0
                is_alive = [ag.isAlive for ag in self.env.get_team_blue]

                episode_buffer = []
                
                for step in range(max_ep+1):
                    a, s0, was_alive = a1, s1, is_alive
                    
                    if random.random() < epsilon:
                        a = random.choices(range(5),k=4)
                        epsilon = max(epsilon_final, epsilon*epsilon_gamma)
                    s1, rc, d, _ = self.env.step(a)
                    #s1 = one_hot_encoder(s1, self.env.get_team_blue, VISION_RANGE)
                    s1 = one_hot_encoder(self.env._env, self.env.get_team_blue, VISION_RANGE)
                    is_alive = [ag.isAlive for ag in self.env.get_team_blue]

                    r = (rc - prev_r-1)
                    if step == max_ep and d == False:
                        r = -100
                        rc = -100
                        d = True

                    r /= 100.0
                    ep_r += r

                    if not d:
                        a1 = self.Network.run_network(np.expand_dims(s1,axis=0))[0]

                    # push to buffer
                    episode_buffer.append([s0, a, r, s1, d, is_alive*1])

                    # Iteration
                    prev_r = rc
                    total_step += 1

                    if d:
                        buffer.add(episode_buffer)
                        if local_ep % update_frequency == 0 and local_ep > 0:
                            batch = buffer.sample(size=batch_size, shuffle=True)
                            aloss = self.train(batch)
                            buffer.flush()
                        break
                        
                global_ep_rewards.append(ep_r)
                global_rewards.append(rc)
                global_length.append(step)
                global_succeed.append(self.env.blue_win)
                global_episodes += 1
                self.sess.run(global_step_next)
                progbar.update(global_episodes)
                if global_episodes % save_stat_frequency == 0 and global_episodes != 0:
                    summary = tf.Summary()
                    summary.value.add(tag='Records/mean_reward', simple_value=global_rewards())
                    summary.value.add(tag='Records/mean_length', simple_value=global_length())
                    summary.value.add(tag='Records/mean_succeed', simple_value=global_succeed())
                    summary.value.add(tag='Records/mean_episode_reward', simple_value=global_ep_rewards())
                    summary.value.add(tag='summary/loss', simple_value=aloss)
                    writer.add_summary(summary,global_episodes)
                    writer.flush()
                if global_episodes % save_network_frequency == 0:
                    saver.save(self.sess, MODEL_PATH+'/ctf_policy.ckpt', global_step=global_episodes)

    def train(self, batch):
        batch = [*zip(*batch)]
        states0 = np.array(batch[0][:])
        actions = np.array(batch[1][:])
        rewards = discount_rewards(batch[2][:],gamma)
        states1 = np.array(batch[3][:])
        dones = np.array(batch[4][:])
        masks = np.array(batch[5][:])
        loss = self.Network.update_full(states0, actions, rewards, states1, dones, masks)
        
        return loss
    

## Run

In [None]:
# Global Network
with tf.name_scope('Global_Step'):
    global_step = tf.Variable(0, trainable=False, name='global_step')
    global_step_next = tf.assign_add(global_step, 1)
trainer=tf.train.AdamOptimizer(learning_rate=lr_a)
target_network = Network(in_size=in_size,
                         action_size=action_space,
                         scope='target',
                         num_agent=n_agent,
                         global_step=global_step)

name = 'primary'
worker = Worker(name=name, sess=sess, trainer=trainer, target_network=target_network)
print(f'{name} initiated')
saver = tf.train.Saver(max_to_keep=3)
writer = tf.summary.FileWriter(LOG_PATH, sess.graph)

ckpt = tf.train.get_checkpoint_state(MODEL_PATH)
if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Load Model : ", ckpt.model_checkpoint_path)
else:
    sess.run(tf.global_variables_initializer())
    print("Initialized Variables")

saver.save(sess, MODEL_PATH+'/ctf_policy.ckpt', global_step=global_episodes)    
global_episodes = sess.run(global_step)
worker.work(saver, writer)

primary initiated
Initialized Variables
   362/200000 [..............................] - ETA: 32:36:09