# Capture the Flag (RL - Policy Gradient)

- Seung Hyun Kim
- skim449@illinois.edu

## Implementation Details

- Simple Policy gradient with experience buffer.
- The implementation network is slightly different
    - Better code for mini-batch
    - include self-play for red
    - 19x19 vision
- Add LSTM network, serialize learning

### Sampling
- [x] Mini-batch to update 'average' gradient
- [x] Experience Replay for Random Sampling
- [ ] Importance Sampling

## Notes

- This notebook includes:
    - Building the structure of policy driven network.
    - Training with/without render
    - Saver that save model and weights to ./model directory
    - Writer that will record some necessary datas to ./logs

- This notebook does not include:
    - Simulation with RL policy
        - The simulation can be done using policy_RL.py
    - cap_test.py is changed appropriately.
    
## References :
- https://github.com/awjuliani/DeepRL-Agents/blob/master/Vanilla-Policy.ipynb (source)
- https://www.youtube.com/watch?v=PDbXPBwOavc
- https://github.com/lilianweng/deep-reinforcement-learning-gym/blob/master/playground/policies/actor_critic.py (source)
- https://github.com/spro/practical-pytorch/blob/master/reinforce-gridworld/reinforce-gridworld.ipynb

## TODO:

- enemy with different policies (zero, patrol)
- stochastic interaction

In [1]:
!rm -rf logs/VANILLA_lstm/ model/VANILLA_lstm

In [2]:
TRAIN_NAME='VANILLA_lstm'
LOG_PATH='./logs/'+TRAIN_NAME
MODEL_PATH='./model/' + TRAIN_NAME
GPU_CAPACITY=0.125 # gpu capacity in percentage

In [3]:
%load_ext autoreload
%autoreload 2

import os

from itertools import count

import tensorflow as tf
from tensorflow.python.client import device_lib
import matplotlib.pyplot as plt
%matplotlib inline

import time
from datetime import datetime
import gym
import gym_cap
import gym_cap.envs.const as CONST
import numpy as np
import random

# the modules that you can use to generate the policy.
import policy.policy_RL
import policy.random
import policy.zeros

from network.REINFORCE_lstm import REINFORCE as network

# Data Processing Module
from utility.dataModule import one_hot_encoder_v2 as one_hot_encoder
from utility.utils import MovingAverage as MA
from utility.utils import discount_rewards
from utility.buffer import Trajectory, Trajectory_buffer

## Hyperparameters

In [4]:
# Training Related
total_episode = 120000
max_ep = 150
serial_length = 8
batch_update_size = 128
each_run = 8

# Saving Related
save_network_frequency = 1024
save_stat_frequency = 128
moving_average_step = 128

# Parameters
learning_rate = 1e-3
gamma = 0.98
map_size = 20
vision_range = 9
vision_dX, vision_dY = 2*vision_range+1, 2*vision_range+1
n_channel = 11

in_size = [None, vision_dX, vision_dY, n_channel]

## Environment Setting

In [None]:
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    
#Create a directory to save episode playback gifs to
if not os.path.exists(LOG_PATH):
    os.makedirs(LOG_PATH)

In [None]:
env = gym.make("cap-v0") # initialize the environment
policy_red = policy.zeros.PolicyGen(env.get_map, env.get_team_red)

# Environment Related
action_space = 5
n_agent = 4

## Session

In [None]:
# Launch the session
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=GPU_CAPACITY, allow_growth=True)

sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

ma_reward = MA(moving_average_step)
ma_length = MA(moving_average_step)
ma_captured = MA(moving_average_step)

## Network Setting

In [None]:
#tf.reset_default_graph() # Clear the Tensorflow graph.
network = network(learning_rate=learning_rate,
                  in_size=in_size,
                  action_size=action_space,
                  sess=sess
                 )
global_step = tf.Variable(0, trainable=False, name='global_step') # global step
increment_global_step_op = tf.assign(global_step, global_step+each_run)
merged = tf.summary.merge_all()

In [None]:
# Setup Save and Restore Network
saver = tf.train.Saver(tf.global_variables())
writer = tf.summary.FileWriter(LOG_PATH, sess.graph)

ckpt = tf.train.get_checkpoint_state(MODEL_PATH)
if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
    saver.restore(sess, ckpt.model_checkpoint_path)
    print("Load Model : ", ckpt.model_checkpoint_path)
else:
    sess.run(tf.global_variables_initializer())
    print("Initialized Variables")    

Initialized Variables


In [None]:
def record(summary_=None):
    with tf.device('/cpu:0'): 
        summary = tf.Summary()
        summary.value.add(tag='Records/mean_reward', simple_value=ma_reward())
        summary.value.add(tag='Records/mean_length', simple_value=ma_length())
        summary.value.add(tag='Records/mean_succeed', simple_value=ma_captured())
        writer.add_summary(summary, sess.run(global_step))
        
        #summary_str = sess.run(merged,feed_dict={myAgent.state_input:obs})
        #writer.add_summary(summary_, sess.run(global_step))
        
        writer.flush()

## Setup Rollout Experience

In [None]:
# Trajectory Buffer Configuration
tr_depth = 3

def get_each_actions(states, rnn_states):
    # Get action for individual agent
    # states/rnn_states is a stack of observation space from each agent
    # states is expected to be in OH encoded
    
    actions = []
    final_states = []
    for s, rs in zip(states, rnn_states):
        # state chage shape to: (1,1,19,19,11)
        action, final_state = network.get_action(states=s[np.newaxis,np.newaxis,:], rnn_init_states=rs)
        actions.append(action)
        final_states.append(final_state)
    return actions, final_states

def policy_rollout(num_episode, PARTIAL=False):
    # Run episodes, return the results of MDP tuples
    replay_buffer = Trajectory_buffer(depth=tr_depth, capacity=batch_update_size)
    trim_init_states = tuple([] for _ in range(network.gru_num_layers))
    seq_lens = []
    for num_ep in range(num_episode):
        s = env.reset(map_size=map_size, policy_red=policy_red)    
        if PARTIAL:
            obs_next = one_hot_encoder(s, env.get_team_blue) # partial observation
        else:
            obs_next = one_hot_encoder(env._env, env.get_team_blue, vision_range) # Full observation

        init_states = [tuple([] for _ in range(network.gru_num_layers)) for _ in range(n_agent)]
        init_state = [tuple(np.zeros((1,network.gru_unit_size)) for _ in range(network.gru_num_layers)) for _ in range(n_agent)]

        trajs = [Trajectory(depth=tr_depth, length_max=max_ep) for _ in range(n_agent)]

        is_alive = [ag.isAlive for ag in env.get_team_blue]
        prev_reward = 0
        frame = 0
        for frame in range(max_ep+1):
            obs = obs_next
            was_alive = is_alive

            acts, final_states = get_each_actions(obs, init_state)
            s,r1,d,_ = env.step(acts) #Get our reward

            is_alive = [ag.isAlive for ag in env.get_team_blue]
            r = r1-prev_reward
            if frame == max_ep and d == False:
                r = -100
                r1 = -100
                d = True
            r /= 100

            if PARTIAL:
                obs_next = one_hot_encoder(s, env.get_team_blue)
            else:
                obs_next = one_hot_encoder(env._env, env.get_team_blue, vision_range)

            # Push history for individual that 'was' alive previous frame
            for idx, agent in enumerate(env.get_team_blue):
                if was_alive[idx]:
                    trajs[idx].append([obs[idx], acts[idx], r]) # MDP Tuple : state, action, reward
                    for i in range(network.gru_num_layers):
                        init_states[idx][i].append(init_state[idx][i][0])

            # State Transition
            prev_reward = r1
            init_state = final_states

            if d:
                break
                
        # summarize and record
        ma_reward.append(r1)
        ma_length.append(frame)
        ma_captured.append(env.blue_win)

        for idx, traj in enumerate(trajs):
            if len(traj)<=serial_length:
                continue
            else:
                traj_length = len(traj)
                batch_length = traj_length // serial_length

            # Discount Reward
            _rew = np.array(traj[2])
            traj[2] = discount_rewards(_rew, gamma).tolist()

            # Trim rnn states
            ss = 0 if traj_length % serial_length == 0 else 1
            rnn_init_state = init_states[idx]
            trim_init_state = tuple(rnn_init_state[i][::serial_length][ss:] for i in range(network.gru_num_layers))
            
            # Sequence length
            seq_len = [serial_length]*(batch_length)

            # Trim batch for each trajectory
            traj_list = traj.trim(serial_length)
            replay_buffer.extend(traj_list)
            [a.extend(b) for a,b in zip(trim_init_states, trim_init_state)]
            seq_lens.extend(seq_len)
    return replay_buffer.sample(), np.array(trim_init_states[0]), np.array(seq_lens)

## Training

In [None]:
policy_red = policy.zeros.PolicyGen(env.get_map, env.get_team_red)
num_ep = total_episode
try:
    progbar = tf.keras.utils.Progbar(num_ep,width=5, interval=0.5)
    ep_count = sess.run(global_step)
    while ep_count < num_ep:
        progbar.update(ep_count) # update progress bar

        # Run episode
        (states, actions, rewards), init_state, seq_len = policy_rollout(num_episode = each_run)
        ep_count += each_run

        network.update_network(states, rewards, actions, init_state, seq_len)

        if ep_count % save_stat_frequency == 0 and ep_count != 0:
            #summary_ = sess.run(merged, feed_dict=feed_dict)
            record() #summary_)

        # save weight
        if ep_count % save_network_frequency == 0:
            saver.save(sess, MODEL_PATH+'/ctf_policy.ckpt', global_step=global_step)

        sess.run(increment_global_step_op)

except KeyboardInterrupt:
    saver.save(sess, MODEL_PATH+'/ctf_policy.ckpt', global_step=global_step)
    print(f'save: {sess.run(global_step)}, episodes')

     0/120000 [.....] - ETA: 0s