# Capture the Flag (RL - Policy Gradient)

- Seung Hyun Kim
- skim449@illinois.edu

## Notes
- This notebook includes:
    - Building the structure of policy driven network.
    - Training with/without render
    - Saver that save model and weights to ./model directory
    - Writer that will record some necessary datas to ./logs
- This notebook does not include running the CtF game with the RL policy. Using the network will be separately scripted in policy/policy_RL1.py.
    - cap_test.py is changed appropriately.
    
## References :
- https://github.com/awjuliani/DeepRL-Agents/blob/master/Vanilla-Policy.ipynb (source)
- https://www.youtube.com/watch?v=PDbXPBwOavc
- https://github.com/lilianweng/deep-reinforcement-learning-gym/blob/master/playground/policies/actor_critic.py (source)
- https://github.com/MG2033/A2C

## TODO:
### Sampling
- [ ] Mini-batch to update 'average' gradient
- [x] Experience Replay for Random Sampling
    - [x] Importance Sampling
    
### Move onto Deterministic
- [ ] DDPG and MADDPG

### Stability and Reducing Variance
- [ ] Target Network
- [ ] TRPO
- [ ] PPO

### Multiprocessing
- [x] Multiprocessing for Synchronous Training (A2C)
    - Serialization
    - Using Update Coordinate
- [ ] Asynchronous Training (A3C)

In [1]:
!rm -rf logs/B2R2_SyncA2C/ model/B2R2_SyncA2C

In [1]:
TRAIN_NAME='B2R2_SyncA2C'
LOG_PATH='./logs/'+TRAIN_NAME
MODEL_PATH='./model/' + TRAIN_NAME
GPU_CAPACITY=0.125 # gpu capacity in percentage

In [2]:
import os

import tensorflow as tf
import tensorflow.contrib.slim as slim
import tensorflow.contrib.layers as layers
from tensorflow.python.client import device_lib
import matplotlib.pyplot as plt
%matplotlib inline

import multiprocessing
import threading
from multiprocessing import Process, Pipe
from multiprocessing import Queue

import time
import gym
import gym_cap
import gym_cap.envs.const as CONST
import numpy as np
import random

# the modules that you can use to generate the policy.
import policy.patrol 
import policy.random
import policy.simple # custon written policy
import policy.policy_RL
import policy.zeros

# Data Processing Module
from DataModule import one_hot_encoder, VISION_dX, VISION_dY
from Utils import MovingAverage as MA
from Utils import Experience_buffer, discount_rewards

  return f(*args, **kwds)


In [4]:
#device_lib.list_local_devices()

## Hyperparameters

In [3]:
# Replay Variables
total_episodes = 5000000 #Set total number of episodes to train agent on.
max_ep = 150
update_frequency = 32
batch_size = 128
batch_update_cycle = 5
experience_size=20000

# Saving Related
save_network_frequency = 1024
save_stat_frequency = 128
moving_average_step = 128

# Training Variables
LEARNING_RATE_FIX = False
LEARNINGRATE_ACTOR  = 5e-4
LEARNINGRATE_CRITIC = 5e-3
LR_ACTOR_DECAY = 0.995
LR_CRITIC_DECAY = 0.995
LR_ACTOR_FINAL = 1e-5
LR_CRITIC_FINAL = 5e-5
gamma = 0.98
discount_factor = 0.98

# Env Settings
MAP_SIZE = 10
NENV = 2

## Environment Setting

In [4]:
if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)
    
#Create a directory to save episode playback gifs to
if not os.path.exists(LOG_PATH):
    os.makedirs(LOG_PATH)

In [5]:
env_tester = gym.make("cap-v0") # initialize the environment
policy_red = policy.random.PolicyGen(env_tester.get_map, env_tester.get_team_red)
action_space = 5
n_agent = len(env_tester.get_team_blue)
print('red number : ', len(env_tester.get_team_red))
print('blue number : ', len(env_tester.get_team_blue))

red number :  2
blue number :  2


## Policy Network

In [8]:
def update_graph(from_scope,to_scope):
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

    op_holder = []
    for from_var,to_var in zip(from_vars,to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder

In [6]:
class Agent():
    def __init__(self, in_size, action_size, grad_clip_norm, scope='main', trainable=False, logging=False, verbose=0):
        self.in_size = in_size
        self.action_size = action_size
        self.grad_clip_norm = grad_clip_norm
        self.scope = scope
        with tf.variable_scope(scope):
            # Learning Rate Variables
            self.learning_rate_actor = tf.placeholder(tf.float32, shape=None, name='learning_rate_actor')
            self.learning_rate_critic = tf.placeholder(tf.float32, shape=None, name='learning_rate_critic')
            
            # Feed-Forward Network
            self.state_input = tf.placeholder(shape=in_size,dtype=tf.float32, name='state')
            
            # Actor stream
            with tf.variable_scope('actor'):
                layer = slim.conv2d(self.state_input, 16, [5,5],# activation_fn=tf.nn.relu,
                                    weights_initializer=layers.xavier_initializer_conv2d(),
                                    biases_initializer=tf.zeros_initializer(),
                                    padding='SAME')
                layer = slim.avg_pool2d(layer, [2,2])
                layer = slim.conv2d(layer, 16, [3,3],# activation_fn=tf.nn.relu,
                                    weights_initializer=layers.xavier_initializer_conv2d(),
                                    biases_initializer=tf.zeros_initializer(),
                                    padding='SAME')
                self.layer = slim.flatten(layer)
                #adv_stream, val_stream = tf.split(layer, num_or_size_splits=2)


                actor = layers.fully_connected(self.layer, 128,
                                            activation_fn=tf.nn.relu)
                self.actor = layers.fully_connected(actor, action_size,
                                            activation_fn=None)
            self.output = tf.nn.softmax(self.actor, name='action')
            self.output_argmax = tf.argmax(self.output, axis=1,output_type=tf.int32)
            self.actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope+'/actor')

            # Value stream
            with tf.variable_scope('critic'):
                layer2 = slim.conv2d(self.state_input, 16, [5,5], # activation_fn=tf.nn.relu,
                                    weights_initializer=layers.xavier_initializer_conv2d(),
                                    biases_initializer=tf.zeros_initializer(),
                                    padding='SAME')
                layer2 = slim.avg_pool2d(layer2, [2,2])
                layer2 = slim.conv2d(layer2, 16, [3,3],# activation_fn=tf.nn.relu,
                                    weights_initializer=layers.xavier_initializer_conv2d(),
                                    biases_initializer=tf.zeros_initializer(),
                                    padding='SAME')
                self.layer2 = slim.flatten(layer2)
                self.critic = layers.fully_connected(self.layer2, 1,
                                            activation_fn=None)
                self.critic = tf.reshape(self.critic, [-1])
            self.critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope+'/critic')

        self.local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)            
            
        if verbose:
            print('Actor Network')
            for var in self.actor_vars:
                print(var)
            print('Critic Network')
            for var in self.critic_vars:
                print(var)
            print('Full Local Network')
            for var in self.local_vars:
                print(var)
                
        if trainable:
            self._build_trainer()
            self._build_logger()
        elif logging:
            self._build_logger()
        
    def _build_trainer(self):
        # Placeholders
        with tf.name_scope('holders'):
            self.action_holder = tf.placeholder(shape=[None],dtype=tf.int32)
            self.action_OH = tf.one_hot(self.action_holder, self.action_size)
            self.reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
            self.target_value_holder = tf.placeholder(shape=[None], dtype=tf.float32, name='target')
            self.advantage_holder = tf.placeholder(shape=[None], dtype=tf.float32, name='adv')  
            self.behavior_policy = tf.placeholder(shape=[None,self.action_size], dtype=tf.float32, name='IS') 
        # Feed Backward
        # - compute the  loss, and use it to find gradient, and update the network
        # - May be need to add bootstrap value at the end of the value
        self.selected=tf.equal(self.action_holder, self.output_argmax)
        with tf.name_scope('critic_train'):
            self.loss_critic = tf.reduce_mean(tf.square(self.target_value_holder - self.critic))
            self.optimizer_critic = tf.train.AdamOptimizer(self.learning_rate_critic)
            self.grads_critic = self.optimizer_critic.compute_gradients(self.loss_critic, self.critic_vars)
            if self.grad_clip_norm:
                self.grads_critic = [(tf.clip_by_norm(grad, self.grad_clip_norm), var) for grad, var in self.grads_critic]
            self.update_critic = self.optimizer_critic.apply_gradients(self.grads_critic)

        with tf.name_scope('actor_train'):
            self.entropy = -tf.reduce_mean(self.output * tf.log(self.output), name='entropy')
            self.policy_outputs = tf.reduce_sum(self.output * self.action_OH, 1)
            self.behav_policy_output = tf.reduce_sum(self.behavior_policy * self.action_OH,1)
            #self.objective_function = tf.log(self.responsible_outputs)
            self.sampling_weight = self.policy_outputs / self.behav_policy_output
            self.objective_function = tf.clip_by_value(self.sampling_weight,0.7,1.2)# * tf.log(self.policy_outputs) #tf.clip_by_value(self.sampling_weight_holder,0.7,1.3)
            self.loss_actor = -tf.reduce_mean(self.objective_function*self.advantage_holder)
            self.optimizer_actor = tf.train.AdamOptimizer(self.learning_rate_actor)
            self.grads_actor = self.optimizer_actor.compute_gradients(self.loss_actor, self.actor_vars)
            if self.grad_clip_norm:
                self.grads_actor = [(tf.clip_by_norm(grad, self.grad_clip_norm), var) for grad, var in self.grads_actor]
            self.update_actor = self.optimizer_actor.apply_gradients(self.grads_actor)
            
        with tf.name_scope('update'):
            self.loss = 0.05*self.loss_critic + self.loss_actor - self.entropy * 0.01
            self.update_batch = tf.group(self.update_actor, self.update_critic)        
         
    def _build_logger(self):
        # Summary
        # Histogram output
        with tf.name_scope('debug_parameters'):
            tf.summary.histogram('output', self.output)
            tf.summary.histogram('actor', self.actor)
            tf.summary.histogram('critic', self.critic)        
            tf.summary.histogram('action', self.action_holder)
            tf.summary.histogram('IS_weight', self.sampling_weight)
            tf.summary.histogram('objective_function', self.objective_function)
        
        # Graph summary Loss
        with tf.name_scope('summary'):
            tf.summary.scalar(name='actor_loss', tensor=self.loss_actor)
            tf.summary.scalar(name='critic_loss', tensor=self.loss_critic)
            tf.summary.scalar(name='total_loss', tensor=self.loss)
            tf.summary.scalar(name='Entropy', tensor=self.entropy)
        
        with tf.name_scope('weights_bias'):
            # Histogram weights and bias
            for var in slim.get_model_variables():
                tf.summary.histogram(var.op.name, var)
                
        with tf.name_scope('gradients'):
            # Histogram Gradients
            for var, grad in zip(tf.trainable_variables(), self.grads_critic):
                tf.summary.histogram(var.op.name+'/grad_critic', grad[0])
            for var, grad in zip(tf.trainable_variables(), self.grads_actor):
                tf.summary.histogram(var.op.name+'/grad_actor', grad[0])
        
        with tf.name_scope('Learning_Rate'):
            # Learning Rate
            tf.summary.scalar(name='actor_lr', tensor=self.learning_rate_actor)
            tf.summary.scalar(name='critic_lr', tensor=self.learning_rate_critic)

In [7]:
def record(summary_):
    summary = tf.Summary()
    summary.value.add(tag='Records/mean_reward', simple_value=ma_reward())
    summary.value.add(tag='Records/mean_length', simple_value=ma_length())
    summary.value.add(tag='Records/mean_succeed', simple_value=ma_captured())
    writer.add_summary(summary, sess.run(global_step))

    #summary_str = sess.run(merged,feed_dict={myAgent.state_input:obs})
    writer.add_summary(summary_, sess.run(global_step))

    writer.flush()

## Synchronous Rollout

In [12]:
class Worker():
    def __init__(self,idx,state_size,action_size,env,map_size=MAP_SIZE, sess=None, coord=None):
        self.name = "worker_" + str(idx)
        self.id = idx
        self.map_size = map_size
        self.coord = coord
        
        # local environment
        self.env = env # initialize the environment
        self.policy_red = policy.random.PolicyGen(self.env.get_map, self.env.get_team_red)
        self.num_blue = len(self.env.get_team_blue)
        
    def policy_rollout(self, sess, explore=False, deterministic=False):
        # Run single episode, return the results
        s = self.env.reset(map_size=self.map_size, policy_red=self.policy_red)
        #obs_next = one_hot_encoder(s, self.env.get_team_blue) # partial observation
        obs_next = one_hot_encoder(self.env._env, self.env.get_team_blue)

        indv_history = [[] for _ in range(self.num_blue)]
        was_alive = [ag.isAlive for ag in self.env.get_team_blue]

        prev_reward=0
        total_reward = 0
        frame=0
        print('rollout began')        
        with sess.as_default(), sess.graph.as_default():
            for frame in range(max_ep+1):
                obs = obs_next

                with tf.device('/cpu:0'):
                    act_prob = sess.run(self.local_Agent.output, feed_dict={self.local_Agent.state_input:obs})
                act = [np.random.choice(action_space, p=act_prob[x]/sum(act_prob[x])) for x in range(self.num_blue)] # divide by sum : normalize
                behavior_policy = act_prob#np.log(act_chosen)

                s,r1,d,_ = self.env.step(act) #Get our reward for taking an action given a bandit.

                r = r1-prev_reward
                if frame == max_ep and d == False:
                    r = -100
                    r1 = -100
                total_reward += r

                if d:
                    value = np.array([0.0 for _ in range(self.num_blue)])
                else:
                    obs_next = one_hot_encoder(self.env._env, self.env.get_team_blue) # Full Observation
                    value = sess.run(self.local_Agent.critic, feed_dict={self.local_Agent.state_input:obs_next})

                # Push history for individual that 'was' alive previous frame
                # [state, action, reward(later:discount), value, advantage(later), behavior policy]
                for idx, agent in enumerate(self.env.get_team_blue):
                    if was_alive[idx]:
                        indv_history[idx].append([obs[idx],act[idx],r,0,behavior_policy[idx]])

                # State Transition
                prev_reward = r1
                was_alive = [ag.isAlive for ag in self.env.get_team_blue]

                if d:
                    break
        print('rollout end')
        # Policy rollout for all agents are done.
        # Calculate Advantage for individual histories
        ep_history = []        
        for idx, history in enumerate(indv_history):
            if len(history)==0:
                continue
            _history = np.array(history)
            values = np.array(_history[:,3])
            values_ext = np.asarray(values.tolist() + [0])
            advantages = _history[:,2] + gamma * values_ext[1:] - values_ext[:-1]
            advantages = discount_rewards(advantages,discount_factor)
            rewards = _history[:,2]
            _history[:,2] = discount_rewards(rewards,discount_factor)
            _history[:,3] = advantages
            ep_history.extend(_history)

        if len(ep_history) > 0:        
            ep_history = np.stack(ep_history)

        return [frame, ep_history, r1, self.env.blue_win, total_reward]
    
                
    def pipe_interaction(self, connection):
        with coord.stop_on_exception():
            while not coord.should_stop():
                pass
        
        sess = self.sess
        coord = self.coord
        while True: #not coord.should_stop():
            print('recieving....')
            cmd, arg = connection.recv()
            print('recieved: ', cmd)
            if cmd == 'rollout':
                fr, eph, r, bw, tr = self.policy_rollout(sess)
                connection.send((fr, eph, r, bw, tr))
            elif cmd == 'update':
                print('network update')
                sess.run(update_graph('main', self.name))
                print('netwokr finished update')
            elif cmd == 'close':
                connection.close()
                break
            elif cmd == 'debug':
                print('red number : ', len(self.env.get_team_red))
                print('blue number : ', len(self.env.get_team_blue))
                print(self.local_Agent.local_vars)
            else:
                raise NotImplementedError

In [13]:
class SerializedEnv:
    # Use multiprocess to serialize the synchronized run
    def __init__(self, sess, num_env=NENV):
        self.num_env = num_env
        self.envs = [gym.make("cap-v0") for _ in range(num_env)]
        self.connection, self.work_connection = zip(*[Pipe() for _ in range(self.num_env)])
        self.coord = tf.train.Coordinator()
        self.workers = [Worker(idx,state_size=[None,VISION_dX,VISION_dY,6],action_size=5,env=self.envs[idx],map_size=MAP_SIZE, sess=sess, coord=self.coord) for idx in range(self.num_env)]
        self.ps = []
        for con, worker in zip(self.work_connection, self.workers):
            print('loading worker')
            work = lambda: worker.pipe_interaction(con)
            p = Process(target = (work))
            #p = Process(target = worker.pipe_interaction, args=(con,self.coord))
            self.ps.append(p)
            print('worker created')
        
        #self.ps = [Process(target=worker, args=(conn_pipe, env)) for idx, conn_pipe, env in enumerate(zip(self.work_connection, self.envs))]
        for p in self.ps:
            p.daemon = True
            p.start()
            time.sleep(1)
        #self.coord.join(self.ps)
        #self.connection[0].send(('debug'))
        
    def __len__(self):
        return self.num_env
        
    def call_rollout(self):
        batch_history= []
        batch_length = []
        batch_finalR = []
        batch_totalR = []
        batch_blueWin= []
        for connection in self.connection:
            print('send rollout')
            connection.send(('rollout',None))
        for connection in self.connection:
            l, h, r, d, tr = connection.recv()             
            batch_history.extend(h)
            batch_length.append(l)
            batch_finalR.append(r)
            batch_totalR.append(tr)
            batch_blueWin.append(d)
            
        return batch_history, batch_length, batch_finalR, batch_blueWin
    
    def update_weight(self):
        for con in self.connection:
            con.send(('update',None))
            #time.sleep(0.05)

    def close(self):
        for con in self.connection:
            con.send(('close', None))
        [p.join() for p in self.ps]

## Training

In [14]:
# Clear the Tensorflow graph.
tf.reset_default_graph()

# Initiate saving modules
ma_reward = MA(moving_average_step)
ma_length = MA(moving_average_step)
ma_captured = MA(moving_average_step)

with tf.variable_scope('global_step'):
    global_step = tf.Variable(0, trainable=False, name='global_step') # global step
    increment_global_step_op = tf.assign(global_step, global_step+NENV)
merged = tf.summary.merge_all()

In [15]:
# Launch the session
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=GPU_CAPACITY, allow_growth=True)
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    global_Agent = Agent(in_size=[None,VISION_dX,VISION_dY,6],action_size=5, grad_clip_norm=50, trainable=True, verbose=1) #Load the agent.
    se = SerializedEnv(sess)
    
    # Setup Save and Restore Network
    saver = tf.train.Saver(global_Agent.local_vars, max_to_keep=3)
    #saver = tf.train.Saver(tf.global_variables(), max_to_keep=3)
    writer = tf.summary.FileWriter(LOG_PATH, sess.graph)
    
    ckpt = tf.train.get_checkpoint_state(MODEL_PATH)
    if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
        saver.restore(sess, ckpt.model_checkpoint_path)
        print("Load Model : ", ckpt.model_checkpoint_path)
    else:
        sess.run(tf.global_variables_initializer())
        print("Initialized Variables")

    ep = sess.run(global_step)
    se.update_weight()
    exp_buffer = Experience_buffer(experience_shape=5, buffer_size=experience_size)
    batch = []
    try:
        progbar = tf.keras.utils.Progbar(total_episodes,width=5)
        while ep < total_episodes+1:
            # Run episode
            batch_length, batch_history, batch_reward, batch_win = se.call_rollout()
            
            ep += NENV
            
            #frame, history, reward, did_won, total_reward = policy_rollout(EXPLORE=ep < pre_train,DETERMINISTIC=False)

            # Add history
            exp_buffer.add(batch_history)
            
                
            if ep % update_frequency == 0 and ep != 0 and len(exp_buffer) > 0:
                for _ in range(batch_update_cycle):
                    if len(exp_buffer) <= 0:
                        break
                    batch = exp_buffer.pop(size=batch_size, shuffle=True)
                    feed_dict={global_Agent.learning_rate_actor :LEARNINGRATE_ACTOR,
                               global_Agent.learning_rate_critic:LEARNINGRATE_CRITIC,
                               global_Agent.reward_holder:batch[:,2],
                               global_Agent.behavior_policy:np.stack(batch[:,4]),
                               global_Agent.action_holder:batch[:,1],
                               global_Agent.state_input:np.stack(batch[:,0]),
                               global_Agent.target_value_holder:batch[:,2],
                               global_Agent.advantage_holder:batch[:,3]}
                    with tf.device('/gpu:0'):
                        sess.run(global_Agent.update_batch, feed_dict=feed_dict)
                # Update Workers Network
                se.update_weight(global_Agent.local_vars)

            # decay lr
            if not LEARNING_RATE_FIX:
                if ep > pre_train: LEARNINGRATE_ACTOR = max(LEARNINGRATE_ACTOR*LR_ACTOR_DECAY,LR_ACTOR_FINAL)
                LEARNINGRATE_CRITIC = max(LEARNINGRATE_CRITIC*LR_CRITIC_DECAY,LR_CRITIC_FINAL)
            
            # summarize and record
            ma_reward.extend(batch_reward)
            ma_length.extend(batch_length)
            ma_captured.extend(batch_win)   
            
            if ep % save_stat_frequency == 0 and ep != 0 and len(batch) > 0:
                summary_ = sess.run(merged, feed_dict=feed_dict)
                record(summary_)

            # save weight
            if ep % save_network_frequency == 0 and ep != 0:
                saver.save(sess, MODEL_PATH+'/ctf_policy.ckpt', global_step=global_step)

            # Proceed to next episode
            progbar.update(ep) # update progress bar
            sess.run(increment_global_step_op)

    except KeyboardInterrupt:
        print('\n\nManually stopped the training (KeyboardInterrupt)');
        #saver.save(sess, MODEL_PATH+'/ctf_policy.ckpt', global_step=global_step)
        print("save: ", sess.run(global_step), 'episodes')
        se.close()
        
    se.close()

Actor Network
<tf.Variable 'main/actor/Conv/weights:0' shape=(5, 5, 6, 16) dtype=float32_ref>
<tf.Variable 'main/actor/Conv/biases:0' shape=(16,) dtype=float32_ref>
<tf.Variable 'main/actor/Conv_1/weights:0' shape=(3, 3, 16, 16) dtype=float32_ref>
<tf.Variable 'main/actor/Conv_1/biases:0' shape=(16,) dtype=float32_ref>
<tf.Variable 'main/actor/fully_connected/weights:0' shape=(1600, 128) dtype=float32_ref>
<tf.Variable 'main/actor/fully_connected/biases:0' shape=(128,) dtype=float32_ref>
<tf.Variable 'main/actor/fully_connected_1/weights:0' shape=(128, 5) dtype=float32_ref>
<tf.Variable 'main/actor/fully_connected_1/biases:0' shape=(5,) dtype=float32_ref>
Critic Network
<tf.Variable 'main/critic/Conv/weights:0' shape=(5, 5, 6, 16) dtype=float32_ref>
<tf.Variable 'main/critic/Conv/biases:0' shape=(16,) dtype=float32_ref>
<tf.Variable 'main/critic/Conv_1/weights:0' shape=(3, 3, 16, 16) dtype=float32_ref>
<tf.Variable 'main/critic/Conv_1/biases:0' shape=(16,) dtype=float32_ref>
<tf.Variab

Process Process-1:
Process Process-2:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-13-1ba860f84f47>", line 12, in <lambda>
    work = lambda: worker.pipe_interaction(con)
  File "<ipython-input-13-1ba860f84f47>", line 12, in <lambda>
    work = lambda: worker.pipe_interaction(con)
  File "<ipython-input-12-b768afbc59bb>", line 93, in pipe_interaction
    cmd, arg = connection.recv()
  File "<ipython-input-12-b768afbc59bb>", line 93, in pipe_interaction
    cmd, arg = connection.recv()
  File "/usr/lib/python3.6/multiproc

Initialized Variables
send rollout
send rollout


Manually stopped the training (KeyboardInterrupt)
save:  0 episodes
