# A3C

In [1]:
import threading
import multiprocessing

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.contrib.slim as slim

import scipy.signal

%matplotlib inline

from vizdoom import *

from random import choice
from time import sleep, time

  from ._conv import register_converters as _register_converters


ImportError: DLL load failed: Le module spécifié est introuvable.

In [None]:
# HELPER.PY
"""
Preprocess part:
    We don't need to turn to gray thanks to vizdoom : game.set_screen_format(ScreenFormat.GRAY8)
"""
def preprocess_frame(frame, turn_to_gray):
    
    # Get the state
    state = frame[10:-10,30:-30]
    
    # Grayscale it (if needed)
    if turn_to_gray == True:
        frame = cv2.cvtColor(frame, cv2.color.RGB2GRAY)
    
    # Resize it
    state = scipy.misc.imresize(state, [84,84])
    
    # Normalize it
    state = np.reshape(state,[np.prod(state.shape)]) / 255.0
    
    return state # Return a [84, 84, 1] frame


"""
Config doom environement
"""
def config_doom(action_size):
        #The Below code is related to setting up the Doom environment
        game.set_doom_scenario_path("basic.wad") #This corresponds to the simple task we will pose our agent
        game.set_doom_map("map01")
        game.set_screen_resolution(ScreenResolution.RES_160X120)
        game.set_screen_format(ScreenFormat.GRAY8)
        game.set_render_hud(False)
        game.set_render_crosshair(False)
        game.set_render_weapon(True)
        game.set_render_decals(False)
        game.set_render_particles(False)
        game.add_available_button(Button.MOVE_LEFT)
        game.add_available_button(Button.MOVE_RIGHT)
        game.add_available_button(Button.ATTACK)
        game.add_available_game_variable(GameVariable.AMMO2)
        game.add_available_game_variable(GameVariable.POSITION_X)
        game.add_available_game_variable(GameVariable.POSITION_Y)
        game.set_episode_timeout(300)
        game.set_episode_start_time(10)
        game.set_window_visible(False)
        game.set_sound_enabled(False)
        game.set_living_reward(-1)
        game.set_mode(Mode.PLAYER)
        game.init()
        self.actions = self.actions = np.identity(action_size,dtype=bool).tolist()
        #End Doom set-up
        self.env = game
        
        return self.actions, self.env

    # Copies one set of variables to another.
# Used to set worker network parameters to those of global network.
def update_target_graph(from_scope,to_scope):
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

    op_holder = []
    for from_var,to_var in zip(from_vars,to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder

# Discounting function used to calculate discounted returns.
def discount(x, gamma):
    return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]

#Used to initialize weights for policy and value output layers
def normalized_columns_initializer(std=1.0):
    def _initializer(shape, dtype=None, partition_info=None):
        out = np.random.randn(*shape).astype(np.float32)
        out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
        return tf.constant(out)
    return _initializer

In [None]:
# Hyperparameters
max_episode_length = 300
state_size = 7056 # Observations are greyscale frames of 84 * 84 * 1
action_size = 3 # Agent can move Left, Right, or Fire
gamma = .99 # discount rate for advantage estimation and reward discounting
load_model = False
model_path = './model'

AC Network: All the tensorflow operations to create the NN

<img src="a3c.png" alt="A3C Model" />

Really important <b> to initialise the weights and biases well </b>
<ul>
<li> By using "Xavier initialization" which chooses the variance of the distribution (either uniformly or normally distributed) to be 1/Nin where Nin is the number of neurons that are input to a given neuron in this layer. </li>

<li> For FC is easy: the number of neurons input to a given layer = number of neurons in the previous layer </li>
<li> So if the filter size of the current layer is [4,4] and there are 16 filters on the previous layer, then the number of inputs would be 4×4×16. </li>
</ul>

In [None]:
# AC NETWORK
class AC_Network:
    def __init__(self, state_size, action_size, scope, trainer):
        with tf.variable_scope(scope):
            # State_size = 84*84 = 7056
            self.inputs = tf.placeholder(tf.float32, [None, state_size])

            self.states = tf.reshape(self.inputs, [-1, 84, 84, 1])

            # CNN for spatial dependencies
            self.conv1 = tf.contrib.layers.conv2d(self.states,
                                                 num_outputs = 16,
                                                 kernel_size = [8,8],
                                                 stride = [4,4],
                                                 padding = "VALID",
                                                 activation_fn = tf.nn.elu,
                                                 name = "conv1")

            self.conv2 = tf.contrib.layers.conv2d(self.conv1,
                                                 num_outputs = 32,
                                                 kernel_size = [4,4],
                                                 stride = [2,2],
                                                 padding = "VALID",
                                                 activation_fn = tf.nn.elu,
                                                 name = "conv2")

            # TODO: See if good idea to add a conv, maxpool

            # Flatten the network
            self.flatten = tf.contrib.layers.flatten(self.conv2)

            self.fc1 = tf.contrib.layers.fully_connected(self.flatten,
                                                        num_outputs = 256,
                                                        activation_fn = tf.nn.elu,
                                                        name = "fc1")

            # LSTM for temporal dependencies
            lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(256,state_is_tuple=True)
            c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
            h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
            self.state_init = [c_init, h_init]

            c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c])
            h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h])
            self.state_in = (c_in, h_in)

            rnn_in = tf.expand_dims(hidden, [0])
            step_size = tf.shape(self.imageIn)[:1]
            state_in = tf.nn.rnn_cell.LSTMStateTuple(c_in, h_in)
            lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
                    lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size,
                    time_major=False)
            lstm_c, lstm_h = lstm_state
            self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
            rnn_out = tf.reshape(lstm_outputs, [-1, 256])

            # POLICY OUTPUT (Actor)
            self.policy = tf.contrib.layers.fully_connected(inputs = rnn_out,
                                                           num_outputs = action_size,
                                                           activation_fn = tf.nn.softmax,
                                                           weights_initializer = tf.contrib.layers.xavier_initializer(),
                                                           biases_initializer = None)

            # VALUE OUTPUT (Critic)
            self.value = tf.contrib.layers.fully_connected(inputs = rnn_out,
                                                           num_outputs = 1,
                                                           activation_fn = None,
                                                           weights_initializer = tf.contrib.layers.xavier_initializer(),
                                                           biases_initializer = None)

            # Only the worker network need operations for loss function and gradient updating.
            if scope != 'global':
                # Placeholders for the action, target value and advantages
                self.action = tf.placeholder(tf.int32, [None], name="action")
                
                # Compute the one hot vectors for each action given
                action_one_hot = tf.one_hot(self.action, 
                                            action_size, 
                                            tf.float32, 
                                            name="action_one_hot")
                
                # R
                self.target_value = tf.placeholder(tf.float32, 
                                                   [None], 
                                                   name="target_value")
                
                # A
                self.advantages = tf.placeholder(tf.float32, 
                                                 [None], 
                                                 name="advantages")
                
                
                ### LOSS PART
                # Clip the policy output to avoid 0 and 1 (bad for log calculation)
                # tf.clip_by_value: given tensor f, this operation returns a tensor of the same type
                # and shape as t with its values clipped to clip_value_min and clip_value_max. 
                # Any values less than clip_value_min are set to clip_value_min.Any values greater 
                # than clip_value_max are set to clip_value_max.
                
                # log(pi)
                self.log_policy = tf.log(tf.clip_by_value(self.policy, 0.000001, 0.999999))
                
                # For a given state and action, compute the log of the policy at
                # that action for that state.
                # log(pi(s))
                                                            # Tensor to reduce              # Dimension to reduce
                self.log_pi_for_action = tf.reduce_sum(self.log_policy * self.actions_onehot, reduction_indices = 1)
                
                # VALUE LOSS:
                # Squared difference between Expected discounted reward (R) and Value (V(s))
                # L = Σ(R - V(s))²
                # self.target_value - self.value
                
                self.value_loss = tf.reduce_mean(tf.square(self.target_value - self.value))
                
                
                # POLICY LOSS:
                # We want to do gradient ascent on the expected discounted reward
                # (Because we want to max improve the expected discounted reward)
                # Grad of R == grad logpi * (R - estimated V)
                # Where R: sampled reward from given state following the policy pi
                # self.log_pi_for_action * self.advantages
                # Because we want to max this, we define the policy loss as the negative
                # and get tf to do the automatic differentiation for us
                self.policy_loss = -tf.reduce_mean(self.log_pi_for_action * self.advantages)
                
                
                # Also add entropy as another loss to the policy
                # Entropy of proba dist: is the expected value of logP(X)
                # denonted E(-log P(X)), which can compute for our policy at any given
                # state with Σ(policy * -log(policy)).
                # This will be a number between 0 and 1.
                #  Note that entropy is smaller when the probability
                # distribution is more concentrated on one action, so a larger
                # entropy implies more exploration. Thus we penalise small entropy,
                # or equivalently, add -entropy to our loss. 
                self.entropy = tf.reduce_sum(tf.multiply(self.policy, -self.log_policy))
                
                
                # Try to minimize the loss
                # Note the negative entropy term, which encourages exploration:
                # higher entropy corresponds to less certainty.
                self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01
                
                
                # GRADIENTS
                #A worker then uses these losses to obtain gradients with respect to its network 
                # parameters. Each of these gradients are typically clipped in order to prevent 
                # overly-large parameter updates which can destabilize the policy.
                
                # Compute the gradient of the loss with respect to all the weights,
                # and create a list of tuples consisting of the gradient to apply to
                # the weight.
                # Get gradients from local network using local losses
                local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss,local_vars)
                self.var_norms = tf.global_norm(local_vars)
                grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,40.0)

                #Apply local gradients to global network
                global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = trainer.apply_gradients(zip(grads,global_vars))

Each worker then interacts with its own copy of the environment and collects experience. Each keeps a list of experience tuples (observation, action, reward, done, value) that is constantly added to from interactions with the environment.

In [None]:
class Worker():
    def __init__(self, game, name, state_size, action_size, trainer, model_path,global_episodes):
        self.name = "worker_" + str(name)
        self.number = name
        self.model_path = model_path
        self.trainer = trainer
        # Keep the global training
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        
        # Tensorboard
        self.summary_writer = tf.summary.FileWriter("train_"+str(self.number))

        # Create a copy of the network and the tensorflow operators
        # to copy global paramters to local network
        self.local_AC = AC_Network(state_size, action_size, self.name, trainer)
        self.update_local_ops = update_target_graph('global', self.name)
        
        # Config doom
        self.actions, self.env = config_doom(action_size)
        
    # TRAINING
    def train(self, rollout, sess, gamma, bootstrap_value):
        rollout = np.array(rollout)
        observations = rollout[:,0]
        actions = rollout[:,1]
        rewards = rollout[:,2]
        next_observations = rollout[:,3]
        values = rollout[:,5]
        
        # Here we take the rewards and values from the rollout, and use them to 
        # generate the advantage and discounted returns. 
        # The advantage function uses "Generalized Advantage Estimation"
        self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = discount(self.rewards_plus,gamma)[:-1]
        self.value_plus = np.asarray(values.tolist() + [bootstrap_value])
        advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1]
        advantages = discount(advantages,gamma)

        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        feed_dict = {self.local_AC.target_v:discounted_rewards,
            self.local_AC.inputs:np.vstack(observations),
            self.local_AC.actions:actions,
            self.local_AC.advantages:advantages,
            self.local_AC.state_in[0]:self.batch_rnn_state[0],
            self.local_AC.state_in[1]:self.batch_rnn_state[1]}
        v_l,p_l,e_l,g_n,v_n, self.batch_rnn_state,_ = sess.run([self.local_AC.value_loss,
            self.local_AC.policy_loss,
            self.local_AC.entropy,
            self.local_AC.grad_norms,
            self.local_AC.var_norms,
            self.local_AC.state_out,
            self.local_AC.apply_grads],
            feed_dict=feed_dict)
        return v_l / len(rollout),p_l / len(rollout),e_l / len(rollout), g_n,v_n
        
    
    # PLAYING
    def work(self, max_episode_length, gamma, sess, coordinator, saver):
        episode_count = sess.run(self.global_episodes)
        total_steps = 0
        print("Starting worker " + str(self.number))
        
         with sess.as_default(), sess.graph.as_default():
            while not coordinator.should_stop():
                sess.run(self.update_local_ops)
                episode_buffer = []
                episode_values = []
                episode_frames = []
                episode_reward = 0
                episode_step_count = 0
                done = False
                
                # Create a new episode
                self.env.new_episode()
                
                # State
                state = self.env.get_state().screen_buffer
                
                # Append that state to the buffer
                episode_frames.append(state)
                
                # Preprocess the state
                state = preprocess_frame(state, False)
                
                # ?
                rnn_state = self.local_AC.state_init
                
                # ?
                self.batch_rnn_state = rnn_state
                
                # While the episode is not finished
                while self.env.is_episode_finished() == False:
                    # Take an action using probabilities from policy network output
                    action_distribution, value, rnn_state = sess.run([self.local_AC.policy, self.local_AC.value, self.local_AC.state_out],
                                                                    feed_dict = {self.local_AC.inputs:[state],
                                                                                self.local_AC.state_in[0]:rnn_state[0],
                                                                                self.local_AC.state_in[1]:rnn_state[1]})
                    a = np.random.choice(a_dist[0],p=a_dist[0])
                    a = np.argmax(a_dist == a)
                    
                    reward = self.env.make_action(self.actions[a]) / 100.0
                    
                    done = self.env.is_episode_finished()
                    
                    # If not done
                    if done == False:
                        # Get the new state
                        new_state = self.env.get_state().screen_buffer
                        episode_frames.append(new_state)
                        new_state = preprocess_frame(new_state, False)
                        
                    else:
                        new_state = state
                    
                    episode_buffer.append([state, action, reward, new_state, done, value[0,0]])
                    episode_values.append(v[0,0])
                    
                    episode_reward += reward
                    
                    state = new_state
                    total_steps += 1
                    episode_step_count += 1
                    
                    #Specific to VizDoom. We sleep the game for a specific time.
                    if self.sleep_time>0:
                        sleep(self.sleep_time)
                        
                    # If the episode hasn't ended, but the experience buffer is full, then we
                    # make an update step using that experience rollout.
                    if len(episode_buffer) == 30 and d != True and episode_step_count != max_episode_length - 1:
                        # Since we don't know what the true final return is, we "bootstrap" from our current
                        # value estimation.
                        v1 = sess.run(self.local_AC.value, 
                            feed_dict={self.local_AC.inputs:[s],
                            self.local_AC.state_in[0]:rnn_state[0],
                            self.local_AC.state_in[1]:rnn_state[1]})[0,0]
                        v_l,p_l,e_l,g_n,v_n = self.train(global_AC,episode_buffer,sess,gamma,v1)
                        episode_buffer = []
                        sess.run(self.update_local_ops)
                    if d == True:
                        break
                                            
                self.episode_rewards.append(episode_reward)
                self.episode_lengths.append(episode_step_count)
                self.episode_mean_values.append(np.mean(episode_values))
                
                # Update the network using the episode buffer at the end of the episode.
                if len(episode_buffer) != 0:
                    v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,0.0)
                                
                    
                # Periodically save gifs of episodes, model parameters, and summary statistics.
                if episode_count % 5 == 0 and episode_count != 0:
                    if self.name == 'worker_0' and episode_count % 25 == 0:
                        time_per_step = 0.05
                        images = np.array(episode_frames)
                        make_gif(images,'./frames/image'+str(episode_count)+'.gif',
                            duration=len(images)*time_per_step,true_image=True,salience=False)
                    if episode_count % 250 == 0 and self.name == 'worker_0':
                        saver.save(sess,self.model_path+'/model-'+str(episode_count)+'.cptk')
                        print ("Saved Model")

                    mean_reward = np.mean(self.episode_rewards[-5:])
                    mean_length = np.mean(self.episode_lengths[-5:])
                    mean_value = np.mean(self.episode_mean_values[-5:])
                    summary = tf.Summary()
                    summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward))
                    summary.value.add(tag='Perf/Length', simple_value=float(mean_length))
                    summary.value.add(tag='Perf/Value', simple_value=float(mean_value))
                    summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l))
                    summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l))
                    summary.value.add(tag='Losses/Entropy', simple_value=float(e_l))
                    summary.value.add(tag='Losses/Grad Norm', simple_value=float(g_n))
                    summary.value.add(tag='Losses/Var Norm', simple_value=float(v_n))
                    self.summary_writer.add_summary(summary, episode_count)

                    self.summary_writer.flush()
                if self.name == 'worker_0':
                    sess.run(self.increment)
                episode_count += 1

In [None]:
tf.reset_default_graph()

if not os.path.exists(model_path):
    os.makedirs(model_path)
    
#Create a directory to save episode playback gifs to
if not os.path.exists('./frames'):
    os.makedirs('./frames')

with tf.device("/cpu:0"):
    global_episodes = tf.Variable(0,dtype=tf.int32,name='global_episodes',trainable=False)
    trainer = tf.train.AdamOptimizer(learning_rate=1e-4)
   
    # Generate global network
    master_network = AC_Network(state_size, action_size, 'global', None)
    
    # Set workers based on available CPU threads
    num_workers = multiprocessing.cpu_count()
    
    workers = []
    
    # Create worker classes
    for i in range(num_workers):
        workers.append(Worker(DoomGame(),
                             i,
                             state_size,
                             action_size,
                             trainer,
                             saver,
                             model_path))
    saver = tf.train.Saver(max_to_keep=5)
        
    
with tf.Session() as sess:
    # This class implements a simple mechanism to coordinate the termination of a set of threads.
    coordinator = tf.train.Coordinator()
    if load_model == True:
        print("Loading Model")
        checkpoint = tf.train.get_checkpoint_state(model_path)
        saver.restore(sess, checkpoint.model_checkpoint_path)
        
    else:
        init = tf.global_variables_initializer
        sess.run(init)
        
    
    # Asynchronous part
    # Start the "work" process for each worker in a separate thread.
    worker_threads = []
    for worker in workers:
        worker_work = lambda: woker.work(max_episode_length,
                                        gamma,
                                        master_network,
                                        sess,
                                        coordinator)
        
        t = threading.Thread(target=(worker_work))
        
        t.start()
        
        worker_threads.append(t)
    
    coordinator.join(worker_threads)
        

A set of worker agents, each with their own network and environment are created. Each of these workers are run on a separate processor thread, so there should be no more worker than there are threads on my CPU