# 0) Instalando dependências

In [1]:
# Installing dependencies. Vizdoom can be downloaded from https://github.com/mwydmuch/ViZDoom
!pip install scikit-image
!pip install tqdm



You are using pip version 18.0, however version 18.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.




You are using pip version 18.0, however version 18.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


# 1) Importando bibliotecas
Para esse exemplo, teremos de importar todas os módulos ubíquos ao DRL, como:
- TensorFlow
- Numpy
- Random

E além disso, para nosso ambiente Doom, temos de importar o módulo responsável por modelar o ambiente Doom, chamado de Vizdoom.

In [2]:
# Importing required modules
import tensorflow as tf
import numpy as np
from vizdoom import * # Doom environment module

import random
import time
from tqdm import *
from skimage import transform  # For frame preprocessing

from collections import deque
import matplotlib.pyplot as plt

import warnings  # Ignores all warning messages from skimage during training
warnings.filterwarnings("ignore")# 2) Criando o ambiente Doom

# 2) Criando o ambiente Doom

In [3]:
'''Creating our Doom environment'''
config_filename = "deadly_corridor.cfg"
scenario_filename = "deadly_corridor.wad"
def create_env():
    game = DoomGame()
    
    # Loads a configuration file that handles all the options (size of frame, possible actions etc.)
    game.load_config(config_filename)
    
    # Loads a scenario. We're using basic scenario, but others can be used.
    game.set_doom_scenario_path(scenario_filename)
    
    game.init()
    
    # One-hot encoded list of possible actions. We can:
    # turn left, turn right, move left, move right, and shoot.
    possible_actions = np.identity(7, dtype=int).tolist()
    
    return game, possible_actions


'''Performing random actions to test the environment.'''
def test_env():
    game = DoomGame()
    game.load_config(config_filename)
    game.set_doom_scenario_path(scenario_filename)
    game.init()
    
    possible_actions = np.identity(7, dtype=int).tolist()
    
    episodes = 10
    for i in range(episodes):
        game.new_episode()
        while not game.is_episode_finished():
            state = game.get_state()
            img = state.screen_buffer
            misc = state.game_variables
            action = random.choice(possible_actions)
            print(action)
            reward = game.make_action(action)
            print("\tReward: ", reward)
            time.sleep(0.02)
        print("Result: ", game.get_total_reward())
        time.sleep(2)
    game.close()

In [4]:
game, possible_actions = create_env()

# 3) Pré processamento
O pré processamento é utilizado para diminuírmos a dimensão do nosso input, que nesse caso são so frames do jogo. Nessa fase, iremos portanto pegar o frame input, converter as cores de RGB para grayscale, visto que as cores **não adicionam informação importante** para nossa rede, e então cortaremos parte do frame para que o teto não seja representado, visto que ele também não adiciona uma informações necessárias à nossa rede. Tudo isso será feito utilizando o módulo *skimage* importado anteriormente.

In [5]:
"""Frame preprocessing: takes a frame, grayscales and then downscales it."""
def preprocess_frame(frame):
    # Grayscaling is can be done by Vizdoom if configured properly in config file, but we're doing it 'manually'.
    grayscaled_frame = np.mean(frame, 0)

    # Cropping the frame to remove the roof, since it contains no relevant info.
    cropped_frame = grayscaled_frame[15:-5, 20:-20]
    
    # Normalizing pixel color values.
    normalized_frame = cropped_frame/255.0
    
    # Resizing the frame to a squared size (84x84).
    preprocessed_frame = transform.resize(normalized_frame, [100, 120])
    return preprocessed_frame


## Agrupamento de frames

É necessário agrupar frames para **darmos à nossa rede uma noção de movimento**. Para isso, seguiremos os seguintes passos:

- Pré processamos o frame atual
- Empurramos esse frame a um *deque* que remove automaticamente o frame mais antigo
- Depois disso, construímos o estado agrupado, que consiste em um estado representado pelo grupo de frames.

O agrupamento funciona da seguinte forma:

- Para o primeiro frame, nós usamos 4 frames.
- A cada *timestep* adicionamos um novo frame ao *deque* e agrupamos eles para formarmos um novo frame agrupado.
- Seguimos com esses passos até o final do episódio.
- Ao final do episódio, repetimos o processo criando 4 novos frames, pois estamos em um novo episódio.

In [6]:
stack_size = 4 # Number of frames we're stacking.

# Initializing a stack of frames with empty (zero'd) frames.
stacked_frames = deque([np.zeros((100, 120), dtype=np.int) for i in range(stack_size)], maxlen=4)

# Stacked_frames = deque with stacked frames
# state = current frame
# is_new_episode = bool signaling if it's the start of an episode
def stack_frames(stacked_frames, state, is_new_episode):
    # Preprocessing the current frame
    frame = preprocess_frame(state)
    if is_new_episode:
        # If we're starting a new episode, the stack of frames is reinitialized with empty frames
        stacked_frames = deque([np.zeros((100, 120), dtype=np.int) for i in range(stack_size)], maxlen=stack_size)
        
        # Then we fill the deque using the same frame, since we just started the episode and that's the only frame.
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        
        # Builds the stacked state from stacked frames. The frames have dimension (1, 84, 84) after preprocessing is done,
        # so the stacked state has dimension (4, 84, 84), because we're using axis=2 to stack them.
        stacked_state = np.stack(stacked_frames, axis=2)
    else:
        # If we're not beginning a new episode, the current frame is stacked and the oldest is automatically removed.
        stacked_frames.append(frame)
        
        # Building the stacked state from currently stacked frames.
        stacked_state = np.stack(stacked_frames, axis=2)
        
    return stacked_state, stacked_frames

# 4) Definindo os hiperparâmetros

Nessa parte nós definiremos os hiperparâmetros de nossa rede. Em um contexto real, os hiperparâmetros **não são definidos de uma vez logo quando construímos a rede, mas sim progressivamente durante o ciclo de desenvolvimento.**

- Primeiro definiremos os hiperparâmetros da rede neural quando implementarmos o modelo.
- Então, adicionaremos os hiperparâmetros de treinamento quando implementarmos o algoritmo de treinamento.

In [7]:
# >>>> NEURAL NETWORK HYPERPARAMETERS <<<<
# As we've seen before, the state's dimension = 4 stacked frames, so we have (84, 84, 4)-sized inputs.
state_size = [100, 120, 4]
action_size = game.get_available_buttons_size() # We have 7 possible actions
learning_rate = 0.00025 # The learning rate for our network. Tuning this value may yield better results.


# >>>> TRAINING HYPERPARAMETERS <<<<
total_episodes = 5000 # Total #episodes for training
max_steps = 5000 # Maximum possible steps in an episode, considering we don't reach a terminal state.
batch_size = 64

# >>>> FIXED Q TARGETS STRATEGY PARAMETERS <<<<
max_tau = 10000 # The maximum number of steps it takes without updating Q-Target

# Exploration parameters for epsilon greedy strategy for action picking.
explore_start = 1.0 # Exploration probability at the start of an episode.
explore_stop = 0.01 # Exploration probability at the end of an episode.
decay_rate = 0.00005 # Exponential decay rate for the exploration probability.

# Q-Learning parameters
gamma = 0.95 # Discount rate. Future rewards are multiplied by this value, so high values means future rewards are important.


# >>>> MEMORY HYPERPARAMETERS <<<<
pretrain_length = 10000 # Number of experience tuples stored in memory when it's first initialized.
memory_size = 10000 # Maximum number of experience tuples the memory can keep.


# If training is set to false, we'll just see the trained agent, he'll try to follow the optimal policy so far.
training = True
# Set to True if you want to see the episode to be rendered, False otherwise.
episode_render = False

# 5) Criando nossa Dueling Double Deep Q-Learning Network (DDDQNN)

Os **Q-Values** representam o quão bom é usar determinada ação *a* estando em um estado *s*. Ele pode ser decomposto como a soma de:

- **V(s)**: O valor de estar no estado *s*.
- **A(s, a)**: A vantagem de usar uma ação a dado um estado s.

Podemos portanto usar uma **DDDQNN** para **desacoplar a estimativa dos valores de V(s) e A(s, a) usando duas camadas separadas, uma para cada valor.**

A vantagem dessa abordagem é que podemos **usar V(s) para determinar o valor de um estado sem ter que aprender o valor de cada ação para esse estado**.  Isso é bastante útil, pois **há estados em que a ação que tomamos não influencia no resultado**, e se estivéssemos usando uma **DQN** normal, teríamos de calcular o valor de cada ação para esse estado, coisa que podemos evitar com a **DDDQNN**.

Nosso modelo de DDDQNN tem a seguinte forma:

- Input: 4 frames agrupados;
- 3 camadas convolucionais;
- Uma camada flatten;
- Duas camadas densas:
    - Uma para calcular V(s).
    - Outra para calcular os A(s, a) para cada ação.
- Uma camada de agregação que junta esses valores;
- Output: Q-Values para cada ação.


In [8]:
class DDDQNNet:
    def __init__(self, state_size, action_size, learning_rate, name):
        # Constructing DDQNet's parameters.
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.name = name
        
        # tf.variable_scope is used to know which network we're using (DQN or target net). It will be useful
        # to update our w- parameters (the strategy of fixed Q-Targets).
        with tf.variable_scope(self.name):
            
            # Creating the placeholders, which are initialized along with our network.
            
            self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name='inputs') # The inputs are stacks of frames.
            self.ISWeights_ = tf.placeholder(tf.float32, [None, 1], name='IS_Weights')  # Importance sampling weights.
            self.actions_ = tf.placeholder(tf.float32, [None, action_size], name='actions_')
            self.target_Q = tf.placeholder(tf.float32, [None], name='target') # targetQ(s, a) = R(s, a) + y*maxQhat(s', a')
            
            # Creating the actual layers of our network.
            
            '''
            First convolutional layer:
            CNN -> ELU activation
            '''
            
            self.conv1 = tf.layers.conv2d(inputs=self.inputs_,
                                          filters=32,
                                          kernel_size=[8, 8],
                                          strides=[4, 4],
                                          padding='VALID',
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                          name='conv1')
            
            self.conv1_out = tf.nn.elu(self.conv1, name='conv1_out')
            
            '''
            Second convolutional layer:
            CNN -> ELU activation
            '''
            
            self.conv2 = tf.layers.conv2d(inputs=self.inputs_,
                                          filters=64,
                                          kernel_size=[4, 4],
                                          strides=[2, 2],
                                          padding='VALID',
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                          name='conv2')
            
            self.conv2_out = tf.nn.elu(self.conv1, name='conv2_out')
            
            
            '''
            Third convolutional layer:
            CNN -> ELU activation
            '''
            
            self.conv3 = tf.layers.conv2d(inputs=self.inputs_,
                                          filters=128,
                                          kernel_size=[4, 4],
                                          strides=[2, 2],
                                          padding='VALID',
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                          name='conv3')
            
            self.conv3_out = tf.nn.elu(self.conv1, name='conv3_out')
            
            self.flatten = tf.layers.flatten(self.conv3_out)
            
            
            # Now we separate our network's stream in two layers - one for V(s) and the other for A(s, a).
            
            '''
            Stream that calculates V(s), a.k.a value layer:
            Flatten -> Dense
            '''
            
            self.value_fc = tf.layers.dense(inputs=self.flatten,
                                            units=512,
                                            activation=tf.nn.elu,
                                            kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                            name='value_fc')
            
            self.value = tf.layers.dense(inputs=self.value_fc,
                                         units=1,
                                         activation=None,
                                         kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                         name='value')
            
            '''
            Stream that calculates A(s, a), a.k.a advantage layer:
            Flatten -> Dense
            '''

            self.advantage_fc = tf.layers.dense(inputs=self.flatten,
                                                units=512,
                                                activation=tf.nn.elu,
                                                kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                                name='advantage_fc')
            
            self.advantage = tf.layers.dense(inputs=self.advantage_fc,
                                             units=1,
                                             activation=None,
                                             kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                             name='advantages')
            
            
            # After creating the advantage and value layers, we create the aggregate to put together all values.
            # To do so, it follows the formula: Q(s, a) = V(s) + (A(s, a) - 1 / |A| * sumA(s, a'))
            
            self.output = self.value + tf.subtract(self.advantage, tf.reduce_mean(self.advantage, axis=1, keep_dims=True))
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_), axis=1) # Q is the prediction of our network
            
            # Defining loss/optimizer
            
            # We're modifying the loss because of Prioritized Experience Replay.
            self.absolute_errors = tf.abs(self.target_Q - self.Q)  # To update our Sumtree for PER
            self.loss = tf.reduce_mean(self.ISWeights_ * tf.squared_difference(self.target_Q, self.Q)) 
            self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss)
            
            

In [9]:
# Resetting the graph
tf.reset_default_graph()

# Instantiating our newly created DDQNetwork
DQNetwork = DDDQNNet(state_size, action_size, learning_rate, name="DQNetwork")

# Instantiating the target network, used to update target values when requested
TargetNetwork = DDDQNNet(state_size, action_size, learning_rate, name="TargetNetwork")


Instructions for updating:
keep_dims is deprecated, use keepdims instead


# 6) Prioritized Experience Replay (PER)

The idea behind **PER** is that **some experiences are more important than others for our training** but they might occur less frequently, so instead of sampling experiences through an uniform distribution, we **assign priorities for our experiences in order to improve our sampling**.

To implement this technique, we can't use an array, because then sampling would be inefficient. So instead, we'll implement  a **sumtree**, which is a binary tree data type where **parent nodes are the sum of child nodes**.

To summarize, our implementation follows these steps:

1. First we construct our Sumtree, which is a Binary Tree whose leaves contains the **priorities** and a **data array** with indexed elements that points to the index of leaves.
![title](sumtree_explanation.png)
    - **def init**: Initializes our SumTree data object with all nodes = 0 and data array with all = 0.
    - **def add**: Adds our priority score to the Sumtree's leaf and experience (S, A, R, S', Done) to the data.
    - **def update**: We update the leaf's priority score and propagate it through tree.
    - **def get_leaf**: Returns the priority score, index and experience associated to a leaf.
    - **def total_priority**: Returns the root node value, which is the sum of all child nodes, or the total priority score of our replay buffer.


2. Then we create a Memory type object, which will contain our Sumtree and data.
    - **def init**: Generates the SumTree and data by instantiating the SumTree object.
    - **def store**: Stores a new experience in our SumTree. New experiences are initialized with max priority, and this priority is updated during training, when we calculate our TD error.
    - **def sample**:
        - To sample a minibatch of k elements, we first divide the range [0, total_priority] into k ranges.
        - Then we sample a value uniformly from each range.
        - We then search in the SumTree the experience whose priority score corresponds to the sampled values.
        - Finally, we calculate IS weights for each element of the minibatch.
    - **def update_batch**: Updates the priorities on the tree.


In [10]:
# First step of the PER implementation.

class Sumtree(object):
    data_pointer = 0
    
    """
    Initializes the tree with all nodes = 0 and data array with all = 0.
    """
    def __init__(self, capacity):
        self.capacity = capacity  # Total #leaf_nodes, which are final nodes that contains experience
        self.tree = np.zeros(2 * capacity - 1)
        
        self.data = np.zeros(capacity, dtype=object)
    
    """
    Adds priority and data to the SumTree. Priority is added to a leaf node, and experience is added to the data array.
    """
    def add(self, priority, data):
        tree_index = self.data_pointer + self.capacity - 1   # Looking at what index to put the experience in
        
        self.data[self.data_pointer] = data  # Updating the data frame
        self.update(tree_index, priority)  # Updating the leaf node
        
        self.data_pointer += 1  # Incrementing the data_pointer by 1.
        
        if self.data_pointer >= self.capacity:  # Resets the data_pointer if we're above capacity
            self.data_pointer = 0
        
    """
    Updates the leaf node's priority score, and propagates this change along the SumTree.
    """
    def update(self, tree_index, priority):
        change = priority - self.tree[tree_index]  # New priority score - former priority score
        self.tree[tree_index] = priority  # Update the leaf node's priority score
        
        # Propagating the change through the tree
        while tree_index != 0:
            """
            Here we want to access the line above
            THE NUMBERS IN THIS TREE ARE THE INDEXES NOT THE PRIORITY VALUES
            
                0
               / \
              1   2
             / \ / \
            3  4 5  [6] 
            
            If we are in leaf at index 6, we updated the priority score
            We need then to update index 2 node
            So tree_index = (tree_index - 1) // 2
            tree_index = (6-1)//2
            tree_index = 2 (because // round the result)
            """
            tree_index = (tree_index - 1) // 2
            self.tree[tree_index] += change
            
    
    """
    Returns the index, priority score and experience associated to a leaf.
    """
    def get_leaf(self, v):
        parent_index = 0
        
        while True:
            left_child_index = 2 * parent_index + 1
            right_child_index = left_child_index + 1
            
            # Ends the search if the bottom of the tree was reached
            if left_child_index >= len(self.tree):
                leaf_index = parent_index
                break
            else:  # The search hasn't ended, so we search downwards for a higher priority node
                if v <= self.tree[left_child_index]:
                    parent_index = left_child_index
                else:
                    v -= self.tree[left_child_index]
                    parent_index = right_child_index
                    
        data_index = leaf_index - self.capacity + 1
        
        return leaf_index, self.tree[leaf_index], self.data[data_index]
                
    """
    Returns the root node of the tree, which corresponds to the total priority.
    """
    @property
    def total_priority(self):
        return self.tree[0]
            
        

In [11]:
# Second step of the PER implementation

class Memory(object):
    """
    Defining hyperparameters for our memory
    """
    PER_e = 0.01 # Fixed experience probability, so that experiences don't have 0 prob of being selected.
    PER_a = 0.6 # Controls the proportion between sampling only experiences with high priority and sampling randomly
    PER_b = 0.4 # Importance sampling's initial value that increases to 1 during training.
    PER_b_increment_per_sampling = 0.001
    
    absolute_error_upper = 1.0  # Clipped absolute error
    
    """
    Generates the SumTree and data by instantiating the SumTree object.
    """
    def __init__(self, capacity):
        # We don't use deque because it requires that indices are changed by 1 every timestep, causing performance issues.
        self.tree = Sumtree(capacity)
        
    
    """
    Stores a new experience in our SumTree, new experiences are initialized with max priority. This priority is
    updated when we use it to train our DDDQNN, which is the moment when we calculate our TD error.
    """
    def store(self, experience):
        max_priority = np.max(self.tree.tree[-self.tree.capacity:])  # Finding the max priority for our tree
        
        # If max_priority is 0, we set it to absolute_error_upper, or the experience would never be selected.
        if max_priority == 0:
            max_priority = self.absolute_error_upper
            
        self.tree.add(max_priority, experience)  # Sets the max priority for the new experience p
        
    
    """
    1) To sample a minibatch of k elements, we first divide the range [0, total_priority] into k ranges.
    2) Then we sample a value uniformly from each range.
    3) We then search in the SumTree the experience whose priority score corresponds to the sampled values.
    4) Finally, we calculate IS weights for each element of the minibatch.
    """
    def sample(self, n):
        memory_batch = []  # This array will contain the minibatch of size n
        
        b_idx = np.empty((n, ), dtype=np.int32)
        b_ISWeights = np.empty((n, 1), dtype=np.float32)
        
        # 1) Dividing the priority range [0, max_priority] into n segments.
        priority_segment = self.tree.total_priority / n
        
        # Increments PER_b each time we sample a new batch. It can be incremented up to 1.
        self.PER_b = np.min([1.0, self.PER_b + self.PER_b_increment_per_sampling])
        
        # Calculating the maximum weight
        p_min = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_priority
        max_weight = (p_min * n) ** (-self.PER_b)
        
        for i in range(n):
            # 2) Sampling a value uniformly from each range.
            a, b = priority_segment * i, priority_segment * (i + 1)
            value = np.random.uniform(a, b)
            
            # 3) Retrieving the experience that corresponds to the sampled values.
            index, priority, data = self.tree.get_leaf(value)
            # prob(j)
            sampling_probabilities = priority / self.tree.total_priority
            
            # 4) Calculating IS weights for the element, which is then added to the minibatch
            # IS = (1/N * 1/P(i))**b /max wi == (N*P(i))**-b  /max wi
            
            b_ISWeights[i, 0] = np.power(n * sampling_probabilities, -self.PER_b) / max_weight
            b_idx[i] = index
            experience = [data]
            memory_batch.append(experience)
        
        return b_idx, memory_batch, b_ISWeights
    
    
    """
    Updates the priorities on the tree.
    """
    def batch_update(self, tree_idx, abs_errors):
        abs_errors += self.PER_e  # Convert to abs and avoid 0
        clipped_errors = np.minimum(abs_errors, self.absolute_error_upper)
        ps = np.power(clipped_errors, self.PER_a)
        
        for ti, p in zip(tree_idx, ps):
            self.tree.update(ti, p)
            
    

Dealing with the **empty memory problem**. We'll fill our Memory by taking random actions and storing the experiences.

In [12]:
memory = Memory(memory_size) # Instantiating the memory

game.new_episode()

for i in tqdm(range(pretrain_length)):
    # If it's the first step, we initialize the state with a frame and stack it into 4 frames to make our first input.
    if i == 0:
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    
    action = random.choice(possible_actions)  # Samples a random action
    reward = game.make_action(action)  # Executes the chosen action and gets the reward
    done = game.is_episode_finished()  # Checks if the game has reached a terminal state
    
    # If we reach a terminal state (win or the character is dead), we reset the environment to restart a new episode
    if done:
        next_state = np.zeros(state.shape)  # Setting the next_state to a zero state
        
        # Adding the current experience to the memory
        experience = state, action, reward, next_state, done
        memory.store(experience)
        
        game.new_episode()  # Starting a new episode
        state = game.get_state().screen_buffer  # Restarts the state
        state, stacked_frames = stack_frames(stacked_frames, state, True)  # Creates a new stack of frames
        
    else:
        next_state = game.get_state().screen_buffer
        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
        
        # Adding the current experience to the memory
        experience = state, action, reward, next_state, done
        memory.store(experience)
        
        state = next_state
        
        

100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:51<00:00, 193.90it/s]


# 7) Configurando o TensorBoard

O *TensorBoard* é uma ferramenta de análise do *TensorFlow*, assistir https://www.youtube.com/watch?v=eBbEDRsCmv4 .

Para executar o *TensorBoard*, devemos utilizar o comando *tensorboard --logdir=/tensorboard/dddqn/1* no CMD.

In [13]:
writer = tf.summary.FileWriter("/tensorboard/dddqn/1")  # Setting up TensorBoard's writer

tf.summary.scalar("Loss", DQNetwork.loss)  # Recording losses

write_op = tf.summary.merge_all()

# 8) Treinando nosso agente

O algoritmo que utilizaremos para treinar nosso agente segue os seguintes passos:

- **Inicializa** os pesos para a DQN;
- **Inicializa** os para a rede que treinará nosso Q-Targets, w- <- w;
- **Inicializa** o ambiente Doom;
- **Inicializa** a taxa de decaimento usada pela seleção de ações epsilon greedy;

- **Para cada** episódio **em** max_episódios:
    - **Inicializa** um episódio;
    - **Setta** passos = 0
    - **Observa** o primeiro estado s_0
    
    - **Enquanto** passos < max_passos:
        - **Incrementa** a taxa de decaimento;
        - Com epsilon, **seleciona** um ação aleatória a_t, **caso contrário**, a_t = argmax_a Q(s_t, a), ou seja, **seleciona** a ação com maior Q atualmente;
        - **Executa** a ação a_t no simulador e **observa** a recompensa r_t+1 e o novo estado s_t+1;
        - **Armazena** a transição $;
        - **Amostra** um minibatch aleatório de D, chamado $$;
        - **Se** o episódio termina em +1, **setta** target Q_hat = r. **Caso contrário**, **setta** Q_hat = r + y.Q(s', argmax_a'Q(s', a', w), w-);
        - **Executa** o gradiente descendente com loss = (Q_hat - Q(s, a))^2;
        - A cada tau passos, **resetta** w- = w (passo da estratégia fixed Q-Targets).
        

In [14]:
"""
This function will execute the epsilon greedy action selection for our training.
"""
def predict_action(explore_start, explore_stop, decay_rate, decay_step, state, actions):
    # >>>> EPSILON GREEDY STRATEGY <<<<
    # Choosing action a from state s using epsilon greedy
    
    # First, we pick a random number
    exp_exp_tradeoff = np.random.rand()
    
    # np.exp calculates the exponential of input parameter.
    # Calculating the explore probability. Exploring is the same as picking a random action.
    explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    
    if(explore_probability > exp_exp_tradeoff):
        # We take a random action
        action = random.choice(possible_actions)
        
    else:
        # We get the current best action from our Q-Network. This is exploitation, the opposite of exploration.
        
        # Estimating the Q-Values for state.
        Qs = sess.run(DQNetwork.output,
                      feed_dict={DQNetwork.inputs_: state.reshape((1, *state.shape))})
        
        # Getting the biggest Q-Value from our estimated Q-Values. This will yield us the best action.
        choice = np.argmax(Qs)
        action = possible_actions[int(choice)]
        
    return action, explore_probability

In [15]:
# Utility function that copies one set of variables to another. This is used to update our fixed Q-Targets network's weights.
def update_target_graph():
    # Getting the variables from our DQNetwork and target network. We'll copy from DQNetwork to target network
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "DQNetwork")
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "TargetNetwork")
    
    op_holder = []
    
    # Updating the target network's parameters using DQNetwork's parameters
    for from_var, to_var in zip(from_vars, to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder
    

In [16]:
saver = tf.train.Saver()  # Saves our trained model

if training == True:
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())  # Initializing TensorFlow variables
        
        decay_step = 0  # Initializing decay rate for epsilon greedy
        tau = 0
        
        game.init()
        
        # Updating our target network's weights
        update_target = update_target_graph()
        sess.run(update_target)
        
        for episode in range(total_episodes):
            # >>>> SIMULATION PART <<<<
            step = 0
            episode_rewards = []
            
            game.new_episode()
            
            state = game.get_state().screen_buffer
            state, stacked_frames = stack_frames(stacked_frames, state, True)
            
            while step <  max_steps:
                step += 1
                tau += 1
                decay_rate += 1
                
                # Selecting a random action using epsilon greedy
                action, explore_probability = predict_action(explore_start,
                                                             explore_stop,
                                                             decay_rate,
                                                             decay_step,
                                                             state,
                                                             possible_actions)
                
                reward = game.make_action(action)  # Executing the selected action and observing the reward
                done = game.is_episode_finished()  # Checking if the game has reached a terminal state
                episode_rewards.append(reward)
                
                # If the agent reached a terminal state, we set next_state to an empty state, observe the reward
                # and store the experience tuple in Memory.
                if done:
                    next_state = np.zeros((3, 240, 320), dtype=np.int)  # Setting next_state to empty
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    
                    step = max_steps # Setting step to max steps so that the episode is ended
                    total_reward = np.sum(episode_rewards)
                    
                    print("Episode: {}".format(episode),
                          "Total reward: {}".format(total_reward),
                          "Training loss: {:.4f}".format(loss),
                          "Explore Prob.: {:.4f}".format(explore_probability))
                    
                    # Storing the experience in memory.
                    experience = state, action, reward, next_state, done
                    memory.store(experience)
                    
                else: # If the agent hasn't reached a terminal state, we get the next state and store the experience.
                    next_state = game.get_state().screen_buffer
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    
                    # Storing the experience in memory.
                    experience = state, action, reward, next_state, done
                    memory.store(experience)
                    
                    state = next_state
                
                # >>>> LEARNING PART <<<<
                
                # Getting a random minibatch from memory
                tree_idx, batch, ISWeights_mb = memory.sample(batch_size)
                
                states_mb = np.array([exp_tuple[0][0] for exp_tuple in batch], ndmin=3)
                actions_mb = np.array([exp_tuple[0][1] for exp_tuple in batch])
                rewards_mb = np.array([exp_tuple[0][2] for exp_tuple in batch])
                next_states_mb = np.array([exp_tuple[0][3] for exp_tuple in batch], ndmin=3)
                dones_mb = np.array([exp_tuple[0][4] for exp_tuple in batch])
                
                target_Qs_batch = []
                
                # DOUBLE DQN LOGIC
                # In Double DQN we use the DQNNetwork to select the a' action to take at the next state s',
                # the one with the highest Q-Value.
                # Then we use TargetNetwork to calculate the Q_Val of Q(s', a')
                
                
                # Getting Q-Values for next state
                q_next_state = sess.run(DQNetwork.output,
                                        feed_dict = {DQNetwork.inputs_: next_states_mb})
                
                # Calculating the Q-Target for all actions at that state
                q_target_next_state = sess.run(TargetNetwork.output,
                                               feed_dict = {TargetNetwork.inputs_: next_states_mb})
                
                # Sets QTarget = r if the state ends at +1, otherwise, sets Q_target = r + gamma * Qtarget(s',a')
                for i in range(0, len(batch)):
                    terminal = dones_mb[i]
                    
                    action = np.argmax(q_next_state[i])  # Getting the best action from next state, a'
                    
                    # If the state is terminal, QTarget = r
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                        
                    # If the state is not terminal, we set QTarget = r + gamma * QTarget(s', a')
                    else:
                        target = rewards_mb[i] + gamma * q_target_next_state[i][action]
                        target_Qs_batch.append(target)
                        
                targets_mb = np.array([exp_tuple for exp_tuple in target_Qs_batch])
                
                _, loss, absolute_errors = sess.run([DQNetwork.optimizer, DQNetwork.loss, DQNetwork.absolute_errors],
                                                    feed_dict = {
                                                        DQNetwork.inputs_: states_mb,
                                                        DQNetwork.target_Q: targets_mb,
                                                        DQNetwork.actions_: actions_mb,
                                                        DQNetwork.ISWeights_: ISWeights_mb
                                                    })
                
                # Updating the priorities for PER experiences.
                memory.batch_update(tree_idx, absolute_errors)
                
                # Writing tf summaries to TensorBoard
                summary = sess.run(write_op,
                                   feed_dict = {
                                        DQNetwork.inputs_: states_mb,
                                        DQNetwork.target_Q: targets_mb,
                                        DQNetwork.actions_: actions_mb,
                                        DQNetwork.ISWeights_: ISWeights_mb
                                   })
                
                writer.add_summary(summary, episode)
                writer.flush()
                
                # If tau > max_tau, we update the weights of our target network
                if tau > max_tau:
                    update_target = update_target_graph()
                    sess.run(update_target)
                    tau = 0
                    print("Target network weights updated!")
            
            if episode % 5 == 0:
                save_path = saver.save(sess, "models/corridor_model.ckpt")
                print("Model saved!")

        

Episode: 0 Total reward: -91.64329528808594 Training loss: 0.4634 Explore Prob.: 1.0000
Model saved!
Episode: 1 Total reward: -108.10116577148438 Training loss: 1.0179 Explore Prob.: 1.0000
Episode: 2 Total reward: -92.45686340332031 Training loss: 1.0118 Explore Prob.: 1.0000
Episode: 3 Total reward: -112.77201843261719 Training loss: 0.6964 Explore Prob.: 1.0000
Episode: 4 Total reward: -111.99989318847656 Training loss: 19.2720 Explore Prob.: 1.0000
Episode: 5 Total reward: -109.731201171875 Training loss: 14.8977 Explore Prob.: 1.0000
Model saved!
Episode: 6 Total reward: -115.99786376953125 Training loss: 0.4588 Explore Prob.: 1.0000
Episode: 7 Total reward: -111.05291748046875 Training loss: 0.9718 Explore Prob.: 1.0000
Episode: 8 Total reward: -95.94459533691406 Training loss: 19.6915 Explore Prob.: 1.0000
Episode: 9 Total reward: -106.89639282226562 Training loss: 13.1753 Explore Prob.: 1.0000
Episode: 10 Total reward: -92.26089477539062 Training loss: 5.8538 Explore Prob.: 1.0

ViZDoomUnexpectedExitException: Controlled ViZDoom instance exited unexpectedly.

# 9) Assistindo nosso agente jogar.

Agora que treinamos nosso agente, podemos assistí-lo jogar!

In [17]:
with tf.Session() as sess:
    game = DoomGame()
    
    game.load_config(config_filename)
    game.set_doom_scenario_path(scenario_filename)
    
    game.init()
    
    saver.restore(sess, "models/corridor_model.ckpt")
    game.init()
    
    for i in range(10):
        
        game.new_episode()
        
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
        while not game.is_episode_finished():
            # Choose an action using epsilon greedy
            
            exp_exp_tradeoff = np.random.rand() # Exploration/exploration tradeoff ratio
            
            explore_probability = 0.01
            
            if explore_probability > exp_exp_tradeoff:
                action = random.choice(possible_actions)  # Getting a random action
                
            else:
                # Getting predicted Q-Values from the network
                Qs = sess.run(DQNetwork.output,
                              feed_dict= {
                                  DQNetwork.inputs_: state.reshape((1, *state.shape))
                              })
                choice = np.argmax(Qs) # Picks the greatest Q-Value from the network
                action = possible_actions[int(choice)]
            
            game.make_action(action)
            done = game.is_episode_finished()
            
            if done:
                break
            
            else:
                next_state = game.get_state().screen_buffer
                next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                state = next_state
                
        score = game.get_total_reward()
        print("Score: ", score)
        
    game.close()
                
        

INFO:tensorflow:Restoring parameters from models/corridor_model.ckpt
Score:  -115.81562805175781
Score:  -106.03758239746094
Score:  -115.81562805175781
Score:  -115.81562805175781
Score:  -115.9755859375
Score:  -115.81562805175781
Score:  -115.81562805175781
Score:  -115.98381042480469
Score:  -115.99192810058594
Score:  -115.81562805175781
