# 0) Instalando dependências

In [1]:
# Installing dependencies. Vizdoom can be downloaded from https://github.com/mwydmuch/ViZDoom
!pip install scikit-image



# 1) Importando bibliotecas
Para esse exemplo, teremos de importar todas os módulos ubíquos ao DRL, como:
- TensorFlow
- Numpy
- Random

E além disso, para nosso ambiente Doom, temos de importar o módulo responsável por modelar o ambiente Doom, chamado de Vizdoom.

In [2]:
# Importing required modules
import tensorflow as tf
import numpy as np
from vizdoom import * # Doom environment module

import random
import time
from skimage import transform  # For frame preprocessing

from collections import deque
import matplotlib.pyplot as plt

import warnings  # Ignores all warning messages from skimage during training
warnings.filterwarnings("ignore")

# 2) Criando o ambiente Doom

In [3]:
'''Creating our Doom environment'''
def create_env():
    game = DoomGame()
    
    # Loads a configuration file that handles all the options (size of frame, possible actions etc.)
    game.load_config("basic.cfg")
    
    # Loads a scenario. We're using basic scenario, but others can be used.
    game.set_doom_scenario_path("basic.wad")
    
    game.init()
    
    # One-hot encoded list of possible actions.
    left = [1, 0, 0]
    right = [0, 1, 0]
    shoot = [0, 0, 1]
    possible_actions = [left, right, shoot]
    
    return game, possible_actions


'''Performing random actions to test the environment.'''
def test_env():
    game = DoomGame()
    game.load_config("basic.cfg")
    game.set_doom_scenario_path("basic.wad")
    game.init()
    
    left = [1, 0, 0]
    right = [0, 1, 0]
    shoot = [0, 0, 1]
    possible_actions = [left, right, shoot]
    
    episodes = 10
    for i in range(episodes):
        game.new_episode()
        while not game.is_episode_finished():
            state = game.get_state()
            img = state.screen_buffer
            misc = state.game_variables
            action = random.choice(possible_actions)
            print(action)
            reward = game.make_action(action)
            print("\tReward: ", reward)
            time.sleep(0.02)
        print("Result: ", game.get_total_reward())
        time.sleep(2)
    game.close()
    

In [4]:
game, possible_actions = create_env()

# 3) Pré processamento
O pré processamento é utilizado para diminuírmos a dimensão do nosso input, que nesse caso são so frames do jogo. Nessa fase, iremos portanto pegar o frame input, converter as cores de RGB para grayscale, visto que as cores **não adicionam informação importante** para nossa rede, e então cortaremos parte do frame para que o teto não seja representado, visto que ele também não adiciona uma informações necessárias à nossa rede. Tudo isso será feito utilizando o módulo *skimage* importado anteriormente.

In [5]:
"""Frame preprocessing: takes a frame, grayscales and then downscales it."""
def preprocess_frame(frame):
    # Grayscaling is can be done by Vizdoom if configured properly in config file, but we're doing it 'manually'.
    grayscaled_frame = np.mean(frame, 0)

    # Cropping the frame to remove the roof, since it contains no relevant info.
    cropped_frame = grayscaled_frame[30:-10, 30:-30]
    
    # Normalizing pixel color values.
    normalized_frame = cropped_frame/255.0
    
    # Resizing the frame to a squared size (84x84).
    preprocessed_frame = transform.resize(normalized_frame, [84, 84])
    return preprocessed_frame


## Agrupamento de frames

É necessário agrupar frames para **darmos à nossa rede uma noção de movimento**. Para isso, seguiremos os seguintes passos:

- Pré processamos o frame atual
- Empurramos esse frame a um *deque* que remove automaticamente o frame mais antigo
- Depois disso, construímos o estado agrupado, que consiste em um estado representado pelo grupo de frames.

O agrupamento funciona da seguinte forma:

- Para o primeiro frame, nós usamos 4 frames.
- A cada *timestep* adicionamos um novo frame ao *deque* e agrupamos eles para formarmos um novo frame agrupado.
- Seguimos com esses passos até o final do episódio.
- Ao final do episódio, repetimos o processo criando 4 novos frames, pois estamos em um novo episódio.

In [6]:
stack_size = 4 # Number of frames we're stacking.

# Initializing a stack of frames with empty (zero'd) frames.
stacked_frames = deque([np.zeros((84, 84), dtype=np.int) for i in range(stack_size)], maxlen=4)

# Stacked_frames = deque with stacked frames
# state = current frame
# is_new_episode = bool signaling if it's the start of an episode
def stack_frames(stacked_frames, state, is_new_episode):
    # Preprocessing the current frame
    frame = preprocess_frame(state)
    if is_new_episode:
        # If we're starting a new episode, the stack of frames is reinitialized with empty frames
        stacked_frames = deque([np.zeros((84, 84), dtype=np.int) for i in range(stack_size)], maxlen=stack_size)
        
        # Then we fill the deque using the same frame, since we just started the episode and that's the only frame.
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        
        # Builds the stacked state from stacked frames. The frames have dimension (1, 84, 84) after preprocessing is done,
        # so the stacked state has dimension (4, 84, 84), because we're using axis=2 to stack them.
        stacked_state = np.stack(stacked_frames, axis=2)
    else:
        # If we're not beginning a new episode, the current frame is stacked and the oldest is automatically removed.
        stacked_frames.append(frame)
        
        # Building the stacked state from currently stacked frames.
        stacked_state = np.stack(stacked_frames, axis=2)
        
    return stacked_state, stacked_frames

# 4) Definindo os hiperparâmetros

Nessa parte nós definiremos os hiperparâmetros de nossa rede. Em um contexto real, os hiperparâmetros **não são definidos de uma vez logo quando construímos a rede, mas sim progressivamente durante o ciclo de desenvolvimento.**

- Primeiro definiremos os hiperparâmetros da rede neural quando implementarmos o modelo.
- Então, adicionaremos os hiperparâmetros de treinamento quando implementarmos o algoritmo de treinamento.

In [7]:
# >>>> NEURAL NETWORK HYPERPARAMETERS <<<<
# As we've seen before, the state's dimension = 4 stacked frames, so we have (84, 84, 4)-sized inputs.
state_size = [84, 84, 4]
action_size = game.get_available_buttons_size() # We have 3 possible actions: left, right, shoot
learning_rate = 0.0002 # The learning rate for our network. Tuning this value may yield better results.


# >>>> TRAINING HYPERPARAMETERS <<<<
total_episodes = 500 # Total #episodes for training
max_steps = 100 # Maximum possible steps in an episode, considering we don't reach a terminal state.
batch_size = 64

# Exploration parameters for epsilon greedy strategy for action picking.
explore_start = 1.0 # Exploration probability at the start of an episode.
explore_stop = 0.01 # Exploration probability at the end of an episode.
decay_rate = 0.0001 # Exponential decay rate for the exploration probability.

# Q-Learning parameters
gamma = 0.95 # Discount rate. Future rewards are multiplied by this value, so high values means future rewards are important.


# >>>> MEMORY HYPERPARAMETERS <<<<
pretrain_length = batch_size # Number of experience tuples stored in memory when it's first initialized.
memory_size = 1000000 # Maximum number of experience tuples the memory can keep.


# If training is set to false, we'll just see the trained agent, he'll try to follow the optimal policy so far.
training = True
# Set to True if you want to see the episode to be rendered, False otherwise.
episode_render = False


# 5) Criando o modelo de nossa Rede Neural com Deep Q-Learning

Nosso modelo com Deep Q-Learning pode ser descrito da seguinte forma:

- Primeiro nós agrupamos 4 frames como input, processo já descrito anteriormente;
- Depois esse input passa por 3 camadas *convolucionais*;
- É "achatado" (flattened);
- Passa por mais duas camadas *densas*;
- E por fim, o modelo nos dá como saída os **Q-Values** correspondentes a cada tipo de ação.

In [8]:
class DQNetwork:
    def __init__(self, state_size, action_size, learning_rate, name='DQNetwork'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        with tf.variable_scope(name):
            # 'state_size' is the size of each state tuple, in our case, (4, 84, 84).
            # So writing [None, *state_size] is the same as writing [None, 4, 84, 84].
            self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name="inputs")
            self.actions_ = tf.placeholder(tf.float32, [None, 3], name="actions_")
            
            # As described in the article, targetQ = R(s, a) + Qhat(s', a')
            self.target_Q = tf.placeholder(tf.float32, [None], name="target")
            
            
            """
            First convolutional layer:
            CNN -> Batch normalization -> ELU activation
            """
            
            # The input of our network is a state, so the input size is (4, 84, 84).
            self.conv1 = tf.layers.conv2d(inputs=self.inputs_,
                                          filters=32,
                                          kernel_size=[8, 8],
                                          strides=[4, 4],
                                          padding="VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                          name="conv1")
            
            self.conv1_batchnorm = tf.layers.batch_normalization(self.conv1,
                                                                 training=True,
                                                                 epsilon=1e-5,
                                                                 name="batch_norm1")
            
            self.conv1_out = tf.nn.elu(self.conv1_batchnorm, name="conv1_out")
            
            # Output size is (20, 20, 32).
            
            
            """
            Second convolutional layer:
            CNN -> Batch normalization -> ELU activation
            """
            
            self.conv2 = tf.layers.conv2d(inputs=self.conv1_out,
                                          filters=64,
                                          kernel_size=[4, 4],
                                          strides=[2, 2],
                                          padding="VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                          name="conv2")
            
            self.conv2_batchnorm = tf.layers.batch_normalization(self.conv2,
                                                                 training=True,
                                                                 epsilon=1e-5,
                                                                 name="batch_norm2")
            
            self.conv2_out = tf.nn.elu(self.conv2_batchnorm, name="conv2_out")
            
            # Output size is (9, 9, 64).
            
            
            """
            Third convolutional layer:
            CNN -> Batch normalization -> ELU activation
            """
            
            self.conv3 = tf.layers.conv2d(inputs=self.conv2_out,
                                          filters=128,
                                          kernel_size=[4, 4],
                                          strides=[2, 2],
                                          padding="VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                          name="conv3")
            
            self.conv3_batchnorm = tf.layers.batch_normalization(self.conv3,
                                                                 training=True,
                                                                 epsilon=1e-5,
                                                                 name="batch_norm3")
            
            self.conv3_out = tf.nn.elu(self.conv3_batchnorm, name="conv3_out")
            
            # Output size is (3, 3, 128)
            
            """
            Now we stack one flatten layer and two dense layers, the last one being the output layer.
            """
            
            self.flatten = tf.layers.flatten(self.conv3_out)
            
            # Output size is 3 * 3 * 128 = 1152.
            
            self.fc = tf.layers.dense(inputs=self.flatten,
                                      units=512,
                                      activation=tf.nn.elu,
                                      kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                      name="fc1")
            
            self.output = tf.layers.dense(inputs=self.fc,
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                          units=3,
                                          activation=None)
            
            # Those are the Q-Values predicted by our network.
            # tf.reduce_sum returns a reduced version of the input tensor along specified axis.
            # The 0 axis in tensorflow is rows, and the 1 axis is columns, so reduce_sum over axis 1 yields
            # a tensor with one value for each row, summing over all of the row's values.
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_), axis=1)
            
            # The loss is the difference between the predicted Q-Values and Q_Target.
            # tf.reduce_mean works like tf.reduce_sum, but instead of summing, it gets the mean.
            self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
            
            self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss)
            
            
            

In [9]:
# Resetting the default graph. This step is necessary for tensorflow to work properly.
tf.reset_default_graph()

# Instantiating the DQNetwork.
DQNetwork = DQNetwork(state_size, action_size, learning_rate)


# 6) Experience Replay

A implementação do Experience Replay consiste em um *buffer* com **tuplas de experiências pelas quais o agente passou**, esse *buffer* permitirá ao agente a seleção de *minibatches* de experiências passadas para que elas sejam usadas no treinamento, permitindo que o agente **não aprenda apenas as experiências atuais e de forma sequencial, mas também experiências passadas**.

Esse *buffer* é representado em nosso programa por um *deque*, que é uma fila de dois lados que **remove o seu elemento mais antigo quando um novo é adicionado.**

In [10]:
class Memory():
    # Starts the buffer's deque with limited size.
    def __init__(self, max_size):
        self.buffer = deque(maxlen=max_size)
    
    # Adds an element to the buffer.
    def add(self, experience):
        self.buffer.append(experience)
    
    # Samples a random batch from the buffer.
    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        index = np.random.choice(np.arange(buffer_size),
                                 size=batch_size,
                                 replace=False)
        
        return [self.buffer[i] for i in index]
    
        

In [11]:
# Instantiating the memory
memory = Memory(max_size=memory_size)

# Rendering the Doom environment
game.new_episode()

# Initializes the memory buffer by taking random actions and storing the experience
# tuples (state, action, reward, new_state) in it.
for i in range(pretrain_length):
    
    # If it's the first step
    if i == 0:
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
    # Taking a random action.
    action = random.choice(possible_actions)
    
    # Getting the reward from the chosen action.
    reward = game.make_action(action)
    
    # Checking if the episode is finished (reached a terminal state).
    done = game.is_episode_finished()
    
    if done:
        next_state = np.zeros(state.shape)
        
        # Adding an experience tuple to the memory
        memory.add((state, action, reward, next_state, done))
        
        game.new_episode()
        
        state = game.get_state().screen_buffer
        
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
    else:
        # Getting the next state.
        next_state = game.get_state().screen_buffer
        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
        
        # Adding an experience tuple to the memory
        memory.add((state, action, reward, next_state, done))
        
        state = next_state
        
        

# 7) Configurando o TensorBoard

O *TensorBoard* é uma ferramenta de análise do *TensorFlow*, assistir https://www.youtube.com/watch?v=eBbEDRsCmv4 .

Para executar o *TensorBoard*, devemos utilizar o comando *tensorboard --logdir=/tensorboard/dqn/1* no CMD.

In [12]:
# Setting up the TensorBoard writer.
writer = tf.summary.FileWriter("/tensorboard/dqn/1")

# Writing loss values to TensorBoard
tf.summary.scalar("Loss", DQNetwork.loss)

write_op = tf.summary.merge_all()

# 8) Treinando nosso agente

O algoritmo de treinamento que utilizaremos segue os seguintes passos:

- **Inicializa** os *pesos*;
- **Inicializa** o *ambiente*;
- **Inicializa** a *taxa de decaimento* de *epsilon* (epsilon é usado pela seleção de ação epsilon greedy);


- **Para cada** *episódio* **em** *max_episódios*:
    - **Cria** um novo episódio;
    - **Setta** *step* para 0;
    - **Observa** o primeiro estado S0;
    
    - **Enquanto** *step* < *max_step*:
        - **Incrementa** a *taxa de decaimento*;
        - Com *prob* = *epsilon*, **seleciona** uma ação aleatória a, ou a = *argmaxQ(s, a)*;
        - **Simula** com a ação *a* e **observa** a recompensa R,t+1 e novo estado s,t+1;
        - **Armazena** a transição $;
        - **Amostra** um *mini-batch* aleatório de *D*: $$
        - **Setta** *Q,hat* = *R* **se** o episódio termina em +1, caso contrário, **setta** Q,hat = r + decay . max_a . Q(s', a')
        - **Executa** um *step* do gradiente descendente com *perda* (Q,hat - Q(s, a)) ^ 2
        

In [13]:
"""This function is responsible for handling epsilon greedy action selection and epsilon decay."""
def predict_action(explore_start, explore_stop, decay_rate, decay_step, state, actions):
    # >>>> EPSILON GREEDY STRATEGY <<<<
    # Choosing action a from state s using epsilon greedy
    
    # First, we pick a random number
    exp_exp_tradeoff = np.random.rand()
    
    # np.exp calculates the exponential of input parameter.
    # Calculating the explore probability. Exploring is the same as picking a random action.
    explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    
    if(explore_probability > exp_exp_tradeoff):
        # We take a random action
        action = random.choice(actions)
        
    else:
        # We get the current best action from our Q-Network. This is exploitation, the opposite of exploration.
        
        # Estimating the Q-Values for state.
        Qs = sess.run(DQNetwork.output,
                      feed_dict={DQNetwork.inputs_: state.reshape((1, *state.shape))})
        
        # Getting the biggest Q-Value from our estimated Q-Values. This will yield us the best action.
        choice = np.argmax(Qs)
        action = possible_actions[int(choice)]
        
    return action, explore_probability

In [None]:
# As the name implies, saver is responsible for saving our model.
saver = tf.train.Saver()

if training == True:
    with tf.Session() as sess:
        # Initializing our Variables
        sess.run(tf.global_variables_initializer())
        
        # Initializing the decay rate used to decrement from epsilon.
        decay_step = 0
        
        # Initializing the game
        game.init()
        
        for episode in range(total_episodes):
            # >>>> SIMULATION PART <<<<
            ### In this part of the algorithm, we're only simulating the environment and recording rewards and memory states.
            
            # Reinitializing the episode's variables.
            step = 0
            episode_rewards = []
            game.new_episode() # Starting a new episode
            
            state = game.get_state().screen_buffer  # Getting the first frame
            state, stacked_frames = stack_frames(stacked_frames, state, True) # Getting a stack of frames to use as input
            
            while step < max_steps:
                step += 1
                decay_step += 1
                
                # Choosing an action using epsilon greedy strategy.
                action, explore_probability = predict_action(explore_start,
                                                             explore_stop,
                                                             decay_rate,
                                                             decay_step,
                                                             state,
                                                             possible_actions)
                
                reward = game.make_action(action)  # Executing the chosen action.
                done = game.is_episode_finished()
                episode_rewards.append(reward)
                
                # If reached terminal state
                if done:
                    next_state = np.zeros((3, 84, 84), dtype=np.int)  # next_state is set to empty, because there is no next step.
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    
                    step = max_steps
                    total_reward = np.sum(episode_rewards)
                    
                    print("Episode: {}".format(episode),
                            "Total reward: {}".format(total_reward),
                            "Training loss: {:.4f}".format(loss),
                            "Explore prob: {:.4f}".format(explore_probability))
                    
                    memory.add((state, action, reward, next_state, done))
                    
                else:
                    next_state = game.get_state().screen_buffer
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    
                    memory.add((state, action, reward, next_state, done))
                    state = next_state
                    
                # >>>> LEARNING PART <<<<
                
                # Getting minibatches of experience tuples [(state, action, reward, next_state, done)] from the memory.
                memory_batch = memory.sample(batch_size)
                states_mb = np.array([xp_tuple[0] for xp_tuple in memory_batch], ndmin=3)
                actions_mb = np.array([xp_tuple[1] for xp_tuple in memory_batch])
                rewards_mb = np.array([xp_tuple[2] for xp_tuple in memory_batch])
                next_states_mb = np.array([xp_tuple[3] for xp_tuple in memory_batch], ndmin=3)
                dones_mb = np.array([xp_tuple[4] for xp_tuple in memory_batch])
                
                target_Qs_batch = []
                
                # Getting Q-Values for the next state.
                Qs_next_state = sess.run(DQNetwork.output,
                                         feed_dict = {DQNetwork.inputs_: next_states_mb})
                
                #  Sets Q_target = r if the episode ends after s+1, otherwise, we set Q_target = r + gamma*maxQ(s', a')
                for i in range(0, len(memory_batch)):
                    terminal = dones_mb[i]
                    
                    # The state is terminal, so we set Q_target = reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                    else:
                        target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)
                
                targets_mb = np.array([targetQ for targetQ in target_Qs_batch])
                loss, _ = sess.run([DQNetwork.loss, DQNetwork.optimizer],
                                   feed_dict={DQNetwork.inputs_: states_mb,
                                              DQNetwork.target_Q: targets_mb,
                                              DQNetwork.actions_: actions_mb})
                
                # Writing TensorFlow summaries.
                summary = sess.run(write_op,
                                   feed_dict={DQNetwork.inputs_: states_mb,
                                              DQNetwork.target_Q: targets_mb,
                                              DQNetwork.actions_: actions_mb})
                writer.add_summary(summary, episode)
                writer.flush()    
            if episode % 5 == 0:
                #save_path = saver.save(sess, "models/model.ckpt")
                print("Saved model!")
                      

Episode: 0 Total reward: 93.0 Training loss: 162.3175 Explore prob: 0.9992
Saved model!
Episode: 1 Total reward: 94.0 Training loss: 281.6517 Explore prob: 0.9985
Episode: 2 Total reward: 94.0 Training loss: 299.9031 Explore prob: 0.9978
Episode: 3 Total reward: 95.0 Training loss: 115.8068 Explore prob: 0.9972
Episode: 5 Total reward: 90.0 Training loss: 85.9537 Explore prob: 0.9863
Saved model!
Episode: 6 Total reward: -18.0 Training loss: 13.0123 Explore prob: 0.9772
Episode: 7 Total reward: 75.0 Training loss: 16.7627 Explore prob: 0.9752
Episode: 8 Total reward: 94.0 Training loss: 28.2834 Explore prob: 0.9745
Saved model!
Episode: 13 Total reward: 95.0 Training loss: 5.0644 Explore prob: 0.9361
Episode: 15 Total reward: 94.0 Training loss: 5.0752 Explore prob: 0.9263
Saved model!
Saved model!
Episode: 22 Total reward: 95.0 Training loss: 6.6279 Explore prob: 0.8724
Episode: 24 Total reward: 93.0 Training loss: 116.1769 Explore prob: 0.8631
Episode: 25 Total reward: 91.0 Training 

Episode: 146 Total reward: 20.0 Training loss: 4.2128 Explore prob: 0.5383
Episode: 147 Total reward: 95.0 Training loss: 6.3267 Explore prob: 0.5380
Episode: 148 Total reward: 94.0 Training loss: 7.3650 Explore prob: 0.5376
Episode: 149 Total reward: 95.0 Training loss: 11.3750 Explore prob: 0.5373
Episode: 150 Total reward: 94.0 Training loss: 4.0003 Explore prob: 0.5369
Saved model!
Episode: 151 Total reward: 95.0 Training loss: 11.4416 Explore prob: 0.5366
Episode: 152 Total reward: 95.0 Training loss: 7.9479 Explore prob: 0.5363
Episode: 154 Total reward: 73.0 Training loss: 6.5760 Explore prob: 0.5298
Episode: 155 Total reward: 94.0 Training loss: 7.1555 Explore prob: 0.5295
Saved model!
Episode: 157 Total reward: 95.0 Training loss: 8.7573 Explore prob: 0.5240
Episode: 158 Total reward: 95.0 Training loss: 6.1254 Explore prob: 0.5237
Episode: 159 Total reward: 76.0 Training loss: 9.6282 Explore prob: 0.5227
Episode: 160 Total reward: 8.0 Training loss: 17.2523 Explore prob: 0.51

# 9) Visualizando o nosso agente.

Agora que treinamos nosso agente, nós podemos testá-lo!

In [18]:
saver = tf.train.Saver()

with tf.Session() as sess:
    game, possible_actions = create_env()
    total_score = 0
    
    saver.restore(sess, "models/model.ckpt") # Loading our saved trained model.
    
    game.init()
    for i in range(1):
        first_frame = True
        
        game.new_episode()
        while not game.is_episode_finished():
            state = game.get_state()
            if first_frame:
                frame = game.get_state().screen_buffer
                state, stacked_frames = stack_frames(stacked_frames, frame, True)
                first_frame = False
            else:
                frame = game.get_state().screen_buffer
                state, stacked_frames = stack_frames(stacked_frames, frame, False)
            
            # Taking the biggest Q-Value from our network, which maps to the best action.
            Qs = sess.run(DQNetwork.output,
                          feed_dict={DQNetwork.inputs_: state.reshape((1, *state.shape))})
            action = np.argmax(Qs)
            action = possible_actions[int(action)]
            
            game.make_action(action) # Executing the action picked.
            
            score = game.get_total_reward()
        print("Score: ", score)
        total_score += score
    print("TOTAL SCORE: ", total_score/100.0)
    game.close()
    
    

INFO:tensorflow:Restoring parameters from models/model.ckpt
Score:  72.0
Score:  95.0
Score:  95.0
Score:  68.0
Score:  95.0
Score:  95.0
Score:  95.0
Score:  95.0
Score:  91.0
Score:  95.0
Score:  93.0
Score:  95.0
Score:  95.0
Score:  38.0
Score:  95.0
Score:  76.0
Score:  95.0
Score:  -385.0
Score:  -375.0
Score:  95.0
Score:  95.0
Score:  64.0
Score:  95.0
Score:  95.0
Score:  -375.0
Score:  95.0
Score:  66.0
Score:  95.0
Score:  -380.0
Score:  95.0
Score:  95.0
Score:  -390.0
Score:  95.0
Score:  73.0
Score:  95.0
Score:  93.0
Score:  95.0
Score:  95.0
Score:  -9.0
Score:  72.0
Score:  50.0
Score:  95.0
Score:  -390.0
Score:  95.0
Score:  -385.0
Score:  95.0
Score:  -410.0
Score:  95.0
Score:  95.0
Score:  95.0
Score:  23.0
Score:  95.0
Score:  -380.0
Score:  95.0
Score:  95.0
Score:  95.0
Score:  -365.0
Score:  95.0
Score:  -365.0
Score:  95.0
Score:  -5.0
Score:  95.0
Score:  95.0
Score:  95.0
Score:  95.0
Score:  95.0
Score:  -123.0
Score:  22.0
Score:  95.0
Score:  95.0
Score: