# 0) Installing dependencies.

In [1]:
# Vizdoom installation guide and download link can be found at: https://github.com/mwydmuch/ViZDoom
!pip install tensorflow-gpu
!pip install numpy
!pip install scikit-image
!pip install matplotlib



# 1) Importing the libraries

In [2]:
import random
import time
import warnings
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
from skimage import transform
from vizdoom import *

# Ignores messages that are printed during training by skimage.
warnings.filterwarnings('ignore')

# 2) Creating our Doom environment

### Our scenario

This time, the scenario we'll use in our environment consists of a stage with multiple MedKits spread around it and the agent constantly losing health. Through the rewards, the agent only knows that life is good and death is bad, so we need to **train our agent to pick the MedKits around the stage so he can survive**. MedKits also spawn periodically and are uniformly distributed around the stage.

### Implementation

To create our Doom environment we're using the [VizDoom library](https://github.com/mwydmuch/ViZDoom), which in our case will be configured as follows.
- Our VizDoom environment takes:
    - A *configuration file* that handles all the options, like size of the frame, possible actions etc.
    - A *scenario file* that generates the scenario we picked.
- We have 3 possible **actions**, namely:
    - Move left, encoded as [1, 0, 0]
    - Move right, encoded as [0. 0. 1]
    - Move forward, encoded as [0, 1, 0]
- The **reward** for living is **1**.
- The **reward** for dying is **-100**.



In [3]:
'''
Creating our Doom environment with VizDoom.
'''

def create_env():
    game = DoomGame()
    
    game.load_config('health_gathering.cfg')   # Loading the configuration file
    
    game.set_doom_scenario_path('health_gathering.wad')   # Loading our scenario
    
    game.init()
    
    # The possible actions are:
    # move_left: [1, 0, 0]
    # move_forward: [0, 1, 0]
    # move_right: [0, 0, 1]
    possible_actions = np.identity(3, dtype=int).tolist()
    
    return game, possible_actions    
    

In [4]:
game, possible_actions = create_env()

# 3) Defining the preprocess function

In order to **reduce the size of our states and the complexity of processing them, which consequently reduces the time required for training**, we'll preprocess our frames with a function that follows these steps:

- **Grayscale** each frame, since **color doesn't add important information** and the frames we get from VizDoom come with RGB color channels.
- **Crop the screen** to remove the roof, since it also doesn't add important information.
- **Normalize** pixel values. We should always use normalized data to train our networks.
- **Resize** the preprocessed frames to a squared size (nxn).

In [5]:
'''
Grayscales, crops, normalizes and resizes the input frame.
'''
def preprocess_frame(frame):
    # Grayscaling our frame
    grayscaled_frame = np.mean(frame, axis=0)
    
    # Cropping our frame to remove the roof
    cropped_frame = grayscaled_frame[80:,:]
    
    # Normalizing pixel values
    normalized_frame = cropped_frame / 255.0
    
    # Resizing the frame to (84, 84)
    preprocessed_frame = transform.resize(normalized_frame, [84, 84])
    
    return preprocessed_frame

### Stacking frames

We need to stack frames in order to **give our network a sense of motion**. To do so, we'll follow the following steps:

- Preprocess the current frame.
- Push the preprocessed frame to a *deque*, which automatically removes the oldest frame.
- Then, we build our stacked state, which consists of a state represented by a stack of frames.

The stacking works as follows:

- For the first frame/state, we'll repeat it n times, n being the maximum size of our stack.
- For each timestep of our training, we'll add a new frame to the deque, so we can stack them in order to get our stacked state.
- We follow these steps until the end of the episode.
- For each episode, we repeat the process.

In [6]:
stack_size = 4  # The number of frames we'll stack together in our stacked state

# Initializing our stacked state with 'stack_size' empty states
stacked_frames = deque([np.zeros((84, 84), dtype=np.int) for i in range(stack_size)], maxlen=stack_size)

def stack_frames(stacked_frames, state, is_new_episode):
    
    # Preprocessing the input frame (state)
    frame = preprocess_frame(state)
    
    if is_new_episode:
        # Since we're in a new episode, we fill our stack of frames using the same frame
        stacked_frames = deque([frame for i in range(stack_size)], maxlen=stack_size)
        
    else:
        # We append the input frame to our deque, which automatically removes the oldest frame
        stacked_frames.append(frame)
    
    # Stacking the frames on stacked_frames to build our stacked_state
    stacked_state = np.stack(stacked_frames, axis=2)
    
    return stacked_state, stacked_frames


### Discounting and normalizing rewards

Since we're in a **Monte Carlo situation** we need to **discount the rewards at the end of the episode**. To do so, we'll implement a funtion that:

1. Takes a list with all the rewards for an episode.
2. Applies our previously defined gamma to discount them.
3. Normalizes the result.


In [7]:
# 1) Takes as input a list with all the rewards for an episode
def discount_and_normalize_rewards(episode_rewards):
    discounted_episode_rewards = np.zeros_like(episode_rewards)
    cumulative = 0.0
    
    # 2) Calculating the discounted rewards
    for i in reversed(range(len(episode_rewards))):
        cumulative = cumulative * gamma + episode_rewards[i]
        discounted_episode_rewards[i] = cumulative
    
    # 3) Normalizing the rewards by subtracting them to their mean and dividing by the standard deviation
    mean = np.mean(discounted_episode_rewards)
    std = np.std(discounted_episode_rewards)
    discounted_episode_rewards = (discounted_episode_rewards - mean) / (std)
    
    return discounted_episode_rewards


# 4) Setting up our hyperparameters

In this guide, we'll set all our hyperparameters at once, that is, the model and training hyperparameters. But in a regular reinforcement learning workflow, we would do it progressively:

- First, we would begin by defining the NN hyperparameters when we implement the model.
- Then, we'd add the training parameters when implementing the training algorithm

In [8]:
### >>>> ENVIRONMENT PARAMETERS <<<<
state_size = [84, 84, stack_size]  # We'll have a stack of 'stack_size' frames with (84, 84) size
action_size = game.get_available_buttons_size()  # The default #actions for this scenario is 3

# >>>> TRAINING PARAMETERS <<<<
learning_rate = 0.002
num_epochs = 500

batch_size = 5000  # The number of timesteps (1000 if cpu, 5000 if gpu)
gamma = 0.95  # Reward discount rate

# This should be set to False if only want to watch the agent without training
training = True


# 5) Creating our Policy Gradient Neural Network model

The model we'll implement has the following configuration:

- Input layer which receives a stack of 'state_size' preprocessed stacked frames
- Three Convolutional layers with ELU activation and batch normalization
- Flatten layer
- Dense layer with 512 units and ELU activation
- Dense layer with 'action_size' units and no activation (logits).
- Output: Softmax activation over logits, which returns a probability distribution over all possible actions

In [9]:
class PGNetwork:
    def __init__(self, state_size, action_size, learning_rate, name='PGNetwork'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        with tf.variable_scope(name):
            with tf.name_scope('inputs'):
                
                # Defining placeholders for inputs, actions discounted rewards and rewards.
                
                # Our network will receive a stack of preprocessed frames when we initialize it.
                self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name='inputs_')
                self.actions = tf.placeholder(tf.int32, [None, action_size], name='actions')
                self.discounted_episode_rewards = tf.placeholder(tf.float32, [None, ], name='discounted_episode_rewards_')
                
                # This placeholder is used to write the mean reward values to TensorBoard
                self.mean_reward_ = tf.placeholder(tf.float32, name='mean_reward')
                
                with tf.name_scope('conv1'):
                    
                    '''
                    First convolutional layer:
                    CNN -> Batch Normalization -> ELU
                    '''
                    
                    self.conv1 = tf.layers.conv2d(inputs=self.inputs_,
                                                  filters=32,
                                                  kernel_size=[8, 8],
                                                  strides=[4,4],
                                                  padding='VALID',
                                                  kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                                  name='conv1')
                    
                    self.conv1_batchnorm = tf.layers.batch_normalization(self.conv1,
                                                                         training=True,
                                                                         epsilon=1e-5,
                                                                         name='batch_norm1')
                    
                    self.conv1_out = tf.nn.elu(self.conv1_batchnorm, name='conv1_out')
                    
                with tf.name_scope('conv2'):
                    '''
                    Second convolutional layer:
                    CNN -> Batch Normalization -> ELU
                    '''
                    
                    self.conv2 = tf.layers.conv2d(inputs=self.conv1_out,
                                                  filters=64,
                                                  kernel_size=[4, 4],
                                                  strides=[2, 2],
                                                  padding='VALID',
                                                  kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                                  name='conv2')
                    
                    self.conv2_batchnorm = tf.layers.batch_normalization(self.conv2,
                                                                         training=True,
                                                                         epsilon=1e-5,
                                                                         name='batch_norm2')
                    
                    self.conv2_out = tf.nn.elu(self.conv2_batchnorm, name='conv2_out')
                    
                with tf.name_scope('conv3'):
                    '''
                    Third convolutional layer:
                    CNN -> Batch Normalization -> ELU
                    '''
                    
                    self.conv3 = tf.layers.conv2d(inputs=self.conv2_out,
                                                  filters=128,
                                                  kernel_size=[4, 4],
                                                  strides=[2, 2],
                                                  padding='VALID',
                                                  kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                                  name='conv3')
                    
                    self.conv3_batchnorm = tf.layers.batch_normalization(self.conv3,
                                                                         training=True,
                                                                         epsilon=1e-5,
                                                                         name='batch_norm3')
                    
                    self.conv3_out = tf.nn.elu(self.conv3_batchnorm, name='conv3_out')
                    
                with tf.name_scope('flatten'):
                    self.flatten = tf.layers.flatten(self.conv3_out)
                    
                with tf.name_scope('fc1'):
                    self.fc = tf.layers.dense(inputs=self.flatten,
                                              units=512,
                                              activation=tf.nn.elu,
                                              kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                              name='fc1')
                    
                with tf.name_scope('logits'):
                    self.logits = tf.layers.dense(inputs=self.fc,
                                                  kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                                  units=self.action_size,
                                                  activation=None)
                    
                with tf.name_scope('softmax'):
                    self.action_distribution = tf.nn.softmax(self.logits)
                    
                with tf.name_scope('loss'):
                    # Since we're dealing with a network that classifies our actions into multiple
                    # classes by using a probability distribution, we'll use softmax crossentropy
                    # to compute our loss. If we were classifying with 1 class, we would use 
                    # tf.nn.sparse_softmax_cross_entropy_with_logits
                    self.neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.logits, labels=self.actions)
                    self.loss = tf.reduce_mean(self.neg_log_prob * self.discounted_episode_rewards)
                    
                with tf.name_scope('train'):
                    self.train_opt = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss)
                    
                

In [10]:
# Resetting the default graph
tf.reset_default_graph()

# Instantiating our Policy Gradient Neural Network
PGNetwork = PGNetwork(state_size, action_size, learning_rate)

# Initializing our TensorFlow session
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

# 6) Setting up TensorBoard

Before running this cell, make sure to run *tensorboard --logdir=/tensorboard/pg/1* on the command prompt.

In [11]:
# Setting up TensorBoard writer
writer = tf.summary.FileWriter('/tensorboard/pg/1')

# Subscribes the loss to TensorBoard
tf.summary.scalar('Loss', PGNetwork.loss)

# Subscribes the reward mean to TensorBoard
tf.summary.scalar('Reward_mean', PGNetwork.mean_reward_)

write_op = tf.summary.merge_all()

# 7) Training our agent.

Now we'll create batches. These batches contains **episodes**, and the number of episodes for each batch depends on the number of rewards we collect. Ex: If we have **episodes** with **only 10 rewards**, we can make **batch_size / 10** episodes. So our function will do the following:

- Make a batch:
    - For each **step**:
        - **Choose** *action* *a*
        - **Perform** *action* *a*
        - **Store** *s*, *a*, *r*
        - **If** *done*:
            - **Calculate** the sum of all rewards
            - **Calculate** *gamma* Gt

In [12]:
def make_batch(batch_size, stacked_frames):
    states, actions, rewards_of_episode, rewards_of_batch, discounted_rewards = [], [], [], [], []
    
    # Reward of batch is also used to keep track of how many timesteps we made. It's used to verify
    # at the end of each episode if > batch_size.
    episode_num = 1  # Keeps track of the number of episodes in our batch
    
    game.new_episode()
    
    state = game.get_state().screen_buffer
    state, stacked_frames = stack_frames(stacked_frames, state, True)
    
    while True:
        # Runs the neural network with our current stack of frames to get the action probability distribution
        action_probability_distribution = sess.run(PGNetwork.action_distribution,
                                                   feed_dict={PGNetwork.inputs_: state.reshape(1, *state_size)})
        
        # Choosing an action using the probability distribution from our network
        action = np.random.choice(range(action_probability_distribution.shape[1]),
                                  p=action_probability_distribution.ravel())
        action = possible_actions[action]
        
        # Performs the chosen action
        reward = game.make_action(action)
        done = game.is_episode_finished()
    
        # Storing the results
        states.append(state)
        actions.append(action)
        rewards_of_episode.append(reward)
        
        if done:
            # The episode is over, so we set next_state to empty
            next_state = np.zeros((3, 240, 240), dtype=np.int)
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
            
            # Appending the rewards_of_episode to rewards_of_batch
            rewards_of_batch.append(rewards_of_episode)
            
            # Calculating gamma Gt
            discounted_rewards.append(discount_and_normalize_rewards(rewards_of_episode))
            
            # len(rewards_of_batch) = num episodes, and if it's bigger than batch_size, we should
            # stop the minibatch creation. This condition is only checked when an episode is finished.
            if len(np.concatenate(rewards_of_batch)) > batch_size:
                break
                
            # Resetting the rewards_of_episode, since the episode is finished.
            rewards_of_episode = []
            
            episode_num += 1
            game.new_episode()
            
            # Resetting the stack of frames before starting a new episode
            state = game.get_state().screen_buffer
            state, stacked_frames = stack_frames(stacked_frames, state, True)
        else:
            # If the episode is not over, next_state = current_state
            next_state = game.get_state().screen_buffer
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
            state = next_state
    
    return np.stack(np.array(states)), np.stack(np.array(actions)), np.concatenate(rewards_of_batch), np.concatenate(discounted_rewards), episode_num
    

### Initializing the Neural Network

- Initialize the weights
- Initialize the Doom environment
- Set max_reward = 0 to keep track of the maximum reward
- For each epoch in range(num_epochs):
    - Get batches
    - Optimize the network

In [13]:
all_rewards = []  # Keeps track of the total reward for each batch

total_rewards = 0
maximum_reward_recorded = 0
mean_reward_total = []
epoch = 1
average_reward = []

saver = tf.train.Saver()

if training:
    #saver.restore(sess, './models/model.ckpt')  # Loads a previously trained model
    
    while epoch < num_epochs + 1:
        # Gathers training data from episodes ran in minibatches. mb = minibatch
        states_mb, actions_mb, rewards_of_batch, discounted_rewards_mb, num_episodes_mb = make_batch(batch_size, stacked_frames)
        
        # Calculating values for analytics
        total_reward_current_batch = np.sum(rewards_of_batch)
        all_rewards.append(total_reward_current_batch)
        
        mean_reward_current_batch = np.divide(total_reward_current_batch, num_episodes_mb)
        mean_reward_total.append(mean_reward_current_batch)
        
        # Calculates the average reward of the entire training: mean_reward_current_batch / epoch
        average_reward_all_training = np.divide(np.sum(mean_reward_total), epoch)
        
        maximum_reward_recorded = np.amax(all_rewards)
        
        print('====================================')
        print('Epoch: ', epoch, '/', num_epochs)
        print('----------')
        print('Number of training episodes: {}'.format(num_episodes_mb))
        print('Total reward: {}'.format(total_reward_current_batch, num_episodes_mb))
        print('Mean reward of current batch: {}'.format(mean_reward_current_batch))
        print('Average reward of all training: {}'.format(average_reward_all_training))
        print('Max reward of all batches so far: {}'.format(maximum_reward_recorded))
        
        # Network optimization through Feedforward, gradient and backpropagation
        loss_, _ = sess.run([PGNetwork.loss, PGNetwork.train_opt],
                            feed_dict={
                                PGNetwork.inputs_: states_mb.reshape((len(states_mb), 84, 84, 4)),
                                PGNetwork.actions: actions_mb,
                                PGNetwork.discounted_episode_rewards: discounted_rewards_mb})
        
        print('Training Loss: {}'.format(loss_))
        
        # Writing summaries to TensorBoard
        summary = sess.run(write_op,
                           feed_dict={
                               PGNetwork.inputs_: states_mb.reshape((len(states_mb), 84, 84, 4)),
                               PGNetwork.actions: actions_mb,
                               PGNetwork.discounted_episode_rewards: discounted_rewards_mb,
                               PGNetwork.mean_reward_: mean_reward_current_batch
                           })
        
        writer.add_summary(summary)
        writer.flush()
        
        # Saving the trained model
        if epoch % 10 == 0:
            saver.save(sess, 'models/model.ckpt')
            print('Model saved!')
        epoch += 1
          

Epoch:  1 / 500
----------
Number of training episodes: 12
Total reward: 3984.0
Mean reward of current batch: 332.0
Average reward of all training: 332.0
Max reward of all batches so far: 3984.0
Training Loss: -0.01314134057611227
Epoch:  2 / 500
----------
Number of training episodes: 11
Total reward: 4052.0
Mean reward of current batch: 368.3636363636364
Average reward of all training: 350.1818181818182
Max reward of all batches so far: 4052.0
Training Loss: 0.007343720179051161
Epoch:  3 / 500
----------
Number of training episodes: 11
Total reward: 4052.0
Mean reward of current batch: 368.3636363636364
Average reward of all training: 356.24242424242425
Max reward of all batches so far: 4052.0
Training Loss: -0.0011698236921802163
Epoch:  4 / 500
----------
Number of training episodes: 11
Total reward: 4180.0
Mean reward of current batch: 380.0
Average reward of all training: 362.1818181818182
Max reward of all batches so far: 4180.0
Training Loss: 0.012000523507595062
Epoch:  5 / 5

ViZDoomUnexpectedExitException: Controlled ViZDoom instance exited unexpectedly.

# 8) Watching our Agent play

Now that we trained our agent and saved the model, we can watch him play by doing what he learned.

In [17]:
num_matches = 10

with tf.Session() as sess:
    game = DoomGame()
    game.load_config('health_gathering.cfg')
    game.set_doom_scenario_path('health_gathering.wad')
    
    # Loading the model
    saver.restore(sess, 'models/model.ckpt')
    game.init()
    
    for i in range(num_matches):
        game.new_episode()
        
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
        while not game.is_episode_finished():
            action_probability_distribution = sess.run(PGNetwork.action_distribution,
                                                       feed_dict={PGNetwork.inputs_: state.reshape(1, *state_size)})
            
            # Choosing an action from the network's probability distribution over actions
            action = np.random.choice(range(action_probability_distribution.shape[1]),
                                      p=action_probability_distribution.ravel())
            action = possible_actions[action]
            
            # Performing the chosen action
            reward = game.make_action(action)
            done = game.is_episode_finished()
            
            if done:
                break
            else:
                next_state = game.get_state().screen_buffer
                next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                state = next_state
        
        print('Score for episode #', i, ': ', game.get_total_reward())
        
    game.close()
            

INFO:tensorflow:Restoring parameters from models/model.ckpt
Score for episode # 0 :  284.0
Score for episode # 1 :  476.0
Score for episode # 2 :  284.0
Score for episode # 3 :  348.0
Score for episode # 4 :  444.0
Score for episode # 5 :  476.0
Score for episode # 6 :  380.0
Score for episode # 7 :  348.0
Score for episode # 8 :  348.0
Score for episode # 9 :  380.0
