In [1]:
import tensorflow as tf
import numpy as np

import random
import random
import time

from skimage import transform
from vizdoom import *

from collections import deque
import matplotlib.pyplot as plt

import warnings

warnings.filterwarnings('ignore')

In [2]:
from IPython.display import HTML
HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/gCJyVX98KJ4?showinfo=0" frameborder="0" allow="autoplay; encrypted-media" allowfullscreen></iframe>')

In [3]:
def create_environment() :
    game = DoomGame()
    
    # Load the correct configuration
    #game.load_config(r"C:\Users\Sourabh\AppData\Local\conda\conda\envs\tensorflow_env\Lib\site-packages\vizdoom\scenarios\basic.cfg")
    game.load_config("basic.cfg")
    
    # Load the correct scenario (in our case basic scenario)
    #game.set_doom_scenario_path(r"C:\Users\Sourabh\AppData\Local\conda\conda\envs\tensorflow_env\Lib\site-packages\vizdoom\scenarios\basic.wad")
    game.set_doom_scenario_path("basic.wad")
    
    game.init()
    
    # Here our possible actions
    left = [1,0,0]
    right = [0,1,0]
    shoot = [0,0,1]
    
    possible_actions = [left,right,shoot]
    
    return game, possible_actions

def test_environment() :
    game = DoomGame
    game.load_config("basic.cfg")
    game.set_doom_scenario_path("basic.wad")
    game.init()
    
    shoot = [0,0,1]
    left = [1,0,0]
    right = [0,1,0]
    
    actions = [shoot,left,right]
    
    episodes = 10
    
    for i in range(episodes) :
        game.new_episodes()
        while not game.is_episode_finished() :
            state = game.get_state()
            img = state.screen_bufer
            misc = state.game_variables
            action = random.choice(actions)
            print(action)
            reward = game.make_action(action)
            print("\t reward:",reward)
            time.sleep(0.02)
        print("Result:",game.get_total_reward())
        time.sleep(2)
        
    game.close()

In [4]:
game, possible_actions = create_environment()

## Define the preprocessing functions

In [5]:
def preprocess_frame(frame) :
    # Greyscale frame already done in our vizdoom config
    # x = np.mean(frame,-1)
    
    # Crop the screen (remove the roof because it contains no information)
    cropped_frame = frame[:,:]
    
    # normalize Pixel Values
    normalized_frame = cropped_frame/255.0
    
    # Resize
    preprocessed_frame = transform.resize(normalized_frame,[128,128])
    
    return preprocessed_frame

## Stacking Frames

stacking frames is important because it **gives us a sense of motion**

In [6]:
stack_size = 4 # We stack 4 frames

# Initialize deque with zero-images one array for each image
stacked_frames = deque([np.zeros((128,128),dtype=np.int) for i in range(stack_size)],maxlen=4)

def stack_frames(stacked_frames,state,is_new_episode) :
    # Preprocess frame
    frame = preprocess_frame(state)
    
    if is_new_episode :
        # Clear our stacked_frames
        stacked_frames = deque([np.zeros((128,128),dtype=np.int) for i in range(stack_size)],maxlen=4)
        
        # Because we're in a new episode, copy the same frame 4x times
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        
        # Stack the frames
        stacked_state = np.stack(stacked_frames,axis=2)
        
    else :
        # Append frame to deque, automatically removes the oldest frame
        stacked_frames.append(frame)
        
        # Build the stacked state (first dimension specifies different frames)
        stacked_state = np.stack(stacked_frames,axis=2)
        
    return stacked_state, stacked_frames

## Setting up the hyperparameters

In [7]:
## Model Hyperparameters
state_size = [128,128,4] # Our input is a stack of 4 frames hence 84x84x4
action_size = game.get_available_buttons_size() # 3 possible actions
learning_rate = 0.0002 # alpha, also known as the learning rate

## Training Hyperparameters
total_episodes = 500 # Total episodes for training
max_steps = 100 # Max possible steps in an episode
batch_size =64

## Exploration hyperparameters for epsilon greedy strategy
explore_start = 1.0 # exploration probability at start
explore_stop = 0.001 # minimum exploration probability
decay_rate = 0.0001 # exponential decay rate for exploration

## Q Learning hyperparameters
gamma = 0.95 # Discounting rate

## Memory hyperparameters
pretrain_length = batch_size # number of experiences stored in the Memory when initialized for the first time
memory_size = 100000 # Number of experiences the Memory can store

## Modify the following if you want to see the trained agent
training = True

## Turn this to True if you want to render the environment
episode_render = False

## Creating the Deep Q-learning Neural Network

Following is the Deep Q-learning model

* We take a stack of 4 frames as input
* It passes through 3 convnets
* Then It is flattened
* Finally it passes through 2 FC layers
* It outputs a Q Value for each actions

In [8]:
class DQNetwork :
    def __init__(self, state_size,action_size,learning_rate,name='DQNetwork') :
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        with tf.variable_scope(name) :
            # We Create placeholders
            # *state_size means that we take each elements of state_size in a tuple hence it is the same as writing [None,84,84,4]
            self.inputs_ = tf.placeholder(tf.float32,[None,*state_size],name='inputs')
            self.actions_ = tf.placeholder(tf.float32,[None,3],name='actions_')
            
            # Remeber that target_Q is the R(s,a) + gamma* max(Q_hat(s',a'))
            self.target_Q = tf.placeholder(tf.float32,[None],name="target")
            
            '''
            First Convnet :
            CNN
            BatchNormalization
            ELU
            '''
            
            # Input is 100x120x4
            self.conv1 = tf.layers.conv2d(inputs=self.inputs_,
                                          filters=32,
                                          kernel_size=[8,8],
                                          strides=[4,4],
                                          padding="VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                          name="conv1")
            
            self.conv1_batchnorm = tf.layers.batch_normalization(self.conv1,
                                                                 training=True,
                                                                 epsilon=1e-5,
                                                                 name="batch_norm1")
            
            self.conv1_out = tf.nn.elu(self.conv1_batchnorm,name="conv1_out")
            # output is [20,20,32]
            
            
            '''
            Second convnet :
            CNN
            BatchNormalization
            ELU
            '''
            
            self.conv2 = tf.layers.conv2d(inputs=self.conv1_out,
                                          filters=64,
                                          kernel_size=[4,4],
                                          strides=[2,2],
                                          padding="VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                          name="conv2")
            
            self.conv2_batchnorm = tf.layers.batch_normalization(self.conv2,
                                                                 training=True,
                                                                 epsilon=1e-5,name="batch_norm2")
            
            self.conv2_out = tf.nn.elu(self.conv2_batchnorm,name="conv2_out")
            # output is [9,9,64]
            
            '''
            Third convnet :
            CNN
            BatchNormalization
            ELU
            '''
            
            self.conv3 = tf.layers.conv2d(inputs=self.conv2_out,
                                          filters=128,
                                          kernel_size=[4,4],
                                          strides=[2,2],
                                          padding="VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                          name="conv3")
            
            self.conv3_batchnorm = tf.layers.batch_normalization(self.conv2,
                                                                 training=True,
                                                                 epsilon=1e-5,
                                                                 name='batch_norm3')
            
            self.conv3_out = tf.nn.elu(self.conv3_batchnorm,name="conv3_out")
            # outputs is [3,3,128]
            
            self.flatten = tf.layers.flatten(self.conv3_out)
            # output is [1152]
            
            self.fc = tf.layers.dense(inputs=self.flatten,
                                      units=512,
                                      activation=tf.nn.elu,
                                      kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                      name="fc1")
            
            self.output = tf.layers.dense(inputs=self.fc,
                                          kernel_initializer=tf.contrib.layers.xavier_initializer(),units=3,
                                          activation=None,name="dense1")
            
            # Q is our predicted Q value
            self.Q  = tf.reduce_sum(tf.multiply(self.output,self.actions_),axis=1)
            
            # The loss is the difference between our predicted Q_values and the Q_target
            # Sum(Q_target - Q)^2
            
            self.loss = tf.reduce_sum(tf.square(self.target_Q-self.Q))
            
            self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss)

In [9]:
# Reset the graph
tf.reset_default_graph()

# Instantiate the DQNetwork
DQNetwork = DQNetwork(state_size,action_size,learning_rate)

## Experience Replay

Now we create the **experience replay method**

In [10]:
class Memory() :
    def __init__(self,max_size) :
        self.buffer = deque(maxlen=max_size)
    
    def add(self,experience) :
        self.buffer.append(experience)
        
    def sample(self,batch_size) :
        buffer_size = len(self.buffer)
        index = np.random.choice(np.arange(buffer_size),size=batch_size,replace=False)
        
        return [self.buffer[i] for i in index]

Here we deal with the empty memory problem, we pre-populate our memory experience (state,action,reward,new_state)

In [None]:
# Instantiate Memory
memory = Memory(max_size=memory_size)

# Render the environment
game.new_episode()

for i in range(pretrain_length) :
    # If it's the first step
    if i == 0:
        # First we need a state
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stack_frames,state,True)
    
    # Random action
    action = random.choice(possible_actions)
    
    # Get the rewards
    reward = game.make_action(action)
    
    # Look if the episode is finished
    done = game.is_episode_finished()
    
    # If we're dead
    if done :
        # We finished the episode
        next_state = np.zeros(states.shape)
        
        # Add experience to memory
        memory.add((state,action,reward,next_state,done))
        
        # Start a new episode
        game.new_episode()
        
        # First we need a state
        state = game.get_state().screen_buffer
        
        # Stack the frames
        state, stacked_frames = stack_frames(stacked_frames,state,True)
    
    else :
        # Get the next state
        next_state = game.get_state().screen_buffer
        next_state, stacked_frames = stack_frames(stacked_frames,next_state,False)
        
        # Add experience to memory
        memory.add((state,action,reward,next_state,done))
        
        # Our state is now the next state
        state = next_state

## Setting up Tensorboard

To launch tensorboard type
```
tensorboard --logdir=/tensorboard/dqn/1
```

In [12]:
# Setup TensorBoard Writer
writer = tf.summary.FileWriter("/tensorboard/dqn/1")

# losses
tf.summary.scalar("Loss",DQNetwork.loss)

write_op = tf.summary.merge_all()

## Training our Agent

Algorithm

* Initialize the weights
* Initialize the environment
* Initialize the decay rate (that will reduce the epsilon)

* For episode to max_episode do
    * Make new episode
    * Set step to 0
    * Observe the first state s0
    
    * While step < max_steps do
        * Increase decay_rate
        * With epsilon select a random action a(t) otherwise select a(t) = argmax Q(s(t),a(t))
        * Execute action a(t) in simulator and observe reward r(t+1) and new state s(t+1)
        * Store transition S
        * Sample random mini-batch from D : SS
        * Set Q_hat= r if the episode ends at +1, otherwise set Q_hat = r + gamma*max(Q(s',a'))
        * Make a gradient descent step with loss (Q_hat - Q(s,a))^2
    * endfor
* endfor

In [15]:
'''
This function will do the part with epsilon select a random action a(t), otherwise select a(t)=argmax Q(s(t),a(t))
'''

def predict_action(explore_start,explore_stop,decay_rate,decay_step,state,actions) :
    # Epsilon greedy strategy
    # Choose action a from state s using epsilon greedy
    # First we randomize a number
    exp_exp_tradeoff = np.random.rand()
    
    # Here we'll use an improved version of our epsilon greedy strategy used in Q-learning Notebook
    explore_probability = explore_stop + (explore_start-explore_stop)*np.exp(-decay_rate*decay_step)
    
    if (explore_probability > exp_exp_tradeoff) :
        # Make a random action (exploration)
        action = random.choice(possible_actions)
    else :
        # Get action from Q-network (exploitation)
        # Estimate the Qs values state
        Qs = sess.run(DQNetwork.output,feed_dict={DQNetwork.inputs_:states.reshape((1,*state.shape))})
        
        # Take the biggest Q value (= the best action)
        choice = np.argmax(Qs)
        action = possible_actions[int(choice)]
        
    return action, explore_probability

In [None]:
# Saver will help us to save our model
saver = tf.train.Saver()

if training == True :
    with tf.Session() as sess :
        # Initialize the variables
        sess.run(tf.global_variables_initializer())
        
        # Initialize the decay rate (that will use to reduce the epsilon)
        decay_step = 0
        
        # Init the game
        game.init()
        
        for episode in range(total_episodes) :
            # Set step to 0
            step = 0
            
            # Initialize the rewards of the episode
            episode_rewards = []
            
            # Make a new episode and observe the first state
            game.new_episode()
            state = game.get_state().screen_buffer
            
            # Remember that stack frame function also call our preprocess function
            state, stacked_frames = stack_frames(stacked_frames,state,True)
            
            while step < max_steps :
                step += 1
                
                # Increase decay_step
                decay_step += 1
                
                # Predict the action to take and take it
                action, explore_probability = predict_action(explore_start,explore_stop,decay_rate,decay_step,state,possible_actions)
                
                # Do the action
                reward  = game.make_action(action)
                
                # Look if the episode is finished
                done = game.is_episode_finished()
                
                # Add the reward to total_reward
                episode_rewards.append(reward)
                
                # If the game is finished
                if done :
                    # the episode ends so no next state
                    next_state = np.zeros((84,84),dtype=np.int)
                    next_state, stacked_frames = stack_frames(stack_frames,next_state,False)
                    
                    # Set step = max_steps to end the episode
                    step = max_steps
                    
                    # Get the total reward of the episode
                    total_reward = np.sum(episode_rewards)
                    
                    print('Episode: {}'.format(episode),
                          'Total reward: {}'.format(total_reward),
                          'Training loss: {:.4f}'.format(loss),
                          'Explore P: {:.4f}'.format(explore_probability))
                    
                    memory.add((state,action,reward,next_state,done))
                    
                else :
                    
                    # Get the next state
                    next_state = game.get_state().screen_buffer
                    
                    # Stack the frame of the next_state
                    next_state, stacked_frames = stack_frames(stacked_frames,next_state,False)
                    
                    # Add experience to memory
                    memory.add((state,action,reward,next_state,done))
                    
                    # s(t+1) is now our current state
                    
                # Learning Part
                # Obtain radnom mini-batch from memory
                batch = memory.sample(batch_size)
                states_mb = np.array([each[0] for each in batch],ndmin=3)
                actions_mb = np.array([each[1] for each in batch])
                rewards_mb = np.array([each[2] for each in batch])
                next_states_mb = np.array([each[3] for each in batch],ndmin=3)
                dones_mb = np.array([each[4] for each in batch])
                
                target_Qs_batch = []
                
                # Get Q values for next_state
                Qs_next_state = sess.run(DQNetwork.output,feed_dict={DQNetwork.inputs_:next_states_mb})
                
                # Set Q_target = r if the episode ends at s+1, otherwise  set Q_target = r + gamma*max(Q(s',a'))
                
                for i in range(0,len(batch)) :
                    terminal = dones_mb[i]
                    
                    # If we are in a terminal state, only equals reward
                    if terminal :
                        target_Qs_batch.append(rewards_mb[i])
                    else :
                        target = rewards_mb[i] + gamma*np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)
                        
                    targets_mb = np.array([each for each in target_Qs_batch])
                    
                    loss, _ = sess.run([DQNetwork.loss,DQNetwork.optmizer],feed_dict={DQNetwork.inputs_:states_mb,
                                                                                      DQNetwork.target_Q:targets_mb,
                                                                                      DQNetwork.actions_:actions_mb})
                    
                    # Write TF Summaries
                    summary = sess.run(write_op,feed_dict={DQNetwork.inputs_:states_mb,
                                                           DQNetwork.target_Q:targets_mb,
                                                           DQNetwork.actions_:actions_mb})
                    
                    writer.add_summary(summary,episode)
                    writer.flush()
                
                # Save model every 5 episodes
                if episode % 5 == 0 :
                    save_path = saver.save(sess,"./models/model.ckpt")
                    print("Model Saved")

## Watching our Agent Playing

Now that we have trained our agent, we can begin testing it

In [None]:
with tf.Session() as sess :
    game, possible_actions = create_environments()
    
    totalScore = 0
    
    # Load the model
    saver.restore(sess,"./models/model.ckpt")
    game.init()
    
    for i in range(1) :
        done = False
        game.new_episode()
        
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames,state,True)
        
        while not game.is_episode_finished() :
            
            # Take the biggest Q value (= the best action)
            Qs = sess.run(DQNetwork.output,feed_dict={DQNetwork.inputs_:state.reshape((1,*state.shape))})
            
            # Tatke the biggest Q value (= the best action)
            choice = np.argmax(Qs)
            action = possible_actions[int(choice)]
            
            game.make_action(action)
            done = game.is_new_episode_finished()
            score = game.get_total_reward()
            
            if done :
                break
            else :
                print("else")
                next_state = game.get_state().screen_buffer
                next_state, stacked_frames = stack_frames(stacked_frames,next_state,False)
                state = next_state
                
            score = game.get_total_reward()
            print("Score: ",score)
            
        game.close()