In [1]:
import tensorflow as tf      # Deep Learning library
import numpy as np           # Handle matrices
from vizdoom import *        # Doom Environment

import random                # Handling random number generation
import time                  # Handling time calculation
from skimage import transform# Help us to preprocess the frames

from collections import deque# Ordered collection with ends
import matplotlib.pyplot as plt # Display graphs

import warnings
import os

In [29]:
#Extract all level files from scenarios directory
levels = []

for file in os.listdir("scenarios/"):
    if file.endswith(".cfg"):
        levels.append(os.path.splitext(file)[0])
        
print(levels)

['basic', 'cig', 'deadly_corridor', 'deathmatch', 'defend_the_center', 'defend_the_line', 'health_gathering', 'health_gathering_supreme', 'learning', 'multi', 'multi_duel', 'my_way_home', 'oblige', 'predict_position', 'rocket_basic', 'simpler_basic', 'take_cover']


### Create Environment

In [30]:
"""
Here we create our environment
"""
def create_environment(level=0, launch_game=True):
    
    game = DoomGame()
    # Load the correct configuration
    game.load_config("scenarios/" + levels[level] + ".cfg")
    
    # Load the correct scenario
    game.set_doom_scenario_path("scenarios/" + levels[level] + ".wad")
    game.set_screen_resolution(RES_640X480)
    
    if launch_game:
        game.init()
    
    # List of possible controls from vizdoom environment
    controls = game.get_available_buttons()
    action_size = game.get_available_buttons_size()
    
    action_names = []
    for index in range(action_size):
        action_names.append(os.path.splitext(str(controls[index]))[1][1:])
        
    action_space = np.eye(action_size).astype(int).tolist()
    
    return game, action_space, action_names
       
def test_environment(level=0, episodes=10, print_info=True):
    game, actions, action_names = create_environment(level)
    caches = []
    
    episodes = 1
    
    for i in range(episodes):
        game.new_episode()
        while not game.is_episode_finished():
            state = game.get_state()
            img = state.screen_buffer
            misc = state.game_variables
            action = random.choice(actions)
            
            reward = game.make_action(action)
            
            if print_info:
                print(str(action_names[np.argmax(action)]) + " : " + str(action))
                print ("\treward:", reward)
            
            time.sleep(0.02)
            cache = (i, state, img, misc, action)
            caches.append(cache)
        print ("Result:", game.get_total_reward())
        time.sleep(2)
    game.close()
    return caches

In [73]:
caches = test_environment(level=-4)

TURN_LEFT : [1, 0, 0]
	reward: -0.001
TURN_RIGHT : [0, 1, 0]
	reward: -0.001
TURN_RIGHT : [0, 1, 0]
	reward: -0.001
ATTACK : [0, 0, 1]
	reward: -0.001
ATTACK : [0, 0, 1]
	reward: -0.001
ATTACK : [0, 0, 1]
	reward: -0.001
TURN_RIGHT : [0, 1, 0]
	reward: -0.001
TURN_LEFT : [1, 0, 0]
	reward: -0.001
TURN_LEFT : [1, 0, 0]
	reward: -0.001
TURN_RIGHT : [0, 1, 0]
	reward: -0.001
TURN_LEFT : [1, 0, 0]
	reward: -0.001
TURN_LEFT : [1, 0, 0]
	reward: -0.001
TURN_RIGHT : [0, 1, 0]
	reward: -0.001
ATTACK : [0, 0, 1]
	reward: -0.001
TURN_RIGHT : [0, 1, 0]
	reward: -0.001
ATTACK : [0, 0, 1]
	reward: -0.001
TURN_LEFT : [1, 0, 0]
	reward: -0.001
TURN_RIGHT : [0, 1, 0]
	reward: -0.001
TURN_LEFT : [1, 0, 0]
	reward: -0.001
TURN_RIGHT : [0, 1, 0]
	reward: -0.001
ATTACK : [0, 0, 1]
	reward: -0.001
TURN_RIGHT : [0, 1, 0]
	reward: -0.001
TURN_RIGHT : [0, 1, 0]
	reward: -0.001
TURN_LEFT : [1, 0, 0]
	reward: -0.001
TURN_RIGHT : [0, 1, 0]
	reward: -0.001
TURN_LEFT : [1, 0, 0]
	reward: -0.001
ATTACK : [0, 0, 1]


### Helper Functions

#### Greyscale, crop, normalize

In [60]:
def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.2989, 0.5870, 0.1140])

In [61]:
def preprocess_frame(frame):

    #Transpose channels (ch, h, w) --> (h, w, ch)
    transposed = frame.transpose(1,2,0)
    
    # Greyscale
    gray = rgb2gray(transposed)
    
    # Crop the screen
    crop = gray[30:-10,30:-30]
    
    # Normalize Pixel Values
    normalized = crop/255.0
    
    # Resize
    preprocessed = transform.resize(normalized, [84,84])
    
    return preprocessed

#### Frame Stacking

In [35]:
stack_size = 4
stacked_frames = deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4)

def stack_frames(stacked_frames, state, is_new_episode):
    
    frame = preprocess_frame(state)
    
    if is_new_episode:
        # Clear our stacked_frames
        stacked_frames = deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4)
        
        #Fill in array with new episode frame
        for i in range(stack_size):
            stacked_frames.append(frame)
            
        # Stack the frames: (h x w x stack)
        stacked_state = np.stack(stacked_frames, axis=2)
        
    else:
        stacked_frames.append(frame)
        stacked_state = np.stack(stacked_frames, axis=2) 
    
    return stacked_state, stacked_frames

#### Epsilon Greedy Strategy

In [36]:
def choose_action(explore_start, explore_stop, decay_rate, decay_step, state, actions):
    # Choose action a from state s with a probability epsilon_ using epsilon greedy.
    
    #explore-exploit
    exp_exp_tradeoff = np.random.rand()

    # Here we'll use an improved version of our epsilon greedy strategy used in Q-learning notebook
    epsilon_ = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    
    if (epsilon_ > exp_exp_tradeoff):
        # Explore
        action = random.choice(possible_actions)
        
    else:
        # Exploit
        Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
        
        # Take the biggest Q value (= the best action)
        choice = np.argmax(Qs)
        action = possible_actions[int(choice)]
                
    return action, epsilon_

### Model Hyperparameters

In [83]:
# ENVIRONMENT PARAMETERS
level = -4
game, _, action_names = create_environment(level, launch_game = False)
action_size = len(action_names)
game.close()

# MODEL HYPERPARAMETERS
state_size = [84,84,4]      # Our input is a stack of 4 frames hence 84x84x4 (Width, height, channels) 
learning_rate =  0.0005      # Alpha (aka learning rate)

# TRAINING HYPERPARAMETERS
total_episodes = 501        # Total episodes for training
max_steps = 150              # Max possible steps in an episode
batch_size = 64             

# Exploration parameters for epsilon greedy strategy
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability 
decay_rate = 0.0001            # exponential decay rate for exploration prob

# Q learning hyperparameters
gamma = 0.98               # Discounting rate

# MEMORY HYPERPARAMETERS
pretrain_length = batch_size   # Number of experiences stored in the Memory when initialized for the first time
memory_size = 16000            # Number of experiences the Memory can keep

# MODIFY THIS TO FALSE IF YOU JUST WANT TO SEE THE TRAINED AGENT
training = True

### Deep Q Network

In [75]:
class DQNetwork:
    def __init__(self, state_size, action_size, learning_rate, name='DQNetwork'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        with tf.variable_scope(name):
            # We create the placeholders
            # *state_size means that we take each elements of state_size in tuple hence is like if we wrote
            # [None, 84, 84, 4]
            self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name="inputs")
            self.actions_ = tf.placeholder(tf.float32, [None, action_size], name="actions_")
            
            # Remember that target_Q is the R(s,a) + ymax Qhat(s', a')
            self.target_Q = tf.placeholder(tf.float32, [None], name="target")
            
            
            ###################### CONV NET start #################################
            # Input is 84x84x4
            self.conv1 = tf.layers.conv2d(inputs = self.inputs_,
                                         filters = 32,
                                         kernel_size = [8,8],
                                         strides = [4,4],
                                         padding = "VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                         name = "conv1")
            
            self.conv1_batchnorm = tf.layers.batch_normalization(self.conv1,
                                                   training = True,
                                                   epsilon = 1e-5,
                                                     name = 'batch_norm1')
            
            self.conv1_out = tf.nn.relu(self.conv1_batchnorm, name="conv1_out")
            ## --> [20, 20, 32]
            
            self.conv2 = tf.layers.conv2d(inputs = self.conv1_out,
                                 filters = 64,
                                 kernel_size = [4,4],
                                 strides = [2,2],
                                 padding = "VALID",
                                kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                 name = "conv2")
        
            self.conv2_batchnorm = tf.layers.batch_normalization(self.conv2,
                                                   training = True,
                                                   epsilon = 1e-5,
                                                     name = 'batch_norm2')

            self.conv2_out = tf.nn.relu(self.conv2_batchnorm, name="conv2_out")
            ## --> [9, 9, 64]
            
            self.conv3 = tf.layers.conv2d(inputs = self.conv2_out,
                                 filters = 128,
                                 kernel_size = [4,4],
                                 strides = [2,2],
                                 padding = "VALID",
                                kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                 name = "conv3")
        
            self.conv3_batchnorm = tf.layers.batch_normalization(self.conv3,
                                                   training = True,
                                                   epsilon = 1e-5,
                                                     name = 'batch_norm3')

            self.conv3_out = tf.nn.relu(self.conv3_batchnorm, name="conv3_out")
            ## --> [3, 3, 128]
            
            
            self.flatten = tf.layers.flatten(self.conv3_out)
            ## --> [1152]
            
            self.fc = tf.layers.dense(inputs = self.flatten,
                                  units = 512,
                                  activation = tf.nn.elu,
                                       kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                name="fc1")
            
            self.output = tf.layers.dense(inputs = self.fc, 
                                           kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                          units = action_size, 
                                        activation=None)

            ###################### CONV NET end ###################################
            
            # Q is our predicted Q value.
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_), axis=1)
        
            # mean(Qtarget - Q)^2
            self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
            
            self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss)

In [76]:
# Reset the graph
tf.reset_default_graph()

# Instantiate the DQNetwork
DQNetwork = DQNetwork(state_size, action_size, learning_rate)





### Memory, Experience Replay, and TensorBoard

In [81]:
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen = max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
    
    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        index = np.random.choice(np.arange(buffer_size),
                                size = batch_size,
                                replace = False)
        
        return [self.buffer[i] for i in index]

In [82]:
# Instantiate memory
memory = Memory(max_size = memory_size)

# Render the environment
game, possible_actions, action_names = create_environment(level)
game.new_episode()

for i in range(pretrain_length):
    # If it's the first step
    if i == 0:
        # First we need a state
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    
    # Random action
    action = random.choice(possible_actions)
    
    # Get the rewards
    reward = game.make_action(action)
    
    # Look if the episode is finished
    done = game.is_episode_finished()
    
    # If we're dead
    if done:
        # We finished the episode
        next_state = np.zeros(state.shape)
        
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
        
        # Start a new episode
        game.new_episode()
        
        # First we need a state
        state = game.get_state().screen_buffer
        
        # Stack the frames
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
    else:
        # Get the next state
        next_state = game.get_state().screen_buffer
        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
        
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
        
        # Our state is now the next_state
        state = next_state

game.close()

In [84]:
# Setup TensorBoard Writer
writer = tf.summary.FileWriter("/tensorboard/dqn/1")

## Losses
tf.summary.scalar("Loss", DQNetwork.loss)

write_op = tf.summary.merge_all()

### Model Train

In [85]:
# Saver will help us to save our model
saver = tf.train.Saver()

if training == True:
    with tf.Session() as sess:
        
        decay_step = 0
        
        sess.run(tf.global_variables_initializer())
        game.init()

        for episode in range(total_episodes):
            
            episode_rewards = []
            first_state = True
            
            game.new_episode()    
            
            while not game.is_episode_finished():
                
                if first_state:
                    state = game.get_state().screen_buffer
                    state, stacked_frames = stack_frames(stacked_frames, state, True)
                    first_state = False
                
                #Load State
                next_state = game.get_state().screen_buffer
                next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                memory.add((state, action, reward, next_state, done))
                state = next_state
                
                decay_step +=1
                
                #Choose Action
                action, epsilon_ = choose_action(explore_start, explore_stop, decay_rate, decay_step, state, possible_actions)
                reward = game.make_action(action)
                episode_rewards.append(reward)

                ### LEARNING PART            
                # Obtain random mini-batch from memory
                batch = memory.sample(batch_size)
                states_mb = np.array([each[0] for each in batch], ndmin=3)
                actions_mb = np.array([each[1] for each in batch])
                rewards_mb = np.array([each[2] for each in batch]) 
                next_states_mb = np.array([each[3] for each in batch], ndmin=3)
                dones_mb = np.array([each[4] for each in batch])

                target_Qs_batch = []

                # Get Q values for next_state
                Qs_next_state = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: next_states_mb})
                
                # Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*maxQ(s', a')
                for i in range(0, len(batch)):
                    terminal = dones_mb[i]

                    # If we are in a terminal state, only equals reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                        
                    else:
                        target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)
                        

                targets_mb = np.array([each for each in target_Qs_batch])

                loss, _ = sess.run([DQNetwork.loss, DQNetwork.optimizer],
                                    feed_dict={DQNetwork.inputs_: states_mb,
                                               DQNetwork.target_Q: targets_mb,
                                               DQNetwork.actions_: actions_mb})

                # Write TF Summaries
                summary = sess.run(write_op, feed_dict={DQNetwork.inputs_: states_mb,
                                                   DQNetwork.target_Q: targets_mb,
                                                   DQNetwork.actions_: actions_mb})
                writer.add_summary(summary, episode)
                writer.flush()
                
            # the episode ends so no next state
            next_state = np.zeros((3,84,84), dtype=np.int)
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)

            # Set step = max_steps to end the episode
            step = max_steps

            # Get the total reward of the episode
            total_reward = np.sum(episode_rewards)

            print('Episode: {}'.format(episode),
                      'Total reward: {}'.format(total_reward),
                      'Training loss: {:.4f}'.format(loss),
                      'Explore P: {:.4f}'.format(epsilon_))

            memory.add((state, action, reward, next_state, done))
            
            # Save model every 5 episodes
            if episode % 25 == 0:
                save_path = saver.save(sess, "./models/model_level_" + str(level) + "_episode_" + str(episode) + ".ckpt")
                print("Model Saved")
                
game.close()

Episode: 0 Total reward: 0.951 Training loss: 3.9937 Explore P: 0.9952
Model Saved
Episode: 1 Total reward: -0.060000000000000005 Training loss: 14.6103 Explore P: 0.9893
Episode: 2 Total reward: -0.059000000000000004 Training loss: 4.4238 Explore P: 0.9835
Episode: 3 Total reward: 0.962 Training loss: 13.6550 Explore P: 0.9798
Episode: 4 Total reward: -0.057 Training loss: 6.8514 Explore P: 0.9743
Episode: 5 Total reward: -0.061000000000000006 Training loss: 3.8420 Explore P: 0.9684
Episode: 6 Total reward: -0.056 Training loss: 3.5324 Explore P: 0.9631
Episode: 7 Total reward: 0.952 Training loss: 2.3169 Explore P: 0.9585
Episode: 8 Total reward: -0.057 Training loss: 1.3511 Explore P: 0.9531
Episode: 9 Total reward: -0.060000000000000005 Training loss: 0.7125 Explore P: 0.9475
Episode: 10 Total reward: -0.057 Training loss: 1.5491 Explore P: 0.9422
Episode: 11 Total reward: -0.058 Training loss: 0.4836 Explore P: 0.9368
Episode: 12 Total reward: -0.058 Training loss: 0.7889 Explore 

Episode: 104 Total reward: 0.956 Training loss: 0.0310 Explore P: 0.5542
Episode: 105 Total reward: -0.057 Training loss: 0.0352 Explore P: 0.5511
Episode: 106 Total reward: -0.057 Training loss: 0.0407 Explore P: 0.5480
Episode: 107 Total reward: -0.060000000000000005 Training loss: 0.0557 Explore P: 0.5448
Episode: 108 Total reward: -0.057 Training loss: 0.0233 Explore P: 0.5418
Episode: 109 Total reward: -0.059000000000000004 Training loss: 0.0426 Explore P: 0.5386
Episode: 110 Total reward: -0.061000000000000006 Training loss: 0.0446 Explore P: 0.5354
Episode: 111 Total reward: -0.060000000000000005 Training loss: 0.0356 Explore P: 0.5323
Episode: 112 Total reward: -0.062000000000000006 Training loss: 0.0446 Explore P: 0.5291
Episode: 113 Total reward: -0.059000000000000004 Training loss: 0.0167 Explore P: 0.5260
Episode: 114 Total reward: -0.057 Training loss: 0.0730 Explore P: 0.5231
Episode: 115 Total reward: -0.059000000000000004 Training loss: 0.0371 Explore P: 0.5200
Episode:

Episode: 208 Total reward: -0.065 Training loss: 0.0324 Explore P: 0.3082
Episode: 209 Total reward: -0.056 Training loss: 0.0259 Explore P: 0.3066
Episode: 210 Total reward: 0.957 Training loss: 0.1450 Explore P: 0.3053
Episode: 211 Total reward: -0.056 Training loss: 0.0462 Explore P: 0.3037
Episode: 212 Total reward: -0.059000000000000004 Training loss: 0.0920 Explore P: 0.3019
Episode: 213 Total reward: -0.060000000000000005 Training loss: 0.1588 Explore P: 0.3002
Episode: 214 Total reward: -0.056 Training loss: 0.0852 Explore P: 0.2986
Episode: 215 Total reward: -0.056 Training loss: 0.0303 Explore P: 0.2969
Episode: 216 Total reward: -0.059000000000000004 Training loss: 0.0602 Explore P: 0.2953
Episode: 217 Total reward: -0.063 Training loss: 0.0637 Explore P: 0.2935
Episode: 218 Total reward: -0.057 Training loss: 0.0838 Explore P: 0.2919
Episode: 219 Total reward: 0.954 Training loss: 0.1155 Explore P: 0.2906
Episode: 220 Total reward: -0.058 Training loss: 0.0453 Explore P: 0.

Episode: 314 Total reward: -0.057 Training loss: 0.0228 Explore P: 0.1734
Episode: 315 Total reward: 0.949 Training loss: 0.0149 Explore P: 0.1725
Episode: 316 Total reward: -0.056 Training loss: 0.0106 Explore P: 0.1716
Episode: 317 Total reward: 0.948 Training loss: 0.0662 Explore P: 0.1708
Episode: 318 Total reward: -0.056 Training loss: 0.1454 Explore P: 0.1699
Episode: 319 Total reward: -0.057 Training loss: 0.0792 Explore P: 0.1690
Episode: 320 Total reward: -0.059000000000000004 Training loss: 0.0760 Explore P: 0.1680
Episode: 321 Total reward: -0.056 Training loss: 0.0248 Explore P: 0.1672
Episode: 322 Total reward: -0.057 Training loss: 0.0434 Explore P: 0.1663
Episode: 323 Total reward: -0.058 Training loss: 0.0957 Explore P: 0.1654
Episode: 324 Total reward: -0.061000000000000006 Training loss: 0.1759 Explore P: 0.1644
Episode: 325 Total reward: -0.064 Training loss: 0.0339 Explore P: 0.1634
Model Saved
Episode: 326 Total reward: 0.958 Training loss: 0.0742 Explore P: 0.1628

Episode: 421 Total reward: -0.057 Training loss: 0.1655 Explore P: 0.0990
Episode: 422 Total reward: 0.959 Training loss: 0.0351 Explore P: 0.0986
Episode: 423 Total reward: -0.059000000000000004 Training loss: 0.0451 Explore P: 0.0981
Episode: 424 Total reward: -0.063 Training loss: 0.1148 Explore P: 0.0975
Episode: 425 Total reward: -0.060000000000000005 Training loss: 0.0881 Explore P: 0.0970
Model Saved
Episode: 426 Total reward: -0.059000000000000004 Training loss: 0.0498 Explore P: 0.0965
Episode: 427 Total reward: -0.065 Training loss: 0.0815 Explore P: 0.0959
Episode: 428 Total reward: -0.059000000000000004 Training loss: 0.0362 Explore P: 0.0954
Episode: 429 Total reward: -0.059000000000000004 Training loss: 0.1232 Explore P: 0.0949
Episode: 430 Total reward: -0.057 Training loss: 0.0490 Explore P: 0.0945
Episode: 431 Total reward: -0.056 Training loss: 0.1222 Explore P: 0.0940
Episode: 432 Total reward: -0.056 Training loss: 0.1169 Explore P: 0.0935


ViZDoomUnexpectedExitException: Controlled ViZDoom instance exited unexpectedly.

### Test

In [89]:
with tf.Session() as sess:
    
    game, possible_actions, action_names = create_environment(level = level)
    
    totalScore = 0
    
    # Load the model
    saver.restore(sess, "./models/model_level_" + str(level) + "_episode_375.ckpt")
    game.init()
    for i in range(50):
        
        print('Episode {}'.format(i))
        
        game.new_episode()
        
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
            
        while not game.is_episode_finished():
            # Take the biggest Q value (= the best action)
            Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
            
            next_state = game.get_state().screen_buffer
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
            state = next_state
            
            # Take the biggest Q value (= the best action)
            choice = np.argmax(Qs)
            action = possible_actions[int(choice)]
            
            game.make_action(action)
            score = game.get_total_reward()
                
        score = game.get_total_reward()
        print("Score: ", score)
    game.close()

INFO:tensorflow:Restoring parameters from ./models/model_level_-4_episode_375.ckpt
Episode 0
Score:  -0.3000000000000002
Episode 1


ViZDoomUnexpectedExitException: Controlled ViZDoom instance exited unexpectedly.

### Train Back-up

In [None]:
# Saver will help us to save our model
saver = tf.train.Saver()

if training == True:
    with tf.Session() as sess:
        # Initialize the variables
        sess.run(tf.global_variables_initializer())
        
        # Initialize the decay rate (that will use to reduce epsilon) 
        decay_step = 0

        # Init the game
        game.init()

        for episode in range(total_episodes):
            # Set step to 0
            step = 0
            
            # Initialize the rewards of the episode
            episode_rewards = []
            
            # Make a new episode and observe the first state
            game.new_episode()
            state = game.get_state().screen_buffer
            
            # Remember that stack frame function also call our preprocess function.
            state, stacked_frames = stack_frames(stacked_frames, state, True)

            while step < max_steps:
                step += 1
                
                # Increase decay_step
                decay_step +=1
                
                # Predict the action to take and take it
                action, epsilon_ = choose_action(explore_start, explore_stop, decay_rate, decay_step, state, possible_actions)

                # Do the action
                reward = game.make_action(action)

                # Look if the episode is finished
                done = game.is_episode_finished()
                
                # Add the reward to total reward
                episode_rewards.append(reward)

                # If the game is finished
                if done:
                    # the episode ends so no next state
                    next_state = np.zeros((3,84,84), dtype=np.int)
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    
                    # Set step = max_steps to end the episode
                    step = max_steps

                    # Get the total reward of the episode
                    total_reward = np.sum(episode_rewards)

                    print('Episode: {}'.format(episode),
                              'Total reward: {}'.format(total_reward),
                              'Training loss: {:.4f}'.format(loss),
                              'Explore P: {:.4f}'.format(epsilon_))

                    memory.add((state, action, reward, next_state, done))

                else:
                    # Get the next state
                    next_state = game.get_state().screen_buffer
                    
                    # Stack the frame of the next_state
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    

                    # Add experience to memory
                    memory.add((state, action, reward, next_state, done))
                    
                    # st+1 is now our current state
                    state = next_state


                ### LEARNING PART            
                # Obtain random mini-batch from memory
                batch = memory.sample(batch_size)
                states_mb = np.array([each[0] for each in batch], ndmin=3)
                actions_mb = np.array([each[1] for each in batch])
                rewards_mb = np.array([each[2] for each in batch]) 
                next_states_mb = np.array([each[3] for each in batch], ndmin=3)
                dones_mb = np.array([each[4] for each in batch])

                target_Qs_batch = []

                 # Get Q values for next_state 
                Qs_next_state = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: next_states_mb})
                
                # Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*maxQ(s', a')
                for i in range(0, len(batch)):
                    terminal = dones_mb[i]

                    # If we are in a terminal state, only equals reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                        
                    else:
                        target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)
                        

                targets_mb = np.array([each for each in target_Qs_batch])

                loss, _ = sess.run([DQNetwork.loss, DQNetwork.optimizer],
                                    feed_dict={DQNetwork.inputs_: states_mb,
                                               DQNetwork.target_Q: targets_mb,
                                               DQNetwork.actions_: actions_mb})

                # Write TF Summaries
                summary = sess.run(write_op, feed_dict={DQNetwork.inputs_: states_mb,
                                                   DQNetwork.target_Q: targets_mb,
                                                   DQNetwork.actions_: actions_mb})
                writer.add_summary(summary, episode)
                writer.flush()

            # Save model every 5 episodes
            if episode % 25 == 0:
                save_path = saver.save(sess, "./models/model_level_" + str(level) + "_episode_" + str(episode) + ".ckpt")
                print("Model Saved")
                
game.close()