# DEEP Q-NETWORK ALGORITHM

---

In this notebook, we will implement the Deep Q-Networks algorithm in Keras.

<img width="900px" src="./assets/dqn_intro.png">

<br>

# 1. Import the Packages

---

Below let's import all the libraries that we are going to use in this jupyter notebook.

In [1]:
# Import the libraries
import matplotlib.pyplot as plt
import random
import gym
import numpy as np
from collections import deque

%matplotlib inline

In [2]:
# Import libraries in Keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Conv2D, MaxPooling2D
from keras.optimizers import Adam

Using TensorFlow backend.


<br>

# 2. Deep Q-Network (DQN)

---

In this section, we will build the Deep Q-Network (DQN) algorithm in Keras. Later on we will add the extensions and observe the difference in accuracy.

In [3]:
class DQN_Agent:
    """
    Deep Q-Network (DQN) Agent.
    """

    # Init function
    def __init__(self, state_size, action_size):
        """
        Init function.
        
        ARGUMENTS
        ========================
            - state_size: Size of the state space
            - action-size: Size of action space
        """
        # Initialize state size object
        self.state_size = state_size
        
        # Initialize action size object
        self.action_size = action_size
        
        # Initialize memory object
        self.memory = deque(maxlen=5000)
        
        # Initialize discount rate object
        self.gamma = 0.9            
        
        # Initialize exploration rate object
        self.epsilon = 1.0         
        
        # Initialize minimal exploration rate (epsilon-greedy) object
        self.epsilon_min = 0.1      
        
        # Initialize decay rate for epsilon object
        self.epsilon_decay = 0.995  
        
        # Initialize number of steps until updating the target network object
        self.update_rate = 1000     
        
        # Construct DQN models
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.target_model.set_weights(self.model.get_weights())
        self.model.summary()

        
    # Model architecture
    def _build_model(self):
        """
        Building a neural networks architecture.
        """
        # Initialize the model
        model = Sequential()
        
        # Conv layer 1
        model.add(Conv2D(32, (8, 8), strides=4, padding='same', input_shape=self.state_size))
        model.add(Activation('relu'))
        
        # Conv layer 2
        model.add(Conv2D(64, (4, 4), strides=2, padding='same'))
        model.add(Activation('relu'))
        
        # Conv layer 3
        model.add(Conv2D(64, (3, 3), strides=1, padding='same'))
        model.add(Activation('relu'))
        
        # Flatten the neurons
        model.add(Flatten())

        # FCL layer 1
        model.add(Dense(512, activation='relu'))
        
        # Output layer
        model.add(Dense(self.action_size, activation='linear'))
        
        # Compile the model
        model.compile(loss='mse', optimizer=Adam())
        
        return model

    
    # Function for storing the experience in the replay memory
    def remember(self, state, action, reward, next_state, done):
        """
        Store the experience in the replay memory.
        
        ARGUMENTS
        ========================
            - state: Current state
            - action: Action 
            - reward: Reward
            - next_state: Next state
            - done: Indicates if it's a terminal state or not.
        """
        # Append the (S, A, R, S', done) into the memory
        self.memory.append((state, action, reward, next_state, done))


    # Function for choosing action based on a epsilon-greedy policy
    def act(self, state):
        """
        Choose action based on a epsilon-greedy policy.
        
        ARGUMENTS
        ========================
            - state: Current state
        """
        # Random action
        if np.random.rand() <= self.epsilon:
            action_to_take = random.randrange(self.action_size)

        # Greedy action
        else:
            act_values = self.model.predict(state)
            action_to_take = np.argmax(act_values[0])
        
        return action_to_take


    # Function for randomely selecting experiences (in the replay memory) and training on them
    def replay(self, batch_size):
        """
        Randomely selecting experiences (in the replay memory) and training on them.
        
        ARGUMENTS
        ========================
            - batch_size: Batch size
        """
        # Get a random batch
        minibatch = random.sample(self.memory, batch_size)
        
        # Iterate through the (S, A, R, S')
        for state, action, reward, next_state, done in minibatch:
            
            # If terminal state
            if not done:
                
                # Calculate the target
                target = (reward + self.gamma * np.amax(self.target_model.predict(next_state)))
                
            # If not terminal state
            else:
                
                # Assign reward to our target
                target = reward
                
            ### Construct the target vector
            
            # 1. Output the Q-value predictions
            target_f = self.model.predict(state)
            
            # 2. Update the action values with the target
            target_f[0][action] = target
            
            # 3. Use vectors in the objective computation
            self.model.fit(state, target_f, epochs=1, verbose=0)
            
        # Decay epsilon (if it's larger than epsilon_min)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

            
    # Function for updating the target model parameters
    def update_target_model(self):
        """
        Update the target model parameters to the current model parameters
        """
        # Update the target model
        self.target_model.set_weights(self.model.get_weights())
            

    # Load a saved model
    def load(self, name):
        """
        Load the saved model
        """
        self.model.load_weights(name)

        
    # Save parameters of a trained model
    def save(self, name):
        """
        Save the parameters of a trained model
        """
        self.model.save_weights(name)

In [4]:
# Function for preprocessing the frame
def process_frame(frame):
    """
    Preprocessing the frame.
    
    ARGUMENTS
    ========================
        - frame: One frame of the video to preprocess.
    """
    # TODO
    mspacman_color = np.array([210, 164, 74]).mean()
    
    # Crop and downsize
    img = frame[1:176:2, ::2]    
    
    # Convert to greyscale
    img = img.mean(axis=2)       
    
    # Improve contrast by making pacman white
    img[img==mspacman_color] = 0 
    
    # Normalize from -1 to 1
    img = (img - 128) / 128 - 1  
    
    return np.expand_dims(img.reshape(88, 80, 1), axis=0)

In [5]:
# Function for blending the images
def blend_images(images, blend):
    """
    Blend images.
    
    ARGUMENTS
    ========================
        - images
        - blend
    """
    avg_image = np.expand_dims(np.zeros((88, 80, 1), np.float64), axis=0)

    for image in images:
        avg_image += image
        
    if len(images) < blend:
        return avg_image / len(images)
    
    else:
        return avg_image / blend

In [14]:
# Initialize the environment
env = gym.make('SpaceInvaders-v0')

In [15]:
### Hyperparameters

# State size
state_size = (88, 80, 1)

# Action size
action_size = env.action_space.n

# Number of episodes
episodes = 500

# Batch size
batch_size = 8

# Waits for 90 actions before the episode begins
skip_start = 90  

# Counter for total number of steps taken
total_time = 0   

# Used to compute avg reward over time
all_rewards = 0  

# Number of images to blend
blend = 4       

# Terminal state
done = False

In [16]:
# Initialize the DQN agent
agent = DQN_Agent(state_size, action_size)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_13 (Conv2D)           (None, 22, 20, 32)        2080      
_________________________________________________________________
activation_13 (Activation)   (None, 22, 20, 32)        0         
_________________________________________________________________
conv2d_14 (Conv2D)           (None, 11, 10, 64)        32832     
_________________________________________________________________
activation_14 (Activation)   (None, 11, 10, 64)        0         
_________________________________________________________________
conv2d_15 (Conv2D)           (None, 11, 10, 64)        36928     
_________________________________________________________________
activation_15 (Activation)   (None, 11, 10, 64)        0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 7040)              0         
__________

In [None]:
# Iterate through episodes
for e in range(episodes):
    
    # Initialize the total reward
    total_reward = 0
    
    # Initialize the game score
    game_score = 0
    
    # Get the state + preprocess the state
    state = process_frame(env.reset())
    
    # Array of images to be blended
    images = deque(maxlen=blend)  
    
    # Append the state to images
    images.append(state)
    
    # Skip the start of each game
    for skip in range(skip_start):
        env.step(0)
    
    # Iterate through timesteps
    for time in range(20000):
        
        # Render the environment
        env.render()
        
        # Add one to total_time
        total_time += 1
        
        # Every update_rate timesteps
        if total_time % agent.update_rate == 0:
            
            # Update the target network parameters
            agent.update_target_model()
        
        # Average the last 4 frames
        state = blend_images(images, blend)
        
        # Take action
        action = agent.act(state)
        
        # One timestep of the environment's dynamics
        next_state, reward, done, _ = env.step(action)
        
        # Average the last 4 frames
        next_state = process_frame(next_state)
        images.append(next_state)
        next_state = blend_images(images, blend)
        
        # Store sequence in replay memory
        agent.remember(state, action, reward, next_state, done)
        
        # Update the state
        state = next_state
        
        # Add reward into game_score
        game_score += reward
        
        # Punish behavior which does not accumulate reward
        reward -= 1 
        
        # Append the reward into total_reward
        total_reward += reward
        
        # If terminal state
        if done:
            
            # Add game_score into all_rewards
            all_rewards += game_score
            
            # Print episode, reward, average reward, time
            print("episode: {}/{}, game score: {}, reward: {}, avg reward: {}, time: {}, total time: {}"
                  .format(e+1, episodes, game_score, total_reward, all_rewards/(e+1), time, total_time))
            
            # Break
            break
        
        # If length of memory gets larger than batch_size
        if len(agent.memory) > batch_size:
            
            # Randomely selecting experiences (in the replay memory) and training on them.
            agent.replay(batch_size)