<a href="https://colab.research.google.com/github/psygrammer/fast_and_slow/blob/master/drl/notebooks/dqn/tf_ch09_Deep_Q_Network_and_Its_Variants_sol.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 9. Deep Q Network and Its Variants (sol)

* Fast And Slow / Deep RL - tf2 [1]
* 김무성

#### 실습 repo

In [None]:
!git clone https://github.com/psygrammer/fast_and_slow

Cloning into 'fast_and_slow'...
remote: Enumerating objects: 59, done.[K
remote: Counting objects: 100% (59/59), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 59 (delta 13), reused 43 (delta 10), pack-reused 0[K
Unpacking objects: 100% (59/59), done.


In [None]:
ls

[0m[01;34mfast_and_slow[0m/  [01;34msample_data[0m/


In [None]:
cd /content/fast_and_slow/drl/notebooks/dqn

/content/fast_and_slow/drl/notebooks/dqn


In [None]:
ls

tf_ch09_Deep_Q_Network_and_Its_Variants.ipynb  [0m[01;34mvideo[0m/


#### Install dependancies

In [None]:
#remove " > /dev/null 2>&1" to see what is going on under the hood
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!pip install pyglet==1.3.2 > /dev/null 2>&1
!apt-get install -y xvfb x11-utils python-opengl ffmpeg > /dev/null 2>&1

#### Imports and Helper functions

In [None]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only

import math
import glob
import io
import base64
from IPython.display import HTML

from IPython import display as ipythondisplay

In [None]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7f1465cccdd0>

In [None]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

-------------------

# Playing Atari games using DQN



In [None]:
import tensorflow as tf
print(tf.__version__)

2.4.1


In [None]:
# Let's implement the DQN to play the Ms Pacman game. 
# First, let's import the necessary libraries:
import random
import gym
import numpy as np
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam

In [None]:
# Now, let's create the Ms Pacman game environment using Gym:
env = gym.make("MsPacman-v0")

In [None]:
env = wrap_env(env) # monitoring

In [None]:
env.observation_space

Box(0, 255, (210, 160, 3), uint8)

In [None]:
# Set the state size:
state_size = (88, 80, 1)

In [None]:
# Get the number of actions:
action_size = env.action_space.n

## Preprocess the game screen

Now, let's define a function called preprocess_state which takes the game state (image of the game screen) as an input and returns the preprocessed game state (image of the game screen):

In [None]:
color = np.array([210, 164, 74]).mean()
color

149.33333333333334

In [None]:
def preprocess_state(state):

    #crop and resize the image
    image = state[1:176:2, ::2]

    #convert the image to greyscale
    image = image.mean(axis=2)

    #improve image contrast
    image[image==color] = 0

    #normalize the image
    image = (image - 128) / 128 - 1
    
    #reshape the image
    image = np.expand_dims(image.reshape(88, 80, 1), axis=0)

    return image


## Defining the DQN class

In [None]:
class DQN:
    # -- Defining the init method
    def __init__(self, state_size, action_size):
        
        #define the state size
        self.state_size = state_size
        
        #define the action size
        self.action_size = action_size
        
        #define the replay buffer
        self.replay_buffer = deque(maxlen=5000)
        
        #define the discount factor
        self.gamma = 0.9  
        
        #define the epsilon value
        self.epsilon = 0.8   
        
        #define the update rate at which we want to update the target network
        self.update_rate = 1000    
        
        #define the main network
        self.main_network = self.build_network()
        
        #define the target network
        self.target_network = self.build_network()
        
        #copy the weights of the main network to the target network
        self.target_network.set_weights(self.main_network.get_weights())
        

    # -- Building the DQN
    #Let's define a function called build_network which is essentially our DQN. 

    def build_network(self):
        # Define the first convolutional layer:
        model = Sequential()
        model.add(Conv2D(32, (8, 8), strides=4, padding='same', input_shape=self.state_size))
        model.add(Activation('relu'))
        
        # Define the second convolutional layer:
        model.add(Conv2D(64, (4, 4), strides=2, padding='same'))
        model.add(Activation('relu'))
        
        # Define the third convolutional layer:
        model.add(Conv2D(64, (3, 3), strides=1, padding='same'))
        model.add(Activation('relu'))

        #Flatten the feature maps obtained as a result of the third convolutional layer:
        model.add(Flatten())

        # Feed the flattened maps to the fully connected layer:
        model.add(Dense(512, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        
        # Compile the model with loss as MSE:
        model.compile(loss='mse', optimizer=Adam())

        # Return the model:
        return model


    # -- Storing the transition
    #We learned that we train DQN by randomly sampling a minibatch of transitions from the
    #replay buffer. So, we define a function called store_transition which stores the transition information
    #into the replay buffer

    def store_transistion(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))
        

    # -- Defining the epsilon-greedy policy
    #We learned that in DQN, to take care of exploration-exploitation trade off, we select action
    #using the epsilon-greedy policy. So, now we define the function called epsilon_greedy
    #for selecting action using the epsilon-greedy policy.
    
    def epsilon_greedy(self, state):
        if random.uniform(0,1) < self.epsilon:
            return np.random.randint(self.action_size)
        
        Q_values = self.main_network.predict(state)
        
        return np.argmax(Q_values[0])


    # -- Define the training
    #train the network
    def train(self, batch_size):
        
        #sample a mini batch of transition from the replay buffer
        minibatch = random.sample(self.replay_buffer, batch_size)
        
        #compute the Q value using the target network
        for state, action, reward, next_state, done in minibatch:
            if not done:
                target_Q = (reward + self.gamma * np.amax(self.target_network.predict(next_state)))
            else:
                target_Q = reward
                
            #compute the Q value using the main network 
            Q_values = self.main_network.predict(state)
            
            Q_values[0][action] = target_Q
            
            #train the main network
            self.main_network.fit(state, Q_values, epochs=1, verbose=0)
            
    #update the target network weights by copying from the main network
    def update_target_network(self):
        self.target_network.set_weights(self.main_network.get_weights())

## Training the DQN

In [None]:
# Now, let's train the network. 
# First, let's set the number of episodes we want to train the network:
num_episodes = 500

In [None]:
# Define the number of time steps
num_timesteps = 20000

In [None]:
# Define the batch size:
batch_size = 8

In [None]:
# Set the number of past game screens we want to consider:
num_screens = 4

In [None]:
ls

tf_ch09_Deep_Q_Network_and_Its_Variants.ipynb  [0m[01;34mvideo[0m/


In [None]:
ls video

In [None]:
# Instantiate the DQN class
dqn = DQN(state_size, action_size)

In [None]:
done = False
time_step = 0

#for each episode
for i in range(num_episodes):
    #set return to 0
    Return = 0
    
    #preprocess the game screen
    state = preprocess_state(env.reset())

    #for each step in the episode
    for t in range(num_timesteps):
        
        #render the environment
        env.render()
        
        #update the time step
        time_step += 1
        
        #update the target network
        if time_step % dqn.update_rate == 0:
            dqn.update_target_network()
        
        #select the action
        action = dqn.epsilon_greedy(state)
        
        #perform the selected action
        next_state, reward, done, _ = env.step(action)
        
        #preprocess the next state
        next_state = preprocess_state(next_state)
        
        #store the transition information
        dqn.store_transistion(state, action, reward, next_state, done)
        
        #update current state to next state
        state = next_state
        
        #update the return
        Return += reward
        
        #if the episode is done then print the return
        if done:
            print('Episode: ',i, ',' 'Return', Return)
            break
            
        #if the number of transistions in the replay buffer is greater than batch size
        #then train the network
        if len(dqn.replay_buffer) > batch_size:
            dqn.train(batch_size)

KeyboardInterrupt: ignored

In [None]:
env.close()

In [None]:
%ls video

In [None]:
show_video()