# Deep Q-Network

# Import

In [1]:
import gym
import random
import numpy as np
from collections import deque
from time import sleep
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape
from IPython.display import clear_output

# Environment

In [2]:
env = gym.make('FrozenLake-v0', is_slippery=True).env

In [3]:
help(env)

Help on FrozenLakeEnv in module gym.envs.toy_text.frozen_lake object:

class FrozenLakeEnv(gym.envs.toy_text.discrete.DiscreteEnv)
 |  FrozenLakeEnv(desc=None, map_name='4x4', is_slippery=True)
 |  
 |  Winter is here. You and your friends were tossing around a frisbee at the park
 |  when you made a wild throw that left the frisbee out in the middle of the lake.
 |  The water is mostly frozen, but there are a few holes where the ice has melted.
 |  If you step into one of those holes, you'll fall into the freezing water.
 |  At this time, there's an international frisbee shortage, so it's absolutely imperative that
 |  you navigate across the lake and retrieve the disc.
 |  However, the ice is slippery, so you won't always move in the direction you intend.
 |  The surface is described using a grid like the following
 |  
 |      SFFF
 |      FHFH
 |      FFFH
 |      HFFG
 |  
 |  S : starting point, safe
 |  F : frozen surface, safe
 |  H : hole, fall to your doom
 |  G : goal, where

In [4]:
env.action_space

Discrete(4)

In [5]:
env.observation_space

Discrete(16)

# Model

In [6]:
class DQN(Sequential):
    def __init__(self, n_embedded=5, n_nodes=32, n_hidden=2):
        super().__init__()
        # input layer
        self.add(Embedding(env.observation_space.n, n_embedded, input_length=1))
        self.add(Reshape((n_embedded, )))
        # hidden layers
        for _ in range(n_hidden):
            self.add(Dense(n_nodes, activation='relu'), )
        # output layer
        self.add(Dense(env.action_space.n, activation='linear'),)
        # compile
        self.compile(loss='mse', optimizer='adam')

    def clone_from(self, another):
        self.set_weights(another.get_weights())
        return self

In [7]:
policy = DQN()

# Replay Memory

In [8]:
class Memory:
    def __init__(self, maxlen):
        self._memory = deque(maxlen=maxlen)
    
    def remember(self, state, action, reward, next_state, done):
        self._memory.append((state, action, reward, next_state, done))
    
    def get_batch(self, batch_size):
        samples = random.sample(self._memory, min(len(self._memory), batch_size))
        batch = np.array(samples, dtype=object).transpose()
        states, actions, rewards, next_states, dones = batch
        states, next_states = np.stack(states), np.stack(next_states)
        return states, actions, rewards, next_states, dones

# Agent

In [9]:
class Agent:
    def __init__(self, env, policy):
        self._env = env
        self._memory = Memory(100_000)
        self._policy = policy
        self._target = DQN().clone_from(self._policy)
    
    @property
    def policy(self): return self._policy
    
    def choose_action(self, state, *, epilson=0.5):
        if np.random.random()>epilson:
            return self._env.action_space.sample()
        else:
            return np.argmax(agent._policy(tf.constant([state])))       

In [10]:
class Agent(Agent):
    def play(self, *, n_steps=500, render=False):
        state = env.reset()
        done = False
        rewards = 0
        for i_steps in range(1, n_steps+1):
            action = self.choose_action(state, epilson=1)
            next_state, reward, done, info = env.step(action)
            rewards += reward
            if render: 
                clear_output(wait=True)
                env.render()
                sleep(0.05)
            if done: 
                break
            state = next_state
        if render: 
            print(f'Steps taken: {i_steps}, rewards earned: {rewards}')
            env.close()
        else:
            return rewards

In [11]:
class Agent(Agent):
    def train(self, *, batch_size=1024, gamma=0.99):
        states, actions, rewards, next_states, dones = self._memory.get_batch(batch_size)
        next_states_predicts = self._target(next_states).numpy()
        q_targets = self._policy(states).numpy()
        for i,row in enumerate(q_targets):
            row[actions[i]] = rewards[i] if dones[i] else rewards[i] + gamma*np.max(next_states_predicts[i])
        X, y = tf.constant(states), tf.constant(q_targets)
        self._policy.fit(X, y, epochs=50, batch_size=len(X), verbose=0)

In [12]:
class Agent(Agent):
    def run(self, *, n_eps=3000, n_steps=500):
        scores = deque(maxlen=50)
        for i_eps in range(1, n_eps+1):            
            state = env.reset()
            done = False            
            for _ in range(n_steps):
                action = self.choose_action(state, epilson=i_eps/n_eps)
                next_state, reward, done, info = env.step(action)
                self._memory.remember(state, action, reward, next_state, done)
                state = next_state 
                if done: break
            self.train()
            if i_eps%10==0:
                scores.append(self.play())
            if i_eps%20==0:
                self._target.clone_from(self._policy)
            if i_eps%2==0:
                print('#', end='')
            if i_eps%100==0:                
                mean_score = sum(scores)/len(scores)
                print(f' | Episode {i_eps:>4d} | rewards: {mean_score:.1f}')

In [13]:
agent = Agent(env, policy)

In [14]:
agent.run()

################################################## | Episode  100 | rewards: 0.0
################################################## | Episode  200 | rewards: 0.1
################################################## | Episode  300 | rewards: 0.2
################################################## | Episode  400 | rewards: 0.3
################################################## | Episode  500 | rewards: 0.4
################################################## | Episode  600 | rewards: 0.6
################################################## | Episode  700 | rewards: 0.6
################################################## | Episode  800 | rewards: 0.7
################################################## | Episode  900 | rewards: 0.7
################################################## | Episode 1000 | rewards: 0.7
################################################## | Episode 1100 | rewards: 0.6
################################################## | Episode 1200 | rewards: 0.6
############################

# Evaluation

In [19]:
agent.play(render=True)

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Steps taken: 35, rewards earned: 1.0


# Comment

* Compared to Q-Learning, DQN can perform much better average reward.
* I guess it is because I made use of Replay Memory which gives more change for the network to learn the randomness of slippery.
* Also DQN may have access to a richer state space (considering randomness of action produced extra variabilities) and there may be able to learn some randomness.