# Deep Q-Learning

# Import

In [1]:
import gym
import random
import numpy as np
from collections import deque
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, Reshape
from tensorflow.keras.optimizers import Adam

# Environment

In [None]:
env = gym.make('LunarLander-v2').env

In [3]:
help(env)

Help on LunarLander in module gym.envs.box2d.lunar_lander object:

class LunarLander(gym.core.Env, gym.utils.ezpickle.EzPickle)
 |  The main OpenAI Gym class. It encapsulates an environment with
 |  arbitrary behind-the-scenes dynamics. An environment can be
 |  partially or fully observed.
 |  
 |  The main API methods that users of this class need to know are:
 |  
 |      step
 |      reset
 |      render
 |      close
 |      seed
 |  
 |  And set the following attributes:
 |  
 |      action_space: The Space object corresponding to valid actions
 |      observation_space: The Space object corresponding to valid observations
 |      reward_range: A tuple corresponding to the min and max possible rewards
 |  
 |  Note: a default reward range set to [-inf,+inf] already exists. Set it if you want a narrower range.
 |  
 |  The methods are accessed publicly as "step", "reset", etc.. The
 |  non-underscored versions are wrapper methods to which we may add
 |  functionality over time.
 |

In [4]:
env.observation_space

Box(8,)

In [5]:
env.action_space

Discrete(4)

# Model

In [6]:
class DQN(Sequential):
    def __init__(self, n_nodes=32, n_hidden=2):
        super().__init__()
        # input layer
        self.add(Dense(n_nodes, activation='relu', input_shape=env.observation_space.shape), )
        # hidden layers
        for _ in range(n_hidden):
            self.add(Dense(n_nodes, activation='relu'), )
        # output layer
        self.add(Dense(env.action_space.n, activation='linear'),)
        # compile
        self.compile(loss='mse', optimizer='adam')

    def clone_from(self, another):
        self.set_weights(another.get_weights())
        return self

In [7]:
policy = DQN()

# Replay Memory

In [8]:
class Memory:
    def __init__(self, maxlen):
        self._memory = deque(maxlen=maxlen)
    
    def remember(self, state, action, reward, next_state, done):
        self._memory.append((state, action, reward, next_state, done))
    
    def get_batch(self, batch_size):
        samples = random.sample(self._memory, min(len(self._memory), batch_size))
        batch = np.array(samples, dtype=object).transpose()
        states, actions, rewards, next_states, dones = batch
        states, next_states = np.stack(states), np.stack(next_states)
        return states, actions, rewards, next_states, dones

# Agent

In [9]:
class Agent:
    def __init__(self, env, policy):
        self._env = env
        self._memory = Memory(100_000)
        self._policy = policy
        self._target = DQN().clone_from(self._policy)
    
    @property
    def policy(self): return self._policy
    
    def _choose_action(self, state, *, epilson=0.5):
        if np.random.random()>epilson:
            return self._env.action_space.sample()
        else:
            return np.argmax(agent._policy(tf.constant([state])))       

In [10]:
class Agent(Agent):
    def play(self, *, n_steps=1000, render=False):
        state = env.reset()
        done = False
        rewards = 0
        for i_steps in range(1, n_steps+1):
            action = self._choose_action(state, epilson=1)
            next_state, reward, done, info = env.step(action)
            if render: env.render()
            rewards += reward
            state = next_state
            if done: break
        if render: 
            print(f'Steps taken: {i_steps}, rewards earned: {rewards}')
            env.close()
        else:
            return rewards

In [11]:
class Agent(Agent):
    def _train(self, *, batch_size=1024, gamma=0.99):
        states, actions, rewards, next_states, dones = self._memory.get_batch(batch_size)
        next_states_predicts = self._target(next_states).numpy()
        q_targets = self._policy(states).numpy()
        for i,row in enumerate(q_targets):
            row[actions[i]] = rewards[i] if dones[i] else rewards[i] + gamma*np.max(next_states_predicts[i])
        X, y = tf.constant(states), tf.constant(q_targets)
        self._policy.fit(X, y, epochs=10, batch_size=len(X), verbose=0)

In [12]:
class Agent(Agent):
    def run(self, *, n_eps=1000, n_steps=1000):
        scores = deque(maxlen=50)
        for i_eps in range(1, n_eps+1):            
            state = env.reset()
            done = False            
            for _ in range(n_steps):
                action = self._choose_action(state, epilson=i_eps/n_eps)
                next_state, reward, done, info = env.step(action)
                self._memory.remember(state, action, reward, next_state, done)
                state = next_state 
                if done: break
            self._train()
            if i_eps%10==0:
                scores.append(self.play())
            if i_eps%50==0:
                self._target.clone_from(self._policy)
            if i_eps%2==0:
                print('#', end='')
            if i_eps%100==0:                
                mean_score = sum(scores)/len(scores)
                print(f' | Episode {i_eps:>4d} | rewards: {mean_score:.1f}')
                if i_eps>1000 and mean_score>=195:
                    print(f'\nMean score of {mean_score:.1f} has reached the target.')
                    break

In [13]:
agent = Agent(env, policy)

In [14]:
while input('Continue Training? ([Y]/n)').upper()!='N':
    agent.run()

Continue Training? ([Y]/n) 


################################################## | Episode  100 | rewards: -296.3
################################################## | Episode  200 | rewards: -327.2
################################################## | Episode  300 | rewards: -271.7
################################################## | Episode  400 | rewards: -247.1
################################################## | Episode  500 | rewards: -234.2
################################################## | Episode  600 | rewards: -194.1
################################################## | Episode  700 | rewards: -151.7
################################################## | Episode  800 | rewards: -130.5
################################################## | Episode  900 | rewards: -103.8
################################################## | Episode 1000 | rewards: -82.6


Continue Training? ([Y]/n) y


################################################## | Episode  100 | rewards: -222.5
################################################## | Episode  200 | rewards: -188.3
################################################## | Episode  300 | rewards: -140.0
################################################## | Episode  400 | rewards: -121.7
################################################## | Episode  500 | rewards: -96.2
################################################## | Episode  600 | rewards: -74.1
################################################## | Episode  700 | rewards: -47.3
################################################## | Episode  800 | rewards: -38.4
################################################## | Episode  900 | rewards: 1.2
################################################## | Episode 1000 | rewards: 29.2


Continue Training? ([Y]/n) y


################################################## | Episode  100 | rewards: 77.9
################################################## | Episode  200 | rewards: 59.1
################################################## | Episode  300 | rewards: 77.2
################################################## | Episode  400 | rewards: 89.6
################################################## | Episode  500 | rewards: 84.0
################################################## | Episode  600 | rewards: 92.8
################################################## | Episode  700 | rewards: 118.0
################################################## | Episode  800 | rewards: 137.0
################################################## | Episode  900 | rewards: 138.2
################################################## | Episode 1000 | rewards: 155.0


Continue Training? ([Y]/n) y


################################################## | Episode  100 | rewards: 20.7
################################################## | Episode  200 | rewards: 81.9
################################################## | Episode  300 | rewards: 68.6
################################################## | Episode  400 | rewards: 54.9
################################################## | Episode  500 | rewards: 51.5
################################################## | Episode  600 | rewards: 58.2
################################################## | Episode  700 | rewards: 20.2
################################################## | Episode  800 | rewards: 22.5
################################################## | Episode  900 | rewards: 37.8
################################################## | Episode 1000 | rewards: 72.6


Continue Training? ([Y]/n) y


################################################## | Episode  100 | rewards: 213.9
################################################## | Episode  200 | rewards: 193.5
################################################## | Episode  300 | rewards: 184.4
################################################## | Episode  400 | rewards: 184.0
################################################## | Episode  500 | rewards: 171.7
################################################## | Episode  600 | rewards: 153.4
################################################## | Episode  700 | rewards: 127.6
################################################## | Episode  800 | rewards: 128.1
################################################## | Episode  900 | rewards: 122.4
################################################## | Episode 1000 | rewards: 123.2


Continue Training? ([Y]/n) n


# Evaluation

In [17]:
agent.play(render=True)

Steps taken: 396, rewards earned: 210.57972816412695


In [16]:
if input('Save model? ([Y]/n)').upper()=='Y':
    agent.policy.save('lunarlander_v2@DQN#keras.h5')

Save model? ([Y]/n) y


# Comment

* This env is relative more difficult and takes time to explore.
* Takes time to go through the 500 steps in each episode but this is necessary since the landing only take place at later stage but it awarded the most.