# Deep Q-Network

# Import

In [1]:
import gym
import random
import numpy as np
from collections import deque
from time import sleep
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Environment

In [2]:
env = gym.make('CartPole-v1').env

In [3]:
help(env)

Help on CartPoleEnv in module gym.envs.classic_control.cartpole object:

class CartPoleEnv(gym.core.Env)
 |  Description:
 |      A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The pendulum starts upright, and the goal is to prevent it from falling over by increasing and reducing the cart's velocity.
 |  
 |  Source:
 |      This environment corresponds to the version of the cart-pole problem described by Barto, Sutton, and Anderson
 |  
 |  Observation: 
 |      Type: Box(4)
 |      Num     Observation                 Min         Max
 |      0       Cart Position             -4.8            4.8
 |      1       Cart Velocity             -Inf            Inf
 |      2       Pole Angle                 -24 deg        24 deg
 |      3       Pole Velocity At Tip      -Inf            Inf
 |      
 |  Actions:
 |      Type: Discrete(2)
 |      Num     Action
 |      0       Push cart to the left
 |      1       Push cart to the right
 |      
 |  

In [4]:
env.observation_space

Box(4,)

In [5]:
env.action_space

Discrete(2)

# Model

In [6]:
class DQN(Sequential):
    def __init__(self, n_nodes=32, n_hidden=2):
        super().__init__()
        # input layer
        self.add(Dense(n_nodes, activation='relu', input_shape=env.observation_space.shape), )
        # hidden layers
        for _ in range(n_hidden):
            self.add(Dense(n_nodes, activation='relu'), )
        # output layer
        self.add(Dense(env.action_space.n, activation='linear'),)
        # compile
        self.compile(loss='mse', optimizer='adam')

    def clone_from(self, another):
        self.set_weights(another.get_weights())
        return self

In [7]:
policy = DQN()

# Replay Memory

In [8]:
class Memory:
    def __init__(self, maxlen):
        self._memory = deque(maxlen=maxlen)
    
    def remember(self, state, action, reward, next_state, done):
        self._memory.append((state, action, reward, next_state, done))
    
    def get_batch(self, batch_size):
        samples = random.sample(self._memory, min(len(self._memory), batch_size))
        batch = np.array(samples, dtype=object).transpose()
        states, actions, rewards, next_states, dones = batch
        states, next_states = np.stack(states), np.stack(next_states)
        return states, actions, rewards, next_states, dones

# Agent

In [9]:
class Agent:
    def __init__(self, env, policy):
        self._env = env
        self._memory = Memory(100_000)
        self._policy = policy
        self._target = DQN().clone_from(self._policy)
    
    @property
    def policy(self): return self._policy
    
    def choose_action(self, state, *, epilson=0.5):
        if np.random.random()>epilson:
            return self._env.action_space.sample()
        else:
            return np.argmax(agent._policy(tf.constant([state])))       

In [10]:
class Agent(Agent):
    def play(self, *, n_steps=500, render=False):
        state = env.reset()
        done = False
        rewards = 0
        for i_steps in range(1, n_steps+1):
            action = self.choose_action(state, epilson=1)
            next_state, reward, done, info = env.step(action)
            if render: env.render()
            rewards += reward
            state = next_state
            if done: break
        if render: 
            print(f'Steps taken: {i_steps}, rewards earned: {rewards}')
            env.close()
        else:
            return rewards

In [11]:
class Agent(Agent):
    def train(self, *, batch_size=1024, gamma=0.90):
        states, actions, rewards, next_states, dones = self._memory.get_batch(batch_size)
        next_states_predicts = self._target(next_states).numpy()
        q_targets = self._policy(states).numpy()
        for i,row in enumerate(q_targets):
            row[actions[i]] = rewards[i] if dones[i] else rewards[i] + gamma*np.max(next_states_predicts[i])
        X, y = tf.constant(states), tf.constant(q_targets)
        self._policy.fit(X, y, epochs=50, batch_size=len(X), verbose=0)

In [12]:
class Agent(Agent):
    def run(self, *, n_eps=3000, n_steps=500):
        scores = deque(maxlen=50)
        for i_eps in range(1, n_eps+1):            
            state = env.reset()
            done = False            
            for _ in range(n_steps):
                action = self.choose_action(state, epilson=i_eps/n_eps)
                next_state, reward, done, info = env.step(action)
                self._memory.remember(state, action, reward, next_state, done)
                state = next_state 
                if done: break
            self.train()
            if i_eps%10==0:
                scores.append(self.play())
            if i_eps%20==0:
                self._target.clone_from(self._policy)
            if i_eps%2==0:
                print('#', end='')
            if i_eps%100==0:                
                mean_score = sum(scores)/len(scores)
                print(f' | Episode {i_eps:>4d} | rewards: {mean_score:.1f}')
                if i_eps>1000 and mean_score>=475:
                    print(f'\nMean score of {mean_score:.1f} has reached the target.')
                    break

In [13]:
agent = Agent(env, policy)

In [14]:
agent.run()

################################################## | Episode  100 | rewards: 25.0
################################################## | Episode  200 | rewards: 68.9
################################################## | Episode  300 | rewards: 92.0
################################################## | Episode  400 | rewards: 88.2
################################################## | Episode  500 | rewards: 88.5
################################################## | Episode  600 | rewards: 110.5
################################################## | Episode  700 | rewards: 114.5
################################################## | Episode  800 | rewards: 103.7
################################################## | Episode  900 | rewards: 107.4
################################################## | Episode 1000 | rewards: 123.0
################################################## | Episode 1100 | rewards: 145.2
################################################## | Episode 1200 | rewards: 140.9
#########

# Evaluation

In [15]:
agent.play(render=True)

Steps taken: 216, rewards earned: 216.0


In [16]:
if input('Save model? ([Y]/n)').upper()=='Y':
    agent._policy.save('cartpole_v1@DQN#keras.h5')

Save model? ([Y]/n) y


# Comment

* As compared to to CartPole-v0, CartPole-v1 has a larger number of step before termination, and therefore having a large state space.
* Training time and difficulty are higher and more prone to variance.
* Therefore early stopping is implemented to prevent overfitting.
* The longer the time frame, the more variability the system has. My experience is that there is no guarantee the system will converge. Using a larger replay buffer size may help.