In [1]:
import gym
from gym import spaces
import numpy as np

In [2]:
class Car2DEnv(gym.Env):
    metadata = {
        'render.modes': ['human', 'rgb_array'],
        'video.frames_per_second': 2
    }
     
    def __init__(self):
        self.xth = 0
        self.target_x = 0
        self.target_y = 0
        self.L = 10
        self.action_space = spaces.Discrete(5) # # 0:stay, 1:up, 2:down，3:left，4:right
        self.observation_space = spaces.Box(np.array([-self.L, -self.L]), np.array([self.L, self.L]))
        self.state = None
    
    def step(self, action):
        assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action))
        x, y = self.state
        if action == 0:
            x = x
            y = y
        if action == 1:
            x = x
            y = y + 1
        if action == 2:
            x = x
            y = y - 1
        if action == 3:
            x = x - 1
            y = y
        if action == 4:
            x = x + 1
            y = y
        self.state = np.array([x, y])
        self.counts += 1
        
        done = (np.abs(x)+np.abs(y) <= 1) or (np.abs(x)+np.abs(y) >= 2*self.L+1)
        done = bool(done)
        
        if not done:
            reward = -0.1
        else:
            if np.abs(x)+np.abs(y) <= 1:
                reward = 10
            else:
                reward = -50
            
        return self.state, reward, done, {}
    
    def reset(self):
        self.state = np.ceil(np.random.rand(2)*2*self.L)-self.L
        self.counts = 0
        return self.state
        
    def render(self, mode='human'):
        return None
        
    def close(self):
        return None

In [3]:
env = Car2DEnv()

In [4]:
nb_actions = env.action_space.n

In [5]:
nb_actions

5

In [6]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

Using TensorFlow backend.
  return f(*args, **kwds)


In [7]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 2)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                48        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
__________

In [8]:
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [9]:
dqn.fit(env, nb_steps=1000, visualize=False, verbose=2)

Training for 1000 steps ...




  84/1000: episode: 1, duration: 0.467s, episode steps: 84, steps per second: 180, episode reward: -58.300, mean reward: -0.694 [-50.000, -0.100], mean action: 1.631 [0.000, 4.000], mean observation: -6.619 [-16.000, -2.000], loss: 0.089281, mean_absolute_error: 0.529797, mean_q: 1.022866
 188/1000: episode: 2, duration: 0.206s, episode steps: 104, steps per second: 505, episode reward: -60.300, mean reward: -0.580 [-50.000, -0.100], mean action: 1.471 [0.000, 4.000], mean observation: -1.611 [-13.000, 10.000], loss: 8.040032, mean_absolute_error: 0.989454, mean_q: 1.360462
 214/1000: episode: 3, duration: 0.054s, episode steps: 26, steps per second: 481, episode reward: -52.500, mean reward: -2.019 [-50.000, -0.100], mean action: 1.346 [0.000, 4.000], mean observation: 8.038 [4.000, 12.000], loss: 9.882359, mean_absolute_error: 1.315702, mean_q: 1.625490
 324/1000: episode: 4, duration: 0.216s, episode steps: 110, steps per second: 510, episode reward: -60.900, mean reward: -0.554 [-5

<keras.callbacks.History at 0xb291468d0>

In [10]:
type(dqn)

rl.agents.dqn.DQNAgent

In [11]:
obs, done = env.reset(), False
print(obs)

[2. 4.]


In [12]:
episode_reward = 0
while not done:
    obs, reward, done, _ = env.step(dqn.forward(obs))
    print("state:",obs)
    episode_reward += reward
print([episode_reward, env.counts])

state: [2. 3.]
state: [3. 3.]
state: [3. 3.]
state: [3. 3.]
state: [3. 2.]
state: [3. 1.]
state: [3. 1.]
state: [2. 1.]
state: [2. 1.]
state: [2. 0.]
state: [2. 1.]
state: [1. 1.]
state: [1. 0.]
[8.8, 13]
