In [1]:
#reference: https://zhuanlan.zhihu.com/p/33553076
import gym
from gym import spaces
import numpy as np

In [13]:
class Car2DEnv(gym.Env):
    metadata = {
        'render.modes': ['human', 'rgb_array'],
        'video.frames_per_second': 2
    }
     
    def __init__(self):
        self.xth = 0
        self.target_x = 0
        self.target_y = 0
        self.L = 10
        self.action_space = spaces.Discrete(5) # 0:stay, 1:up, 2:down，3:left，4:right
        self.observation_space = spaces.Box(np.array([-self.L, -self.L]), np.array([self.L, self.L]))
        self.state = None
    
    def step(self, action):
        assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action))
        x, y = self.state
        if action == 0:
            x = x
            y = y
        if action == 1:
            x = x
            y = y + 1
        if action == 2:
            x = x
            y = y - 1
        if action == 3:
            x = x - 1
            y = y
        if action == 4:
            x = x + 1
            y = y
        self.state = np.array([x, y])
        self.counts += 1
        
        done = (np.abs(x)+np.abs(y) <= 1) or (np.abs(x)+np.abs(y) >= 2*self.L+1)
        done = bool(done)
        
        if not done:
            reward = -0.1
        else:
            if np.abs(x)+np.abs(y) <= 1:
                reward = 10
            else:
                reward = -50
            
        return self.state, reward, done, {}
    
    def reset(self):
        self.state = np.ceil(np.random.rand(2)*2*self.L)-self.L
        self.counts = 0
        return self.state
        
    def render(self, mode='human'):
        return None
        
    def close(self):
        return None

In [14]:
from baselines import deepq

In [15]:
env = Car2DEnv()

In [16]:
model = deepq.models.mlp([32, 16], layer_norm=True)
act = deepq.learn(
    env,
    q_func=model,
    lr=0.01,
    max_timesteps=10000,
    print_freq=1,
    checkpoint_freq=1000
)

print('Finish!')

--------------------------------------
| % time spent exploring  | 85       |
| episodes                | 2        |
| mean 100 episode reward | -64.7    |
| steps                   | 147      |
--------------------------------------
--------------------------------------
| % time spent exploring  | 77       |
| episodes                | 3        |
| mean 100 episode reward | -61.3    |
| steps                   | 228      |
--------------------------------------
--------------------------------------
| % time spent exploring  | 73       |
| episodes                | 4        |
| mean 100 episode reward | -59      |
| steps                   | 271      |
--------------------------------------
--------------------------------------
| % time spent exploring  | 67       |
| episodes                | 5        |
| mean 100 episode reward | -58.1    |
| steps                   | 327      |
--------------------------------------
--------------------------------------
| % time spent exploring 

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


--------------------------------------
| % time spent exploring  | 40       |
| episodes                | 11       |
| mean 100 episode reward | -32      |
| steps                   | 606      |
--------------------------------------
--------------------------------------
| % time spent exploring  | 38       |
| episodes                | 12       |
| mean 100 episode reward | -28.3    |
| steps                   | 628      |
--------------------------------------
--------------------------------------
| % time spent exploring  | 34       |
| episodes                | 13       |
| mean 100 episode reward | -30.5    |
| steps                   | 670      |
--------------------------------------
--------------------------------------
| % time spent exploring  | 33       |
| episodes                | 14       |
| mean 100 episode reward | -27.4    |
| steps                   | 677      |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 47       |
| mean 100 episode reward | -36.8    |
| steps                   | 3575     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 48       |
| mean 100 episode reward | -35.9    |
| steps                   | 3606     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 49       |
| mean 100 episode reward | -36.2    |
| steps                   | 3634     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 50       |
| mean 100 episode reward | -35.3    |
| steps                   | 3652     |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 83       |
| mean 100 episode reward | -21.4    |
| steps                   | 4226     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 84       |
| mean 100 episode reward | -21      |
| steps                   | 4229     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 85       |
| mean 100 episode reward | -20.7    |
| steps                   | 4239     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 86       |
| mean 100 episode reward | -20.3    |
| steps                   | 4250     |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 119      |
| mean 100 episode reward | -8.7     |
| steps                   | 4637     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 120      |
| mean 100 episode reward | -8.1     |
| steps                   | 4674     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 121      |
| mean 100 episode reward | -7.5     |
| steps                   | 4679     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 122      |
| mean 100 episode reward | -6.9     |
| steps                   | 4695     |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 154      |
| mean 100 episode reward | 5.6      |
| steps                   | 5275     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 155      |
| mean 100 episode reward | 5.6      |
| steps                   | 5283     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 156      |
| mean 100 episode reward | 5        |
| steps                   | 5328     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 157      |
| mean 100 episode reward | 5        |
| steps                   | 5343     |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 190      |
| mean 100 episode reward | 6.7      |
| steps                   | 5849     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 191      |
| mean 100 episode reward | 6.6      |
| steps                   | 5995     |
--------------------------------------
Saving model due to mean reward increase: -0.6 -> 6.6
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 192      |
| mean 100 episode reward | 6.5      |
| steps                   | 6051     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 193      |
| mean 100 episode reward | 6.5      |
| steps                   | 6056     |
--------------------------------------
----------

--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 225      |
| mean 100 episode reward | 6        |
| steps                   | 6449     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 226      |
| mean 100 episode reward | 6.1      |
| steps                   | 6465     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 227      |
| mean 100 episode reward | 6        |
| steps                   | 6483     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 228      |
| mean 100 episode reward | 6        |
| steps                   | 6501     |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 261      |
| mean 100 episode reward | 5.5      |
| steps                   | 7089     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 262      |
| mean 100 episode reward | 5.5      |
| steps                   | 7100     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 263      |
| mean 100 episode reward | 5.5      |
| steps                   | 7117     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 264      |
| mean 100 episode reward | 5.5      |
| steps                   | 7124     |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 297      |
| mean 100 episode reward | 5.3      |
| steps                   | 7862     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 298      |
| mean 100 episode reward | 5.4      |
| steps                   | 7873     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 299      |
| mean 100 episode reward | 5.3      |
| steps                   | 7891     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 300      |
| mean 100 episode reward | 5.3      |
| steps                   | 7904     |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 333      |
| mean 100 episode reward | 5.8      |
| steps                   | 8457     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 334      |
| mean 100 episode reward | 6.4      |
| steps                   | 8463     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 335      |
| mean 100 episode reward | 6.4      |
| steps                   | 8489     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 336      |
| mean 100 episode reward | 6.4      |
| steps                   | 8493     |
--------------------------------------
--------------------------------------
| % time spent exploring 

--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 368      |
| mean 100 episode reward | 6.5      |
| steps                   | 9551     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 369      |
| mean 100 episode reward | 6.5      |
| steps                   | 9609     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 370      |
| mean 100 episode reward | 6.4      |
| steps                   | 9718     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 371      |
| mean 100 episode reward | 6.3      |
| steps                   | 9776     |
--------------------------------------
--------------------------------------
| % time spent exploring 

[2020-05-31 18:26:07,071] Restoring parameters from /var/folders/sz/3k8my8vn6jg6mjj1mhcrjqs80000gn/T/tmp5s0yd722/model


Finish!


In [38]:
obs, done = env.reset(), False
print(obs)

[4. 9.]


In [51]:
episode_reward = 0
while not done:
    obs, reward, done, _ = env.step(act(obs[None])[0])
    print("state:",obs)
    episode_reward += reward
print([episode_reward, env.counts])

state: [4. 8.]
state: [4. 7.]
state: [4. 6.]
state: [3. 6.]
state: [3. 5.]
state: [3. 4.]
state: [3. 3.]
state: [3. 2.]
state: [3. 1.]
state: [3. 0.]
state: [ 3. -1.]
state: [ 2. -1.]
state: [ 1. -1.]
state: [ 0. -1.]
[8.7, 14]
