In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
from environment import NYCEnv

from stable_baselines import ACKTR
from stable_baselines.common.cmd_util import make_vec_env

In [3]:
import pickle
import numpy as np
import pandas as pd

In [4]:
with open('data/emp_Q_shA.pkl', 'rb') as f:
    emp_Q_shA = pickle.load(f)
with open('data/emp_Q_shB.pkl', 'rb') as f:
    emp_Q_shB = pickle.load(f)

In [5]:
def optimal_reposition(obs, shifts='A'):
    obs = tuple(obs)
    if shifts == 'A' and obs in emp_Q_shA.keys():
        reposition = np.argmax(emp_Q_shA[obs])
    elif shifts == 'B' and obs in emp_Q_shB.keys():
        reposition = np.argmax(emp_Q_shB[obs])
    else:
        reposition = obs[0]
    return reposition

In [6]:
env = NYCEnv(delta_t=15)

In [7]:
from stable_baselines.common.env_checker import check_env
check_env(env, warn=True)

In [8]:
epochs = 100
rewards = np.zeros(epochs)
n_steps = 100

for epoch in range(epochs):
    obs = env.reset()
    obs, reward, done, info = env.step(obs[0])
    for step in range(1, n_steps):
        reposition = optimal_reposition(obs, shifts='A')
        obs, reward, done, info = env.step(reposition)
        if reward == env.TERMINATE_PENALTY:
            rewards[epoch] = env.total_rewards - reward
            break
        if done:
            rewards[epoch] = env.total_rewards
            break
        elif step == n_steps-1:
            rewards[epoch] = env.total_rewards
pd.DataFrame(rewards).describe()

Unnamed: 0,0
count,100.0
mean,103.171706
std,108.164884
min,-2.078565
25%,0.0
50%,84.351039
75%,190.311163
max,334.270996


In [9]:
# no repositioning
epochs = 100
rewards = np.zeros(epochs)
n_steps = 100

for epoch in range(epochs):
    obs = env.reset()
    obs, reward, done, info = env.step(obs[0])
    for step in range(1, n_steps):
        obs, reward, done, info = env.step(obs[0])
        if reward == env.TERMINATE_PENALTY:
            rewards[epoch] = env.total_rewards - reward
            break
        if done:
            rewards[epoch] = env.total_rewards
            break
        elif step == n_steps-1:
            rewards[epoch] = env.total_rewards
pd.DataFrame(rewards).describe()

Unnamed: 0,0
count,100.0
mean,153.73161
std,159.635944
min,-4.417404
25%,0.0
50%,114.877018
75%,318.827105
max,452.98454


In [62]:
env = make_vec_env(lambda: env, n_envs=1)

In [63]:
model = ACKTR('MlpPolicy', env, verbose=1).learn(50000)

Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
----------------------------------
| ep_len_mean        | 1.05      |
| ep_reward_mean     | -1e+04    |
| explained_variance | -0.00338  |
| fps                | 23        |
| nupdates           | 1         |
| policy_entropy     | 5.58      |
| policy_loss        | -5.57e+04 |
| total_timesteps    | 20        |
| value_loss         | 9.99e+07  |
----------------------------------
----------------------------------
| ep_len_mean        | 1.01      |
| ep_reward_mean     | -1e+04    |
| explained_variance | nan       |
| fps                | 313       |
| nupdates           | 100       |
| policy_entropy     | 4.56      |
| policy_loss        | -4.25e+04 |
| total_timesteps    | 2000      |
| value_loss         | 9.93e+07  |
------

In [64]:
# Test the trained agent
obs = env.reset()
print("obs=", obs)
print('==========')
n_steps = 20
for step in range(n_steps):
    action, _ = model.predict(obs, deterministic=True)
    print("Step {}".format(step + 1))
    print("Action: ", action)
    obs, reward, done, info = env.step(action)
    print('obs=', obs, 'reward=', reward, 'done=', done)
    env.render(mode='console')
    print('==========')
    if done:
        # Note that the VecEnv resets automatically
        # when a done signal is encountered
        print("Goal reached!", "reward=", reward)
        break

obs= [[41 26]]
Step 1
Action:  [107]
obs= [[144  26]] reward= [-10000.] done= [ True]
Current taxi zone: 144, time: 26, reward: 0.00
Goal reached! reward= [-10000.]
