Importing open ai gym and the package

In [1]:
import gym
import numpy as np
import airline
from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.policies import ActorCriticPolicy
import torch as th
from GruActorCriticPolicy import CustomActorCriticPolicy as GruACP
from LongActorCriticPolicy import CustomActorCriticPolicy as LACP

Define the function for evaluating the model on a given environment for a specified number of episodes. 

In [14]:
def evaluate(model, env, numiters):
    rewards = []
    for j in range(numiters):
        obs = env.reset()
        tot_reward = 0
        for i in range(env.tau):
            action, states = model.predict(obs)
            _, reward, _, _ = env.step(action)
            tot_reward += reward
        rewards.append(tot_reward)
    return np.mean(rewards), np.std(rewards), np.amax(rewards), np.min(rewards)

# Model is the random policy
def policy(env):
    return env.action_space.sample()
    

Setting up the configuration for the environment

In [None]:
A = np.asarray([[1, 1, 0,0,0,0], [ 0,0, 1, 1, 1, 1], [ 0,0, 0,0, 1, 1] ])
tau = 23
P = np.ones((tau, A.shape[1]))/3
c = [5, 5, 5]
f = range(10, 16)
CONFIG = {'A': A, 'f': f, 'P': P, 'starting_state': c , 'tau': tau}

In [5]:
m = 6
l = 3
A = np.identity(m)
for i in range(l):
    for j in range(l):
        if i != j:
            demand_col = np.zeros((m, 1))
            demand_col[2 * i + 1] = 1.0
            demand_col[2 * j] = 1.0
            A=  np.append(A, demand_col, axis = 1)
A = np.append(A, A, axis = 1)
tau = 20
P = np.array([0.01327884, 0.02244177, 0.07923761, 0.0297121,  0.02654582, 0.08408091, 0.09591975, 0.00671065, 0.08147508, 0.00977341, 0.02966204, 0.121162, 0.00442628, 0.00748059, 0.02641254, 0.00990403, 0.00884861, 0.02802697, 0.03197325, 0.00223688, 0.02715836, 0.0032578,  0.00988735, 0.04038733])
P = np.array([P]*tau)
c = [2]*6
f = np.array([33, 28, 36, 34, 17, 20, 39, 24, 31, 19, 30, 48, 165, 140, 180, 170, 85, 100,195, 120, 155, 95, 150, 240])
CONFIG = {'A': A, 'f': f, 'P': P, 'starting_state': c , 'tau': tau}

Making an instance of the environment

In [6]:
np.random.seed(1)
env = gym.make('Airline-v0', config=CONFIG)
check_env(env)

In [7]:
#Baseline MlpPolicy
model = A2C("MlpPolicy", env, n_steps = 5, verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [None]:
#OneSharedLayer
policy_kwargs = dict(net_arch=[64, dict(vf=[64, 64], pi=[64, 64])  ])
model = A2C("MlpPolicy", env,  policy_kwargs = policy_kwargs, n_steps = 5, verbose=1)

In [None]:
#WideMlp
policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[1024, 1024, 1024 ])
model = A2C("MlpPolicy", env,  policy_kwargs = policy_kwargs, n_steps = 5, verbose=1)

In [None]:
#DeepMlp
policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch = [32, 32, 64, 64, 128, 256, 128, 64, 64, 32, 32])
model = A2C(LongActorCriticPolicy, env,  policy_kwargs = policy_kwargs, n_steps = 5, verbose=1)

In [None]:
#Gru
model = A2C(GruACP, env, n_steps = 5, verbose=1)

In [8]:
model.learn(total_timesteps =200000)

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 20       |
|    ep_rew_mean        | 162      |
| time/                 |          |
|    fps                | 914      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -16      |
|    explained_variance | 5.96e-08 |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.329   |
|    value_loss         | 0.000579 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 20       |
|    ep_rew_mean        | 176      |
| time/                 |          |
|    fps                | 906      |
|    iterations         | 200      |
|    time_elapsed       | 1        |
|    total_timesteps    | 1000     |
| train/                |          |
|

<stable_baselines3.a2c.a2c.A2C at 0x7ff66c94f2e0>

In [None]:
model.save("a2cDeep")

In [None]:
obs = env.reset()

In [None]:
model.predict(obs)

Testing the variables

In [None]:
print(env.state)
print(env.action_space)
print(env.observation_space) # openAI calles states observations :/

Testing the step function

In [None]:
env.state = [1,0, 0]
print(env.step(np.asarray([1,1, 1, 1, 1, 1]), 5))
print('***')
env.state = [3, 3, 3]
print(env.step([1,1, 1, 1, 1, 1], 5))

test sample uniformly from action space at random

In [None]:
env.action_space.sample()

Simulating randomized policy

In [None]:
e = 50
rewards = []
for j in range(e):
    obs = env.reset()
    tot_reward = 0
    for i in range(tau):
        action, states_ = model.predict(obs)
        obs, reward, dones, info = env.step(action)
        tot_reward += reward
    rewards.append(tot_reward)

Test Evaluate Function

In [15]:
print('Here is the mean and standard deviation of the estimate ' + str(evaluate(model, env, 1000)))

Here is the mean and standard deviation of the estimate (232.119, 129.36994565585934, 682, 40)
