Importing open ai gym and the package

In [31]:
import gym
import numpy as np
import airline
from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.policies import ActorCriticPolicy
import torch as th

Define the function for evaluating the model on a given environment for a specified number of episodes. 

In [32]:
def evaluate(model, env, numiters):
    rewards = []
    for j in range(numiters):
        obs = env.reset()
        tot_reward = 0
        for i in range(env.tau):
            action, states = model.predict(obs)
            _, reward, _, _ = env.step(action)
            tot_reward += reward
        rewards.append(tot_reward)
    return np.mean(rewards), np.std(rewards)

# Model is the random policy
def policy(env):
    return env.action_space.sample()
    

Setting up the configuration for the environment

In [33]:
A = np.asarray([[1, 1, 0,0,0,0], [ 0,0, 1, 1, 1, 1], [ 0,0, 0,0, 1, 1] ])
tau = 23
P = np.ones((tau, A.shape[1]))/3
c = [5, 5, 5]
f = range(10, 16)
CONFIG = {'A': A, 'f': f, 'P': P, 'starting_state': c , 'tau': tau}

Making an instance of the environment

In [34]:
env = gym.make('Airline-v0', config=CONFIG)

In [35]:
check_env(env)

In [None]:
#Baseline MlpPolicy
model = A2C("MlpPolicy", env, n_steps = 5, verbose=1)

In [23]:
#OneSharedLayer
policy_kwargs = dict(net_arch=[64, dict(vf=[64, 64], pi=[64, 64])  ])
model = A2C("MlpPolicy", env,  policy_kwargs = policy_kwargs, n_steps = 5, verbose=1)

array([5, 5, 5])

In [28]:
#WideMlp
policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[1024, 1024, 1024 ])
model = A2C("MlpPolicy", env,  policy_kwargs = policy_kwargs, n_steps = 5, verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [36]:
#DeepMlp
policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch = [32, 32, 64, 64, 128, 256, 128, 64, 64, 32, 32])
model = A2C("MlpPolicy", env,  policy_kwargs = policy_kwargs, n_steps = 5, verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [37]:
model.learn(total_timesteps =250000)

------------------------------------
| time/                 |          |
|    fps                | 300      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 1.08e-09 |
|    value_loss         | 6.66e-16 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 302      |
|    iterations         | 200      |
|    time_elapsed       | 3        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 199      |
|    policy_loss        | 4.34e-07 |
|    value_loss         | 6.52e-15 |
-

-------------------------------------
| time/                 |           |
|    fps                | 304       |
|    iterations         | 1700      |
|    time_elapsed       | 27        |
|    total_timesteps    | 8500      |
| train/                |           |
|    entropy_loss       | -4.15     |
|    explained_variance | 0         |
|    learning_rate      | 0.0007    |
|    n_updates          | 1699      |
|    policy_loss        | -0.000123 |
|    value_loss         | 1.09e-09  |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 304       |
|    iterations         | 1800      |
|    time_elapsed       | 29        |
|    total_timesteps    | 9000      |
| train/                |           |
|    entropy_loss       | -4.15     |
|    explained_variance | 0         |
|    learning_rate      | 0.0007    |
|    n_updates          | 1799      |
|    policy_loss        | -0.000182 |
|    value_l

------------------------------------
| time/                 |          |
|    fps                | 298      |
|    iterations         | 3300     |
|    time_elapsed       | 55       |
|    total_timesteps    | 16500    |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 3299     |
|    policy_loss        | 3.83e-05 |
|    value_loss         | 1.05e-10 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 297      |
|    iterations         | 3400     |
|    time_elapsed       | 57       |
|    total_timesteps    | 17000    |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 3399     |
|    policy_loss        | 3.22e-05 |
|    value_loss         | 7.28e-11 |
-

------------------------------------
| time/                 |          |
|    fps                | 297      |
|    iterations         | 4900     |
|    time_elapsed       | 82       |
|    total_timesteps    | 24500    |
| train/                |          |
|    entropy_loss       | -4.15    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 4899     |
|    policy_loss        | 6.37e-05 |
|    value_loss         | 2.91e-10 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 297      |
|    iterations         | 5000     |
|    time_elapsed       | 84       |
|    total_timesteps    | 25000    |
| train/                |          |
|    entropy_loss       | -4.15    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 4999     |
|    policy_loss        | 0.00021  |
|    value_loss         | 3.13e-09 |
-

------------------------------------
| time/                 |          |
|    fps                | 292      |
|    iterations         | 6500     |
|    time_elapsed       | 111      |
|    total_timesteps    | 32500    |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 6499     |
|    policy_loss        | 4.14e-05 |
|    value_loss         | 1.2e-10  |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 291      |
|    iterations         | 6600     |
|    time_elapsed       | 113      |
|    total_timesteps    | 33000    |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 1.19e-07 |
|    learning_rate      | 0.0007   |
|    n_updates          | 6599     |
|    policy_loss        | 7.27e-05 |
|    value_loss         | 3.78e-10 |
-

------------------------------------
| time/                 |          |
|    fps                | 285      |
|    iterations         | 8100     |
|    time_elapsed       | 141      |
|    total_timesteps    | 40500    |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 8099     |
|    policy_loss        | 2.19e-05 |
|    value_loss         | 3.31e-11 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 284      |
|    iterations         | 8200     |
|    time_elapsed       | 143      |
|    total_timesteps    | 41000    |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 8199     |
|    policy_loss        | 6.69e-05 |
|    value_loss         | 3.14e-10 |
-

------------------------------------
| time/                 |          |
|    fps                | 276      |
|    iterations         | 9700     |
|    time_elapsed       | 175      |
|    total_timesteps    | 48500    |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 9699     |
|    policy_loss        | 4.72e-05 |
|    value_loss         | 1.56e-10 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 276      |
|    iterations         | 9800     |
|    time_elapsed       | 177      |
|    total_timesteps    | 49000    |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 9799     |
|    policy_loss        | 4.01e-05 |
|    value_loss         | 1.13e-10 |
-

------------------------------------
| time/                 |          |
|    fps                | 272      |
|    iterations         | 11300    |
|    time_elapsed       | 207      |
|    total_timesteps    | 56500    |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 11299    |
|    policy_loss        | 7.15e-05 |
|    value_loss         | 3.65e-10 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 272      |
|    iterations         | 11400    |
|    time_elapsed       | 209      |
|    total_timesteps    | 57000    |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 11399    |
|    policy_loss        | 9.85e-05 |
|    value_loss         | 6.81e-10 |
-

------------------------------------
| time/                 |          |
|    fps                | 270      |
|    iterations         | 12900    |
|    time_elapsed       | 238      |
|    total_timesteps    | 64500    |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 12899    |
|    policy_loss        | 7.1e-05  |
|    value_loss         | 3.55e-10 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 270      |
|    iterations         | 13000    |
|    time_elapsed       | 240      |
|    total_timesteps    | 65000    |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 12999    |
|    policy_loss        | 6.74e-06 |
|    value_loss         | 3.22e-12 |
-

------------------------------------
| time/                 |          |
|    fps                | 269      |
|    iterations         | 14500    |
|    time_elapsed       | 268      |
|    total_timesteps    | 72500    |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 14499    |
|    policy_loss        | 5.78e-05 |
|    value_loss         | 2.38e-10 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 269      |
|    iterations         | 14600    |
|    time_elapsed       | 270      |
|    total_timesteps    | 73000    |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 14599    |
|    policy_loss        | 4.24e-05 |
|    value_loss         | 1.27e-10 |
-

------------------------------------
| time/                 |          |
|    fps                | 267      |
|    iterations         | 16100    |
|    time_elapsed       | 300      |
|    total_timesteps    | 80500    |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 16099    |
|    policy_loss        | 6.22e-05 |
|    value_loss         | 2.67e-10 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 267      |
|    iterations         | 16200    |
|    time_elapsed       | 302      |
|    total_timesteps    | 81000    |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 16199    |
|    policy_loss        | 4.58e-05 |
|    value_loss         | 1.47e-10 |
-

------------------------------------
| time/                 |          |
|    fps                | 267      |
|    iterations         | 17700    |
|    time_elapsed       | 331      |
|    total_timesteps    | 88500    |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 17699    |
|    policy_loss        | 4.37e-05 |
|    value_loss         | 1.35e-10 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 267      |
|    iterations         | 17800    |
|    time_elapsed       | 333      |
|    total_timesteps    | 89000    |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 17799    |
|    policy_loss        | 5.83e-05 |
|    value_loss         | 2.42e-10 |
-

-------------------------------------
| time/                 |           |
|    fps                | 266       |
|    iterations         | 19300     |
|    time_elapsed       | 362       |
|    total_timesteps    | 96500     |
| train/                |           |
|    entropy_loss       | -4.16     |
|    explained_variance | -1.19e-07 |
|    learning_rate      | 0.0007    |
|    n_updates          | 19299     |
|    policy_loss        | 3.23e-05  |
|    value_loss         | 7.4e-11   |
-------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 265      |
|    iterations         | 19400    |
|    time_elapsed       | 365      |
|    total_timesteps    | 97000    |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 19399    |
|    policy_loss        | 4.11e-05 |
|    value_loss         

-------------------------------------
| time/                 |           |
|    fps                | 262       |
|    iterations         | 20900     |
|    time_elapsed       | 398       |
|    total_timesteps    | 104500    |
| train/                |           |
|    entropy_loss       | -4.16     |
|    explained_variance | -1.19e-07 |
|    learning_rate      | 0.0007    |
|    n_updates          | 20899     |
|    policy_loss        | 4.61e-05  |
|    value_loss         | 1.51e-10  |
-------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 262      |
|    iterations         | 21000    |
|    time_elapsed       | 400      |
|    total_timesteps    | 105000   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 20999    |
|    policy_loss        | 4.58e-05 |
|    value_loss         

------------------------------------
| time/                 |          |
|    fps                | 261      |
|    iterations         | 22500    |
|    time_elapsed       | 430      |
|    total_timesteps    | 112500   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 22499    |
|    policy_loss        | 3.25e-05 |
|    value_loss         | 7.55e-11 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 261      |
|    iterations         | 22600    |
|    time_elapsed       | 432      |
|    total_timesteps    | 113000   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 1.79e-07 |
|    learning_rate      | 0.0007   |
|    n_updates          | 22599    |
|    policy_loss        | 4.14e-05 |
|    value_loss         | 1.19e-10 |
-

------------------------------------
| time/                 |          |
|    fps                | 260      |
|    iterations         | 24100    |
|    time_elapsed       | 462      |
|    total_timesteps    | 120500   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 24099    |
|    policy_loss        | 5.12e-05 |
|    value_loss         | 1.81e-10 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 260      |
|    iterations         | 24200    |
|    time_elapsed       | 464      |
|    total_timesteps    | 121000   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 24199    |
|    policy_loss        | 4.9e-05  |
|    value_loss         | 1.7e-10  |
-

------------------------------------
| time/                 |          |
|    fps                | 260      |
|    iterations         | 25700    |
|    time_elapsed       | 492      |
|    total_timesteps    | 128500   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 25699    |
|    policy_loss        | 4.26e-05 |
|    value_loss         | 1.27e-10 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 260      |
|    iterations         | 25800    |
|    time_elapsed       | 494      |
|    total_timesteps    | 129000   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 25799    |
|    policy_loss        | 3.59e-05 |
|    value_loss         | 9.29e-11 |
-

------------------------------------
| time/                 |          |
|    fps                | 259      |
|    iterations         | 27300    |
|    time_elapsed       | 525      |
|    total_timesteps    | 136500   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 27299    |
|    policy_loss        | 2.75e-05 |
|    value_loss         | 5.38e-11 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 259      |
|    iterations         | 27400    |
|    time_elapsed       | 526      |
|    total_timesteps    | 137000   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 27399    |
|    policy_loss        | 4.01e-05 |
|    value_loss         | 1.1e-10  |
-

-------------------------------------
| time/                 |           |
|    fps                | 260       |
|    iterations         | 28900     |
|    time_elapsed       | 555       |
|    total_timesteps    | 144500    |
| train/                |           |
|    entropy_loss       | -4.16     |
|    explained_variance | -1.19e-07 |
|    learning_rate      | 0.0007    |
|    n_updates          | 28899     |
|    policy_loss        | 3.6e-05   |
|    value_loss         | 9.12e-11  |
-------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 260      |
|    iterations         | 29000    |
|    time_elapsed       | 557      |
|    total_timesteps    | 145000   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 28999    |
|    policy_loss        | 3.77e-05 |
|    value_loss         

-------------------------------------
| time/                 |           |
|    fps                | 259       |
|    iterations         | 30500     |
|    time_elapsed       | 588       |
|    total_timesteps    | 152500    |
| train/                |           |
|    entropy_loss       | -4.16     |
|    explained_variance | -1.19e-07 |
|    learning_rate      | 0.0007    |
|    n_updates          | 30499     |
|    policy_loss        | 1.49e-05  |
|    value_loss         | 1.54e-11  |
-------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 259      |
|    iterations         | 30600    |
|    time_elapsed       | 590      |
|    total_timesteps    | 153000   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 30599    |
|    policy_loss        | 3.08e-05 |
|    value_loss         

------------------------------------
| time/                 |          |
|    fps                | 259      |
|    iterations         | 32100    |
|    time_elapsed       | 618      |
|    total_timesteps    | 160500   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 1.19e-07 |
|    learning_rate      | 0.0007   |
|    n_updates          | 32099    |
|    policy_loss        | 2.23e-05 |
|    value_loss         | 3.48e-11 |
------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 259       |
|    iterations         | 32200     |
|    time_elapsed       | 620       |
|    total_timesteps    | 161000    |
| train/                |           |
|    entropy_loss       | -4.16     |
|    explained_variance | -1.19e-07 |
|    learning_rate      | 0.0007    |
|    n_updates          | 32199     |
|    policy_loss        | 2.48e-05  |
|    value_loss         | 

-------------------------------------
| time/                 |           |
|    fps                | 259       |
|    iterations         | 33700     |
|    time_elapsed       | 649       |
|    total_timesteps    | 168500    |
| train/                |           |
|    entropy_loss       | -4.16     |
|    explained_variance | -1.19e-07 |
|    learning_rate      | 0.0007    |
|    n_updates          | 33699     |
|    policy_loss        | 2.01e-05  |
|    value_loss         | 2.88e-11  |
-------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 259      |
|    iterations         | 33800    |
|    time_elapsed       | 651      |
|    total_timesteps    | 169000   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 33799    |
|    policy_loss        | 2.75e-05 |
|    value_loss         

------------------------------------
| time/                 |          |
|    fps                | 259      |
|    iterations         | 35300    |
|    time_elapsed       | 680      |
|    total_timesteps    | 176500   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 35299    |
|    policy_loss        | 1.76e-05 |
|    value_loss         | 2.17e-11 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 259      |
|    iterations         | 35400    |
|    time_elapsed       | 682      |
|    total_timesteps    | 177000   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 35399    |
|    policy_loss        | 1.53e-05 |
|    value_loss         | 1.66e-11 |
-

------------------------------------
| time/                 |          |
|    fps                | 259      |
|    iterations         | 36900    |
|    time_elapsed       | 710      |
|    total_timesteps    | 184500   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 36899    |
|    policy_loss        | 2.25e-05 |
|    value_loss         | 3.54e-11 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 259      |
|    iterations         | 37000    |
|    time_elapsed       | 712      |
|    total_timesteps    | 185000   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 5.96e-08 |
|    learning_rate      | 0.0007   |
|    n_updates          | 36999    |
|    policy_loss        | 2.14e-05 |
|    value_loss         | 3.23e-11 |
-

------------------------------------
| time/                 |          |
|    fps                | 259      |
|    iterations         | 38500    |
|    time_elapsed       | 740      |
|    total_timesteps    | 192500   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 38499    |
|    policy_loss        | 3.21e-05 |
|    value_loss         | 7.28e-11 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 259      |
|    iterations         | 38600    |
|    time_elapsed       | 742      |
|    total_timesteps    | 193000   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 38599    |
|    policy_loss        | 2.19e-05 |
|    value_loss         | 3.46e-11 |
-

------------------------------------
| time/                 |          |
|    fps                | 259      |
|    iterations         | 40100    |
|    time_elapsed       | 771      |
|    total_timesteps    | 200500   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 40099    |
|    policy_loss        | 2.22e-05 |
|    value_loss         | 3.49e-11 |
------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 259       |
|    iterations         | 40200     |
|    time_elapsed       | 773       |
|    total_timesteps    | 201000    |
| train/                |           |
|    entropy_loss       | -4.16     |
|    explained_variance | -1.19e-07 |
|    learning_rate      | 0.0007    |
|    n_updates          | 40199     |
|    policy_loss        | 1.35e-05  |
|    value_loss         | 

------------------------------------
| time/                 |          |
|    fps                | 260      |
|    iterations         | 41700    |
|    time_elapsed       | 801      |
|    total_timesteps    | 208500   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 41699    |
|    policy_loss        | 1.66e-05 |
|    value_loss         | 1.98e-11 |
------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 260       |
|    iterations         | 41800     |
|    time_elapsed       | 803       |
|    total_timesteps    | 209000    |
| train/                |           |
|    entropy_loss       | -4.16     |
|    explained_variance | -1.19e-07 |
|    learning_rate      | 0.0007    |
|    n_updates          | 41799     |
|    policy_loss        | 4.28e-05  |
|    value_loss         | 

------------------------------------
| time/                 |          |
|    fps                | 260      |
|    iterations         | 43300    |
|    time_elapsed       | 831      |
|    total_timesteps    | 216500   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 43299    |
|    policy_loss        | 1.95e-05 |
|    value_loss         | 2.73e-11 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 260      |
|    iterations         | 43400    |
|    time_elapsed       | 833      |
|    total_timesteps    | 217000   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 43399    |
|    policy_loss        | 1.84e-05 |
|    value_loss         | 2.37e-11 |
-

-------------------------------------
| time/                 |           |
|    fps                | 260       |
|    iterations         | 44900     |
|    time_elapsed       | 862       |
|    total_timesteps    | 224500    |
| train/                |           |
|    entropy_loss       | -4.16     |
|    explained_variance | -1.19e-07 |
|    learning_rate      | 0.0007    |
|    n_updates          | 44899     |
|    policy_loss        | 1.88e-05  |
|    value_loss         | 2.6e-11   |
-------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 260      |
|    iterations         | 45000    |
|    time_elapsed       | 863      |
|    total_timesteps    | 225000   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 1.19e-07 |
|    learning_rate      | 0.0007   |
|    n_updates          | 44999    |
|    policy_loss        | 1.76e-05 |
|    value_loss         

------------------------------------
| time/                 |          |
|    fps                | 260      |
|    iterations         | 46500    |
|    time_elapsed       | 892      |
|    total_timesteps    | 232500   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 1.19e-07 |
|    learning_rate      | 0.0007   |
|    n_updates          | 46499    |
|    policy_loss        | 2.4e-05  |
|    value_loss         | 4.13e-11 |
------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 260       |
|    iterations         | 46600     |
|    time_elapsed       | 893       |
|    total_timesteps    | 233000    |
| train/                |           |
|    entropy_loss       | -4.16     |
|    explained_variance | -1.19e-07 |
|    learning_rate      | 0.0007    |
|    n_updates          | 46599     |
|    policy_loss        | 2.02e-05  |
|    value_loss         | 

------------------------------------
| time/                 |          |
|    fps                | 260      |
|    iterations         | 48100    |
|    time_elapsed       | 923      |
|    total_timesteps    | 240500   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 1.19e-07 |
|    learning_rate      | 0.0007   |
|    n_updates          | 48099    |
|    policy_loss        | 1.7e-05  |
|    value_loss         | 2.03e-11 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 260      |
|    iterations         | 48200    |
|    time_elapsed       | 925      |
|    total_timesteps    | 241000   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 48199    |
|    policy_loss        | 2.89e-05 |
|    value_loss         | 5.89e-11 |
-

------------------------------------
| time/                 |          |
|    fps                | 260      |
|    iterations         | 49700    |
|    time_elapsed       | 953      |
|    total_timesteps    | 248500   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 49699    |
|    policy_loss        | 2.06e-05 |
|    value_loss         | 3.03e-11 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 260      |
|    iterations         | 49800    |
|    time_elapsed       | 955      |
|    total_timesteps    | 249000   |
| train/                |          |
|    entropy_loss       | -4.16    |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 49799    |
|    policy_loss        | 1.75e-05 |
|    value_loss         | 2.07e-11 |
-

<stable_baselines3.a2c.a2c.A2C at 0x7fc3ca804b38>

In [38]:
model.save("a2cDeep")

In [15]:
obs = env.reset()

In [None]:
model.predict(obs)

Testing the variables

In [None]:
print(env.state)
print(env.action_space)
print(env.observation_space) # openAI calles states observations :/

Testing the step function

In [None]:
env.state = [1,0, 0]
print(env.step(np.asarray([1,1, 1, 1, 1, 1]), 5))
print('***')
env.state = [3, 3, 3]
print(env.step([1,1, 1, 1, 1, 1], 5))

test sample uniformly from action space at random

In [None]:
env.action_space.sample()

Simulating randomized policy

In [None]:
e = 50
rewards = []
for j in range(e):
    obs = env.reset()
    tot_reward = 0
    for i in range(tau):
        action, states_ = model.predict(obs)
        obs, reward, dones, info = env.step(action)
        tot_reward += reward
    rewards.append(tot_reward)

Test Evaluate Function

In [39]:
print('Here is the mean and standard deviation of the estimate ' + str(evaluate(model, env, 50)))

Here is the mean and standard deviation of the estimate (105.38, 9.094811707781531)
