Importing open ai gym and the package

In [3]:
import gym
import numpy as np
import airline
from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.policies import ActorCriticPolicy
import torch as th
from GruActorCriticPolicy import CustomActorCriticPolicy as GruACP
from LongActorCriticPolicy import CustomActorCriticPolicy as LACP

Define the function for evaluating the model on a given environment for a specified number of episodes. 

In [4]:
def evaluate(model, env, numiters):
    rewards = []
    for j in range(numiters):
        obs = env.reset()
        tot_reward = 0
        for i in range(env.tau):
            action, states = model.predict(obs)
            _, reward, _, _ = env.step(action)
            tot_reward += reward
        rewards.append(tot_reward)
    return np.mean(rewards), np.std(rewards)

# Model is the random policy
def policy(env):
    return env.action_space.sample()
    

Setting up the configuration for the environment

In [5]:
A = np.asarray([[1, 1, 0,0,0,0], [ 0,0, 1, 1, 1, 1], [ 0,0, 0,0, 1, 1] ])
tau = 23
P = np.ones((tau, A.shape[1]))/3
c = [5, 5, 5]
f = range(10, 16)
CONFIG = {'A': A, 'f': f, 'P': P, 'starting_state': c , 'tau': tau}

Making an instance of the environment

In [18]:
np.random.seed(1)
env = gym.make('Airline-v0', config=CONFIG)
check_env(env)

In [22]:
#Baseline MlpPolicy
model = A2C("MlpPolicy", env, n_steps = 5, verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [23]:
#OneSharedLayer
policy_kwargs = dict(net_arch=[64, dict(vf=[64, 64], pi=[64, 64])  ])
model = A2C("MlpPolicy", env,  policy_kwargs = policy_kwargs, n_steps = 5, verbose=1)

array([5, 5, 5])

In [28]:
#WideMlp
policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[1024, 1024, 1024 ])
model = A2C("MlpPolicy", env,  policy_kwargs = policy_kwargs, n_steps = 5, verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [36]:
#DeepMlp
policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch = [32, 32, 64, 64, 128, 256, 128, 64, 64, 32, 32])
model = A2C(LongActorCriticPolicy, env,  policy_kwargs = policy_kwargs, n_steps = 5, verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [19]:
#Gru
model = A2C(GruACP, env, n_steps = 5, verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [20]:
model.learn(total_timesteps =200000)

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 16       |
| time/                 |          |
|    fps                | 620      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -4.15    |
|    explained_variance | 0.00283  |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 46.5     |
|    value_loss         | 152      |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 12.5     |
| time/                 |          |
|    fps                | 619      |
|    iterations         | 200      |
|    time_elapsed       | 1        |
|    total_timesteps    | 1000     |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 30.7     |
| time/                 |          |
|    fps                | 615      |
|    iterations         | 1500     |
|    time_elapsed       | 12       |
|    total_timesteps    | 7500     |
| train/                |          |
|    entropy_loss       | -3.64    |
|    explained_variance | 0.0246   |
|    learning_rate      | 0.0007   |
|    n_updates          | 1499     |
|    policy_loss        | 14.4     |
|    value_loss         | 37.9     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 31.4     |
| time/                 |          |
|    fps                | 614      |
|    iterations         | 1600     |
|    time_elapsed       | 13       |
|    total_timesteps    | 8000     |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 36       |
| time/                 |          |
|    fps                | 612      |
|    iterations         | 2800     |
|    time_elapsed       | 22       |
|    total_timesteps    | 14000    |
| train/                |          |
|    entropy_loss       | -3.15    |
|    explained_variance | 0.0473   |
|    learning_rate      | 0.0007   |
|    n_updates          | 2799     |
|    policy_loss        | 25.6     |
|    value_loss         | 126      |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 36.3      |
| time/                 |           |
|    fps                | 612       |
|    iterations         | 2900      |
|    time_elapsed       | 23        |
|    total_timesteps    | 14500     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 55.3     |
| time/                 |          |
|    fps                | 610      |
|    iterations         | 4100     |
|    time_elapsed       | 33       |
|    total_timesteps    | 20500    |
| train/                |          |
|    entropy_loss       | -1.75    |
|    explained_variance | -0.00836 |
|    learning_rate      | 0.0007   |
|    n_updates          | 4099     |
|    policy_loss        | 2.2      |
|    value_loss         | 22       |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 56.9     |
| time/                 |          |
|    fps                | 610      |
|    iterations         | 4200     |
|    time_elapsed       | 34       |
|    total_timesteps    | 21000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 58.6     |
| time/                 |          |
|    fps                | 606      |
|    iterations         | 5500     |
|    time_elapsed       | 45       |
|    total_timesteps    | 27500    |
| train/                |          |
|    entropy_loss       | -0.462   |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 5499     |
|    policy_loss        | -0.999   |
|    value_loss         | 0.123    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 58.3     |
| time/                 |          |
|    fps                | 606      |
|    iterations         | 5600     |
|    time_elapsed       | 46       |
|    total_timesteps    | 28000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 57.7     |
| time/                 |          |
|    fps                | 604      |
|    iterations         | 6800     |
|    time_elapsed       | 56       |
|    total_timesteps    | 34000    |
| train/                |          |
|    entropy_loss       | -0.55    |
|    explained_variance | 0.79     |
|    learning_rate      | 0.0007   |
|    n_updates          | 6799     |
|    policy_loss        | 0.0225   |
|    value_loss         | 28.1     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 58.5     |
| time/                 |          |
|    fps                | 604      |
|    iterations         | 6900     |
|    time_elapsed       | 57       |
|    total_timesteps    | 34500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 58.3     |
| time/                 |          |
|    fps                | 598      |
|    iterations         | 8100     |
|    time_elapsed       | 67       |
|    total_timesteps    | 40500    |
| train/                |          |
|    entropy_loss       | -0.0232  |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 8099     |
|    policy_loss        | 6.4e-06  |
|    value_loss         | 6.48e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 58.2     |
| time/                 |          |
|    fps                | 597      |
|    iterations         | 8200     |
|    time_elapsed       | 68       |
|    total_timesteps    | 41000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 58.4     |
| time/                 |          |
|    fps                | 596      |
|    iterations         | 9400     |
|    time_elapsed       | 78       |
|    total_timesteps    | 47000    |
| train/                |          |
|    entropy_loss       | -0.0131  |
|    explained_variance | 5.58e-05 |
|    learning_rate      | 0.0007   |
|    n_updates          | 9399     |
|    policy_loss        | 0.0163   |
|    value_loss         | 98       |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.4      |
| time/                 |           |
|    fps                | 596       |
|    iterations         | 9500      |
|    time_elapsed       | 79        |
|    total_timesteps    | 47500     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 59.3     |
| time/                 |          |
|    fps                | 595      |
|    iterations         | 10700    |
|    time_elapsed       | 89       |
|    total_timesteps    | 53500    |
| train/                |          |
|    entropy_loss       | -0.0117  |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 10699    |
|    policy_loss        | 8.01e-07 |
|    value_loss         | 2.9e-07  |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 59.5      |
| time/                 |           |
|    fps                | 595       |
|    iterations         | 10800     |
|    time_elapsed       | 90        |
|    total_timesteps    | 54000     |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.7      |
| time/                 |           |
|    fps                | 594       |
|    iterations         | 12000     |
|    time_elapsed       | 100       |
|    total_timesteps    | 60000     |
| train/                |           |
|    entropy_loss       | -0.0124   |
|    explained_variance | 1         |
|    learning_rate      | 0.0007    |
|    n_updates          | 11999     |
|    policy_loss        | -6.81e-07 |
|    value_loss         | 0.00161   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.6      |
| time/                 |           |
|    fps                | 594       |
|    iterations         | 12100     |
|    time_elapsed       | 101       |
|    total_timesteps    | 60500     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 59.5      |
| time/                 |           |
|    fps                | 593       |
|    iterations         | 13300     |
|    time_elapsed       | 111       |
|    total_timesteps    | 66500     |
| train/                |           |
|    entropy_loss       | -0.000727 |
|    explained_variance | 1         |
|    learning_rate      | 0.0007    |
|    n_updates          | 13299     |
|    policy_loss        | -8.29e-06 |
|    value_loss         | 0.101     |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 59.2      |
| time/                 |           |
|    fps                | 593       |
|    iterations         | 13400     |
|    time_elapsed       | 112       |
|    total_timesteps    | 67000     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 57.7      |
| time/                 |           |
|    fps                | 593       |
|    iterations         | 14600     |
|    time_elapsed       | 122       |
|    total_timesteps    | 73000     |
| train/                |           |
|    entropy_loss       | -0.000203 |
|    explained_variance | 0         |
|    learning_rate      | 0.0007    |
|    n_updates          | 14599     |
|    policy_loss        | 1.82e-05  |
|    value_loss         | 1.16      |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.7      |
| time/                 |           |
|    fps                | 593       |
|    iterations         | 14700     |
|    time_elapsed       | 123       |
|    total_timesteps    | 73500     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 59        |
| time/                 |           |
|    fps                | 592       |
|    iterations         | 15900     |
|    time_elapsed       | 134       |
|    total_timesteps    | 79500     |
| train/                |           |
|    entropy_loss       | -2.12e-05 |
|    explained_variance | 0         |
|    learning_rate      | 0.0007    |
|    n_updates          | 15899     |
|    policy_loss        | -1.73e-06 |
|    value_loss         | 1.46      |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 59       |
| time/                 |          |
|    fps                | 592      |
|    iterations         | 16000    |
|    time_elapsed       | 134      |
|    total_timesteps    | 80000    |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.3      |
| time/                 |           |
|    fps                | 592       |
|    iterations         | 17200     |
|    time_elapsed       | 145       |
|    total_timesteps    | 86000     |
| train/                |           |
|    entropy_loss       | -5.24e-05 |
|    explained_variance | 0         |
|    learning_rate      | 0.0007    |
|    n_updates          | 17199     |
|    policy_loss        | 1.04e-09  |
|    value_loss         | 6.19e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.4      |
| time/                 |           |
|    fps                | 592       |
|    iterations         | 17300     |
|    time_elapsed       | 145       |
|    total_timesteps    | 86500     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.6      |
| time/                 |           |
|    fps                | 592       |
|    iterations         | 18500     |
|    time_elapsed       | 156       |
|    total_timesteps    | 92500     |
| train/                |           |
|    entropy_loss       | -3.39e-05 |
|    explained_variance | 0         |
|    learning_rate      | 0.0007    |
|    n_updates          | 18499     |
|    policy_loss        | -2.75e-06 |
|    value_loss         | 1.28      |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 58.6     |
| time/                 |          |
|    fps                | 592      |
|    iterations         | 18600    |
|    time_elapsed       | 156      |
|    total_timesteps    | 93000    |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.6      |
| time/                 |           |
|    fps                | 592       |
|    iterations         | 19800     |
|    time_elapsed       | 167       |
|    total_timesteps    | 99000     |
| train/                |           |
|    entropy_loss       | -1.09e-05 |
|    explained_variance | 0.839     |
|    learning_rate      | 0.0007    |
|    n_updates          | 19799     |
|    policy_loss        | -8.07e-06 |
|    value_loss         | 102       |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.1      |
| time/                 |           |
|    fps                | 592       |
|    iterations         | 19900     |
|    time_elapsed       | 167       |
|    total_timesteps    | 99500     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 59.2      |
| time/                 |           |
|    fps                | 592       |
|    iterations         | 21100     |
|    time_elapsed       | 178       |
|    total_timesteps    | 105500    |
| train/                |           |
|    entropy_loss       | -2.36e-06 |
|    explained_variance | 0.998     |
|    learning_rate      | 0.0007    |
|    n_updates          | 21099     |
|    policy_loss        | -6.87e-08 |
|    value_loss         | 0.108     |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.8      |
| time/                 |           |
|    fps                | 592       |
|    iterations         | 21200     |
|    time_elapsed       | 178       |
|    total_timesteps    | 106000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 59        |
| time/                 |           |
|    fps                | 592       |
|    iterations         | 22400     |
|    time_elapsed       | 189       |
|    total_timesteps    | 112000    |
| train/                |           |
|    entropy_loss       | -0.0016   |
|    explained_variance | 0         |
|    learning_rate      | 0.0007    |
|    n_updates          | 22399     |
|    policy_loss        | -8.64e-05 |
|    value_loss         | 1.4       |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.4      |
| time/                 |           |
|    fps                | 591       |
|    iterations         | 22500     |
|    time_elapsed       | 190       |
|    total_timesteps    | 112500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.6      |
| time/                 |           |
|    fps                | 591       |
|    iterations         | 23700     |
|    time_elapsed       | 200       |
|    total_timesteps    | 118500    |
| train/                |           |
|    entropy_loss       | -7.89e-06 |
|    explained_variance | 0         |
|    learning_rate      | 0.0007    |
|    n_updates          | 23699     |
|    policy_loss        | 1.68e-09  |
|    value_loss         | 1.38e-05  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.7      |
| time/                 |           |
|    fps                | 591       |
|    iterations         | 23800     |
|    time_elapsed       | 201       |
|    total_timesteps    | 119000    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 58.6     |
| time/                 |          |
|    fps                | 591      |
|    iterations         | 25000    |
|    time_elapsed       | 211      |
|    total_timesteps    | 125000   |
| train/                |          |
|    entropy_loss       | -0.00253 |
|    explained_variance | 0.995    |
|    learning_rate      | 0.0007   |
|    n_updates          | 24999    |
|    policy_loss        | 1.22e-06 |
|    value_loss         | 0.14     |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 57.8      |
| time/                 |           |
|    fps                | 591       |
|    iterations         | 25100     |
|    time_elapsed       | 212       |
|    total_timesteps    | 125500    |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 58.3     |
| time/                 |          |
|    fps                | 591      |
|    iterations         | 26300    |
|    time_elapsed       | 222      |
|    total_timesteps    | 131500   |
| train/                |          |
|    entropy_loss       | -0       |
|    explained_variance | 0.978    |
|    learning_rate      | 0.0007   |
|    n_updates          | 26299    |
|    policy_loss        | -0       |
|    value_loss         | 16.8     |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.2      |
| time/                 |           |
|    fps                | 591       |
|    iterations         | 26400     |
|    time_elapsed       | 223       |
|    total_timesteps    | 132000    |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 59.2     |
| time/                 |          |
|    fps                | 591      |
|    iterations         | 27600    |
|    time_elapsed       | 233      |
|    total_timesteps    | 138000   |
| train/                |          |
|    entropy_loss       | -0       |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 27599    |
|    policy_loss        | -0       |
|    value_loss         | 0.562    |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 59.2      |
| time/                 |           |
|    fps                | 591       |
|    iterations         | 27700     |
|    time_elapsed       | 234       |
|    total_timesteps    | 138500    |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 58.9     |
| time/                 |          |
|    fps                | 587      |
|    iterations         | 28900    |
|    time_elapsed       | 245      |
|    total_timesteps    | 144500   |
| train/                |          |
|    entropy_loss       | -0       |
|    explained_variance | 0.999    |
|    learning_rate      | 0.0007   |
|    n_updates          | 28899    |
|    policy_loss        | -0       |
|    value_loss         | 0.289    |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.8      |
| time/                 |           |
|    fps                | 587       |
|    iterations         | 29000     |
|    time_elapsed       | 246       |
|    total_timesteps    | 145000    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 59.9      |
| time/                 |           |
|    fps                | 584       |
|    iterations         | 30200     |
|    time_elapsed       | 258       |
|    total_timesteps    | 151000    |
| train/                |           |
|    entropy_loss       | -1.98e-06 |
|    explained_variance | -0.667    |
|    learning_rate      | 0.0007    |
|    n_updates          | 30199     |
|    policy_loss        | -3.22e-09 |
|    value_loss         | 0.00203   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 59.5      |
| time/                 |           |
|    fps                | 584       |
|    iterations         | 30300     |
|    time_elapsed       | 259       |
|    total_timesteps    | 151500    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 58.8     |
| time/                 |          |
|    fps                | 579      |
|    iterations         | 31500    |
|    time_elapsed       | 271      |
|    total_timesteps    | 157500   |
| train/                |          |
|    entropy_loss       | -0       |
|    explained_variance | 0.999    |
|    learning_rate      | 0.0007   |
|    n_updates          | 31499    |
|    policy_loss        | -0       |
|    value_loss         | 0.427    |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.8      |
| time/                 |           |
|    fps                | 578       |
|    iterations         | 31600     |
|    time_elapsed       | 272       |
|    total_timesteps    | 158000    |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 58.1     |
| time/                 |          |
|    fps                | 577      |
|    iterations         | 32800    |
|    time_elapsed       | 283      |
|    total_timesteps    | 164000   |
| train/                |          |
|    entropy_loss       | -0       |
|    explained_variance | 0.989    |
|    learning_rate      | 0.0007   |
|    n_updates          | 32799    |
|    policy_loss        | -0       |
|    value_loss         | 0.92     |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.3      |
| time/                 |           |
|    fps                | 577       |
|    iterations         | 32900     |
|    time_elapsed       | 284       |
|    total_timesteps    | 164500    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 59.4      |
| time/                 |           |
|    fps                | 577       |
|    iterations         | 34100     |
|    time_elapsed       | 295       |
|    total_timesteps    | 170500    |
| train/                |           |
|    entropy_loss       | -0.000708 |
|    explained_variance | 0         |
|    learning_rate      | 0.0007    |
|    n_updates          | 34099     |
|    policy_loss        | -1.06e-07 |
|    value_loss         | 1.16e-06  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 59.5     |
| time/                 |          |
|    fps                | 577      |
|    iterations         | 34200    |
|    time_elapsed       | 295      |
|    total_timesteps    | 171000   |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.2      |
| time/                 |           |
|    fps                | 578       |
|    iterations         | 35400     |
|    time_elapsed       | 306       |
|    total_timesteps    | 177000    |
| train/                |           |
|    entropy_loss       | -7.63e-07 |
|    explained_variance | 0.983     |
|    learning_rate      | 0.0007    |
|    n_updates          | 35399     |
|    policy_loss        | -0        |
|    value_loss         | 0.352     |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.1      |
| time/                 |           |
|    fps                | 578       |
|    iterations         | 35500     |
|    time_elapsed       | 307       |
|    total_timesteps    | 177500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.7      |
| time/                 |           |
|    fps                | 578       |
|    iterations         | 36700     |
|    time_elapsed       | 317       |
|    total_timesteps    | 183500    |
| train/                |           |
|    entropy_loss       | -1.98e-06 |
|    explained_variance | 1         |
|    learning_rate      | 0.0007    |
|    n_updates          | 36699     |
|    policy_loss        | -2.53e-09 |
|    value_loss         | 0.122     |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.7      |
| time/                 |           |
|    fps                | 578       |
|    iterations         | 36800     |
|    time_elapsed       | 318       |
|    total_timesteps    | 184000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.2      |
| time/                 |           |
|    fps                | 578       |
|    iterations         | 38000     |
|    time_elapsed       | 328       |
|    total_timesteps    | 190000    |
| train/                |           |
|    entropy_loss       | -1.98e-06 |
|    explained_variance | 1         |
|    learning_rate      | 0.0007    |
|    n_updates          | 37999     |
|    policy_loss        | -8.67e-11 |
|    value_loss         | 0.006     |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 23        |
|    ep_rew_mean        | 58.2      |
| time/                 |           |
|    fps                | 578       |
|    iterations         | 38100     |
|    time_elapsed       | 329       |
|    total_timesteps    | 190500    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 58.6     |
| time/                 |          |
|    fps                | 579      |
|    iterations         | 39300    |
|    time_elapsed       | 339      |
|    total_timesteps    | 196500   |
| train/                |          |
|    entropy_loss       | -0       |
|    explained_variance | 0.969    |
|    learning_rate      | 0.0007   |
|    n_updates          | 39299    |
|    policy_loss        | -0       |
|    value_loss         | 1.27     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 23       |
|    ep_rew_mean        | 58.3     |
| time/                 |          |
|    fps                | 579      |
|    iterations         | 39400    |
|    time_elapsed       | 340      |
|    total_timesteps    | 197000   |
| train/                |          |
|

<stable_baselines3.a2c.a2c.A2C at 0x7f23e5311748>

In [38]:
model.save("a2cDeep")

In [15]:
obs = env.reset()

In [None]:
model.predict(obs)

Testing the variables

In [None]:
print(env.state)
print(env.action_space)
print(env.observation_space) # openAI calles states observations :/

Testing the step function

In [None]:
env.state = [1,0, 0]
print(env.step(np.asarray([1,1, 1, 1, 1, 1]), 5))
print('***')
env.state = [3, 3, 3]
print(env.step([1,1, 1, 1, 1, 1], 5))

test sample uniformly from action space at random

In [None]:
env.action_space.sample()

Simulating randomized policy

In [None]:
e = 50
rewards = []
for j in range(e):
    obs = env.reset()
    tot_reward = 0
    for i in range(tau):
        action, states_ = model.predict(obs)
        obs, reward, dones, info = env.step(action)
        tot_reward += reward
    rewards.append(tot_reward)

Test Evaluate Function

In [17]:
print('Here is the mean and standard deviation of the estimate ' + str(evaluate(model, env, 50)))

Here is the mean and standard deviation of the estimate (62.92, 5.428959384633486)
