In [1]:
import numpy as np
import gym
from stable_baselines3 import PPO
import math
import itertools
import datetime
import ray
import ray.rllib as rllib
from ray import tune
from ray.tune.registry import register_env

In [2]:
N = 13
rand = np.random.RandomState(1)
M = np.round(rand.normal(5, 1.5, size=(N, N)),2)
M[np.eye(N)==1] = 0
M

array([[0.  , 4.08, 4.21, 3.39, 6.3 , 1.55, 7.62, 3.86, 5.48, 4.63, 7.19,
        1.91, 4.52],
       [4.42, 0.  , 3.35, 4.74, 3.68, 5.06, 5.87, 3.35, 6.72, 6.35, 5.75,
        6.35, 3.97],
       [4.82, 3.6 , 0.  , 5.8 , 3.96, 4.4 , 3.97, 3.73, 3.99, 4.98, 3.32,
        5.35, 7.49],
       [6.11, 4.71, 3.67, 0.  , 7.54, 5.08, 4.04, 5.29, 8.15, 5.18, 5.93,
        5.45, 4.47],
       [3.29, 4.48, 4.69, 5.88, 0.  , 6.4 , 5.43, 6.33, 3.87, 6.88, 5.77,
        4.55, 5.73],
       [4.89, 6.7 , 7.28, 8.28, 2.91, 0.  , 4.24, 5.24, 6.31, 5.47, 1.97,
        4.54, 6.24],
       [5.35, 6.14, 4.67, 4.7 , 5.28, 5.62, 0.  , 5.18, 3.99, 5.57, 5.18,
        6.69, 6.8 ],
       [5.28, 4.44, 4.04, 5.64, 5.12, 4.48, 5.07, 0.  , 6.05, 4.33, 6.84,
        5.61, 5.89],
       [3.36, 5.25, 6.11, 3.57, 4.6 , 5.05, 2.94, 5.47, 0.  , 3.71, 5.53,
        3.03, 4.94],
       [2.58, 6.68, 5.61, 4.96, 3.84, 6.91, 7.95, 2.21, 6.85, 0.  , 5.51,
        3.2 , 6.3 ],
       [4.73, 4.09, 3.15, 5.83, 6.19, 4.06, 5.78, 

In [3]:
math.factorial(N-1)

479001600

In [4]:
class MyEnv(gym.Env):
    def __init__(self, env_config):
        super().__init__()
        self.n = N-1
        self.action_space = gym.spaces.Discrete(self.n)
        self.observation_space = gym.spaces.Dict({
            'visited': gym.spaces.MultiBinary(self.n), 
            'last': gym.spaces.Discrete(N)})   
        
    def reset(self):
        self.state = {'visited': np.zeros(self.n), 'last': 0}
        visited = np.zeros(self.n)
        return self.state

    def step(self, action):
        if self.state['visited'][action] == 1:
            self.reward = -10
        else:
            self.state['visited'][action] = 1
            self.reward = - M[self.state['last'], action + 1]
            self.state['last'] = action + 1
        if np.all(self.state['visited'] == 1): 
            self.reward += - M[action + 1, 0]    
            self.done = True
        else:
            self.done = False
            
        return self.state, self.reward, self.done, {}

In [5]:
config = rllib.agents.ppo.DEFAULT_CONFIG.copy()
config["num_gpus"] = 0
config["num_workers"] = 1
config["framework"] = "torch"
config["env_config"] = {}

In [6]:
env = MyEnv(config)

In [7]:
print(datetime.datetime.now())
model = PPO("MultiInputPolicy", env, verbose=1)
model.learn(total_timesteps=100000)
print(datetime.datetime.now())

2022-03-13 21:15:08.100306
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 39.7     |
|    ep_rew_mean     | -343     |
| time/              |          |
|    fps             | 2162     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 38.7        |
|    ep_rew_mean          | -333        |
| time/                   |             |
|    fps                  | 1569        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.004341918 |
|    clip_fraction        | 0.00718     |
|    clip_range           | 0.2    

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 29.5        |
|    ep_rew_mean          | -241        |
| time/                   |             |
|    fps                  | 1258        |
|    iterations           | 11          |
|    time_elapsed         | 17          |
|    total_timesteps      | 22528       |
| train/                  |             |
|    approx_kl            | 0.006707578 |
|    clip_fraction        | 0.0377      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.45       |
|    explained_variance   | -0.000133   |
|    learning_rate        | 0.0003      |
|    loss                 | 1.91e+03    |
|    n_updates            | 100         |
|    policy_gradient_loss | -0.0154     |
|    value_loss           | 4.66e+03    |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 26.9  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 16.8        |
|    ep_rew_mean          | -113        |
| time/                   |             |
|    fps                  | 1163        |
|    iterations           | 21          |
|    time_elapsed         | 36          |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.012888955 |
|    clip_fraction        | 0.0926      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.18       |
|    explained_variance   | -3.1e-06    |
|    learning_rate        | 0.0003      |
|    loss                 | 567         |
|    n_updates            | 200         |
|    policy_gradient_loss | -0.0199     |
|    value_loss           | 1.09e+03    |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 15.1  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 12.8        |
|    ep_rew_mean          | -69.6       |
| time/                   |             |
|    fps                  | 1052        |
|    iterations           | 31          |
|    time_elapsed         | 60          |
|    total_timesteps      | 63488       |
| train/                  |             |
|    approx_kl            | 0.017273413 |
|    clip_fraction        | 0.123       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.72       |
|    explained_variance   | -7.15e-07   |
|    learning_rate        | 0.0003      |
|    loss                 | 198         |
|    n_updates            | 300         |
|    policy_gradient_loss | -0.0205     |
|    value_loss           | 369         |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 12.6  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 12.3        |
|    ep_rew_mean          | -58         |
| time/                   |             |
|    fps                  | 1040        |
|    iterations           | 41          |
|    time_elapsed         | 80          |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.012654748 |
|    clip_fraction        | 0.124       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.33       |
|    explained_variance   | -3.58e-07   |
|    learning_rate        | 0.0003      |
|    loss                 | 113         |
|    n_updates            | 400         |
|    policy_gradient_loss | -0.0166     |
|    value_loss           | 252         |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 12.3  

In [8]:
obs = env.reset()
g = 0
actions = [0]
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    g += M[actions[-1], action + 1]
    actions.append(action + 1)
    if done:
        obs = env.reset()
        g += M[actions[-1], 0]
        print(f"{actions}, g = {g}")
        break
env.close()

[0, 11, 8, 6, 3, 2, 1, 12, 5, 10, 7, 9, 4], g = 42.35


In [10]:
x_min = []
g_min = 100000000
print(datetime.datetime.now())
for i, x in enumerate(itertools.permutations(range(1,N), N-1)):
    x = (0,) + x
    g = np.sum(M[x[:-1], x[1:]])+M[x[-1], x[0]]
    if g < g_min:
        g_min = g
        x_min = x
    if i % 10000000 == 0:
        print(i, x_min, g_min)
print(datetime.datetime.now())
print('Optimal solution:')
print(x_min, g_min)

2022-03-13 21:18:52.689905
0 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12) 66.36
10000000 (0, 1, 2, 10, 7, 9, 11, 8, 6, 3, 12, 5, 4) 44.720000000000006
20000000 (0, 1, 2, 10, 7, 9, 11, 8, 6, 3, 12, 5, 4) 44.720000000000006
30000000 (0, 1, 7, 9, 11, 8, 6, 3, 12, 5, 10, 2, 4) 44.29
40000000 (0, 1, 12, 5, 10, 7, 9, 11, 8, 6, 3, 2, 4) 44.24
50000000 (0, 1, 12, 5, 10, 7, 9, 11, 8, 6, 3, 2, 4) 44.24
60000000 (0, 1, 12, 5, 10, 7, 9, 11, 8, 6, 3, 2, 4) 44.24
70000000 (0, 1, 12, 5, 10, 7, 9, 11, 8, 6, 3, 2, 4) 44.24
80000000 (0, 1, 12, 5, 10, 7, 9, 11, 8, 6, 3, 2, 4) 44.24
90000000 (0, 3, 2, 1, 12, 5, 10, 7, 9, 11, 8, 6, 4) 43.769999999999996
100000000 (0, 3, 2, 1, 12, 5, 10, 7, 9, 11, 8, 6, 4) 43.769999999999996
110000000 (0, 3, 2, 1, 12, 5, 10, 7, 9, 11, 8, 6, 4) 43.769999999999996
120000000 (0, 3, 12, 5, 10, 7, 9, 11, 8, 6, 2, 1, 4) 43.67
130000000 (0, 3, 12, 5, 10, 7, 9, 11, 8, 6, 2, 1, 4) 43.67
140000000 (0, 3, 12, 5, 10, 7, 9, 11, 8, 6, 2, 1, 4) 43.67
150000000 (0, 3, 12, 5, 10, 7, 9, 11, 8,

In [11]:
ray.shutdown()
ray.init()

{'node_ip_address': '127.0.0.1',
 'raylet_ip_address': '127.0.0.1',
 'redis_address': '127.0.0.1:58271',
 'object_store_address': '/tmp/ray/session_2022-03-13_22-55-40_868274_25918/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2022-03-13_22-55-40_868274_25918/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2022-03-13_22-55-40_868274_25918',
 'metrics_export_port': 63337,
 'gcs_address': '127.0.0.1:61628',
 'node_id': 'eb5a4aa09f81895294a4836a4fbb293687e2ca732a65f10daa9e65a7'}

In [12]:
agent = rllib.agents.ppo.PPOTrainer(config=config, env=MyEnv)

2022-03-13 22:55:49,537	INFO ppo.py:249 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
2022-03-13 22:55:49,541	INFO trainer.py:790 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


In [13]:
print(datetime.datetime.now())
for i in range(31):
   # Perform one iteration of training the policy with PPO
   result = agent.train()
   if i % 10 == 0:
       #print(pretty_print(result))
       print('i: ', i)
       print('mean episode length:', result['episode_len_mean'])
       print('max episode reward:', result['episode_reward_max'])
       print('mean episode reward:', result['episode_reward_mean'])
       print('min episode reward:', result['episode_reward_min'])
       print('total episodes:', result['episodes_total'])
       print()

       checkpoint = agent.save()
print(datetime.datetime.now())

2022-03-13 22:55:59.366087




i:  0
mean episode length: 37.556603773584904
max episode reward: -119.48
mean episode reward: -320.91037735849056
min episode reward: -913.9899999999999
total episodes: 106

i:  10
mean episode length: 13.23841059602649
max episode reward: -50.17
mean episode reward: -75.25076158940398
min episode reward: -120.49
total episodes: 2187

i:  20
mean episode length: 12.102719033232628
max episode reward: -43.88
mean episode reward: -55.2219335347432
min episode reward: -84.53999999999999
total episodes: 5438

i:  30
mean episode length: 12.024024024024024
max episode reward: -42.94
mean episode reward: -49.662912912912915
min episode reward: -71.28
total episodes: 8758

2022-03-13 23:01:26.211702


In [14]:
obs = env.reset()
g = 0
actions = [0]
for i in range(1000):
    action = agent.compute_single_action(obs, explore = False)
    obs, reward, done, info = env.step(action)
    g += M[actions[-1], action + 1]
    actions.append(action + 1)
    if done:
        obs = env.reset()
        g += M[actions[-1], 0]
        print(f"{actions}, g = {g}")
        break
env.close()

[0, 11, 8, 9, 7, 1, 12, 3, 2, 6, 5, 10, 4], g = 45.61999999999999


In [15]:
def env_creator(env_config):
    return MyEnv(env_config)  # return an env instance

register_env("my_env", env_creator)

In [16]:
experiment = tune.run(
    rllib.agents.ppo.PPOTrainer,
    config={
        "env": "my_env",
        "num_gpus": 0,
        "num_workers": 1,
        "framework": "torch"
    },
    metric="episode_reward_mean",
    mode="max",
    stop={"training_iteration": 50},
    checkpoint_at_end=True
)

Trial name,status,loc
PPOTrainer_my_env_5d5b7_00000,PENDING,


[2m[36m(PPOTrainer pid=27234)[0m 2022-03-13 23:01:35,673	INFO ppo.py:249 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(PPOTrainer pid=27234)[0m 2022-03-13 23:01:35,674	INFO trainer.py:790 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Trial name,status,loc
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234




Trial name,status,loc
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234


Trial name,status,loc
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 4000
  custom_metrics: {}
  date: 2022-03-13_23-01-53
  done: false
  episode_len_mean: 39.20792079207921
  episode_media: {}
  episode_reward_max: -100.96
  episode_reward_mean: -337.74465346534646
  episode_reward_min: -1700.02
  episodes_this_iter: 101
  episodes_total: 101
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 2.473783258725238
          entropy_coeff: 0.0
          kl: 0.01123467804347083
          policy_loss: -0.023224839895352803
          total_loss: 44720.46773143481
          vf_explained_var: -0.0041159298471225205
          vf_loss: 44720.48877373152
        model: {}
    num_agent_steps_sampled: 4000
    num_agent_steps_trained: 4

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,1,9.68022,4000,-337.745,-100.96,-1700.02,39.2079


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,1,9.68022,4000,-337.745,-100.96,-1700.02,39.2079


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 8000
  custom_metrics: {}
  date: 2022-03-13_23-02-03
  done: false
  episode_len_mean: 35.333333333333336
  episode_media: {}
  episode_reward_max: -101.91
  episode_reward_mean: -299.0651754385965
  episode_reward_min: -995.43
  episodes_this_iter: 114
  episodes_total: 215
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 2.4575165846014535
          entropy_coeff: 0.0
          kl: 0.01038814501263589
          policy_loss: -0.02066200280271631
          total_loss: 33776.62444661458
          vf_explained_var: -0.0012789304538439678
          vf_loss: 33776.64321656586
        model: {}
    num_agent_steps_sampled: 8000
    num_agent_steps_trained: 80

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,2,19.2999,8000,-299.065,-101.91,-995.43,35.3333


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,2,19.2999,8000,-299.065,-101.91,-995.43,35.3333


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 12000
  custom_metrics: {}
  date: 2022-03-13_23-02-13
  done: false
  episode_len_mean: 29.666666666666668
  episode_media: {}
  episode_reward_max: -76.69
  episode_reward_mean: -241.7860740740741
  episode_reward_min: -655.48
  episodes_this_iter: 135
  episodes_total: 350
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 2.421945246060689
          entropy_coeff: 0.0
          kl: 0.01138331572427411
          policy_loss: -0.02734935613829763
          total_loss: 19043.648623361896
          vf_explained_var: -0.0006561173546698785
          vf_loss: 19043.673664314516
        model: {}
    num_agent_steps_sampled: 12000
    num_agent_steps_trained: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,3,29.0961,12000,-241.786,-76.69,-655.48,29.6667


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,3,29.0961,12000,-241.786,-76.69,-655.48,29.6667


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 16000
  custom_metrics: {}
  date: 2022-03-13_23-02-23
  done: false
  episode_len_mean: 26.364238410596027
  episode_media: {}
  episode_reward_max: -82.46
  episode_reward_mean: -209.80125827814572
  episode_reward_min: -591.19
  episodes_this_iter: 151
  episodes_total: 501
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 2.376065675673946
          entropy_coeff: 0.0
          kl: 0.015144394500673895
          policy_loss: -0.03900025590593296
          total_loss: 11900.910328198504
          vf_explained_var: -0.00029737757098290227
          vf_loss: 11900.94629405032
        model: {}
    num_agent_steps_sampled: 16000
    num_agent_steps_trained

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,4,38.9931,16000,-209.801,-82.46,-591.19,26.3642


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 20000
  custom_metrics: {}
  date: 2022-03-13_23-02-32
  done: false
  episode_len_mean: 22.801136363636363
  episode_media: {}
  episode_reward_max: -59.00000000000001
  episode_reward_mean: -173.60875000000001
  episode_reward_min: -478.21
  episodes_this_iter: 176
  episodes_total: 677
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 2.3217802168220603
          entropy_coeff: 0.0
          kl: 0.013573479785148477
          policy_loss: -0.03619775630794065
          total_loss: 6829.972571194557
          vf_explained_var: -0.00021996716017364174
          vf_loss: 6830.006030273437
        model: {}
    num_agent_steps_sampled: 20000
    num_agent_s

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,5,48.8158,20000,-173.609,-59,-478.21,22.8011


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,5,48.8158,20000,-173.609,-59,-478.21,22.8011


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 24000
  custom_metrics: {}
  date: 2022-03-13_23-02-42
  done: false
  episode_len_mean: 20.05
  episode_media: {}
  episode_reward_max: -70.78
  episode_reward_mean: -146.13475
  episode_reward_min: -300.96999999999997
  episodes_this_iter: 200
  episodes_total: 877
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 2.2441067154689502
          entropy_coeff: 0.0
          kl: 0.017559759364855827
          policy_loss: -0.0410482964879002
          total_loss: 3206.272947906166
          vf_explained_var: -0.00016638886544012254
          vf_loss: 3206.310487874349
        model: {}
    num_agent_steps_sampled: 24000
    num_agent_steps_trained: 24000
   

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,6,58.4923,24000,-146.135,-70.78,-300.97,20.05


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,6,58.4923,24000,-146.135,-70.78,-300.97,20.05


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 28000
  custom_metrics: {}
  date: 2022-03-13_23-02-52
  done: false
  episode_len_mean: 18.02252252252252
  episode_media: {}
  episode_reward_max: -57.870000000000005
  episode_reward_mean: -126.11139639639637
  episode_reward_min: -265.12
  episodes_this_iter: 222
  episodes_total: 1099
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 2.1530354625435284
          entropy_coeff: 0.0
          kl: 0.018731278704798543
          policy_loss: -0.04072366703842436
          total_loss: 1972.8577575027302
          vf_explained_var: -0.00012339917562341178
          vf_loss: 1972.8947300408477
        model: {}
    num_agent_steps_sampled: 28000
    num_agen

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,7,68.2512,28000,-126.111,-57.87,-265.12,18.0225


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,7,68.2512,28000,-126.111,-57.87,-265.12,18.0225


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 32000
  custom_metrics: {}
  date: 2022-03-13_23-03-02
  done: false
  episode_len_mean: 16.235772357723576
  episode_media: {}
  episode_reward_max: -60.120000000000005
  episode_reward_mean: -107.84243902439023
  episode_reward_min: -305.96
  episodes_this_iter: 246
  episodes_total: 1345
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 2.0412959730753335
          entropy_coeff: 0.0
          kl: 0.020663141158241086
          policy_loss: -0.05078721504047593
          total_loss: 1226.6832612068422
          vf_explained_var: -0.0001136738766906082
          vf_loss: 1226.7299151020666
        model: {}
    num_agent_steps_sampled: 32000
    num_agen

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,8,78.0023,32000,-107.842,-60.12,-305.96,16.2358


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,8,78.0023,32000,-107.842,-60.12,-305.96,16.2358


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 36000
  custom_metrics: {}
  date: 2022-03-13_23-03-11
  done: false
  episode_len_mean: 14.955223880597014
  episode_media: {}
  episode_reward_max: -57.18
  episode_reward_mean: -94.65059701492538
  episode_reward_min: -172.29
  episodes_this_iter: 268
  episodes_total: 1613
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.3
          cur_lr: 5.0000000000000016e-05
          entropy: 1.928566546978489
          entropy_coeff: 0.0
          kl: 0.019997760137424932
          policy_loss: -0.047838858632190576
          total_loss: 883.734471999958
          vf_explained_var: -8.970819493775726e-05
          vf_loss: 883.776314749769
        model: {}
    num_agent_steps_sampled: 36000
    num_agent_steps_trained: 36000
    num_ste

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,9,87.5826,36000,-94.6506,-57.18,-172.29,14.9552


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 40000
  custom_metrics: {}
  date: 2022-03-13_23-03-21
  done: false
  episode_len_mean: 13.789655172413793
  episode_media: {}
  episode_reward_max: -51.4
  episode_reward_mean: -83.374
  episode_reward_min: -160.37
  episodes_this_iter: 290
  episodes_total: 1903
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.3
          cur_lr: 5.0000000000000016e-05
          entropy: 1.821189559403286
          entropy_coeff: 0.0
          kl: 0.017743029979165336
          policy_loss: -0.041728347601238835
          total_loss: 653.7998037358766
          vf_explained_var: -8.027745831397271e-05
          vf_loss: 653.836210451844
        model: {}
    num_agent_steps_sampled: 40000
    num_agent_steps_trained: 40000
    num_steps_sampled:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,10,97.4741,40000,-83.374,-51.4,-160.37,13.7897


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,10,97.4741,40000,-83.374,-51.4,-160.37,13.7897


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 44000
  custom_metrics: {}
  date: 2022-03-13_23-03-31
  done: false
  episode_len_mean: 13.144736842105264
  episode_media: {}
  episode_reward_max: -48.98
  episode_reward_mean: -76.52506578947369
  episode_reward_min: -124.47
  episodes_this_iter: 304
  episodes_total: 2207
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.3
          cur_lr: 5.0000000000000016e-05
          entropy: 1.7340873970780322
          entropy_coeff: 0.0
          kl: 0.017445286695185337
          policy_loss: -0.03864499500311751
          total_loss: 524.9016754478537
          vf_explained_var: -6.103246442733273e-05
          vf_loss: 524.9350869783791
        model: {}
    num_agent_steps_sampled: 44000
    num_agent_steps_trained: 44000
    num_s

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,11,107.499,44000,-76.5251,-48.98,-124.47,13.1447


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,11,107.499,44000,-76.5251,-48.98,-124.47,13.1447


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 48000
  custom_metrics: {}
  date: 2022-03-13_23-03-41
  done: false
  episode_len_mean: 12.695238095238095
  episode_media: {}
  episode_reward_max: -53.400000000000006
  episode_reward_mean: -70.80165079365078
  episode_reward_min: -120.66
  episodes_this_iter: 315
  episodes_total: 2522
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.3
          cur_lr: 5.0000000000000016e-05
          entropy: 1.6587720803035202
          entropy_coeff: 0.0
          kl: 0.020088550077023797
          policy_loss: -0.036568030066067174
          total_loss: 392.0347828198505
          vf_explained_var: -4.483051197503203e-05
          vf_loss: 392.06532323693716
        model: {}
    num_agent_steps_sampled: 48000
    num_agent_steps_trained: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,12,117.187,48000,-70.8017,-53.4,-120.66,12.6952


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,12,117.187,48000,-70.8017,-53.4,-120.66,12.6952


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 52000
  custom_metrics: {}
  date: 2022-03-13_23-03-51
  done: false
  episode_len_mean: 12.473520249221183
  episode_media: {}
  episode_reward_max: -55.97
  episode_reward_mean: -68.95246105919003
  episode_reward_min: -94.36
  episodes_this_iter: 321
  episodes_total: 2843
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 1.6067681871434694
          entropy_coeff: 0.0
          kl: 0.021446823838414758
          policy_loss: -0.04012413527566178
          total_loss: 338.22564638199344
          vf_explained_var: -2.7573621401222804e-05
          vf_loss: 338.2561201198127
        model: {}
    num_agent_steps_sampled: 52000
    num_agent_steps_trained:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,13,127.145,52000,-68.9525,-55.97,-94.36,12.4735


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,13,127.145,52000,-68.9525,-55.97,-94.36,12.4735


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 56000
  custom_metrics: {}
  date: 2022-03-13_23-04-01
  done: false
  episode_len_mean: 12.371517027863778
  episode_media: {}
  episode_reward_max: -52.21
  episode_reward_mean: -66.51405572755418
  episode_reward_min: -95.99000000000001
  episodes_this_iter: 323
  episodes_total: 3166
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 1.575259304302995
          entropy_coeff: 0.0
          kl: 0.010659352703518135
          policy_loss: -0.028806219916910895
          total_loss: 314.3327485443443
          vf_explained_var: -2.0756260041267642e-05
          vf_loss: 314.3543594852571
        model: {}
    num_agent_steps_sampled: 56000
    num_agent_ste

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,14,137.129,56000,-66.5141,-52.21,-95.99,12.3715


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,14,137.129,56000,-66.5141,-52.21,-95.99,12.3715


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 60000
  custom_metrics: {}
  date: 2022-03-13_23-04-11
  done: false
  episode_len_mean: 12.316923076923077
  episode_media: {}
  episode_reward_max: -49.74
  episode_reward_mean: -64.97073846153846
  episode_reward_min: -90.72999999999999
  episodes_this_iter: 325
  episodes_total: 3491
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 1.5391396311021621
          entropy_coeff: 0.0
          kl: 0.009691252726393883
          policy_loss: -0.027085736766457556
          total_loss: 301.097660023679
          vf_explained_var: -1.2592730983611077e-05
          vf_loss: 301.1182035056494
        model: {}
    num_agent_steps_sampled: 60000
    num_agent_ste

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,15,146.784,60000,-64.9707,-49.74,-90.73,12.3169


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 64000
  custom_metrics: {}
  date: 2022-03-13_23-04-20
  done: false
  episode_len_mean: 12.232415902140673
  episode_media: {}
  episode_reward_max: -50.59
  episode_reward_mean: -63.56553516819572
  episode_reward_min: -92.16
  episodes_this_iter: 327
  episodes_total: 3818
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 1.5134287213766446
          entropy_coeff: 0.0
          kl: 0.0104098087803851
          policy_loss: -0.02885140883686241
          total_loss: 287.230866758285
          vf_explained_var: -7.908190450360699e-06
          vf_loss: 287.25269309423305
        model: {}
    num_agent_steps_sampled: 64000
    num_agent_steps_trained: 640

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,16,156.566,64000,-63.5655,-50.59,-92.16,12.2324


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,16,156.566,64000,-63.5655,-50.59,-92.16,12.2324


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 68000
  custom_metrics: {}
  date: 2022-03-13_23-04-30
  done: false
  episode_len_mean: 12.241590214067278
  episode_media: {}
  episode_reward_max: -48.55
  episode_reward_mean: -63.41587155963303
  episode_reward_min: -92.58000000000001
  episodes_this_iter: 327
  episodes_total: 4145
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 1.4857685595430354
          entropy_coeff: 0.0
          kl: 0.014829379887335653
          policy_loss: -0.033187251833958493
          total_loss: 282.9528103407993
          vf_explained_var: -5.183809547014134e-06
          vf_loss: 282.97598678424794
        model: {}
    num_agent_steps_sampled: 68000
    num_agent_st

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,17,166.566,68000,-63.4159,-48.55,-92.58,12.2416


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,17,166.566,68000,-63.4159,-48.55,-92.58,12.2416


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 72000
  custom_metrics: {}
  date: 2022-03-13_23-04-40
  done: false
  episode_len_mean: 12.151975683890578
  episode_media: {}
  episode_reward_max: -51.11999999999999
  episode_reward_mean: -62.28927051671732
  episode_reward_min: -85.62
  episodes_this_iter: 329
  episodes_total: 4474
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 1.4557920725114883
          entropy_coeff: 0.0
          kl: 0.012752548787007493
          policy_loss: -0.03215042478104513
          total_loss: 271.2777299614363
          vf_explained_var: -3.1453306956957747e-06
          vf_loss: 271.3012713196457
        model: {}
    num_agent_steps_sampled: 72000
    num_agent_ste

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,18,176.228,72000,-62.2893,-51.12,-85.62,12.152


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,18,176.228,72000,-62.2893,-51.12,-85.62,12.152


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 76000
  custom_metrics: {}
  date: 2022-03-13_23-04-50
  done: false
  episode_len_mean: 12.136778115501519
  episode_media: {}
  episode_reward_max: -46.59
  episode_reward_mean: -61.299908814589664
  episode_reward_min: -86.49000000000001
  episodes_this_iter: 329
  episodes_total: 4803
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 1.4223536860558295
          entropy_coeff: 0.0
          kl: 0.010166581778629244
          policy_loss: -0.030621681654305066
          total_loss: 261.7104318270119
          vf_explained_var: -2.0151497215353035e-06
          vf_loss: 261.73419063116916
        model: {}
    num_agent_steps_sampled: 76000
    num_agent_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,19,186.167,76000,-61.2999,-46.59,-86.49,12.1368


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,19,186.167,76000,-61.2999,-46.59,-86.49,12.1368


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 80000
  custom_metrics: {}
  date: 2022-03-13_23-05-00
  done: false
  episode_len_mean: 12.139393939393939
  episode_media: {}
  episode_reward_max: -44.99
  episode_reward_mean: -60.96872727272727
  episode_reward_min: -86.82
  episodes_this_iter: 330
  episodes_total: 5133
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 1.3971477664927001
          entropy_coeff: 0.0
          kl: 0.009234670168019982
          policy_loss: -0.02723864465200853
          total_loss: 259.8475347211284
          vf_explained_var: -1.2938694287371892e-06
          vf_loss: 259.86854010141025
        model: {}
    num_agent_steps_sampled: 80000
    num_agent_steps_trained:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,20,196.116,80000,-60.9687,-44.99,-86.82,12.1394


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 84000
  custom_metrics: {}
  date: 2022-03-13_23-05-10
  done: false
  episode_len_mean: 12.127272727272727
  episode_media: {}
  episode_reward_max: -46.47
  episode_reward_mean: -60.34863636363636
  episode_reward_min: -94.5
  episodes_this_iter: 330
  episodes_total: 5463
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 1.3596879684796896
          entropy_coeff: 0.0
          kl: 0.010473048501850131
          policy_loss: -0.028973452574373174
          total_loss: 251.29998410132623
          vf_explained_var: -8.484368683189474e-07
          vf_loss: 251.32188783050864
        model: {}
    num_agent_steps_sampled: 84000
    num_agent_steps_trained:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,21,205.812,84000,-60.3486,-46.47,-94.5,12.1273


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,21,205.812,84000,-60.3486,-46.47,-94.5,12.1273


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 88000
  custom_metrics: {}
  date: 2022-03-13_23-05-20
  done: false
  episode_len_mean: 12.106060606060606
  episode_media: {}
  episode_reward_max: -44.56
  episode_reward_mean: -59.139787878787885
  episode_reward_min: -76.18
  episodes_this_iter: 330
  episodes_total: 5793
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 1.3237874909113811
          entropy_coeff: 0.0
          kl: 0.011270650075807888
          policy_loss: -0.030727513523782373
          total_loss: 236.86514792493594
          vf_explained_var: -5.631036655877226e-07
          vf_loss: 236.88826676235405
        model: {}
    num_agent_steps_sampled: 88000
    num_agent_steps_traine

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,22,215.788,88000,-59.1398,-44.56,-76.18,12.1061


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,22,215.788,88000,-59.1398,-44.56,-76.18,12.1061


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 92000
  custom_metrics: {}
  date: 2022-03-13_23-05-29
  done: false
  episode_len_mean: 12.075528700906345
  episode_media: {}
  episode_reward_max: -44.31
  episode_reward_mean: -57.4860422960725
  episode_reward_min: -77.63
  episodes_this_iter: 331
  episodes_total: 6124
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 1.267694112690546
          entropy_coeff: 0.0
          kl: 0.009318825094540625
          policy_loss: -0.02949205435842516
          total_loss: 230.009647738549
          vf_explained_var: -3.5942241709719423e-07
          vf_loss: 230.03284997427335
        model: {}
    num_agent_steps_sampled: 92000
    num_agent_steps_trained: 92

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,23,225.439,92000,-57.486,-44.31,-77.63,12.0755


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,23,225.439,92000,-57.486,-44.31,-77.63,12.0755


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 96000
  custom_metrics: {}
  date: 2022-03-13_23-05-39
  done: false
  episode_len_mean: 12.066265060240964
  episode_media: {}
  episode_reward_max: -45.61
  episode_reward_mean: -56.87072289156627
  episode_reward_min: -76.5
  episodes_this_iter: 332
  episodes_total: 6456
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 1.2423620395762947
          entropy_coeff: 0.0
          kl: 0.009060710947274734
          policy_loss: -0.025519386234302676
          total_loss: 219.47662061465684
          vf_explained_var: -2.3136856735393566e-07
          vf_loss: 219.49602396975283
        model: {}
    num_agent_steps_sampled: 96000
    num_agent_steps_trained

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,24,235.192,96000,-56.8707,-45.61,-76.5,12.0663


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,24,235.192,96000,-56.8707,-45.61,-76.5,12.0663


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 100000
  custom_metrics: {}
  date: 2022-03-13_23-05-49
  done: false
  episode_len_mean: 12.033132530120483
  episode_media: {}
  episode_reward_max: -45.68
  episode_reward_mean: -55.86596385542169
  episode_reward_min: -70.94000000000001
  episodes_this_iter: 332
  episodes_total: 6788
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 1.2063295341307116
          entropy_coeff: 0.0
          kl: 0.009893187833719126
          policy_loss: -0.03126999000487949
          total_loss: 212.58195111674647
          vf_explained_var: -1.3984659666656166e-07
          vf_loss: 212.6065433789325
        model: {}
    num_agent_steps_sampled: 100000
    num_agent_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,25,245.311,100000,-55.866,-45.68,-70.94,12.0331


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,25,245.311,100000,-55.866,-45.68,-70.94,12.0331


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 104000
  custom_metrics: {}
  date: 2022-03-13_23-05-59
  done: false
  episode_len_mean: 12.081570996978853
  episode_media: {}
  episode_reward_max: -44.23
  episode_reward_mean: -55.7708761329305
  episode_reward_min: -81.60000000000001
  episodes_this_iter: 331
  episodes_total: 7119
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 1.175454928413514
          entropy_coeff: 0.0
          kl: 0.008667914315098583
          policy_loss: -0.02991784865997972
          total_loss: 211.52603991108555
          vf_explained_var: -9.177833475092406e-08
          vf_loss: 211.55010791081253
        model: {}
    num_agent_steps_sampled: 104000
    num_agent_st

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,26,255.248,104000,-55.7709,-44.23,-81.6,12.0816


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 108000
  custom_metrics: {}
  date: 2022-03-13_23-06-09
  done: false
  episode_len_mean: 12.072289156626505
  episode_media: {}
  episode_reward_max: -43.15
  episode_reward_mean: -54.62626506024097
  episode_reward_min: -73.47
  episodes_this_iter: 332
  episodes_total: 7451
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 1.1403400236560453
          entropy_coeff: 0.0
          kl: 0.009046411026105566
          policy_loss: -0.023695112712761406
          total_loss: 208.17249393258044
          vf_explained_var: -5.345190725018901e-08
          vf_loss: 208.19008255825247
        model: {}
    num_agent_steps_sampled: 108000
    num_agent_steps_train

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,27,264.984,108000,-54.6263,-43.15,-73.47,12.0723


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,27,264.984,108000,-54.6263,-43.15,-73.47,12.0723


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 112000
  custom_metrics: {}
  date: 2022-03-13_23-06-19
  done: false
  episode_len_mean: 12.051204819277109
  episode_media: {}
  episode_reward_max: -43.15
  episode_reward_mean: -54.51243975903615
  episode_reward_min: -73.49000000000001
  episodes_this_iter: 332
  episodes_total: 7783
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 1.1127895989725667
          entropy_coeff: 0.0
          kl: 0.010246423396002648
          policy_loss: -0.029001582806970003
          total_loss: 202.36200640278477
          vf_explained_var: -4.7042805661437335e-08
          vf_loss: 202.38409151466945
        model: {}
    num_agent_steps_sampled: 112000
    num_agen

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,28,274.962,112000,-54.5124,-43.15,-73.49,12.0512


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,28,274.962,112000,-54.5124,-43.15,-73.49,12.0512


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 116000
  custom_metrics: {}
  date: 2022-03-13_23-06-29
  done: false
  episode_len_mean: 12.051359516616314
  episode_media: {}
  episode_reward_max: -43.15
  episode_reward_mean: -53.02202416918429
  episode_reward_min: -76.19
  episodes_this_iter: 331
  episodes_total: 8114
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 1.0454060183417413
          entropy_coeff: 0.0
          kl: 0.009647022556628639
          policy_loss: -0.031149647900614367
          total_loss: 194.39151614609585
          vf_explained_var: -1.384365943170363e-08
          vf_loss: 194.41615423387097
        model: {}
    num_agent_steps_sampled: 116000
    num_agent_steps_train

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,29,285.02,116000,-53.022,-43.15,-76.19,12.0514


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,29,285.02,116000,-53.022,-43.15,-76.19,12.0514


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 120000
  custom_metrics: {}
  date: 2022-03-13_23-06-39
  done: false
  episode_len_mean: 12.03903903903904
  episode_media: {}
  episode_reward_max: -43.11
  episode_reward_mean: -52.52927927927928
  episode_reward_min: -80.52
  episodes_this_iter: 333
  episodes_total: 8447
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 1.0139335982902076
          entropy_coeff: 0.0
          kl: 0.0079971848371568
          policy_loss: -0.022940527175062446
          total_loss: 187.74896455170006
          vf_explained_var: -1.1856837939190608e-08
          vf_loss: 187.76650681034212
        model: {}
    num_agent_steps_sampled: 120000
    num_agent_steps_trained

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,30,294.885,120000,-52.5293,-43.11,-80.52,12.039


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,30,294.885,120000,-52.5293,-43.11,-80.52,12.039


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 124000
  custom_metrics: {}
  date: 2022-03-13_23-06-49
  done: false
  episode_len_mean: 12.051204819277109
  episode_media: {}
  episode_reward_max: -43.15
  episode_reward_mean: -51.17707831325301
  episode_reward_min: -68.02
  episodes_this_iter: 332
  episodes_total: 8779
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.9537220103125419
          entropy_coeff: 0.0
          kl: 0.009586700461236101
          policy_loss: -0.022055233675005136
          total_loss: 183.90809258901945
          vf_explained_var: -3.3327328261508738e-09
          vf_loss: 183.92367739113428
        model: {}
    num_agent_steps_sampled: 124000
    num_agent_steps_trai

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,31,304.899,124000,-51.1771,-43.15,-68.02,12.0512


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,31,304.899,124000,-51.1771,-43.15,-68.02,12.0512


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 128000
  custom_metrics: {}
  date: 2022-03-13_23-06-59
  done: false
  episode_len_mean: 12.051204819277109
  episode_media: {}
  episode_reward_max: -42.690000000000005
  episode_reward_mean: -52.13210843373494
  episode_reward_min: -67.52999999999999
  episodes_this_iter: 332
  episodes_total: 9111
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.96981913569153
          entropy_coeff: 0.0
          kl: 0.008778698827590935
          policy_loss: -0.02426915149534902
          total_loss: 185.83262796709616
          vf_explained_var: -1.1408200827977991e-08
          vf_loss: 185.85097224738007
        model: {}
    num_agent_steps_sampled: 128000
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,32,314.948,128000,-52.1321,-42.69,-67.53,12.0512


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,32,314.948,128000,-52.1321,-42.69,-67.53,12.0512


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 132000
  custom_metrics: {}
  date: 2022-03-13_23-07-09
  done: false
  episode_len_mean: 12.057401812688822
  episode_media: {}
  episode_reward_max: -43.15
  episode_reward_mean: -51.57435045317221
  episode_reward_min: -68.65
  episodes_this_iter: 331
  episodes_total: 9442
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.9375901611902381
          entropy_coeff: 0.0
          kl: 0.008376266016157439
          policy_loss: -0.025926853583685013
          total_loss: 182.34688791254516
          vf_explained_var: 3.845460953251008e-10
          vf_loss: 182.36716146161478
        model: {}
    num_agent_steps_sampled: 132000
    num_agent_steps_traine

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,33,324.832,132000,-51.5744,-43.15,-68.65,12.0574


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 136000
  custom_metrics: {}
  date: 2022-03-13_23-07-19
  done: false
  episode_len_mean: 12.045180722891565
  episode_media: {}
  episode_reward_max: -43.15
  episode_reward_mean: -51.72120481927711
  episode_reward_min: -78.29999999999998
  episodes_this_iter: 332
  episodes_total: 9774
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.9228860231496955
          entropy_coeff: 0.0
          kl: 0.0068155711841681585
          policy_loss: -0.024429641729120606
          total_loss: 185.7365695420132
          vf_explained_var: -3.5250058738134242e-09
          vf_loss: 185.75639750162762
        model: {}
    num_agent_steps_sampled: 136000
    num_agen

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,34,334.56,136000,-51.7212,-43.15,-78.3,12.0452


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,34,334.56,136000,-51.7212,-43.15,-78.3,12.0452


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 140000
  custom_metrics: {}
  date: 2022-03-13_23-07-29
  done: false
  episode_len_mean: 12.042042042042041
  episode_media: {}
  episode_reward_max: -42.98
  episode_reward_mean: -50.49711711711712
  episode_reward_min: -72.99
  episodes_this_iter: 333
  episodes_total: 10107
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.8685204782152689
          entropy_coeff: 0.0
          kl: 0.007471573912306512
          policy_loss: -0.02243694888367768
          total_loss: 178.7418521183793
          vf_explained_var: -1.922730476625504e-09
          vf_loss: 178.75924598939957
        model: {}
    num_agent_steps_sampled: 140000
    num_agent_steps_traine

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,35,344.472,140000,-50.4971,-42.98,-72.99,12.042


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,35,344.472,140000,-50.4971,-42.98,-72.99,12.042


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 144000
  custom_metrics: {}
  date: 2022-03-13_23-07-39
  done: false
  episode_len_mean: 12.057401812688822
  episode_media: {}
  episode_reward_max: -43.11
  episode_reward_mean: -50.547794561933536
  episode_reward_min: -66.34
  episodes_this_iter: 331
  episodes_total: 10438
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.8649282684890173
          entropy_coeff: 0.0
          kl: 0.008494573613446132
          policy_loss: -0.026450188600167793
          total_loss: 176.28805066180485
          vf_explained_var: -4.61455314390121e-09
          vf_loss: 176.30876766738072
        model: {}
    num_agent_steps_sampled: 144000
    num_agent_steps_trai

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,36,354.676,144000,-50.5478,-43.11,-66.34,12.0574


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,36,354.676,144000,-50.5478,-43.11,-66.34,12.0574


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 148000
  custom_metrics: {}
  date: 2022-03-13_23-07-49
  done: false
  episode_len_mean: 12.03003003003003
  episode_media: {}
  episode_reward_max: -43.15
  episode_reward_mean: -49.41393393393393
  episode_reward_min: -63.95
  episodes_this_iter: 333
  episodes_total: 10771
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.8156871045789411
          entropy_coeff: 0.0
          kl: 0.00702929640446314
          policy_loss: -0.019195786274729235
          total_loss: 168.619070746309
          vf_explained_var: -6.409101588751681e-10
          vf_loss: 168.6335218368038
        model: {}
    num_agent_steps_sampled: 148000
    num_agent_steps_trained: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,37,364.977,148000,-49.4139,-43.15,-63.95,12.03


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,37,364.977,148000,-49.4139,-43.15,-63.95,12.03


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 152000
  custom_metrics: {}
  date: 2022-03-13_23-07-59
  done: false
  episode_len_mean: 12.05421686746988
  episode_media: {}
  episode_reward_max: -42.06999999999999
  episode_reward_mean: -48.913674698795184
  episode_reward_min: -68.16000000000001
  episodes_this_iter: 332
  episodes_total: 11103
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.8008583520689319
          entropy_coeff: 0.0
          kl: 0.008113112802013331
          policy_loss: -0.020998168784764504
          total_loss: 167.6403279786469
          vf_explained_var: -2.4995496196131554e-09
          vf_loss: 167.65584969469296
        model: {}
    num_agent_steps_sampled: 152000


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,38,375.193,152000,-48.9137,-42.07,-68.16,12.0542


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,38,375.193,152000,-48.9137,-42.07,-68.16,12.0542


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 156000
  custom_metrics: {}
  date: 2022-03-13_23-08-10
  done: false
  episode_len_mean: 12.015015015015015
  episode_media: {}
  episode_reward_max: -42.98
  episode_reward_mean: -48.50582582582583
  episode_reward_min: -67.49000000000001
  episodes_this_iter: 333
  episodes_total: 11436
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.8012809389380998
          entropy_coeff: 0.0
          kl: 0.008072730945294416
          policy_loss: -0.019257390680372393
          total_loss: 164.36555725425802
          vf_explained_var: -5.768191429876512e-10
          vf_loss: 164.37936583488218
        model: {}
    num_agent_steps_sampled: 156000
    num_agen

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,39,385.521,156000,-48.5058,-42.98,-67.49,12.015


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,39,385.521,156000,-48.5058,-42.98,-67.49,12.015


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 160000
  custom_metrics: {}
  date: 2022-03-13_23-08-20
  done: false
  episode_len_mean: 12.02710843373494
  episode_media: {}
  episode_reward_max: -43.11
  episode_reward_mean: -49.224216867469885
  episode_reward_min: -79.15
  episodes_this_iter: 332
  episodes_total: 11768
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.8059066932688477
          entropy_coeff: 0.0
          kl: 0.008578147256245108
          policy_loss: -0.025407996890886176
          total_loss: 171.56978169102823
          vf_explained_var: -1.4100023495253696e-09
          vf_loss: 171.5894000145697
        model: {}
    num_agent_steps_sampled: 160000
    num_agent_steps_trai

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,40,395.813,160000,-49.2242,-43.11,-79.15,12.0271


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,40,395.813,160000,-49.2242,-43.11,-79.15,12.0271


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 164000
  custom_metrics: {}
  date: 2022-03-13_23-08-31
  done: false
  episode_len_mean: 12.03003003003003
  episode_media: {}
  episode_reward_max: -42.690000000000005
  episode_reward_mean: -49.404084084084076
  episode_reward_min: -71.02000000000001
  episodes_this_iter: 333
  episodes_total: 12101
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.7734253932711899
          entropy_coeff: 0.0
          kl: 0.006783952282701753
          policy_loss: -0.019813320869880337
          total_loss: 168.31232757568358
          vf_explained_var: -2.0509125084005376e-09
          vf_loss: 168.3275617866106
        model: {}
    num_agent_steps_sampled: 164000

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,41,406.273,164000,-49.4041,-42.69,-71.02,12.03


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,41,406.273,164000,-49.4041,-42.69,-71.02,12.03


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 168000
  custom_metrics: {}
  date: 2022-03-13_23-08-41
  done: false
  episode_len_mean: 12.015015015015015
  episode_media: {}
  episode_reward_max: -42.12
  episode_reward_mean: -48.88039039039039
  episode_reward_min: -63.589999999999996
  episodes_this_iter: 333
  episodes_total: 12434
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.7609769935249
          entropy_coeff: 0.0
          kl: 0.006729501719440518
          policy_loss: -0.020982649761141947
          total_loss: 160.75002265848138
          vf_explained_var: -4.61455314390121e-09
          vf_loss: 160.76646318333124
        model: {}
    num_agent_steps_sampled: 168000
    num_agent_s

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,42,416.562,168000,-48.8804,-42.12,-63.59,12.015


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,42,416.562,168000,-48.8804,-42.12,-63.59,12.015


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 172000
  custom_metrics: {}
  date: 2022-03-13_23-08-52
  done: false
  episode_len_mean: 12.021084337349398
  episode_media: {}
  episode_reward_max: -42.98
  episode_reward_mean: -48.28728915662651
  episode_reward_min: -64.14
  episodes_this_iter: 332
  episodes_total: 12766
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.7299574163011325
          entropy_coeff: 0.0
          kl: 0.008001464159079636
          policy_loss: -0.02436288960543411
          total_loss: 160.8165732270928
          vf_explained_var: -3.268641810263357e-09
          vf_loss: 160.83553465156146
        model: {}
    num_agent_steps_sampled: 172000
    num_agent_steps_traine

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,43,427.202,172000,-48.2873,-42.98,-64.14,12.0211


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,43,427.202,172000,-48.2873,-42.98,-64.14,12.0211


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 176000
  custom_metrics: {}
  date: 2022-03-13_23-09-02
  done: false
  episode_len_mean: 12.03003003003003
  episode_media: {}
  episode_reward_max: -42.97
  episode_reward_mean: -47.949249249249256
  episode_reward_min: -68.44
  episodes_this_iter: 333
  episodes_total: 13099
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.6806788139445807
          entropy_coeff: 0.0
          kl: 0.007646539586066302
          policy_loss: -0.02045685448884083
          total_loss: 159.3055680962019
          vf_explained_var: -6.40910158875168e-11
          vf_loss: 159.32086286647345
        model: {}
    num_agent_steps_sampled: 176000
    num_agent_steps_trained

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,44,437.839,176000,-47.9492,-42.97,-68.44,12.03


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,44,437.839,176000,-47.9492,-42.97,-68.44,12.03


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 180000
  custom_metrics: {}
  date: 2022-03-13_23-09-13
  done: false
  episode_len_mean: 12.015015015015015
  episode_media: {}
  episode_reward_max: -42.690000000000005
  episode_reward_mean: -47.16393393393393
  episode_reward_min: -60.88
  episodes_this_iter: 333
  episodes_total: 13432
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.6639380885708717
          entropy_coeff: 0.0
          kl: 0.004590130307837574
          policy_loss: -0.01361508025657586
          total_loss: 155.16102068501135
          vf_explained_var: 2.1790945401755713e-09
          vf_loss: 155.17153781357632
        model: {}
    num_agent_steps_sampled: 180000
    num_agen

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,45,448.428,180000,-47.1639,-42.69,-60.88,12.015


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,45,448.428,180000,-47.1639,-42.69,-60.88,12.015


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 184000
  custom_metrics: {}
  date: 2022-03-13_23-09-23
  done: false
  episode_len_mean: 12.012012012012011
  episode_media: {}
  episode_reward_max: -42.690000000000005
  episode_reward_mean: -47.19399399399399
  episode_reward_min: -65.72
  episodes_this_iter: 333
  episodes_total: 13765
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.3375000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.6399588528499809
          entropy_coeff: 0.0
          kl: 0.009962439644944877
          policy_loss: -0.02175139768289462
          total_loss: 154.22729625086632
          vf_explained_var: 3.0763687626008064e-09
          vf_loss: 154.24568474882392
        model: {}
    num_agent_steps_sampled: 184000
    num_agen

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,46,458.919,184000,-47.194,-42.69,-65.72,12.012


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,46,458.919,184000,-47.194,-42.69,-65.72,12.012


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 188000
  custom_metrics: {}
  date: 2022-03-13_23-09-34
  done: false
  episode_len_mean: 12.018018018018019
  episode_media: {}
  episode_reward_max: -42.06999999999999
  episode_reward_mean: -47.24780780780781
  episode_reward_min: -61.36
  episodes_this_iter: 333
  episodes_total: 14098
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.3375000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.6569933616345929
          entropy_coeff: 0.0
          kl: 0.023839802537804048
          policy_loss: -0.03030281941636756
          total_loss: 155.41644213276524
          vf_explained_var: -1.60227539718792e-09
          vf_loss: 155.43869929159843
        model: {}
    num_agent_steps_sampled: 188000
    num_agent_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,47,469.581,188000,-47.2478,-42.07,-61.36,12.018


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,47,469.581,188000,-47.2478,-42.07,-61.36,12.018


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 192000
  custom_metrics: {}
  date: 2022-03-13_23-09-45
  done: false
  episode_len_mean: 12.018072289156626
  episode_media: {}
  episode_reward_max: -42.06999999999999
  episode_reward_mean: -47.60659638554217
  episode_reward_min: -63.93
  episodes_this_iter: 332
  episodes_total: 14430
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.5062499999999999
          cur_lr: 5.0000000000000016e-05
          entropy: 0.6391934690936919
          entropy_coeff: 0.0
          kl: 0.0065446188596110086
          policy_loss: -0.02021409445241975
          total_loss: 151.09862157349946
          vf_explained_var: 6.152737525201613e-09
          vf_loss: 151.1155224051527
        model: {}
    num_agent_steps_sampled: 192000
    num_agent_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,48,480.085,192000,-47.6066,-42.07,-63.93,12.0181


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,48,480.085,192000,-47.6066,-42.07,-63.93,12.0181


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 196000
  custom_metrics: {}
  date: 2022-03-13_23-09-55
  done: false
  episode_len_mean: 12.018018018018019
  episode_media: {}
  episode_reward_max: -42.06999999999999
  episode_reward_mean: -47.176276276276276
  episode_reward_min: -63.95000000000001
  episodes_this_iter: 333
  episodes_total: 14763
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.5062499999999999
          cur_lr: 5.0000000000000016e-05
          entropy: 0.6174364643712198
          entropy_coeff: 0.0
          kl: 0.021032697003465765
          policy_loss: -0.037842283073452214
          total_loss: 146.00206705729167
          vf_explained_var: -4.3581890803511426e-09
          vf_loss: 146.02926182900705
        model: {}
    num_agent_steps_sampled: 19600

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,49,490.447,196000,-47.1763,-42.07,-63.95,12.018


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,RUNNING,127.0.0.1:27234,49,490.447,196000,-47.1763,-42.07,-63.95,12.018


Result for PPOTrainer_my_env_5d5b7_00000:
  agent_timesteps_total: 200000
  custom_metrics: {}
  date: 2022-03-13_23-10-06
  done: true
  episode_len_mean: 12.03003003003003
  episode_media: {}
  episode_reward_max: -42.690000000000005
  episode_reward_mean: -50.06555555555556
  episode_reward_min: -73.18
  episodes_this_iter: 333
  episodes_total: 15096
  experiment_id: 3957756f55bc4d9a8b08a7cd6eb29e57
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.7593750000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.7507079626283338
          entropy_coeff: 0.0
          kl: 0.010414600522083716
          policy_loss: -0.02518911186645749
          total_loss: 167.40628174812562
          vf_explained_var: -3.0763687626008064e-09
          vf_loss: 167.423561457152
        model: {}
    num_agent_steps_sampled: 200000
    num_agent_s

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_5d5b7_00000,TERMINATED,127.0.0.1:27234,50,501.084,200000,-50.0656,-42.69,-73.18,12.03


[2m[36m(RolloutWorker pid=27232)[0m 2022-03-13 23:10:06,751	ERROR worker.py:432 -- SystemExit was raised from the worker.
[2m[36m(RolloutWorker pid=27232)[0m Traceback (most recent call last):
[2m[36m(RolloutWorker pid=27232)[0m   File "python/ray/_raylet.pyx", line 629, in ray._raylet.execute_task
[2m[36m(RolloutWorker pid=27232)[0m   File "python/ray/_raylet.pyx", line 636, in ray._raylet.execute_task
[2m[36m(RolloutWorker pid=27232)[0m   File "python/ray/_raylet.pyx", line 640, in ray._raylet.execute_task
[2m[36m(RolloutWorker pid=27232)[0m   File "python/ray/_raylet.pyx", line 589, in ray._raylet.execute_task.function_executor
[2m[36m(RolloutWorker pid=27232)[0m   File "/Users/vladimirsudakov/.pyenv/versions/3.9.10/lib/python3.9/site-packages/ray/_private/function_manager.py", line 639, in actor_method_executor
[2m[36m(RolloutWorker pid=27232)[0m     return method(__ray_actor, *args, **kwargs)
[2m[36m(RolloutWorker pid=27232)[0m   File "/Users/vladimirsud

In [17]:
agent.restore(experiment.best_checkpoint)

2022-03-13 23:10:06,905	INFO trainable.py:472 -- Restored on 127.0.0.1 from checkpoint: /Users/vladimirsudakov/ray_results/PPOTrainer_2022-03-13_23-01-26/PPOTrainer_my_env_5d5b7_00000_0_2022-03-13_23-01-26/checkpoint_000050/checkpoint-50
2022-03-13 23:10:06,906	INFO trainable.py:480 -- Current state after restoring: {'_iteration': 50, '_timesteps_total': 200000, '_time_total': 501.08385014533997, '_episodes_total': 15096}


In [18]:
obs = env.reset()
g = 0
actions = [0]
for i in range(1000):
    action = agent.compute_single_action(obs, explore = False)
    obs, reward, done, info = env.step(action)
    g += M[actions[-1], action + 1]
    actions.append(action + 1)
    if done:
        obs = env.reset()
        g += M[actions[-1], 0]
        print(f"{actions}, g = {g}")
        break
env.close()

[0, 5, 10, 7, 1, 12, 3, 2, 4, 11, 8, 6, 9], g = 43.15
