In [49]:
import numpy as np
import gym
from stable_baselines3 import PPO
import math
import itertools
import datetime
import ray
import ray.rllib as rllib
from ray import tune
from ray.tune.registry import register_env

In [2]:
N = 10
rand = np.random.RandomState(1)
M = np.round(rand.normal(5, 1.5, size=(N, N)),2)
M[np.eye(N)==1] = 0
M

array([[0.  , 4.08, 4.21, 3.39, 6.3 , 1.55, 7.62, 3.86, 5.48, 4.63],
       [7.19, 0.  , 4.52, 4.42, 6.7 , 3.35, 4.74, 3.68, 5.06, 5.87],
       [3.35, 6.72, 0.  , 5.75, 6.35, 3.97, 4.82, 3.6 , 4.6 , 5.8 ],
       [3.96, 4.4 , 3.97, 0.  , 3.99, 4.98, 3.32, 5.35, 7.49, 6.11],
       [4.71, 3.67, 3.88, 7.54, 0.  , 4.04, 5.29, 8.15, 5.18, 5.93],
       [5.45, 4.47, 3.29, 4.48, 4.69, 0.  , 6.26, 6.4 , 5.43, 6.33],
       [3.87, 6.88, 5.77, 4.55, 5.73, 4.89, 0.  , 7.28, 8.28, 2.91],
       [2.83, 4.24, 5.24, 6.31, 5.47, 1.97, 4.54, 0.  , 5.35, 6.14],
       [4.67, 4.7 , 5.28, 5.62, 5.3 , 5.18, 3.99, 5.57, 0.  , 6.69],
       [6.8 , 5.28, 4.44, 4.04, 5.64, 5.12, 4.48, 5.07, 4.07, 0.  ]])

In [3]:
math.factorial(N-1)

362880

In [22]:
class MyEnv(gym.Env):
    def __init__(self, env_config):
        super().__init__()
        self.n = N-1
        self.action_space = gym.spaces.Discrete(self.n)
        self.observation_space = gym.spaces.Dict({
            'visited': gym.spaces.MultiBinary(self.n), 
            'last': gym.spaces.Discrete(N)})   
        
    def reset(self):
        self.state = {'visited': np.zeros(self.n), 'last': 0}
        visited = np.zeros(self.n)
        return self.state

    def step(self, action):
        if self.state['visited'][action] == 1:
            self.reward = -10
        else:
            self.state['visited'][action] = 1
            self.reward = - M[self.state['last'], action + 1]
            self.state['last'] = action + 1
        if np.all(self.state['visited'] == 1): 
            self.reward += - M[action + 1, 0]    
            self.done = True
        else:
            self.done = False
            
        return self.state, self.reward, self.done, {}

In [23]:
config = rllib.agents.ppo.DEFAULT_CONFIG.copy()
config["num_gpus"] = 0
config["num_workers"] = 1
config["framework"] = "torch"
config["env_config"] = {}

In [24]:
env = MyEnv(config)

In [25]:
print(datetime.datetime.now())
model = PPO("MultiInputPolicy", env, verbose=1)
model.learn(total_timesteps=100000)
print(datetime.datetime.now())

2022-03-13 19:55:25.546275
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 27.3     |
|    ep_rew_mean     | -233     |
| time/              |          |
|    fps             | 1777     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 23.2        |
|    ep_rew_mean          | -193        |
| time/                   |             |
|    fps                  | 1378        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.005818229 |
|    clip_fraction        | 0.00898     |
|    clip_range           | 0.2    

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 12          |
|    ep_rew_mean          | -80.8       |
| time/                   |             |
|    fps                  | 1046        |
|    iterations           | 11          |
|    time_elapsed         | 21          |
|    total_timesteps      | 22528       |
| train/                  |             |
|    approx_kl            | 0.018074024 |
|    clip_fraction        | 0.144       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.93       |
|    explained_variance   | -6.2e-05    |
|    learning_rate        | 0.0003      |
|    loss                 | 452         |
|    n_updates            | 100         |
|    policy_gradient_loss | -0.0253     |
|    value_loss           | 811         |
-----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 11      

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 9.23        |
|    ep_rew_mean          | -49         |
| time/                   |             |
|    fps                  | 1037        |
|    iterations           | 21          |
|    time_elapsed         | 41          |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.014562448 |
|    clip_fraction        | 0.148       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.33       |
|    explained_variance   | -4.53e-06   |
|    learning_rate        | 0.0003      |
|    loss                 | 98.7        |
|    n_updates            | 200         |
|    policy_gradient_loss | -0.0166     |
|    value_loss           | 180         |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 9.18  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 9.1         |
|    ep_rew_mean          | -43.9       |
| time/                   |             |
|    fps                  | 1002        |
|    iterations           | 31          |
|    time_elapsed         | 63          |
|    total_timesteps      | 63488       |
| train/                  |             |
|    approx_kl            | 0.009600232 |
|    clip_fraction        | 0.107       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.02       |
|    explained_variance   | -9.54e-07   |
|    learning_rate        | 0.0003      |
|    loss                 | 63.9        |
|    n_updates            | 300         |
|    policy_gradient_loss | -0.0138     |
|    value_loss           | 128         |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 9.1   

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 9.06        |
|    ep_rew_mean          | -40.9       |
| time/                   |             |
|    fps                  | 993         |
|    iterations           | 41          |
|    time_elapsed         | 84          |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.011049003 |
|    clip_fraction        | 0.129       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.75       |
|    explained_variance   | -2.38e-07   |
|    learning_rate        | 0.0003      |
|    loss                 | 46.5        |
|    n_updates            | 400         |
|    policy_gradient_loss | -0.0106     |
|    value_loss           | 114         |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 9.04

In [28]:
obs = env.reset()
g = 0
actions = [0]
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    g += M[actions[-1], action + 1]
    actions.append(action + 1)
    if done:
        obs = env.reset()
        g += M[actions[-1], 0]
        print(f"{actions}, g = {g}")
        break
env.close()

[0, 5, 2, 8, 6, 9, 3, 4, 1, 7], g = 34.55


In [27]:
x_min = []
g_min = 100000000
print(datetime.datetime.now())
for i, x in enumerate(itertools.permutations(range(1,N), N-1)):
    x = (0,) + x
    g = np.sum(M[x[:-1], x[1:]])+M[x[-1], x[0]]
    if g < g_min:
        g_min = g
        x_min = x
    if i % 100000 == 0:
        print(i, x_min, g_min)
print(datetime.datetime.now())
print('Optimal solution:')
print(x_min, g_min)

2022-03-13 19:58:01.039612
0 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) 54.75999999999999
100000 (0, 3, 4, 1, 7, 5, 2, 6, 9, 8) 36.46
200000 (0, 5, 2, 8, 6, 9, 3, 4, 1, 7) 34.55
300000 (0, 5, 2, 8, 6, 9, 3, 4, 1, 7) 34.55
2022-03-13 19:58:06.565740
Optimal solution:
(0, 5, 2, 8, 6, 9, 3, 4, 1, 7) 34.55


In [32]:
ray.shutdown()
ray.init()

{'node_ip_address': '127.0.0.1',
 'raylet_ip_address': '127.0.0.1',
 'redis_address': '127.0.0.1:63062',
 'object_store_address': '/tmp/ray/session_2022-03-13_20-03-32_400944_23142/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2022-03-13_20-03-32_400944_23142/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2022-03-13_20-03-32_400944_23142',
 'metrics_export_port': 64725,
 'gcs_address': '127.0.0.1:59736',
 'node_id': '6bad5deaf6db8bf243f707c7d8fedd276054729843fc4a681c988362'}

In [33]:
agent = rllib.agents.ppo.PPOTrainer(config=config, env=MyEnv)



In [34]:
print(datetime.datetime.now())
for i in range(31):
   # Perform one iteration of training the policy with PPO
   result = agent.train()
   if i % 10 == 0:
       #print(pretty_print(result))
       print('i: ', i)
       print('mean episode length:', result['episode_len_mean'])
       print('max episode reward:', result['episode_reward_max'])
       print('mean episode reward:', result['episode_reward_mean'])
       print('min episode reward:', result['episode_reward_min'])
       print('total episodes:', result['episodes_total'])
       print()

       checkpoint = agent.save()
print(datetime.datetime.now())

2022-03-13 20:03:46.564128
i:  0
mean episode length: 25.35031847133758
max episode reward: -72.99000000000001
mean episode reward: -214.08828025477706
min episode reward: -661.95
total episodes: 157

i:  10
mean episode length: 9.2830626450116
max episode reward: -37.42
mean episode reward: -49.96266821345708
min episode reward: -81.33999999999997
total episodes: 3438

i:  20
mean episode length: 9.018018018018019
max episode reward: -34.95
mean episode reward: -41.2081981981982
min episode reward: -58.39
total episodes: 7849

i:  30
mean episode length: 9.00900900900901
max episode reward: -34.55
mean episode reward: -38.070923423423416
min episode reward: -53.459999999999994
total episodes: 12286

2022-03-13 20:09:12.743320


In [44]:
obs = env.reset()
g = 0
actions = [0]
for i in range(1000):
    action = agent.compute_single_action(obs, explore = False)
    obs, reward, done, info = env.step(action)
    g += M[actions[-1], action + 1]
    actions.append(action + 1)
    if done:
        obs = env.reset()
        g += M[actions[-1], 0]
        print(f"{actions}, g = {g}")
        break
env.close()

[0, 3, 6, 9, 8, 1, 7, 5, 4, 2], g = 35.96


In [50]:
def env_creator(env_config):
    return MyEnv(env_config)  # return an env instance

register_env("my_env", env_creator)

In [60]:
experiment = tune.run(
    rllib.agents.ppo.PPOTrainer,
    config={
        "env": "my_env",
        "num_gpus": 0,
        "num_workers": 1,
        "framework": "torch"
    },
    metric="episode_reward_mean",
    mode="max",
    stop={"training_iteration": 50},
    checkpoint_at_end=True
)

Trial name,status,loc
PPOTrainer_my_env_246f0_00000,PENDING,


[2m[36m(PPOTrainer pid=25313)[0m 2022-03-13 20:58:23,620	INFO ppo.py:249 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(PPOTrainer pid=25313)[0m 2022-03-13 20:58:23,621	INFO trainer.py:790 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Trial name,status,loc
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313




Trial name,status,loc
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313


Trial name,status,loc
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 4000
  custom_metrics: {}
  date: 2022-03-13_20-58-44
  done: false
  episode_len_mean: 24.169696969696968
  episode_media: {}
  episode_reward_max: -67.26000000000002
  episode_reward_mean: -202.11
  episode_reward_min: -547.58
  episodes_this_iter: 165
  episodes_total: 165
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 2.1869085142689366
          entropy_coeff: 0.0
          kl: 0.010416611763871889
          policy_loss: -0.017940580628571973
          total_loss: 17717.23331548219
          vf_explained_var: -0.005470780147019253
          vf_loss: 17717.249311155912
        model: {}
    num_agent_steps_sampled: 4000
    num_agent_steps_trained: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,1,10.7653,4000,-202.11,-67.26,-547.58,24.1697


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,1,10.7653,4000,-202.11,-67.26,-547.58,24.1697


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 8000
  custom_metrics: {}
  date: 2022-03-13_20-58-53
  done: false
  episode_len_mean: 20.50769230769231
  episode_media: {}
  episode_reward_max: -61.769999999999996
  episode_reward_mean: -165.90123076923078
  episode_reward_min: -505.57
  episodes_this_iter: 195
  episodes_total: 360
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 2.154120001997999
          entropy_coeff: 0.0
          kl: 0.012742371890986056
          policy_loss: -0.03244756844135061
          total_loss: 10110.85639123404
          vf_explained_var: -0.0022907166070835565
          vf_loss: 10110.886298460602
        model: {}
    num_agent_steps_sampled: 8000
    num_agent_step

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,2,20.6346,8000,-165.901,-61.77,-505.57,20.5077


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,2,20.6346,8000,-165.901,-61.77,-505.57,20.5077


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 12000
  custom_metrics: {}
  date: 2022-03-13_20-59-04
  done: false
  episode_len_mean: 18.341013824884794
  episode_media: {}
  episode_reward_max: -57.99000000000001
  episode_reward_mean: -144.20225806451612
  episode_reward_min: -348.49
  episodes_this_iter: 217
  episodes_total: 577
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 2.1048850702983075
          entropy_coeff: 0.0
          kl: 0.015234115387509546
          policy_loss: -0.047512277585243984
          total_loss: 6102.27501968876
          vf_explained_var: -0.0010356727466788343
          vf_loss: 6102.319471527428
        model: {}
    num_agent_steps_sampled: 12000
    num_agent_st

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,3,31.6271,12000,-144.202,-57.99,-348.49,18.341


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,3,31.6271,12000,-144.202,-57.99,-348.49,18.341


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,3,31.6271,12000,-144.202,-57.99,-348.49,18.341


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 16000
  custom_metrics: {}
  date: 2022-03-13_20-59-17
  done: false
  episode_len_mean: 15.307984790874524
  episode_media: {}
  episode_reward_max: -41.849999999999994
  episode_reward_mean: -113.18239543726234
  episode_reward_min: -417.45
  episodes_this_iter: 263
  episodes_total: 840
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 2.0255061267524637
          entropy_coeff: 0.0
          kl: 0.019774362284651254
          policy_loss: -0.04597226970538657
          total_loss: 3060.0108110981605
          vf_explained_var: -0.0006109822180963331
          vf_loss: 3060.052819889848
        model: {}
    num_agent_steps_sampled: 16000
    num_agent_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,4,44.1217,16000,-113.182,-41.85,-417.45,15.308


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,4,44.1217,16000,-113.182,-41.85,-417.45,15.308


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 20000
  custom_metrics: {}
  date: 2022-03-13_20-59-27
  done: false
  episode_len_mean: 13.634812286689419
  episode_media: {}
  episode_reward_max: -44.99
  episode_reward_mean: -96.64587030716723
  episode_reward_min: -257.02000000000004
  episodes_this_iter: 293
  episodes_total: 1133
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 1.929888112827014
          entropy_coeff: 0.0
          kl: 0.018203599315653254
          policy_loss: -0.04980972930638781
          total_loss: 1403.4335122385332
          vf_explained_var: -0.00042828231729486935
          vf_loss: 1403.47967601489
        model: {}
    num_agent_steps_sampled: 20000
    num_agent_st

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,5,54.3173,20000,-96.6459,-44.99,-257.02,13.6348


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,5,54.3173,20000,-96.6459,-44.99,-257.02,13.6348


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 24000
  custom_metrics: {}
  date: 2022-03-13_20-59-38
  done: false
  episode_len_mean: 12.148484848484848
  episode_media: {}
  episode_reward_max: -41.54
  episode_reward_mean: -81.37893939393939
  episode_reward_min: -224.70999999999998
  episodes_this_iter: 330
  episodes_total: 1463
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 1.8097208298662657
          entropy_coeff: 0.0
          kl: 0.022804446145701544
          policy_loss: -0.050020931622073536
          total_loss: 815.7091121878675
          vf_explained_var: -0.0003478830860507104
          vf_loss: 815.7545713363155
        model: {}
    num_agent_steps_sampled: 24000
    num_agent_s

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,6,65.3151,24000,-81.3789,-41.54,-224.71,12.1485


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,6,65.3151,24000,-81.3789,-41.54,-224.71,12.1485


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 28000
  custom_metrics: {}
  date: 2022-03-13_20-59-49
  done: false
  episode_len_mean: 10.705093833780161
  episode_media: {}
  episode_reward_max: -38.650000000000006
  episode_reward_mean: -66.68739946380697
  episode_reward_min: -137.01
  episodes_this_iter: 373
  episodes_total: 1836
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.3
          cur_lr: 5.0000000000000016e-05
          entropy: 1.662133575126689
          entropy_coeff: 0.0
          kl: 0.022450827190429002
          policy_loss: -0.05199338691609521
          total_loss: 460.6173517042591
          vf_explained_var: -0.00035958956646662887
          vf_loss: 460.66261065083165
        model: {}
    num_agent_steps_sampled: 28000
    num_agent_steps_trained: 2

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,7,75.6713,28000,-66.6874,-38.65,-137.01,10.7051


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,7,75.6713,28000,-66.6874,-38.65,-137.01,10.7051


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 32000
  custom_metrics: {}
  date: 2022-03-13_20-59-59
  done: false
  episode_len_mean: 10.0175
  episode_media: {}
  episode_reward_max: -38.5
  episode_reward_mean: -58.8716
  episode_reward_min: -96.22
  episodes_this_iter: 400
  episodes_total: 2236
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 1.5355883276590736
          entropy_coeff: 0.0
          kl: 0.019613028891350647
          policy_loss: -0.04002921180123643
          total_loss: 320.93981356056787
          vf_explained_var: -0.0003111586775830997
          vf_loss: 320.9710161106561
        model: {}
    num_agent_steps_sampled: 32000
    num_agent_steps_trained: 32000
    num_steps_sa

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,8,85.8265,32000,-58.8716,-38.5,-96.22,10.0175


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,8,85.8265,32000,-58.8716,-38.5,-96.22,10.0175


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 36000
  custom_metrics: {}
  date: 2022-03-13_21-00-10
  done: false
  episode_len_mean: 9.523809523809524
  episode_media: {}
  episode_reward_max: -38.09
  episode_reward_mean: -53.725904761904765
  episode_reward_min: -89.61
  episodes_this_iter: 420
  episodes_total: 2656
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 1.418420535518277
          entropy_coeff: 0.0
          kl: 0.015737991138505925
          policy_loss: -0.03441798966978827
          total_loss: 246.68678047426286
          vf_explained_var: -0.00023680835641840453
          vf_loss: 246.7141150033602
        model: {}
    num_agent_steps_sampled: 36000
    num_agent_steps_trained: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,9,96.5772,36000,-53.7259,-38.09,-89.61,9.52381


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,9,96.5772,36000,-53.7259,-38.09,-89.61,9.52381


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 40000
  custom_metrics: {}
  date: 2022-03-13_21-00-20
  done: false
  episode_len_mean: 9.341121495327103
  episode_media: {}
  episode_reward_max: -38.32
  episode_reward_mean: -51.07794392523365
  episode_reward_min: -83.49000000000001
  episodes_this_iter: 428
  episodes_total: 3084
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 1.3249229505497921
          entropy_coeff: 0.0
          kl: 0.01386727163819735
          policy_loss: -0.031868641348856114
          total_loss: 199.4613044984879
          vf_explained_var: -0.00016164215662146128
          vf_loss: 199.486933308263
        model: {}
    num_agent_steps_sampled: 40000
    num_agent_steps

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,10,107.455,40000,-51.0779,-38.32,-83.49,9.34112


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,10,107.455,40000,-51.0779,-38.32,-83.49,9.34112


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 44000
  custom_metrics: {}
  date: 2022-03-13_21-00-31
  done: false
  episode_len_mean: 9.190804597701149
  episode_media: {}
  episode_reward_max: -36.57
  episode_reward_mean: -48.108827586206885
  episode_reward_min: -77.14
  episodes_this_iter: 435
  episodes_total: 3519
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 1.2650363245318013
          entropy_coeff: 0.0
          kl: 0.014320323320877137
          policy_loss: -0.0316133506941579
          total_loss: 170.04851935602002
          vf_explained_var: -0.00010505222505138766
          vf_loss: 170.07368890905892
        model: {}
    num_agent_steps_sampled: 44000
    num_agent_steps_trained:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,11,117.509,44000,-48.1088,-36.57,-77.14,9.1908


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,11,117.509,44000,-48.1088,-36.57,-77.14,9.1908


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 48000
  custom_metrics: {}
  date: 2022-03-13_21-00-41
  done: false
  episode_len_mean: 9.132420091324201
  episode_media: {}
  episode_reward_max: -37.379999999999995
  episode_reward_mean: -46.91664383561643
  episode_reward_min: -78.66
  episodes_this_iter: 438
  episodes_total: 3957
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 1.2046924210363819
          entropy_coeff: 0.0
          kl: 0.014924376245917173
          policy_loss: -0.029970368137082426
          total_loss: 161.16663153863723
          vf_explained_var: -6.591402074342132e-05
          vf_loss: 161.18988537532027
        model: {}
    num_agent_steps_sampled: 48000
    num_agent_s

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,12,127.874,48000,-46.9166,-37.38,-78.66,9.13242


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 52000
  custom_metrics: {}
  date: 2022-03-13_21-00-51
  done: false
  episode_len_mean: 9.125284738041003
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -45.94683371298405
  episode_reward_min: -74.25
  episodes_this_iter: 439
  episodes_total: 4396
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 1.1334780238007987
          entropy_coeff: 0.0
          kl: 0.012472789236653476
          policy_loss: -0.02372160964164763
          total_loss: 149.4071323353757
          vf_explained_var: -4.750336370160503e-05
          vf_loss: 149.42524162210444
        model: {}
    num_agent_steps_sampled: 52000
    num_agent_steps_trained: 5

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,13,137.814,52000,-45.9468,-34.55,-74.25,9.12528


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,13,137.814,52000,-45.9468,-34.55,-74.25,9.12528


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 56000
  custom_metrics: {}
  date: 2022-03-13_21-01-01
  done: false
  episode_len_mean: 9.058956916099774
  episode_media: {}
  episode_reward_max: -36.43
  episode_reward_mean: -44.33743764172335
  episode_reward_min: -62.68
  episodes_this_iter: 441
  episodes_total: 4837
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 1.0681496679141957
          entropy_coeff: 0.0
          kl: 0.009453748714856295
          policy_loss: -0.0208396596809028
          total_loss: 137.50833036361203
          vf_explained_var: -2.84197509929698e-05
          vf_loss: 137.5249160028273
        model: {}
    num_agent_steps_sampled: 56000
    num_agent_steps_trained: 560

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,14,147.759,56000,-44.3374,-36.43,-62.68,9.05896


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,14,147.759,56000,-44.3374,-36.43,-62.68,9.05896


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 60000
  custom_metrics: {}
  date: 2022-03-13_21-01-11
  done: false
  episode_len_mean: 9.036117381489841
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -43.27645598194131
  episode_reward_min: -59.27
  episodes_this_iter: 443
  episodes_total: 5280
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 1.0331625364800936
          entropy_coeff: 0.0
          kl: 0.010514178109263401
          policy_loss: -0.01808428455003968
          total_loss: 127.6260109932192
          vf_explained_var: -1.9936023219939203e-05
          vf_loss: 127.63936386108398
        model: {}
    num_agent_steps_sampled: 60000
    num_agent_steps_trained: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,15,157.392,60000,-43.2765,-34.55,-59.27,9.03612


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,15,157.392,60000,-43.2765,-34.55,-59.27,9.03612


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 64000
  custom_metrics: {}
  date: 2022-03-13_21-01-20
  done: false
  episode_len_mean: 9.033860045146726
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -42.65223476297969
  episode_reward_min: -66.69
  episodes_this_iter: 443
  episodes_total: 5723
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.9582410618182151
          entropy_coeff: 0.0
          kl: 0.010972671590146522
          policy_loss: -0.019844784853999972
          total_loss: 126.2795539855957
          vf_explained_var: -1.2753727615520518e-05
          vf_loss: 126.29446093651556
        model: {}
    num_agent_steps_sampled: 64000
    num_agent_steps_trained:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,16,167.285,64000,-42.6522,-34.55,-66.69,9.03386


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,16,167.285,64000,-42.6522,-34.55,-66.69,9.03386


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 68000
  custom_metrics: {}
  date: 2022-03-13_21-01-30
  done: false
  episode_len_mean: 9.042986425339366
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -41.824321266968326
  episode_reward_min: -65.05000000000001
  episodes_this_iter: 442
  episodes_total: 6165
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.9012484795303755
          entropy_coeff: 0.0
          kl: 0.01056406904204488
          policy_loss: -0.020728957375151014
          total_loss: 120.30025172900127
          vf_explained_var: -7.474294272802209e-06
          vf_loss: 120.31622677874822
        model: {}
    num_agent_steps_sampled: 68000
    num_agent_st

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,17,177.22,68000,-41.8243,-34.55,-65.05,9.04299


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,17,177.22,68000,-41.8243,-34.55,-65.05,9.04299


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 72000
  custom_metrics: {}
  date: 2022-03-13_21-01-41
  done: false
  episode_len_mean: 9.024830699774267
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -41.028939051918734
  episode_reward_min: -58.95
  episodes_this_iter: 443
  episodes_total: 6608
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.8671911156946613
          entropy_coeff: 0.0
          kl: 0.008321157953894721
          policy_loss: -0.015891157643949633
          total_loss: 114.28608715098392
          vf_explained_var: -5.159839507072202e-06
          vf_loss: 114.29823350803827
        model: {}
    num_agent_steps_sampled: 72000
    num_agent_steps_trained

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,18,187.337,72000,-41.0289,-34.55,-58.95,9.02483


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,18,187.337,72000,-41.0289,-34.55,-58.95,9.02483


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 76000
  custom_metrics: {}
  date: 2022-03-13_21-01-51
  done: false
  episode_len_mean: 9.029345372460497
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -41.035553047404065
  episode_reward_min: -57.410000000000004
  episodes_this_iter: 443
  episodes_total: 7051
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.8212604757278196
          entropy_coeff: 0.0
          kl: 0.007697947748606133
          policy_loss: -0.021308956577152174
          total_loss: 113.86837054016769
          vf_explained_var: -3.3572155942199053e-06
          vf_loss: 113.88621566116169
        model: {}
    num_agent_steps_sampled: 76000
    num_agent

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,19,197.465,76000,-41.0356,-34.55,-57.41,9.02935


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,19,197.465,76000,-41.0356,-34.55,-57.41,9.02935


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 80000
  custom_metrics: {}
  date: 2022-03-13_21-02-01
  done: false
  episode_len_mean: 9.018018018018019
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -40.27934684684684
  episode_reward_min: -54.9
  episodes_this_iter: 444
  episodes_total: 7495
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.7534793317958873
          entropy_coeff: 0.0
          kl: 0.007773673441396784
          policy_loss: -0.013847881583597071
          total_loss: 112.83649703815419
          vf_explained_var: -1.9691323721280663e-06
          vf_loss: 112.84684665638913
        model: {}
    num_agent_steps_sampled: 80000
    num_agent_steps_trained:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,20,207.525,80000,-40.2793,-34.55,-54.9,9.01802


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,20,207.525,80000,-40.2793,-34.55,-54.9,9.01802


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 84000
  custom_metrics: {}
  date: 2022-03-13_21-02-12
  done: false
  episode_len_mean: 9.013513513513514
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -39.702432432432424
  episode_reward_min: -53.260000000000005
  episodes_this_iter: 444
  episodes_total: 7939
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.7127429850639836
          entropy_coeff: 0.0
          kl: 0.007662637106209064
          policy_loss: -0.014094210320442755
          total_loss: 106.75508580361644
          vf_explained_var: -1.277462128669985e-06
          vf_loss: 106.7657316556541
        model: {}
    num_agent_steps_sampled: 84000
    num_agent_s

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,21,218.343,84000,-39.7024,-34.55,-53.26,9.01351


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,21,218.343,84000,-39.7024,-34.55,-53.26,9.01351


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 88000
  custom_metrics: {}
  date: 2022-03-13_21-02-22
  done: false
  episode_len_mean: 9.01805869074492
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -39.457336343115124
  episode_reward_min: -54.13
  episodes_this_iter: 443
  episodes_total: 8382
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.677664577832786
          entropy_coeff: 0.0
          kl: 0.010512921081901978
          policy_loss: -0.017637275876377218
          total_loss: 106.32807055237473
          vf_explained_var: -7.575558077904486e-07
          vf_loss: 106.34097711706674
        model: {}
    num_agent_steps_sampled: 88000
    num_agent_steps_trained: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,22,228.433,88000,-39.4573,-34.55,-54.13,9.01806


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,22,228.433,88000,-39.4573,-34.55,-54.13,9.01806


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 92000
  custom_metrics: {}
  date: 2022-03-13_21-02-32
  done: false
  episode_len_mean: 9.006756756756756
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -38.52716216216216
  episode_reward_min: -49.5
  episodes_this_iter: 444
  episodes_total: 8826
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.6271405819923647
          entropy_coeff: 0.0
          kl: 0.007320765488184168
          policy_loss: -0.013221965879902885
          total_loss: 104.08047031689716
          vf_explained_var: -4.31076172859438e-07
          vf_loss: 104.09039821829847
        model: {}
    num_agent_steps_sampled: 92000
    num_agent_steps_trained: 9

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,23,238.564,92000,-38.5272,-34.55,-49.5,9.00676


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 96000
  custom_metrics: {}
  date: 2022-03-13_21-02-42
  done: false
  episode_len_mean: 9.004504504504505
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -38.59234234234234
  episode_reward_min: -50.14
  episodes_this_iter: 444
  episodes_total: 9270
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.6179040621044816
          entropy_coeff: 0.0
          kl: 0.007664411338800447
          policy_loss: -0.01834332604632182
          total_loss: 102.13607667492282
          vf_explained_var: -2.826413800639491e-07
          vf_loss: 102.15097071329753
        model: {}
    num_agent_steps_sampled: 96000
    num_agent_steps_trained: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,24,248.601,96000,-38.5923,-34.55,-50.14,9.0045


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,24,248.601,96000,-38.5923,-34.55,-50.14,9.0045


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 100000
  custom_metrics: {}
  date: 2022-03-13_21-02-52
  done: false
  episode_len_mean: 9.018018018018019
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -38.937139639639646
  episode_reward_min: -61.72
  episodes_this_iter: 444
  episodes_total: 9714
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.6368687182344416
          entropy_coeff: 0.0
          kl: 0.005887084357165662
          policy_loss: -0.01290008273145925
          total_loss: 102.54351006784746
          vf_explained_var: -2.407258556735131e-07
          vf_loss: 102.5537614719842
        model: {}
    num_agent_steps_sampled: 100000
    num_agent_steps_trained

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,25,258.544,100000,-38.9371,-34.55,-61.72,9.01802


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,25,258.544,100000,-38.9371,-34.55,-61.72,9.01802


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 104000
  custom_metrics: {}
  date: 2022-03-13_21-03-02
  done: false
  episode_len_mean: 9.018018018018019
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -38.34132882882883
  episode_reward_min: -51.96
  episodes_this_iter: 444
  episodes_total: 10158
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5717040165137219
          entropy_coeff: 0.0
          kl: 0.005894402486425325
          policy_loss: -0.01187083242581256
          total_loss: 100.84093860913349
          vf_explained_var: -1.113901856125042e-07
          vf_loss: 100.85015719013829
        model: {}
    num_agent_steps_sampled: 104000
    num_agent_steps_traine

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,26,268.791,104000,-38.3413,-34.55,-51.96,9.01802


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,26,268.791,104000,-38.3413,-34.55,-51.96,9.01802


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 108000
  custom_metrics: {}
  date: 2022-03-13_21-03-13
  done: false
  episode_len_mean: 9.00900900900901
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -38.399346846846846
  episode_reward_min: -52.860000000000014
  episodes_this_iter: 444
  episodes_total: 10602
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5406810501249888
          entropy_coeff: 0.0
          kl: 0.011512988323543636
          policy_loss: -0.017822712514128897
          total_loss: 101.74177713701802
          vf_explained_var: -5.486190959971438e-08
          vf_loss: 101.75441892070155
        model: {}
    num_agent_steps_sampled: 108000
    num_agen

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,27,279.131,108000,-38.3993,-34.55,-52.86,9.00901


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,27,279.131,108000,-38.3993,-34.55,-52.86,9.00901


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 112000
  custom_metrics: {}
  date: 2022-03-13_21-03-23
  done: false
  episode_len_mean: 9.004504504504505
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -37.80887387387387
  episode_reward_min: -56.64
  episodes_this_iter: 444
  episodes_total: 11046
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.555786165114372
          entropy_coeff: 0.0
          kl: 0.026193735897898952
          policy_loss: -0.03420510951014015
          total_loss: 98.10125210669733
          vf_explained_var: -1.2177293018628193e-08
          vf_loss: 98.12367055749381
        model: {}
    num_agent_steps_sampled: 112000
    num_agent_steps_trained:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,28,289.362,112000,-37.8089,-34.55,-56.64,9.0045


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,28,289.362,112000,-37.8089,-34.55,-56.64,9.0045


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 116000
  custom_metrics: {}
  date: 2022-03-13_21-03-33
  done: false
  episode_len_mean: 9.002252252252251
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -38.186013513513515
  episode_reward_min: -52.29
  episodes_this_iter: 444
  episodes_total: 11490
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5813959655582264
          entropy_coeff: 0.0
          kl: 0.0074123584037785375
          policy_loss: -0.01465238471845946
          total_loss: 99.20127612698462
          vf_explained_var: -1.1664564891528058e-08
          vf_loss: 99.21092509608116
        model: {}
    num_agent_steps_sampled: 116000
    num_agent_steps_train

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,29,299.39,116000,-38.186,-34.55,-52.29,9.00225


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,29,299.39,116000,-38.186,-34.55,-52.29,9.00225


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 120000
  custom_metrics: {}
  date: 2022-03-13_21-03-43
  done: false
  episode_len_mean: 9.01126126126126
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -38.17745495495496
  episode_reward_min: -55.59
  episodes_this_iter: 444
  episodes_total: 11934
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5795674312178807
          entropy_coeff: 0.0
          kl: 0.00623288634641976
          policy_loss: -0.013204227325816949
          total_loss: 99.61238667477843
          vf_explained_var: -1.160047387564054e-08
          vf_loss: 99.62138354393744
        model: {}
    num_agent_steps_sampled: 120000
    num_agent_steps_trained: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,30,309.68,120000,-38.1775,-34.55,-55.59,9.01126


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,30,309.68,120000,-38.1775,-34.55,-55.59,9.01126


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 124000
  custom_metrics: {}
  date: 2022-03-13_21-03-53
  done: false
  episode_len_mean: 9.01126126126126
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -37.98198198198198
  episode_reward_min: -55.56
  episodes_this_iter: 444
  episodes_total: 12378
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.6750000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5362977639321358
          entropy_coeff: 0.0
          kl: 0.004821978095738002
          policy_loss: -0.011356109334155918
          total_loss: 98.38180839784684
          vf_explained_var: 3.3968238420383906e-09
          vf_loss: 98.38990946123677
        model: {}
    num_agent_steps_sampled: 124000
    num_agent_steps_trained:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,31,319.766,124000,-37.982,-34.55,-55.56,9.01126


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,31,319.766,124000,-37.982,-34.55,-55.56,9.01126


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 128000
  custom_metrics: {}
  date: 2022-03-13_21-04-03
  done: false
  episode_len_mean: 9.0
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -37.41425675675676
  episode_reward_min: -49.0
  episodes_this_iter: 444
  episodes_total: 12822
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.3375000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.49627011267728705
          entropy_coeff: 0.0
          kl: 0.01087451899509903
          policy_loss: -0.015948100689478138
          total_loss: 96.37075685890773
          vf_explained_var: 5.768191429876512e-09
          vf_loss: 96.38303457280641
        model: {}
    num_agent_steps_sampled: 128000
    num_agent_steps_trained: 128000
    num

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,32,329.907,128000,-37.4143,-34.55,-49,9


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,32,329.907,128000,-37.4143,-34.55,-49,9


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 132000
  custom_metrics: {}
  date: 2022-03-13_21-04-14
  done: false
  episode_len_mean: 9.004494382022472
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -37.858471910112364
  episode_reward_min: -51.6
  episodes_this_iter: 445
  episodes_total: 13267
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.3375000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5178590604054031
          entropy_coeff: 0.0
          kl: 0.008751763272091307
          policy_loss: -0.014836361973736716
          total_loss: 95.6296647102602
          vf_explained_var: 2.307276571950605e-09
          vf_loss: 95.64154742661343
        model: {}
    num_agent_steps_sampled: 132000
    num_agent_steps_trained: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,33,340.025,132000,-37.8585,-34.55,-51.6,9.00449


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,33,340.025,132000,-37.8585,-34.55,-51.6,9.00449


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 136000
  custom_metrics: {}
  date: 2022-03-13_21-04-24
  done: false
  episode_len_mean: 9.004504504504505
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -37.546036036036035
  episode_reward_min: -51.76
  episodes_this_iter: 444
  episodes_total: 13711
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.3375000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.49437407860832827
          entropy_coeff: 0.0
          kl: 0.010668689614975094
          policy_loss: -0.01532051372492025
          total_loss: 95.65723401961787
          vf_explained_var: 1.8586394607379872e-09
          vf_loss: 95.66895370483398
        model: {}
    num_agent_steps_sampled: 136000
    num_agent_steps_traine

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,34,350.226,136000,-37.546,-34.55,-51.76,9.0045


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,34,350.226,136000,-37.546,-34.55,-51.76,9.0045


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 140000
  custom_metrics: {}
  date: 2022-03-13_21-04-34
  done: false
  episode_len_mean: 9.006756756756756
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -37.443108108108106
  episode_reward_min: -71.72000000000001
  episodes_this_iter: 444
  episodes_total: 14155
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.3375000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.4554839047052527
          entropy_coeff: 0.0
          kl: 0.01051376704030148
          policy_loss: -0.018900828534156405
          total_loss: 96.20273778771842
          vf_explained_var: -2.563640635500672e-10
          vf_loss: 96.2180903445008
        model: {}
    num_agent_steps_sampled: 140000
    num_agent_st

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,35,360.344,140000,-37.4431,-34.55,-71.72,9.00676


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,35,360.344,140000,-37.4431,-34.55,-71.72,9.00676


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 144000
  custom_metrics: {}
  date: 2022-03-13_21-04-44
  done: false
  episode_len_mean: 9.00900900900901
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -37.16914414414415
  episode_reward_min: -48.55
  episodes_this_iter: 444
  episodes_total: 14599
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.3375000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.41939333895201325
          entropy_coeff: 0.0
          kl: 0.010491754343785718
          policy_loss: -0.018556796927105195
          total_loss: 95.03955927202779
          vf_explained_var: 1.922730476625504e-09
          vf_loss: 95.05457533969674
        model: {}
    num_agent_steps_sampled: 144000
    num_agent_steps_trained:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,36,370.499,144000,-37.1691,-34.55,-48.55,9.00901


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,36,370.499,144000,-37.1691,-34.55,-48.55,9.00901


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 148000
  custom_metrics: {}
  date: 2022-03-13_21-04-55
  done: false
  episode_len_mean: 9.004504504504505
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -36.96605855855856
  episode_reward_min: -48.10000000000001
  episodes_this_iter: 444
  episodes_total: 15043
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.3375000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.36476701812077594
          entropy_coeff: 0.0
          kl: 0.008131061637112498
          policy_loss: -0.018833296654385424
          total_loss: 94.35895315190797
          vf_explained_var: 1.1536382859753024e-08
          vf_loss: 94.37504206011373
        model: {}
    num_agent_steps_sampled: 148000
    num_agent_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,37,381.098,148000,-36.9661,-34.55,-48.1,9.0045


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,37,381.098,148000,-36.9661,-34.55,-48.1,9.0045


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 152000
  custom_metrics: {}
  date: 2022-03-13_21-05-05
  done: false
  episode_len_mean: 9.004504504504505
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -36.472995495495496
  episode_reward_min: -61.21
  episodes_this_iter: 444
  episodes_total: 15487
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.3375000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.3135625531276067
          entropy_coeff: 0.0
          kl: 0.006090770693762135
          policy_loss: -0.01688960805204847
          total_loss: 91.61960936515563
          vf_explained_var: 0.0
          vf_loss: 91.63444297544417
        model: {}
    num_agent_steps_sampled: 152000
    num_agent_steps_trained: 152000
    num_st

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,38,391.542,152000,-36.473,-34.55,-61.21,9.0045


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,38,391.542,152000,-36.473,-34.55,-61.21,9.0045


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 156000
  custom_metrics: {}
  date: 2022-03-13_21-05-16
  done: false
  episode_len_mean: 9.01126126126126
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -36.02817567567568
  episode_reward_min: -52.419999999999995
  episodes_this_iter: 444
  episodes_total: 15931
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.3375000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.2763679229924756
          entropy_coeff: 0.0
          kl: 0.005527848318581739
          policy_loss: -0.010133143496369162
          total_loss: 89.41425343995454
          vf_explained_var: 0.0
          vf_loss: 89.42252097181094
        model: {}
    num_agent_steps_sampled: 156000
    num_agent_steps_trained: 15600

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,39,402.189,156000,-36.0282,-34.55,-52.42,9.01126


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,39,402.189,156000,-36.0282,-34.55,-52.42,9.01126


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,39,402.189,156000,-36.0282,-34.55,-52.42,9.01126


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 160000
  custom_metrics: {}
  date: 2022-03-13_21-05-27
  done: false
  episode_len_mean: 9.002247191011236
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -36.12597752808989
  episode_reward_min: -49.230000000000004
  episodes_this_iter: 445
  episodes_total: 16376
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.3375000000000001
          cur_lr: 5.0000000000000016e-05
          entropy: 0.3163239077694954
          entropy_coeff: 0.0
          kl: 0.023991576097822733
          policy_loss: -0.02632016957617335
          total_loss: 91.36260872297389
          vf_explained_var: 1.922730476625504e-09
          vf_loss: 91.38083115444388
        model: {}
    num_agent_steps_sampled: 160000
    num_agent_st

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,40,413.619,160000,-36.126,-34.55,-49.23,9.00225


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,40,413.619,160000,-36.126,-34.55,-49.23,9.00225


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 164000
  custom_metrics: {}
  date: 2022-03-13_21-05-38
  done: false
  episode_len_mean: 9.011286681715575
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -37.994650112866815
  episode_reward_min: -52.42000000000001
  episodes_this_iter: 443
  episodes_total: 16819
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.5062499999999999
          cur_lr: 5.0000000000000016e-05
          entropy: 0.4752114858678592
          entropy_coeff: 0.0
          kl: 0.013167696371975249
          policy_loss: -0.021524062772752137
          total_loss: 95.84748549102456
          vf_explained_var: 3.845460953251008e-10
          vf_loss: 95.86234349076466
        model: {}
    num_agent_steps_sampled: 164000
    num_agent_s

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,41,424.55,164000,-37.9947,-34.55,-52.42,9.01129


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,41,424.55,164000,-37.9947,-34.55,-52.42,9.01129


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 168000
  custom_metrics: {}
  date: 2022-03-13_21-05-50
  done: false
  episode_len_mean: 9.01126126126126
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -38.11
  episode_reward_min: -51.18000000000001
  episodes_this_iter: 444
  episodes_total: 17263
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.5062499999999999
          cur_lr: 5.0000000000000016e-05
          entropy: 0.5005513257557346
          entropy_coeff: 0.0
          kl: 0.009043920074865921
          policy_loss: -0.020532119345741087
          total_loss: 94.8194555139029
          vf_explained_var: 7.050011747626848e-10
          vf_loss: 94.83540917673419
        model: {}
    num_agent_steps_sampled: 168000
    num_agent_steps_trained: 1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,42,436.585,168000,-38.11,-34.55,-51.18,9.01126


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,42,436.585,168000,-38.11,-34.55,-51.18,9.01126


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 172000
  custom_metrics: {}
  date: 2022-03-13_21-06-01
  done: false
  episode_len_mean: 9.015765765765765
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -37.957927927927926
  episode_reward_min: -55.760000000000005
  episodes_this_iter: 444
  episodes_total: 17707
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.5062499999999999
          cur_lr: 5.0000000000000016e-05
          entropy: 0.4501567998560526
          entropy_coeff: 0.0
          kl: 0.007739706019120954
          policy_loss: -0.0211852927486943
          total_loss: 95.62876488470262
          vf_explained_var: 0.0
          vf_loss: 95.64603222262475
        model: {}
    num_agent_steps_sampled: 172000
    num_agent_steps_trained: 17200

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,43,447.547,172000,-37.9579,-34.55,-55.76,9.01577


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,43,447.547,172000,-37.9579,-34.55,-55.76,9.01577


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 176000
  custom_metrics: {}
  date: 2022-03-13_21-06-12
  done: false
  episode_len_mean: 9.006756756756756
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -37.175202702702705
  episode_reward_min: -62.809999999999995
  episodes_this_iter: 444
  episodes_total: 18151
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.5062499999999999
          cur_lr: 5.0000000000000016e-05
          entropy: 0.41292276177355036
          entropy_coeff: 0.0
          kl: 0.0077046209196772655
          policy_loss: -0.01917504625475054
          total_loss: 94.39148707236014
          vf_explained_var: 1.922730476625504e-09
          vf_loss: 94.40676181752194
        model: {}
    num_agent_steps_sampled: 176000
    num_agent

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,44,457.919,176000,-37.1752,-34.55,-62.81,9.00676


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,44,457.919,176000,-37.1752,-34.55,-62.81,9.00676


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 180000
  custom_metrics: {}
  date: 2022-03-13_21-06-22
  done: false
  episode_len_mean: 9.004504504504505
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -37.16614864864865
  episode_reward_min: -52.84
  episodes_this_iter: 444
  episodes_total: 18595
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.5062499999999999
          cur_lr: 5.0000000000000016e-05
          entropy: 0.4167494191597867
          entropy_coeff: 0.0
          kl: 0.006074871737153603
          policy_loss: -0.015028367786397857
          total_loss: 91.91019445439821
          vf_explained_var: 5.768191429876512e-09
          vf_loss: 91.92214780417822
        model: {}
    num_agent_steps_sampled: 180000
    num_agent_steps_trained:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,45,468.481,180000,-37.1661,-34.55,-52.84,9.0045


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,45,468.481,180000,-37.1661,-34.55,-52.84,9.0045


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 184000
  custom_metrics: {}
  date: 2022-03-13_21-06-34
  done: false
  episode_len_mean: 9.006756756756756
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -36.96858108108109
  episode_reward_min: -53.30000000000001
  episodes_this_iter: 444
  episodes_total: 19039
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.5062499999999999
          cur_lr: 5.0000000000000016e-05
          entropy: 0.38317044832373176
          entropy_coeff: 0.0
          kl: 0.005171998956442985
          policy_loss: -0.01551863614129283
          total_loss: 93.21905070479198
          vf_explained_var: 1.922730476625504e-09
          vf_loss: 93.23195065734207
        model: {}
    num_agent_steps_sampled: 184000
    num_agent_st

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,46,480.538,184000,-36.9686,-34.55,-53.3,9.00676


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,46,480.538,184000,-36.9686,-34.55,-53.3,9.00676


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,46,480.538,184000,-36.9686,-34.55,-53.3,9.00676


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 188000
  custom_metrics: {}
  date: 2022-03-13_21-06-46
  done: false
  episode_len_mean: 9.004494382022472
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -36.43761797752809
  episode_reward_min: -49.55
  episodes_this_iter: 445
  episodes_total: 19484
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.5062499999999999
          cur_lr: 5.0000000000000016e-05
          entropy: 0.36567976229613824
          entropy_coeff: 0.0
          kl: 0.016272100371987768
          policy_loss: -0.02382963051699022
          total_loss: 91.11174592048891
          vf_explained_var: 3.845460953251008e-09
          vf_loss: 91.12733798283402
        model: {}
    num_agent_steps_sampled: 188000
    num_agent_steps_trained:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,47,491.642,188000,-36.4376,-34.55,-49.55,9.00449


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,47,491.642,188000,-36.4376,-34.55,-49.55,9.00449


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 192000
  custom_metrics: {}
  date: 2022-03-13_21-06-57
  done: false
  episode_len_mean: 9.002252252252251
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -37.645270270270274
  episode_reward_min: -46.97
  episodes_this_iter: 444
  episodes_total: 19928
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.5062499999999999
          cur_lr: 5.0000000000000016e-05
          entropy: 0.41541995630469375
          entropy_coeff: 0.0
          kl: 0.010450795166338988
          policy_loss: -0.020951320271768797
          total_loss: 92.48003971551054
          vf_explained_var: 1.922730476625504e-09
          vf_loss: 92.49570021270424
        model: {}
    num_agent_steps_sampled: 192000
    num_agent_steps_traine

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,48,503.196,192000,-37.6453,-34.55,-46.97,9.00225


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,48,503.196,192000,-37.6453,-34.55,-46.97,9.00225


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 196000
  custom_metrics: {}
  date: 2022-03-13_21-07-09
  done: false
  episode_len_mean: 9.004504504504505
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -37.549819819819824
  episode_reward_min: -52.410000000000004
  episodes_this_iter: 444
  episodes_total: 20372
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.5062499999999999
          cur_lr: 5.0000000000000016e-05
          entropy: 0.4291917812119248
          entropy_coeff: 0.0
          kl: 0.009751440291049154
          policy_loss: -0.02014860720548939
          total_loss: 93.43053686490623
          vf_explained_var: 0.0
          vf_loss: 93.44574877831243
        model: {}
    num_agent_steps_sampled: 196000
    num_agent_steps_trained: 1960

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,49,514.588,196000,-37.5498,-34.55,-52.41,9.0045


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,RUNNING,127.0.0.1:25313,49,514.588,196000,-37.5498,-34.55,-52.41,9.0045


Result for PPOTrainer_my_env_246f0_00000:
  agent_timesteps_total: 200000
  custom_metrics: {}
  date: 2022-03-13_21-07-20
  done: true
  episode_len_mean: 9.006756756756756
  episode_media: {}
  episode_reward_max: -34.55
  episode_reward_mean: -37.12144144144145
  episode_reward_min: -55.96000000000001
  episodes_this_iter: 444
  episodes_total: 20816
  experiment_id: b01739d93d044cc4b13cba163f16684d
  hostname: MacBook-Pro-Vladimir.local
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.5062499999999999
          cur_lr: 5.0000000000000016e-05
          entropy: 0.37140962946158584
          entropy_coeff: 0.0
          kl: 0.006195005703386621
          policy_loss: -0.016394468080452692
          total_loss: 93.04049556280977
          vf_explained_var: 1.922730476625504e-09
          vf_loss: 93.05375423636488
        model: {}
    num_agent_steps_sampled: 200000
    num_agent_st

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPOTrainer_my_env_246f0_00000,TERMINATED,127.0.0.1:25313,50,526.16,200000,-37.1214,-34.55,-55.96,9.00676


2022-03-13 21:07:21,259	INFO tune.py:636 -- Total run time: 551.92 seconds (551.35 seconds for the tuning loop).


In [63]:
agent.restore(experiment.best_checkpoint)

2022-03-13 21:10:13,767	INFO trainable.py:472 -- Restored on 127.0.0.1 from checkpoint: /Users/vladimirsudakov/ray_results/PPOTrainer_2022-03-13_20-58-09/PPOTrainer_my_env_246f0_00000_0_2022-03-13_20-58-09/checkpoint_000050/checkpoint-50
2022-03-13 21:10:13,769	INFO trainable.py:480 -- Current state after restoring: {'_iteration': 50, '_timesteps_total': 200000, '_time_total': 526.1604516506195, '_episodes_total': 20816}


In [64]:
obs = env.reset()
g = 0
actions = [0]
for i in range(1000):
    action = agent.compute_single_action(obs, explore = False)
    obs, reward, done, info = env.step(action)
    g += M[actions[-1], action + 1]
    actions.append(action + 1)
    if done:
        obs = env.reset()
        g += M[actions[-1], 0]
        print(f"{actions}, g = {g}")
        break
env.close()

[0, 5, 2, 8, 6, 9, 3, 4, 1, 7], g = 34.55
