In [1]:
import ray, random, os 
from ray.train.xgboost import XGBoostTrainer
from ray.air.config import ScalingConfig
import pandas as pd
ray.init(_temp_dir='/Volumes/SSD980/ray')
from ray import air, tune
from ray.tune.schedulers import PopulationBasedTraining
import argparse
from ray.tune import CLIReporter
from env.multi_agent.hrl import HRL
from ray.train.rl import RLTrainer
from ray.rllib.policy.policy import Policy, PolicySpec
from ray.tune.registry import register_env
from ray.tune import TuneConfig
from ray.tune.logger import pretty_print
from ray.tune.search.bayesopt import BayesOptSearch
from ray.air.config import RunConfig, CheckpointConfig
from ray.tune.schedulers import HyperBandScheduler
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.search.hyperopt import HyperOptSearch

2023-09-08 17:32:38,168	INFO worker.py:1612 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


In [4]:
env = HRL()

def env_creator(env_config):
    return HRL(env_config)  

register_env("hrl", env_creator)

def create_policy_spec(worker_id):
    print(f"Creating policy for {worker_id} with obs space {env.observation_space[worker_id]} and action space {env.action_space[worker_id]}")
    return PolicySpec(
        observation_space=env.observation_space[worker_id],
        action_space=env.action_space[worker_id],
        config={}
    )


manager_policy_spec = PolicySpec(
    observation_space=env.observation_space['manager'],
    action_space=env.action_space['manager'],
    config={}
)

policies = {
    "manager_policy": manager_policy_spec,
}
for worker_id in env.workers:
    policies[worker_id] = create_policy_spec(worker_id)

def policy_mapping_fn(agent_id, episode=None, worker=None, **kwargs):
    if agent_id in env.workers:
        return agent_id
    else:
        return "manager_policy"

search_space = {
     "lr": tune.uniform(1e-4, 1e-1),
    "gamma": tune.uniform(0.9, 0.99),
    "lambda": tune.uniform(0.9, 1.0),
    "entropy_coeff": tune.uniform(1e-4, 1e-1),
    "vf_loss_coeff": tune.uniform(0.1, 1.0),
}

config = TuneConfig(
    metric="episode_reward_mean",
    mode="max",
    search_alg=BayesOptSearch(),
    # scheduler=HyperBandScheduler(),
    num_samples=10,
    max_concurrent_trials=3
    )

run_config = RunConfig(
    name="Test",
    storage_path="/Volumes/SSD980/ray/results/test",
    verbose=2,
    # checkpoint_config=air.CheckpointConfig(checkpoint_frequency=2),
    callbacks=None,
    stop=None,
    failure_config=None,
    sync_config=None,
    checkpoint_config=None,
    progress_reporter=CLIReporter(max_progress_rows=10,max_report_frequency=10),

)

tuner = tune.Tuner(
    trainable="A2C",
    param_space={
        "env": "hrl",
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": policy_mapping_fn,
        },
        "lr": tune.uniform(1e-4, 1e-1),
        "gamma": tune.uniform(0.9, 0.99),
        "lambda": tune.uniform(0.9, 1.0),
        "entropy_coeff": tune.uniform(1e-4, 1e-1),
        "vf_loss_coeff": tune.uniform(0.1, 1.0),
        "num_workers": 3,
    },
    run_config=run_config,
    # If tune_config is custom, you might still use it, but make sure it's correctly implemented
    tune_config=config
)
results = tuner.fit()
print(results.get_best_result(metric="episode_reward_mean", mode="max").config)



Creating policy for ABT.US with obs space Dict('current_cash': Box(-inf, inf, (1,), float32), 'current_price': Box(-inf, inf, (1,), float32), 'current_stock_exposure': Box(-inf, inf, (1,), float32), 'day': Box(-inf, inf, (1,), float32), 'pnl': Box(-inf, inf, (1,), float32), 'return_per_volatility': Box(-inf, inf, (1,), float32), 'shares_held': Box(-inf, inf, (1,), float32), 'tech_indicators': Box(-inf, inf, (14,), float32), 'total_costs': Box(-inf, inf, (1,), float32), 'total_trades': Box(-inf, inf, (1,), float32)) and action space Dict('amount': Box(0.0, 1.0, (1,), float32), 'type': Discrete(3))
Creating policy for AMGN.US with obs space Dict('current_cash': Box(-inf, inf, (1,), float32), 'current_price': Box(-inf, inf, (1,), float32), 'current_stock_exposure': Box(-inf, inf, (1,), float32), 'day': Box(-inf, inf, (1,), float32), 'pnl': Box(-inf, inf, (1,), float32), 'return_per_volatility': Box(-inf, inf, (1,), float32), 'shares_held': Box(-inf, inf, (1,), float32), 'tech_indicators':

  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
2023-09-08 17:46:30,376	INFO tune.py:657 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


== Status ==
Current time: 2023-09-08 17:46:31 (running for 00:00:00.94)
Using FIFO scheduling algorithm.
Logical resource usage: 0/8 CPUs, 0/0 GPUs
Result logdir: /Volumes/SSD980/ray/results/test/Test
Number of trials: 1/10 (1 PENDING)
+------------------+----------+-------+-----------------+----------+----------+----------+-----------------+
| Trial name       | status   | loc   |   entropy_coeff |    gamma |   lambda |       lr |   vf_loss_coeff |
|------------------+----------+-------+-----------------+----------+----------+----------+-----------------|
| A2C_hrl_d82fc844 | PENDING  |       |       0.0375166 | 0.985564 | 0.973199 | 0.059906 |        0.240417 |
+------------------+----------+-------+-----------------+----------+----------+----------+-----------------+






== Status ==
Current time: 2023-09-08 17:46:42 (running for 00:00:11.71)
Using FIFO scheduling algorithm.
Logical resource usage: 4.0/8 CPUs, 0/0 GPUs
Result logdir: /Volumes/SSD980/ray/results/test/Test
Number of trials: 1/10 (1 PENDING)
+------------------+----------+-------+-----------------+----------+----------+----------+-----------------+
| Trial name       | status   | loc   |   entropy_coeff |    gamma |   lambda |       lr |   vf_loss_coeff |
|------------------+----------+-------+-----------------+----------+----------+----------+-----------------|
| A2C_hrl_d82fc844 | PENDING  |       |       0.0375166 | 0.985564 | 0.973199 | 0.059906 |        0.240417 |
+------------------+----------+-------+-----------------+----------+----------+----------+-----------------+






== Status ==
Current time: 2023-09-08 17:46:52 (running for 00:00:21.77)
Using FIFO scheduling algorithm.
Logical resource usage: 4.0/8 CPUs, 0/0 GPUs
Result logdir: /Volumes/SSD980/ray/results/test/Test
Number of trials: 1/10 (1 PENDING)
+------------------+----------+-------+-----------------+----------+----------+----------+-----------------+
| Trial name       | status   | loc   |   entropy_coeff |    gamma |   lambda |       lr |   vf_loss_coeff |
|------------------+----------+-------+-----------------+----------+----------+----------+-----------------|
| A2C_hrl_d82fc844 | PENDING  |       |       0.0375166 | 0.985564 | 0.973199 | 0.059906 |        0.240417 |
+------------------+----------+-------+-----------------+----------+----------+----------+-----------------+




[2m[36m(A2C pid=27142)[0m Trainable.setup took 13.166 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2023-09-08 17:46:57,775	ERROR tune_controller.py:911 -- Trial task failed for trial A2C_hrl_d82fc844
Traceback (most recent call last):
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/worker.py", line 2524, in get
    raise value.as_instanceof_cause(

Trial name
A2C_hrl_1f95c59e
A2C_hrl_422dc560
A2C_hrl_57e8e16b
A2C_hrl_6be960a3
A2C_hrl_9ffc9c5e
A2C_hrl_c02bf1da
A2C_hrl_d82fc844
A2C_hrl_df4a4657
A2C_hrl_e39df62b
A2C_hrl_f237253f


[2m[36m(A2C pid=27142)[0m 2023-09-08 17:46:57,766	ERROR actor_manager.py:500 -- Ray error, taking actor 1 out of service. [36mray::RolloutWorker.apply()[39m (pid=27174, ip=127.0.0.1, actor_id=603ceabe2f944b56007ed4b501000000, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x17c283c10>)
[2m[36m(A2C pid=27142)[0m   File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/rllib/utils/actor_manager.py", line 185, in apply
[2m[36m(A2C pid=27142)[0m     raise e
[2m[36m(A2C pid=27142)[0m   File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/rllib/utils/actor_manager.py", line 176, in apply
[2m[36m(A2C pid=27142)[0m     return func(self, *args, **kwargs)
[2m[36m(A2C pid=27142)[0m   File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/rllib/execution/rollout_ops.py", line 86, in <lambda>
[2m[36m(A2C pid=27142)[0m     lambda w: w.sample(), local_worker=False, healthy_only

== Status ==
Current time: 2023-09-08 17:47:02 (running for 00:00:31.80)
Using FIFO scheduling algorithm.
Logical resource usage: 4.0/8 CPUs, 0/0 GPUs
Result logdir: /Volumes/SSD980/ray/results/test/Test
Number of trials: 2/10 (1 ERROR, 1 PENDING)
+------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------+
| Trial name       | status   | loc             |   entropy_coeff |    gamma |   lambda |        lr |   vf_loss_coeff |
|------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------|
| A2C_hrl_c02bf1da | PENDING  |                 |       0.0156839 | 0.905228 | 0.986618 | 0.0601514 |        0.737265 |
| A2C_hrl_d82fc844 | ERROR    | 127.0.0.1:27142 |       0.0375166 | 0.985564 | 0.973199 | 0.059906  |        0.240417 |
+------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------+
Number of errored trials: 1
+---



== Status ==
Current time: 2023-09-08 17:47:12 (running for 00:00:41.95)
Using FIFO scheduling algorithm.
Logical resource usage: 4.0/8 CPUs, 0/0 GPUs
Result logdir: /Volumes/SSD980/ray/results/test/Test
Number of trials: 2/10 (1 ERROR, 1 RUNNING)
+------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------+
| Trial name       | status   | loc             |   entropy_coeff |    gamma |   lambda |        lr |   vf_loss_coeff |
|------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------|
| A2C_hrl_c02bf1da | RUNNING  | 127.0.0.1:27211 |       0.0156839 | 0.905228 | 0.986618 | 0.0601514 |        0.737265 |
| A2C_hrl_d82fc844 | ERROR    | 127.0.0.1:27142 |       0.0375166 | 0.985564 | 0.973199 | 0.059906  |        0.240417 |
+------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------+
Number of errored trials: 1
+---

2023-09-08 17:47:14,657	ERROR tune_controller.py:911 -- Trial task failed for trial A2C_hrl_c02bf1da
Traceback (most recent call last):
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/worker.py", line 2524, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::A2C.train()[39m (pid=27211, ip=127.0.0.1, actor_id=2a38c29e88b7baed657e7e7b01000000, repr=A2C)
  File "/Users/floriankockler

== Status ==
Current time: 2023-09-08 17:47:22 (running for 00:00:51.89)
Using FIFO scheduling algorithm.
Logical resource usage: 4.0/8 CPUs, 0/0 GPUs
Result logdir: /Volumes/SSD980/ray/results/test/Test
Number of trials: 3/10 (2 ERROR, 1 PENDING)
+------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------+
| Trial name       | status   | loc             |   entropy_coeff |    gamma |   lambda |        lr |   vf_loss_coeff |
|------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------|
| A2C_hrl_422dc560 | PENDING  |                 |      0.00215639 | 0.987292 | 0.983244 | 0.0213127 |        0.263642 |
| A2C_hrl_d82fc844 | ERROR    | 127.0.0.1:27142 |      0.0375166  | 0.985564 | 0.973199 | 0.059906  |        0.240417 |
| A2C_hrl_c02bf1da | ERROR    | 127.0.0.1:27211 |      0.0156839  | 0.905228 | 0.986618 | 0.0601514 |        0.737265 |
+------------------+----------+-



== Status ==
Current time: 2023-09-08 17:47:32 (running for 00:01:02.13)
Using FIFO scheduling algorithm.
Logical resource usage: 4.0/8 CPUs, 0/0 GPUs
Result logdir: /Volumes/SSD980/ray/results/test/Test
Number of trials: 4/10 (2 ERROR, 1 PENDING, 1 RUNNING)
+------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------+
| Trial name       | status   | loc             |   entropy_coeff |    gamma |   lambda |        lr |   vf_loss_coeff |
|------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------|
| A2C_hrl_422dc560 | RUNNING  | 127.0.0.1:27259 |      0.00215639 | 0.987292 | 0.983244 | 0.0213127 |        0.263642 |
| A2C_hrl_9ffc9c5e | PENDING  |                 |      0.0184221  | 0.927382 | 0.952476 | 0.0432513 |        0.362106 |
| A2C_hrl_d82fc844 | ERROR    | 127.0.0.1:27142 |      0.0375166  | 0.985564 | 0.973199 | 0.059906  |        0.240417 |
| A2C_hrl_c02bf1da | 

2023-09-08 17:47:32,842	ERROR tune_controller.py:911 -- Trial task failed for trial A2C_hrl_422dc560
Traceback (most recent call last):
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/worker.py", line 2524, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::A2C.train()[39m (pid=27259, ip=127.0.0.1, actor_id=4d1db7ba5b334534df58e3bd01000000, repr=A2C)
  File "/Users/floriankockler

== Status ==
Current time: 2023-09-08 17:47:42 (running for 00:01:12.13)
Using FIFO scheduling algorithm.
Logical resource usage: 4.0/8 CPUs, 0/0 GPUs
Result logdir: /Volumes/SSD980/ray/results/test/Test
Number of trials: 4/10 (3 ERROR, 1 PENDING)
+------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------+
| Trial name       | status   | loc             |   entropy_coeff |    gamma |   lambda |        lr |   vf_loss_coeff |
|------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------|
| A2C_hrl_9ffc9c5e | PENDING  |                 |      0.0184221  | 0.927382 | 0.952476 | 0.0432513 |        0.362106 |
| A2C_hrl_d82fc844 | ERROR    | 127.0.0.1:27142 |      0.0375166  | 0.985564 | 0.973199 | 0.059906  |        0.240417 |
| A2C_hrl_c02bf1da | ERROR    | 127.0.0.1:27211 |      0.0156839  | 0.905228 | 0.986618 | 0.0601514 |        0.737265 |
| A2C_hrl_422dc560 | ERROR    | 

2023-09-08 17:47:51,095	ERROR tune_controller.py:911 -- Trial task failed for trial A2C_hrl_9ffc9c5e
Traceback (most recent call last):
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/worker.py", line 2524, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::A2C.train()[39m (pid=27297, ip=127.0.0.1, actor_id=d6c94dbb5f06ba7ebe9cb15001000000, repr=A2C)
  File "/Users/floriankockler

== Status ==
Current time: 2023-09-08 17:47:52 (running for 00:01:22.14)
Using FIFO scheduling algorithm.
Logical resource usage: 4.0/8 CPUs, 0/0 GPUs
Result logdir: /Volumes/SSD980/ray/results/test/Test
Number of trials: 5/10 (4 ERROR, 1 PENDING)
+------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------+
| Trial name       | status   | loc             |   entropy_coeff |    gamma |   lambda |        lr |   vf_loss_coeff |
|------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------|
| A2C_hrl_1f95c59e | PENDING  |                 |      0.0612241  | 0.912554 | 0.929214 | 0.0366995 |        0.510463 |
| A2C_hrl_d82fc844 | ERROR    | 127.0.0.1:27142 |      0.0375166  | 0.985564 | 0.973199 | 0.059906  |        0.240417 |
| A2C_hrl_c02bf1da | ERROR    | 127.0.0.1:27211 |      0.0156839  | 0.905228 | 0.986618 | 0.0601514 |        0.737265 |
| A2C_hrl_422dc560 | ERROR    | 



== Status ==
Current time: 2023-09-08 17:48:02 (running for 00:01:32.21)
Using FIFO scheduling algorithm.
Logical resource usage: 4.0/8 CPUs, 0/0 GPUs
Result logdir: /Volumes/SSD980/ray/results/test/Test
Number of trials: 5/10 (4 ERROR, 1 PENDING)
+------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------+
| Trial name       | status   | loc             |   entropy_coeff |    gamma |   lambda |        lr |   vf_loss_coeff |
|------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------|
| A2C_hrl_1f95c59e | PENDING  |                 |      0.0612241  | 0.912554 | 0.929214 | 0.0366995 |        0.510463 |
| A2C_hrl_d82fc844 | ERROR    | 127.0.0.1:27142 |      0.0375166  | 0.985564 | 0.973199 | 0.059906  |        0.240417 |
| A2C_hrl_c02bf1da | ERROR    | 127.0.0.1:27211 |      0.0156839  | 0.905228 | 0.986618 | 0.0601514 |        0.737265 |
| A2C_hrl_422dc560 | ERROR    | 

2023-09-08 17:48:09,268	ERROR tune_controller.py:911 -- Trial task failed for trial A2C_hrl_1f95c59e
Traceback (most recent call last):
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/worker.py", line 2524, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::A2C.train()[39m (pid=27350, ip=127.0.0.1, actor_id=551e3a496eb66dbd1be676df01000000, repr=A2C)
  File "/Users/floriankockler

== Status ==
Current time: 2023-09-08 17:48:12 (running for 00:01:42.30)
Using FIFO scheduling algorithm.
Logical resource usage: 4.0/8 CPUs, 0/0 GPUs
Result logdir: /Volumes/SSD980/ray/results/test/Test
Number of trials: 6/10 (5 ERROR, 1 PENDING)
+------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------+
| Trial name       | status   | loc             |   entropy_coeff |    gamma |   lambda |        lr |   vf_loss_coeff |
|------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------|
| A2C_hrl_df4a4657 | PENDING  |                 |      0.0785391  | 0.917971 | 0.951423 | 0.0592822 |        0.141805 |
| A2C_hrl_d82fc844 | ERROR    | 127.0.0.1:27142 |      0.0375166  | 0.985564 | 0.973199 | 0.059906  |        0.240417 |
| A2C_hrl_c02bf1da | ERROR    | 127.0.0.1:27211 |      0.0156839  | 0.905228 | 0.986618 | 0.0601514 |        0.737265 |
| A2C_hrl_422dc560 | ERROR    | 



== Status ==
Current time: 2023-09-08 17:48:23 (running for 00:01:53.52)
Using FIFO scheduling algorithm.
Logical resource usage: 4.0/8 CPUs, 0/0 GPUs
Result logdir: /Volumes/SSD980/ray/results/test/Test
Number of trials: 6/10 (5 ERROR, 1 PENDING)
+------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------+
| Trial name       | status   | loc             |   entropy_coeff |    gamma |   lambda |        lr |   vf_loss_coeff |
|------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------|
| A2C_hrl_df4a4657 | PENDING  |                 |      0.0785391  | 0.917971 | 0.951423 | 0.0592822 |        0.141805 |
| A2C_hrl_d82fc844 | ERROR    | 127.0.0.1:27142 |      0.0375166  | 0.985564 | 0.973199 | 0.059906  |        0.240417 |
| A2C_hrl_c02bf1da | ERROR    | 127.0.0.1:27211 |      0.0156839  | 0.905228 | 0.986618 | 0.0601514 |        0.737265 |
| A2C_hrl_422dc560 | ERROR    | 

2023-09-08 17:48:26,499	ERROR tune_controller.py:911 -- Trial task failed for trial A2C_hrl_df4a4657
Traceback (most recent call last):
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/worker.py", line 2524, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::A2C.train()[39m (pid=27378, ip=127.0.0.1, actor_id=08f0719cc42b75cdaf0eb73401000000, repr=A2C)
  File "/Users/floriankockler

== Status ==
Current time: 2023-09-08 17:48:33 (running for 00:02:03.56)
Using FIFO scheduling algorithm.
Logical resource usage: 4.0/8 CPUs, 0/0 GPUs
Result logdir: /Volumes/SSD980/ray/results/test/Test
Number of trials: 7/10 (6 ERROR, 1 PENDING)
+------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------+
| Trial name       | status   | loc             |   entropy_coeff |    gamma |   lambda |        lr |   vf_loss_coeff |
|------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------|
| A2C_hrl_e39df62b | PENDING  |                 |      0.0607937  | 0.915347 | 0.906505 | 0.0948937 |        0.969069 |
| A2C_hrl_d82fc844 | ERROR    | 127.0.0.1:27142 |      0.0375166  | 0.985564 | 0.973199 | 0.059906  |        0.240417 |
| A2C_hrl_c02bf1da | ERROR    | 127.0.0.1:27211 |      0.0156839  | 0.905228 | 0.986618 | 0.0601514 |        0.737265 |
| A2C_hrl_422dc560 | ERROR    | 

[2m[36m(A2C pid=27434)[0m Trainable.setup took 11.271 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


== Status ==
Current time: 2023-09-08 17:48:44 (running for 00:02:13.70)
Using FIFO scheduling algorithm.
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Volumes/SSD980/ray/results/test/Test
Number of trials: 8/10 (6 ERROR, 1 PENDING, 1 RUNNING)
+------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------+
| Trial name       | status   | loc             |   entropy_coeff |    gamma |   lambda |        lr |   vf_loss_coeff |
|------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------|
| A2C_hrl_e39df62b | RUNNING  | 127.0.0.1:27434 |      0.0607937  | 0.915347 | 0.906505 | 0.0948937 |        0.969069 |
| A2C_hrl_6be960a3 | PENDING  |                 |      0.0808589  | 0.927415 | 0.909767 | 0.0684549 |        0.496137 |
| A2C_hrl_d82fc844 | ERROR    | 127.0.0.1:27142 |      0.0375166  | 0.985564 | 0.973199 | 0.059906  |        0.240417 |
| A2C_hrl_c02bf1da | 

2023-09-08 17:48:46,180	ERROR tune_controller.py:911 -- Trial task failed for trial A2C_hrl_e39df62b
Traceback (most recent call last):
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/worker.py", line 2524, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::A2C.train()[39m (pid=27434, ip=127.0.0.1, actor_id=a541dd8607220e775f17486501000000, repr=A2C)
  File "/Users/floriankockler

== Status ==
Current time: 2023-09-08 17:48:54 (running for 00:02:23.78)
Using FIFO scheduling algorithm.
Logical resource usage: 4.0/8 CPUs, 0/0 GPUs
Result logdir: /Volumes/SSD980/ray/results/test/Test
Number of trials: 8/10 (7 ERROR, 1 PENDING)
+------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------+
| Trial name       | status   | loc             |   entropy_coeff |    gamma |   lambda |        lr |   vf_loss_coeff |
|------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------|
| A2C_hrl_6be960a3 | PENDING  |                 |      0.0808589  | 0.927415 | 0.909767 | 0.0684549 |        0.496137 |
| A2C_hrl_d82fc844 | ERROR    | 127.0.0.1:27142 |      0.0375166  | 0.985564 | 0.973199 | 0.059906  |        0.240417 |
| A2C_hrl_c02bf1da | ERROR    | 127.0.0.1:27211 |      0.0156839  | 0.905228 | 0.986618 | 0.0601514 |        0.737265 |
| A2C_hrl_422dc560 | ERROR    | 



== Status ==
Current time: 2023-09-08 17:49:04 (running for 00:02:33.82)
Using FIFO scheduling algorithm.
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Volumes/SSD980/ray/results/test/Test
Number of trials: 9/10 (7 ERROR, 1 PENDING, 1 RUNNING)
+------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------+
| Trial name       | status   | loc             |   entropy_coeff |    gamma |   lambda |        lr |   vf_loss_coeff |
|------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------|
| A2C_hrl_6be960a3 | RUNNING  | 127.0.0.1:27475 |      0.0808589  | 0.927415 | 0.909767 | 0.0684549 |        0.496137 |
| A2C_hrl_f237253f | PENDING  |                 |      0.0122916  | 0.944566 | 0.903439 | 0.0909411 |        0.332902 |
| A2C_hrl_d82fc844 | ERROR    | 127.0.0.1:27142 |      0.0375166  | 0.985564 | 0.973199 | 0.059906  |        0.240417 |
| A2C_hrl_c02bf1da | 

2023-09-08 17:49:04,881	ERROR tune_controller.py:911 -- Trial task failed for trial A2C_hrl_6be960a3
Traceback (most recent call last):
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/worker.py", line 2524, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::A2C.train()[39m (pid=27475, ip=127.0.0.1, actor_id=3b7249bca71a33fe9c8c3c2d01000000, repr=A2C)
  File "/Users/floriankockler

== Status ==
Current time: 2023-09-08 17:49:17 (running for 00:02:46.95)
Using FIFO scheduling algorithm.
Logical resource usage: 4.0/8 CPUs, 0/0 GPUs
Result logdir: /Volumes/SSD980/ray/results/test/Test
Number of trials: 9/10 (8 ERROR, 1 PENDING)
+------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------+
| Trial name       | status   | loc             |   entropy_coeff |    gamma |   lambda |        lr |   vf_loss_coeff |
|------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------|
| A2C_hrl_f237253f | PENDING  |                 |      0.0122916  | 0.944566 | 0.903439 | 0.0909411 |        0.332902 |
| A2C_hrl_d82fc844 | ERROR    | 127.0.0.1:27142 |      0.0375166  | 0.985564 | 0.973199 | 0.059906  |        0.240417 |
| A2C_hrl_c02bf1da | ERROR    | 127.0.0.1:27211 |      0.0156839  | 0.905228 | 0.986618 | 0.0601514 |        0.737265 |
| A2C_hrl_422dc560 | ERROR    | 

2023-09-08 17:49:22,697	ERROR tune_controller.py:911 -- Trial task failed for trial A2C_hrl_f237253f
Traceback (most recent call last):
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/worker.py", line 2524, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::A2C.train()[39m (pid=27539, ip=127.0.0.1, actor_id=3b5ff382e173a86d2fb6b23c01000000, repr=A2C)
  File "/Users/floriankockler

== Status ==
Current time: 2023-09-08 17:49:27 (running for 00:02:57.01)
Using FIFO scheduling algorithm.
Logical resource usage: 4.0/8 CPUs, 0/0 GPUs
Result logdir: /Volumes/SSD980/ray/results/test/Test
Number of trials: 10/10 (9 ERROR, 1 PENDING)
+------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------+
| Trial name       | status   | loc             |   entropy_coeff |    gamma |   lambda |        lr |   vf_loss_coeff |
|------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------|
| A2C_hrl_57e8e16b | PENDING  |                 |      0.066286   | 0.928054 | 0.952007 | 0.0547164 |        0.266369 |
| A2C_hrl_d82fc844 | ERROR    | 127.0.0.1:27142 |      0.0375166  | 0.985564 | 0.973199 | 0.059906  |        0.240417 |
| A2C_hrl_c02bf1da | ERROR    | 127.0.0.1:27211 |      0.0156839  | 0.905228 | 0.986618 | 0.0601514 |        0.737265 |
| A2C_hrl_422dc560 | ERROR    |



== Status ==
Current time: 2023-09-08 17:49:37 (running for 00:03:07.10)
Using FIFO scheduling algorithm.
Logical resource usage: 4.0/8 CPUs, 0/0 GPUs
Result logdir: /Volumes/SSD980/ray/results/test/Test
Number of trials: 10/10 (9 ERROR, 1 PENDING)
+------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------+
| Trial name       | status   | loc             |   entropy_coeff |    gamma |   lambda |        lr |   vf_loss_coeff |
|------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------|
| A2C_hrl_57e8e16b | PENDING  |                 |      0.066286   | 0.928054 | 0.952007 | 0.0547164 |        0.266369 |
| A2C_hrl_d82fc844 | ERROR    | 127.0.0.1:27142 |      0.0375166  | 0.985564 | 0.973199 | 0.059906  |        0.240417 |
| A2C_hrl_c02bf1da | ERROR    | 127.0.0.1:27211 |      0.0156839  | 0.905228 | 0.986618 | 0.0601514 |        0.737265 |
| A2C_hrl_422dc560 | ERROR    |

[2m[36m(A2C pid=27577)[0m Trainable.setup took 10.960 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2023-09-08 17:49:41,939	ERROR tune_controller.py:911 -- Trial task failed for trial A2C_hrl_57e8e16b
Traceback (most recent call last):
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/worker.py", line 2524, in get
    raise value.as_instanceof_cause(

== Status ==
Current time: 2023-09-08 17:49:44 (running for 00:03:14.57)
Using FIFO scheduling algorithm.
Logical resource usage: 4.0/8 CPUs, 0/0 GPUs
Result logdir: /Volumes/SSD980/ray/results/test/Test
Number of trials: 10/10 (10 ERROR)
+------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------+
| Trial name       | status   | loc             |   entropy_coeff |    gamma |   lambda |        lr |   vf_loss_coeff |
|------------------+----------+-----------------+-----------------+----------+----------+-----------+-----------------|
| A2C_hrl_d82fc844 | ERROR    | 127.0.0.1:27142 |      0.0375166  | 0.985564 | 0.973199 | 0.059906  |        0.240417 |
| A2C_hrl_c02bf1da | ERROR    | 127.0.0.1:27211 |      0.0156839  | 0.905228 | 0.986618 | 0.0601514 |        0.737265 |
| A2C_hrl_422dc560 | ERROR    | 127.0.0.1:27259 |      0.00215639 | 0.987292 | 0.983244 | 0.0213127 |        0.263642 |
| A2C_hrl_9ffc9c5e | ERROR    | 127.0.0.1

- /Volumes/SSD980/ray/results/test/Test/A2C_hrl_d82fc844_1_entropy_coeff=0.0375,env=hrl,gamma=0.9856,lambda=0.9732,lr=0.0599,ABT_US=ref_ph_0c4b22f9,AMGN_US=ref_ph_fd750e6_2023-09-08_17-46-30
- /Volumes/SSD980/ray/results/test/Test/A2C_hrl_c02bf1da_2_entropy_coeff=0.0157,env=hrl,gamma=0.9052,lambda=0.9866,lr=0.0602,ABT_US=ref_ph_0c4b22f9,AMGN_US=ref_ph_fd750e6_2023-09-08_17-46-53
- /Volumes/SSD980/ray/results/test/Test/A2C_hrl_422dc560_3_entropy_coeff=0.0022,env=hrl,gamma=0.9873,lambda=0.9832,lr=0.0213,ABT_US=ref_ph_0c4b22f9,AMGN_US=ref_ph_fd750e6_2023-09-08_17-47-12
- /Volumes/SSD980/ray/results/test/Test/A2C_hrl_9ffc9c5e_4_entropy_coeff=0.0184,env=hrl,gamma=0.9274,lambda=0.9525,lr=0.0433,ABT_US=ref_ph_0c4b22f9,AMGN_US=ref_ph_fd750e6_2023-09-08_17-47-29
- /Volumes/SSD980/ray/results/test/Test/A2C_hrl_1f95c59e_5_entropy_coeff=0.0612,env=hrl,gamma=0.9126,lambda=0.9292,lr=0.0367,ABT_US=ref_ph_0c4b22f9,AMGN_US=ref_ph_fd750e6_2023-09-08_17-47-48
- /Volumes/SSD980/ray/results/test/Test/A2C_h

RuntimeError: No best trial found for the given metric: episode_reward_mean. This means that no trial has reported this metric, or all values reported for this metric are NaN. To not ignore NaN values, you can set the `filter_nan_and_inf` arg to False.

In [2]:
env = HRL()

def env_creator(env_config):
    return HRL(env_config)  

register_env("hrl", env_creator)

def policy_mapping_fn(agent_id, episode=None, worker=None, **kwargs):
    if agent_id in env.workers:
        return "worker_policy"
    else:
        return "manager_policy"
    
first_worker_tic = next(iter(env.workers))
worker_policy_spec = PolicySpec(
    observation_space=env.observation_space[first_worker_tic],
    action_space=env.action_space[first_worker_tic],
    config={}
)
 
manager_policy_spec = PolicySpec(
    observation_space=env.observation_space['manager'],
    action_space=env.action_space['manager'],
    config={}
)

policies = {
    "worker_policy": worker_policy_spec,
    "manager_policy": manager_policy_spec,
}
 
search_space = {
     "lr": tune.uniform(1e-4, 1e-1),
    "gamma": tune.uniform(0.9, 0.99),
    "lambda": tune.uniform(0.9, 1.0),
    "entropy_coeff": tune.uniform(1e-4, 1e-1),
    "vf_loss_coeff": tune.uniform(0.1, 1.0),
}

tune_config = TuneConfig(
    metric="episode_reward_mean",
    mode="max",
    max_concurrent_trials=1,
    num_samples=10,
    search_alg=BayesOptSearch(
          search_space,
        metric="episode_reward_mean",
        mode="max",
        random_search_steps=4
    ),
    scheduler=HyperBandScheduler()
)

run_config = RunConfig(
    name="MyExperiment",
    storage_path="/Volumes/SSD980/ray/results",
    verbose=2,
    # checkpoint_config=air.CheckpointConfig(checkpoint_frequency=2),
    callbacks=None,
    stop=None,
    failure_config=None,
    sync_config=None,
    checkpoint_config=None,
    progress_reporter=CLIReporter(max_progress_rows=10,max_report_frequency=300),

)

param_space = {
     "env": "hrl",
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": policy_mapping_fn,
        },
          "rollout_fragment_length": "auto",
          "lr": tune.uniform(1e-5,1e-4),
        "num_workers": 1,  
        "num_cpus_per_trial": 1,
}

analysis = tune.run("A2C", metric="episode_reward_mean", mode="max",config=param_space)

2023-09-08 16:26:28,300	INFO tune.py:657 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


0,1
Current time:,2023-09-08 17:15:10
Running for:,00:48:42.24
Memory:,6.8/8.0 GiB

Trial name,status,loc,lr,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
A2C_hrl_b2ed3_00000,RUNNING,127.0.0.1:15321,7.13523e-05,275,2853.02,140032,397985,724775,22836.6,6304




Trial name,agent_timesteps_total,connector_metrics,counters,custom_metrics,episode_len_mean,episode_media,episode_reward_max,episode_reward_mean,episode_reward_min,episodes_this_iter,info,num_agent_steps_sampled,num_agent_steps_trained,num_env_steps_sampled,num_env_steps_sampled_this_iter,num_env_steps_sampled_throughput_per_sec,num_env_steps_trained,num_env_steps_trained_this_iter,num_env_steps_trained_throughput_per_sec,num_faulty_episodes,num_healthy_workers,num_in_flight_async_reqs,num_remote_worker_restarts,num_steps_trained_this_iter,perf,policy_reward_max,policy_reward_mean,policy_reward_min,sampler_perf,sampler_results,timers
A2C_hrl_b2ed3_00000,1960448,"{'ObsPreprocessorConnector_ms': 0.07513273846019398, 'StateBufferConnector_ms': 0.008488243276422674, 'ViewRequirementAgentConnector_ms': 0.3082307902249423}","{'num_env_steps_sampled': 140032, 'num_env_steps_trained': 140032, 'num_agent_steps_sampled': 1960448, 'num_agent_steps_trained': 1960448}",{},6304,{},724775,397985,22836.6,0,"{'learner': {'manager_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 40.0, 'cur_lr': 7.135231685121378e-05, 'entropy_coeff': 0.01, 'policy_entropy': 8983.3740234375, 'policy_loss': 13770.2509765625, 'vf_loss': 47.276458740234375}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 4376, 'diff_num_grad_updates_vs_sampler_policy': 4375}, 'worker_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 40.0, 'cur_lr': 7.135231685121378e-05, 'entropy_coeff': 0.01, 'policy_entropy': -79.89375305175781, 'policy_loss': -8641.462890625, 'vf_loss': 18568436.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 416, 'num_grad_updates_lifetime': 4376, 'diff_num_grad_updates_vs_sampler_policy': 4375}}, 'num_env_steps_sampled': 140032, 'num_env_steps_trained': 140032, 'num_agent_steps_sampled': 1960448, 'num_agent_steps_trained': 1960448}",1960448,1960448,140032,576,56.963,140032,576,56.963,0,1,0,0,576,"{'cpu_util_percent': 47.05714285714286, 'ram_util_percent': 84.4}","{'manager_policy': 2316.146240569651, 'worker_policy': 234259.65842725144}","{'manager_policy': 1531.562260591698, 'worker_policy': 30496.449637810987}","{'manager_policy': 418.42606826680094, 'worker_policy': -4245.862063456094}","{'mean_raw_obs_processing_ms': 1.9023988075349936, 'mean_inference_ms': 4.685624718198401, 'mean_action_processing_ms': 1.7296003001411258, 'mean_env_wait_ms': 10.432182731786636, 'mean_env_render_ms': 0.0}","{'episode_reward_max': 724774.8262072392, 'episode_reward_min': 22836.55890055027, 'episode_reward_mean': 397985.4075521345, 'episode_len_mean': 6304.0, 'episode_media': {}, 'episodes_this_iter': 0, 'policy_reward_min': {'manager_policy': 418.42606826680094, 'worker_policy': -4245.862063456094}, 'policy_reward_max': {'manager_policy': 2316.146240569651, 'worker_policy': 234259.65842725144}, 'policy_reward_mean': {'manager_policy': 1531.562260591698, 'worker_policy': 30496.449637810987}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [22836.55890055027, 75294.97284392256, 218407.07345604093, 292957.8156847798, 289030.5981819227, 388938.3450221231, 535313.1334759434, 528103.548171909, 567298.0266862111, 724774.8262072392, 350255.54304500134, 123938.73509234312, 106774.01792322926, 385009.53468407306, 433459.7482726348, 461829.23187663266, 612211.5686681542, 533338.4988422005, 569019.358061935, 492054.0777483772, 451728.69077332667, 593105.06252841], 'episode_lengths': [6304, 6304, 6304, 6304, 6304, 6304, 6304, 6304, 6304, 6304, 6304, 6304, 6304, 6304, 6304, 6304, 6304, 6304, 6304, 6304, 6304, 6304], 'policy_manager_policy_reward': [1291.0833038589917, 418.42606826680094, 1212.179251662019, 1459.1075042977245, 1736.1857072124258, 1482.2348092698958, 2017.3159687425941, 1693.1613261306193, 1934.4563538026996, 2316.146240569651, 1192.2397266846674, 748.5824920025752, 655.981911802548, 1259.364418295043, 1412.9701802080963, 1639.2101034089865, 1929.4038971813861, 1932.9336217306554, 1980.5909047201276, 1699.8651897053933, 1664.536649318994, 2018.3941041454673], 'policy_worker_policy_reward': [-73.442253009649, -4245.862063456094, 2975.0946349771984, -1293.4606386847809, -2162.7294140625163, 1983.1361390544334, -1125.0378930252919, -416.1013773936502, -2299.7891607308993, -2869.6066890408056, 4937.368952678276, -1772.8262352976635, 27908.73159468273, -1751.7157559391926, 448.14084369061675, 3492.874924124313, 237.4903581593535, -1948.8311281191418, 5220.9563065216935, -1608.1326365327986, 14045.073524510808, 7425.279354206665, -1685.0688433470787, 15690.333965203376, 3101.742684473862, 32208.403178703273, 11957.458706395639, 17280.48817266029, 15819.687604920418, -271.7302978154621, 375.6794700220926, 26913.234013282578, 3001.2213734844627, 34125.11922072846, 130.69488714210456, -1421.5188300964073, 19554.864965910558, 15891.014747383473, 73838.68017036069, 21000.684469451895, 172059.67899510637, -124.77382377057074, 5200.4411983937025, 7435.57402266725, 9197.700809003756, 600.9024612933863, -1347.270557583699, 10941.183158477768, -2222.7106075409683, 20656.188870206126, 11311.48159281045, 36789.62759196659, 15356.263736573615, 65957.89820343904, 21343.629033441568, 1610.2420074275396, 3645.419317068561, 28163.308542185776, 3123.8492078287527, 21402.170030558314, -2184.9782303174725, 13722.02420262812, 32628.009773427737, 19941.022687511504, 62585.55396293718, 20661.768648914283, 146901.91112536972, 11746.882929303254, 5116.958783109498, -1163.4666978842579, 13748.299546932656, 7721.088432573604, 62168.030326077074, 13325.321572099157, -1846.7446059925314, 3856.972900169536, 8302.039939315044, 96917.04731286608, 37495.62379562228, 134248.78422920813, 27329.871764535965, 7174.426774064399, 277.51177443431516, 41651.77375352817, 553.5760014534753, 104077.9851709368, 18126.31196766076, 16275.34038766344, 93938.42086778063, 11602.68878172609, 40543.50223858628, 24032.27565244562, 128143.03637796358, 28155.481869654453, 6620.701089437567, 13661.359932829233, 23649.428023091183, 2181.0187175814644, 52880.39456227668, 16346.929252930517, 21373.36289288711, 151413.1983329325, 5195.015705645841, 52758.184436102514, 24130.181466442547, 224521.8153175977, 32387.726042158734, 8675.229067123262, 1605.6159428973115, 26808.963333385967, 10654.925264977152, 8270.126580469621, 26695.778255733152, 16160.21766170772, 120623.18366919417, 15727.53505191479, 49102.272678806185, 26994.73282806884, 234259.65842725144, 45199.798609096964, 11502.625715955088, 4734.848336149385, 40478.91585007403, 11594.326274485677, 64176.43106111768, 22023.109118536697, 20278.538208995116, 125992.92898159684, 19341.669160040765, 95881.09739530098, 4258.197908882983, 9261.502169761807, 8281.72560307011, 4956.760333411396, 3729.799579866696, 11245.025396360143, 2963.763702504337, 47418.60604429245, 3963.7200585976243, 11206.77991759777, 43460.642043309286, 28402.9174095802, 169913.86315108184, 3212.44735125266, 38330.71864759922, 4283.93205909431, 5929.8442908898, 2426.0590026210994, 4450.179801095277, 5560.212257385254, 19367.012524545193, 1293.8244385570288, 8045.578993558884, 9830.89481267333, 17541.529415488243, 2917.9190055802464, 2713.8203490823507, 46969.861585870385, 2743.2121124072, 3120.801445286721, 3178.0885834395885, 4373.980130448937, 149.8463075272739, 10761.667057909071, 2786.4207562059164, 2578.7649049907923, 22202.637978613377, 2517.2862736321986, 2021.6485260128975, 5397.833954564296, 107573.88991986308, 18661.847717359662, 6259.196093962062, 6537.876810689922, 23421.655562547967, 5268.853867068887, 25555.71675394848, 8513.36112075299, 6241.682090122253, 60872.50525217876, 9167.348061210534, 100278.40306150913, 15598.132684446115, 51749.074641010724, 10542.39295241097, 8195.574069555907, 9681.476416672813, 15037.288391744718, 14187.62022552453, 29591.66031273175, 7343.288127268199, 9531.491614397615, 44140.97396180779, 20331.428864685353, 196116.37583017023, 20608.632437832654, 113009.86767225154, 29823.893041985575, 9870.183800790925, 19062.624872705666, 22621.751599571202, 12322.843913038028, 27720.63768993318, 16553.916031224653, 14760.850642334903, 68488.00308974425, 13274.625554200262, 92072.19142761081, 21310.31399447986, 167919.483069095, 23875.674114608788, 8048.830605634837, 11155.618686025031, 16838.3385889465, 8212.522430582787, 53708.07915534265, 27134.10070595448, 17474.16883829236, 78047.83971018327, 15188.123857039493, 161369.0710147877, 20361.03201040812, 213056.6287899323, 16975.84235647763, 9448.026876533171, 6678.8376156844315, 26438.57725389814, 7363.578797755181, 31168.61870053783, 11381.540478943672, 15641.492498984095, 98890.0036634483, 17969.464532900172, 56031.9216449667, 24097.491985969245, 124672.56317860057, 41628.901735873194, 9514.23920213907, 16029.07874344755, 29515.487003409886, 9061.837073451723, 68833.12254986563, 20850.506953023374, 13287.852709988743, 78374.28864257724, 25261.783517293694, 105911.61386157497, 18975.052187058725, 104287.351218611, 23321.319662841794, 8293.417637236067, 5318.530747791287, 23701.937799786218, 12092.983089114365, 52035.43139525084, 31337.177285739017, 14657.846370575484, 94384.09215165648, 13146.078696915996, 88802.99431609455, 15159.25837655738, 98440.39890390262, 32951.79341703979, 5141.891223155893, 9533.217395976477, 28423.86936487397, 6275.597803335637, 38912.77640387369, 14274.394481398165, 13696.1720701763, 52677.025629982876, 18362.797615666175, 116214.96143806863, 16544.26144805736, 175941.6157755489, 24831.277255583787, 8997.25663163187, 6824.679539553996, 32725.40687694354, 10639.687434101477, 91968.28763681039, 16991.30000422109, 11018.578937329257, 104864.99279644297, 10726.209946199771, 79013.11414184002]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 1.9023988075349936, 'mean_inference_ms': 4.685624718198401, 'mean_action_processing_ms': 1.7296003001411258, 'mean_env_wait_ms': 10.432182731786636, 'mean_env_render_ms': 0.0}, 'num_faulty_episodes': 0, 'connector_metrics': {'ObsPreprocessorConnector_ms': 0.07513273846019398, 'StateBufferConnector_ms': 0.008488243276422674, 'ViewRequirementAgentConnector_ms': 0.3082307902249423}}","{'training_iteration_time_ms': 613.34, 'sample_time_ms': 596.347, 'learn_time_ms': 15.738, 'learn_throughput': 2033.322, 'synch_weights_time_ms': 1.04}"


[2m[36m(RolloutWorker pid=15330)[0m HRL is done
[2m[36m(RolloutWorker pid=15330)[0m day: 6303, episode: 2
[2m[36m(RolloutWorker pid=15330)[0m Total Cash Transfers: 48
[2m[36m(RolloutWorker pid=15330)[0m total_portfolio_trades: 49654.0
[2m[36m(RolloutWorker pid=15330)[0m Beginn_Portfolio_Value: 2000000
[2m[36m(RolloutWorker pid=15330)[0m End_Portfolio_Value: 15056719.0
[2m[36m(RolloutWorker pid=15330)[0m Annual Return: 12.40 %
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: ABT.US Current Stock Exposure: 756047
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: AMGN.US Current Stock Exposure: 0
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: BDX.US Current Stock Exposure: 677528
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: BMY.US Current Stock Exposure: 94354
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: HUM.US Current Stock Exposure: 632814
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: JNJ.US Current Stock Exposure: 124153
[2m[36m(RolloutWorker pi

[2m[36m(raylet)[0m Spilled 2083 MiB, 78 objects, write throughput 203 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message.


[2m[36m(RolloutWorker pid=15330)[0m HRL is done
[2m[36m(RolloutWorker pid=15330)[0m day: 6303, episode: 8
[2m[36m(RolloutWorker pid=15330)[0m Total Cash Transfers: 9
[2m[36m(RolloutWorker pid=15330)[0m total_portfolio_trades: 59550.0
[2m[36m(RolloutWorker pid=15330)[0m Beginn_Portfolio_Value: 2000000
[2m[36m(RolloutWorker pid=15330)[0m End_Portfolio_Value: 64379116.0
[2m[36m(RolloutWorker pid=15330)[0m Annual Return: 22.26 %
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: ABT.US Current Stock Exposure: 4270278
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: AMGN.US Current Stock Exposure: 16941372
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: BDX.US Current Stock Exposure: 2988676
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: BMY.US Current Stock Exposure: 967289
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: HUM.US Current Stock Exposure: 284603
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: JNJ.US Current Stock Exposure: 4782088
[2m[36m(Rollou



[2m[36m(RolloutWorker pid=15330)[0m HRL is done
[2m[36m(RolloutWorker pid=15330)[0m day: 6303, episode: 10
[2m[36m(RolloutWorker pid=15330)[0m Total Cash Transfers: 25
[2m[36m(RolloutWorker pid=15330)[0m total_portfolio_trades: 36786.0
[2m[36m(RolloutWorker pid=15330)[0m Beginn_Portfolio_Value: 2000000
[2m[36m(RolloutWorker pid=15330)[0m End_Portfolio_Value: 60118648.0
[2m[36m(RolloutWorker pid=15330)[0m Annual Return: 21.78 %
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: ABT.US Current Stock Exposure: 2578313
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: AMGN.US Current Stock Exposure: 21588166
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: BDX.US Current Stock Exposure: 3549859
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: BMY.US Current Stock Exposure: 1035378
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: HUM.US Current Stock Exposure: 844491
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: JNJ.US Current Stock Exposure: 2832456
[2m[36m(Rol

[2m[36m(raylet)[0m Spilled 4116 MiB, 154 objects, write throughput 262 MiB/s.


[2m[36m(RolloutWorker pid=15330)[0m HRL is done
[2m[36m(RolloutWorker pid=15330)[0m day: 6303, episode: 15
[2m[36m(RolloutWorker pid=15330)[0m Total Cash Transfers: 59
[2m[36m(RolloutWorker pid=15330)[0m total_portfolio_trades: 7084.0
[2m[36m(RolloutWorker pid=15330)[0m Beginn_Portfolio_Value: 2000000
[2m[36m(RolloutWorker pid=15330)[0m End_Portfolio_Value: 40424148.0
[2m[36m(RolloutWorker pid=15330)[0m Annual Return: 19.01 %
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: ABT.US Current Stock Exposure: 476536
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: AMGN.US Current Stock Exposure: 9026491
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: BDX.US Current Stock Exposure: 2148455
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: BMY.US Current Stock Exposure: 663190
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: HUM.US Current Stock Exposure: 739318
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: JNJ.US Current Stock Exposure: 2652716
[2m[36m(Rollout



[2m[36m(RolloutWorker pid=15330)[0m HRL is done
[2m[36m(RolloutWorker pid=15330)[0m day: 6303, episode: 21
[2m[36m(RolloutWorker pid=15330)[0m Total Cash Transfers: 42
[2m[36m(RolloutWorker pid=15330)[0m total_portfolio_trades: 7501.0
[2m[36m(RolloutWorker pid=15330)[0m Beginn_Portfolio_Value: 2000000
[2m[36m(RolloutWorker pid=15330)[0m End_Portfolio_Value: 51478380.0
[2m[36m(RolloutWorker pid=15330)[0m Annual Return: 20.69 %
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: ABT.US Current Stock Exposure: 1827636
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: AMGN.US Current Stock Exposure: 6718482
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: BDX.US Current Stock Exposure: 2862534
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: BMY.US Current Stock Exposure: 924099
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: HUM.US Current Stock Exposure: 1013567
[2m[36m(RolloutWorker pid=15330)[0m Worker ID: JNJ.US Current Stock Exposure: 2915210
[2m[36m(Rollo

In [3]:
from ray.tune import CLIReporter
def env_creator(env_config):
    return HRL(env_config)  # Assuming this is your environment
 
env = HRL()

register_env("hrl", env_creator)

def policy_mapping_fn(agent_id, episode=None, worker=None, **kwargs):
    if agent_id in env.workers:
        return "worker_policy"
    else:
        return "manager_policy"
    
first_worker_tic = next(iter(env.workers))
worker_policy_spec = PolicySpec(
    observation_space=env.observation_space[first_worker_tic],
    action_space=env.action_space[first_worker_tic],
    config={}
)
 
manager_policy_spec = PolicySpec(
    observation_space=env.observation_space['manager'],
    action_space=env.action_space['manager'],
    config={}
)

policies = {
    "worker_policy": worker_policy_spec,
    "manager_policy": manager_policy_spec,
}
 
search_space = {
    "lr": tune.loguniform(1e-4, 1e-1),  # Learning rate
    "gamma": tune.uniform(0.9, 0.99),  # Discount factor
}

tune_config = TuneConfig(
    metric="episode_reward_mean",
    mode="max",
    max_concurrent_trials=1,
    num_samples=100,
    search_alg=BayesOptSearch(
          search_space,
        metric="episode_reward_mean",
        mode="max",
        random_search_steps=4
    ),
    scheduler=HyperBandScheduler()
)

run_config = RunConfig(
    name="MyExperiment",
    storage_path="/Users/floriankockler/rayresults/tuner_trial",
    verbose=2,
    # checkpoint_config=air.CheckpointConfig(checkpoint_frequency=2),
    callbacks=None,
    stop=None,
    failure_config=None,
    sync_config=None,
    checkpoint_config=None,
    progress_reporter=CLIReporter(max_progress_rows=10),# Define the policy mapping function

)

param_space = {
     "env": "hrl",
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": policy_mapping_fn,
        },
          "rollout_fragment_length": "auto",
        "num_workers": 1,  
        "num_cpus_per_trial": 1,
}

tuner = tune.Tuner(
    "A2C",
    tune_config=tune_config,
    param_space=param_space,

    run_config=run_config,
)
results = tuner.fit()


  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
2023-09-07 22:38:47,802	INFO tune.py:657 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


== Status ==
Current time: 2023-09-07 22:38:48 (running for 00:00:00.86)
Using HyperBand: num_stopped=0 total_brackets=1
Round #0:
  Bracket(Max Size (n)=5, Milestone (r)=81, completed=0.0%): {PENDING: 1} 
Logical resource usage: 0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/tuner_trial/MyExperiment
Number of trials: 1/100 (1 PENDING)
+------------------+----------+-------+----------+-----------+
| Trial name       | status   | loc   |    gamma |        lr |
|------------------+----------+-------+----------+-----------|
| A2C_hrl_b9c02a0d | PENDING  |       | 0.933709 | 0.0950764 |
+------------------+----------+-------+----------+-----------+


== Status ==
Current time: 2023-09-07 22:38:53 (running for 00:00:05.88)
Using HyperBand: num_stopped=0 total_brackets=1
Round #0:
  Bracket(Max Size (n)=5, Milestone (r)=81, completed=0.0%): {PENDING: 1} 
Logical resource usage: 2.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/tuner_trial/MyExperiment
N



== Status ==
Current time: 2023-09-07 22:38:58 (running for 00:00:10.98)
Using HyperBand: num_stopped=0 total_brackets=1
Round #0:
  Bracket(Max Size (n)=5, Milestone (r)=81, completed=0.0%): {PENDING: 1} 
Logical resource usage: 2.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/tuner_trial/MyExperiment
Number of trials: 1/100 (1 PENDING)
+------------------+----------+-------+----------+-----------+
| Trial name       | status   | loc   |    gamma |        lr |
|------------------+----------+-------+----------+-----------|
| A2C_hrl_b9c02a0d | PENDING  |       | 0.933709 | 0.0950764 |
+------------------+----------+-------+----------+-----------+




2023-09-07 22:39:03,274	ERROR tune_controller.py:911 -- Trial task failed for trial A2C_hrl_b9c02a0d
Traceback (most recent call last):
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/worker.py", line 2524, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::A2C.train()[39m (pid=80275, ip=127.0.0.1, actor_id=c52e57b604ccbdc4ae9b4f5901000000, repr=A2C)
  File "/Users/floriankockler

Trial name
A2C_hrl_74a821b2
A2C_hrl_869de91c
A2C_hrl_b9c02a0d
A2C_hrl_d077352f


[2m[36m(A2C pid=80275)[0m 2023-09-07 22:39:03,270	ERROR actor_manager.py:500 -- Ray error, taking actor 1 out of service. [36mray::RolloutWorker.apply()[39m (pid=80285, ip=127.0.0.1, actor_id=5432ebb410a990b79f8f682601000000, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x163a53b80>)
[2m[36m(A2C pid=80275)[0m   File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/rllib/utils/actor_manager.py", line 185, in apply
[2m[36m(A2C pid=80275)[0m     raise e
[2m[36m(A2C pid=80275)[0m   File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/rllib/utils/actor_manager.py", line 176, in apply
[2m[36m(A2C pid=80275)[0m     return func(self, *args, **kwargs)
[2m[36m(A2C pid=80275)[0m   File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/rllib/execution/rollout_ops.py", line 86, in <lambda>
[2m[36m(A2C pid=80275)[0m     lambda w: w.sample(), local_worker=False, healthy_only

== Status ==
Current time: 2023-09-07 22:39:03 (running for 00:00:16.02)
Using HyperBand: num_stopped=0 total_brackets=1
Round #0:
  Bracket(Max Size (n)=5, Milestone (r)=81, completed=0.0%): {ERROR: 1, PENDING: 1} 
Logical resource usage: 2.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/tuner_trial/MyExperiment
Number of trials: 2/100 (1 ERROR, 1 PENDING)
+------------------+----------+-----------------+----------+-----------+
| Trial name       | status   | loc             |    gamma |        lr |
|------------------+----------+-----------------+----------+-----------|
| A2C_hrl_869de91c | PENDING  |                 | 0.965879 | 0.059906  |
| A2C_hrl_b9c02a0d | ERROR    | 127.0.0.1:80275 | 0.933709 | 0.0950764 |
+------------------+----------+-----------------+----------+-----------+
Number of errored trials: 1
+------------------+--------------+------------------------------------------------------------------------------------------------------------------------



== Status ==
Current time: 2023-09-07 22:39:08 (running for 00:00:21.11)
Using HyperBand: num_stopped=0 total_brackets=1
Round #0:
  Bracket(Max Size (n)=5, Milestone (r)=81, completed=0.0%): {ERROR: 1, PENDING: 1} 
Logical resource usage: 2.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/tuner_trial/MyExperiment
Number of trials: 2/100 (1 ERROR, 1 PENDING)
+------------------+----------+-----------------+----------+-----------+
| Trial name       | status   | loc             |    gamma |        lr |
|------------------+----------+-----------------+----------+-----------|
| A2C_hrl_869de91c | PENDING  |                 | 0.965879 | 0.059906  |
| A2C_hrl_b9c02a0d | ERROR    | 127.0.0.1:80275 | 0.933709 | 0.0950764 |
+------------------+----------+-----------------+----------+-----------+
Number of errored trials: 1
+------------------+--------------+------------------------------------------------------------------------------------------------------------------------



== Status ==
Current time: 2023-09-07 22:39:13 (running for 00:00:26.12)
Using HyperBand: num_stopped=0 total_brackets=1
Round #0:
  Bracket(Max Size (n)=5, Milestone (r)=81, completed=0.0%): {ERROR: 1, PENDING: 1} 
Logical resource usage: 2.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/tuner_trial/MyExperiment
Number of trials: 2/100 (1 ERROR, 1 PENDING)
+------------------+----------+-----------------+----------+-----------+
| Trial name       | status   | loc             |    gamma |        lr |
|------------------+----------+-----------------+----------+-----------|
| A2C_hrl_869de91c | PENDING  |                 | 0.965879 | 0.059906  |
| A2C_hrl_b9c02a0d | ERROR    | 127.0.0.1:80275 | 0.933709 | 0.0950764 |
+------------------+----------+-----------------+----------+-----------+
Number of errored trials: 1
+------------------+--------------+------------------------------------------------------------------------------------------------------------------------

2023-09-07 22:39:18,509	ERROR tune_controller.py:911 -- Trial task failed for trial A2C_hrl_869de91c
Traceback (most recent call last):
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/worker.py", line 2524, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::A2C.train()[39m (pid=80304, ip=127.0.0.1, actor_id=74a48364ba6cd33fc8108bbd01000000, repr=A2C)
  File "/Users/floriankockler

== Status ==
Current time: 2023-09-07 22:39:18 (running for 00:00:31.14)
Using HyperBand: num_stopped=0 total_brackets=1
Round #0:
  Bracket(Max Size (n)=5, Milestone (r)=81, completed=0.0%): {ERROR: 2, PENDING: 1} 
Logical resource usage: 2.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/tuner_trial/MyExperiment
Number of trials: 3/100 (2 ERROR, 1 PENDING)
+------------------+----------+-----------------+----------+-----------+
| Trial name       | status   | loc             |    gamma |        lr |
|------------------+----------+-----------------+----------+-----------|
| A2C_hrl_74a821b2 | PENDING  |                 | 0.914042 | 0.0156839 |
| A2C_hrl_b9c02a0d | ERROR    | 127.0.0.1:80275 | 0.933709 | 0.0950764 |
| A2C_hrl_869de91c | ERROR    | 127.0.0.1:80304 | 0.965879 | 0.059906  |
+------------------+----------+-----------------+----------+-----------+
Number of errored trials: 2
+------------------+--------------+-----------------------------------------------



== Status ==
Current time: 2023-09-07 22:39:24 (running for 00:00:36.20)
Using HyperBand: num_stopped=0 total_brackets=1
Round #0:
  Bracket(Max Size (n)=5, Milestone (r)=81, completed=0.0%): {ERROR: 2, PENDING: 1} 
Logical resource usage: 2.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/tuner_trial/MyExperiment
Number of trials: 3/100 (2 ERROR, 1 PENDING)
+------------------+----------+-----------------+----------+-----------+
| Trial name       | status   | loc             |    gamma |        lr |
|------------------+----------+-----------------+----------+-----------|
| A2C_hrl_74a821b2 | PENDING  |                 | 0.914042 | 0.0156839 |
| A2C_hrl_b9c02a0d | ERROR    | 127.0.0.1:80275 | 0.933709 | 0.0950764 |
| A2C_hrl_869de91c | ERROR    | 127.0.0.1:80304 | 0.965879 | 0.059906  |
+------------------+----------+-----------------+----------+-----------+
Number of errored trials: 2
+------------------+--------------+-----------------------------------------------



== Status ==
Current time: 2023-09-07 22:39:29 (running for 00:00:41.28)
Using HyperBand: num_stopped=0 total_brackets=1
Round #0:
  Bracket(Max Size (n)=5, Milestone (r)=81, completed=0.0%): {ERROR: 2, PENDING: 1} 
Logical resource usage: 2.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/tuner_trial/MyExperiment
Number of trials: 3/100 (2 ERROR, 1 PENDING)
+------------------+----------+-----------------+----------+-----------+
| Trial name       | status   | loc             |    gamma |        lr |
|------------------+----------+-----------------+----------+-----------|
| A2C_hrl_74a821b2 | PENDING  |                 | 0.914042 | 0.0156839 |
| A2C_hrl_b9c02a0d | ERROR    | 127.0.0.1:80275 | 0.933709 | 0.0950764 |
| A2C_hrl_869de91c | ERROR    | 127.0.0.1:80304 | 0.965879 | 0.059906  |
+------------------+----------+-----------------+----------+-----------+
Number of errored trials: 2
+------------------+--------------+-----------------------------------------------



== Status ==
Current time: 2023-09-07 22:39:34 (running for 00:00:46.39)
Using HyperBand: num_stopped=0 total_brackets=1
Round #0:
  Bracket(Max Size (n)=5, Milestone (r)=81, completed=0.0%): {ERROR: 2, RUNNING: 1} 
Logical resource usage: 2.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/tuner_trial/MyExperiment
Number of trials: 3/100 (2 ERROR, 1 RUNNING)
+------------------+----------+-----------------+----------+-----------+
| Trial name       | status   | loc             |    gamma |        lr |
|------------------+----------+-----------------+----------+-----------|
| A2C_hrl_74a821b2 | RUNNING  | 127.0.0.1:80328 | 0.914042 | 0.0156839 |
| A2C_hrl_b9c02a0d | ERROR    | 127.0.0.1:80275 | 0.933709 | 0.0950764 |
| A2C_hrl_869de91c | ERROR    | 127.0.0.1:80304 | 0.965879 | 0.059906  |
+------------------+----------+-----------------+----------+-----------+
Number of errored trials: 2
+------------------+--------------+-----------------------------------------------

2023-09-07 22:39:37,678	ERROR tune_controller.py:911 -- Trial task failed for trial A2C_hrl_74a821b2
Traceback (most recent call last):
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/worker.py", line 2524, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::A2C.train()[39m (pid=80328, ip=127.0.0.1, actor_id=e25146695b16b5104b4517ef01000000, repr=A2C)
  File "/Users/floriankockler

== Status ==
Current time: 2023-09-07 22:39:39 (running for 00:00:51.45)
Using HyperBand: num_stopped=0 total_brackets=1
Round #0:
  Bracket(Max Size (n)=5, Milestone (r)=81, completed=0.0%): {ERROR: 3, PENDING: 1} 
Logical resource usage: 2.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/tuner_trial/MyExperiment
Number of trials: 4/100 (3 ERROR, 1 PENDING)
+------------------+----------+-----------------+----------+-----------+
| Trial name       | status   | loc             |    gamma |        lr |
|------------------+----------+-----------------+----------+-----------|
| A2C_hrl_d077352f | PENDING  |                 | 0.905228 | 0.086631  |
| A2C_hrl_b9c02a0d | ERROR    | 127.0.0.1:80275 | 0.933709 | 0.0950764 |
| A2C_hrl_869de91c | ERROR    | 127.0.0.1:80304 | 0.965879 | 0.059906  |
| A2C_hrl_74a821b2 | ERROR    | 127.0.0.1:80328 | 0.914042 | 0.0156839 |
+------------------+----------+-----------------+----------+-----------+
Number of errored trials: 3
+--------



== Status ==
Current time: 2023-09-07 22:39:44 (running for 00:00:56.54)
Using HyperBand: num_stopped=0 total_brackets=1
Round #0:
  Bracket(Max Size (n)=5, Milestone (r)=81, completed=0.0%): {ERROR: 3, PENDING: 1} 
Logical resource usage: 2.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/tuner_trial/MyExperiment
Number of trials: 4/100 (3 ERROR, 1 PENDING)
+------------------+----------+-----------------+----------+-----------+
| Trial name       | status   | loc             |    gamma |        lr |
|------------------+----------+-----------------+----------+-----------|
| A2C_hrl_d077352f | PENDING  |                 | 0.905228 | 0.086631  |
| A2C_hrl_b9c02a0d | ERROR    | 127.0.0.1:80275 | 0.933709 | 0.0950764 |
| A2C_hrl_869de91c | ERROR    | 127.0.0.1:80304 | 0.965879 | 0.059906  |
| A2C_hrl_74a821b2 | ERROR    | 127.0.0.1:80328 | 0.914042 | 0.0156839 |
+------------------+----------+-----------------+----------+-----------+
Number of errored trials: 3
+--------

2023-09-07 22:39:53,198	ERROR tune_controller.py:911 -- Trial task failed for trial A2C_hrl_d077352f
Traceback (most recent call last):
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/worker.py", line 2524, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::A2C.train()[39m (pid=80378, ip=127.0.0.1, actor_id=a32439b1bb3e80f20d59edc601000000, repr=A2C)
  File "/Users/floriankockler

In [None]:
import random
import os
from ray import air, tune
from ray.tune.schedulers import PopulationBasedTraining
import argparse
from ray.tune import CLIReporter
from env.multi_agent.hrl import HRL

reporter = CLIReporter(max_progress_rows=10)

def env_creator(env_config):
    return HRL(env_config)  # Assuming this is your environment
 
register_env("hrl", env_creator)

manager_config = {
    "df": train_df,

}
hrl_config={
        "manager_config": manager_config
        }
env = HRL(hrl_config)
 
def policy_mapping_fn(agent_id, episode=None, worker=None, **kwargs):
    if agent_id in env.workers:
        return "worker_policy"
    else:
        return "manager_policy"
 


first_worker_tic = next(iter(env.workers))
worker_policy_spec = PolicySpec(
    observation_space=env.observation_space[first_worker_tic],
    action_space=env.action_space[first_worker_tic],
    config={}
)
 
manager_policy_spec = PolicySpec(
    observation_space=env.observation_space['manager'],
    action_space=env.action_space['manager'],
    config={}
)
 

parser = argparse.ArgumentParser()
parser.add_argument(
    "--smoke-test", action="store_true", help="Finish quickly for testing"
)
args, _ = parser.parse_known_args()

policies = {
    "worker_policy": worker_policy_spec,
    "manager_policy": manager_policy_spec,
}


def explore(config):
    # Ensure we collect enough timesteps to do sgd
    if config["train_batch_size"] < config["rollout_fragment_length"] * 2:
        config["train_batch_size"] = config["rollout_fragment_length"] * 2
    return config

hyperparam_mutations = {
    "lr": [1e-3, 5e-4, 1e-4, 5e-5, 1e-5],
    "gamma": lambda: random.uniform(0.9, 1.0),
    "entropy_coeff": [0.01, 0.1, 1.0],
    "num_envs_per_worker": [1, 2, 4, 8],
    #"rollout_fragment_length": [50, 100, 200, 400],
    "train_batch_size": lambda: random.randint(200, 1500),
    "sgd_minibatch_size": tune.choice([50, 100, 200]),

}

pbt = PopulationBasedTraining(
        time_attr="time_total_s",
        perturbation_interval=120,
        resample_probability=0.25,
        # Specifies the mutations of these hyperparams
        hyperparam_mutations=hyperparam_mutations,
        custom_explore_fn=explore,
    )

# Stop when we've reached 100 training iterations or reward=300
stopping_criteria = {"training_iteration": 100}

tuner = tune.Tuner(
    "A2C",
    tune_config=tune.TuneConfig(
        metric="episode_reward_mean",
        mode="max",
        scheduler=pbt,
        num_samples=1 if args.smoke_test else 10,
    ),
    param_space={
        "env": "hrl",
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": policy_mapping_fn,
        },
        "env_config": hrl_config,
        "rollout_fragment_length": "auto",
        "framework": "tf2",
        "num_workers": 1,  # 1 for training + 4 for sampling
        "num_cpus_per_trial": 3,
        # "num_cpus": 1,  # number of CPUs to use per trial --> 6 in total = max available
        # "num_gpus": 0,  # number of GPUs to use per trial
        # These params are tuned from a fixed starting value.
        "lr": 1e-4,
        # These params start off randomly drawn from a set.
        "sgd_minibatch_size": tune.choice([50, 100, 200]),
        "train_batch_size": tune.choice([200, 400, 600]),
    },

    run_config=air.RunConfig(stop=stopping_criteria, local_dir="/Users/floriankockler/rayresults/autobatch", progress_reporter=reporter),
)
results = tuner.fit()