In [1]:
import ray, random, os 
from ray.air.config import ScalingConfig
import pandas as pd
ray.init(_temp_dir='/Volumes/SSD980/ray')
from ray import air, tune
from ray.tune.schedulers import PopulationBasedTraining
import argparse
from ray.tune import CLIReporter
from worker_standlone import WorkerStandAlone
from multi_agent import MultiAgent
from manager import Manager
from ray.train.rl import RLTrainer
from ray.rllib.policy.policy import Policy, PolicySpec
from ray.tune.registry import register_env
from ray.tune import TuneConfig
from ray.tune.logger import pretty_print
from ray.tune.search.bayesopt import BayesOptSearch
from ray.air.config import RunConfig, CheckpointConfig
from ray.tune.schedulers import HyperBandScheduler
from ray.rllib.algorithms.ppo import PPOConfig
from ray.tune.search.hyperopt import HyperOptSearch
from ray.tune.schedulers import ASHAScheduler
from ray.rllib.utils import check_env
from ray.tune.logger import pretty_print

2023-09-29 15:08:32,811	INFO worker.py:1612 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8266 [39m[22m


In [2]:
from ray.rllib.utils import check_env
check_env(MultiAgent())

In [3]:
if os.name == 'nt':
    path_to_save = "C:\\GitHub\\ray\\workertune"
else:
    path_to_save = "/Volumes/SSD980/ray/multiagent2"


env = MultiAgent()
def env_creator(env_config):
    return MultiAgent()  

register_env("MultiAgent", env_creator)

asha_scheduler = ASHAScheduler(
    time_attr='training_iteration',
    max_t=2000,
    grace_period=200,
    reduction_factor=3,
    brackets=1,
)

def create_policy_spec(agent_id):
    # print(f"Creating policy for {worker_id} with obs space {env.observation_space[worker_id]} and action space {env.action_space[worker_id]}")
    return PolicySpec(
        observation_space=env.observation_space[agent_id],
        action_space=env.action_space[agent_id],
        config={}
    )

controller_policy_spec = PolicySpec(
    observation_space=env.observation_space['controller'],
    action_space=env.action_space['controller'],
    config={}
)

policies = {
    "controller_policy": controller_policy_spec,
}

for agent_id in env.agents:
    policies[agent_id] = create_policy_spec(agent_id)

def policy_mapping_fn(agent_id, episode=None, agent=None, **kwargs):
    if agent_id == 'controller':
        # print(f"!!!!!! policy mapping manager: {agent_id}")
        return "controller_policy"
    elif agent_id in env.agents:
        return agent_id
    else:
        print("defaul policy triggered")
        return "default_policy"

param_space = {
     "env": "MultiAgent",
    "multiagent": {
            "policies": policies,
            "policy_mapping_fn": policy_mapping_fn,
        },
        "rollout_fragment_length": "auto",
        "framework": "tf2",
        "lr": tune.uniform(1e-5,1e-4),
        "gamma": tune.uniform(0.95, 0.9999),
        "lambda": tune.uniform(0.9,1.0),
        "entropy_coeff": tune.uniform(0.01,0.1),
        "vf_loss_coeff": tune.uniform(0.1,0.3),
        "num_workers": 3, 
        #Change for Debugging
        "log_level": "ERROR",
        "output": "logdir",
        "monitor": True,
}

analysis = tune.run(
    "A2C", 
    metric="episode_reward_mean", 
    num_samples=10,
    resume=False,
    mode="max",
    config=param_space, 
    local_dir=path_to_save,
    search_alg=None,
    scheduler=asha_scheduler,
    progress_reporter=CLIReporter(max_progress_rows=10,max_report_frequency=120),
    max_concurrent_trials=2,
    #checkpoint_config not checked yet
    checkpoint_config={
        "num_to_keep": 3,
        "checkpoint_score_attribute": "episode_reward_mean",
        "checkpoint_score_order": "max",
        "checkpoint_frequency": 10
    }
    )

print(pretty_print(analysis.last_result))
print("Best hyperparameters found were: ", pretty_print(analysis.best_config))

2023-09-29 15:08:39,379	INFO tune.py:657 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


== Status ==
Current time: 2023-09-29 15:08:40 (running for 00:00:00.75)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 1800.000: None | Iter 600.000: None | Iter 200.000: None
Logical resource usage: 0/8 CPUs, 0/0 GPUs
Result logdir: /Volumes/SSD980/ray/multiagent2/A2C
Number of trials: 2/10 (2 PENDING)
+----------------------------+----------+-------+-----------------+----------+----------+-------------+-----------------+
| Trial name                 | status   | loc   |   entropy_coeff |    gamma |   lambda |          lr |   vf_loss_coeff |
|----------------------------+----------+-------+-----------------+----------+----------+-------------+-----------------|
| A2C_MultiAgent_4e970_00000 | PENDING  |       |       0.0455513 | 0.992995 | 0.994113 | 8.77166e-05 |        0.194472 |
| A2C_MultiAgent_4e970_00001 | PENDING  |       |       0.0403279 | 0.971596 | 0.905525 | 6.33856e-05 |        0.245074 |
+----------------------------+----------+-------+-----------------+----------+---

[2m[36m(A2C pid=47403)[0m Trainable.setup took 13.968 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(A2C pid=47402)[0m Trainable.setup took 13.911 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


Trial name,agent_timesteps_total,connector_metrics,counters,custom_metrics,episode_len_mean,episode_media,episode_reward_max,episode_reward_mean,episode_reward_min,episodes_this_iter,info,num_agent_steps_sampled,num_agent_steps_trained,num_env_steps_sampled,num_env_steps_sampled_this_iter,num_env_steps_sampled_throughput_per_sec,num_env_steps_trained,num_env_steps_trained_this_iter,num_env_steps_trained_throughput_per_sec,num_faulty_episodes,num_healthy_workers,num_in_flight_async_reqs,num_remote_worker_restarts,num_steps_trained_this_iter,perf,policy_reward_max,policy_reward_mean,policy_reward_min,sampler_perf,sampler_results,timers
A2C_MultiAgent_4e970_00000,209216,{},"{'num_env_steps_sampled': 14944, 'num_env_steps_trained': 14944, 'num_agent_steps_sampled': 209216, 'num_agent_steps_trained': 209216}",{},,{},,,,0,"{'learner': {'controller_policy': {'learner_stats': {'cur_lr': 8.771660213824362e-05, 'entropy_coeff': 0.04555129259824753, 'policy_loss': -0.0, 'policy_entropy': 35.155594, 'var_gnorm': 22.627424, 'vf_loss': 0.0}, 'grad_gnorm': 0.0, 'vf_explained_var': nan, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 467, 'diff_num_grad_updates_vs_sampler_policy': 466}, 'ABT.US': {'learner_stats': {'cur_lr': 8.771660213824362e-05, 'entropy_coeff': 0.04555129259824753, 'policy_loss': -84087.75, 'policy_entropy': 65.92951, 'var_gnorm': 22.655075, 'vf_loss': 35313948.0}, 'grad_gnorm': 40.0, 'vf_explained_var': 1.7821789e-05, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 467, 'diff_num_grad_updates_vs_sampler_policy': 466}, 'AMGN.US': {'learner_stats': {'cur_lr': 8.771660213824362e-05, 'entropy_coeff': 0.04555129259824753, 'policy_loss': 57340.15, 'policy_entropy': 78.422806, 'var_gnorm': 22.64249, 'vf_loss': 48519876.0}, 'grad_gnorm': 40.0, 'vf_explained_var': -8.237362e-05, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 467, 'diff_num_grad_updates_vs_sampler_policy': 466}, 'BDX.US': {'learner_stats': {'cur_lr': 8.771660213824362e-05, 'entropy_coeff': 0.04555129259824753, 'policy_loss': -14233.217, 'policy_entropy': 80.95892, 'var_gnorm': 22.647688, 'vf_loss': 1779893.6}, 'grad_gnorm': 39.999996, 'vf_explained_var': -0.00021350384, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 467, 'diff_num_grad_updates_vs_sampler_policy': 466}, 'BMY.US': {'learner_stats': {'cur_lr': 8.771660213824362e-05, 'entropy_coeff': 0.04555129259824753, 'policy_loss': -23090.994, 'policy_entropy': 73.29454, 'var_gnorm': 22.650208, 'vf_loss': 2024227.8}, 'grad_gnorm': 40.0, 'vf_explained_var': 3.5226345e-05, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 467, 'diff_num_grad_updates_vs_sampler_policy': 466}, 'HUM.US': {'learner_stats': {'cur_lr': 8.771660213824362e-05, 'entropy_coeff': 0.04555129259824753, 'policy_loss': 6550.0605, 'policy_entropy': 71.750946, 'var_gnorm': 22.64523, 'vf_loss': 2168264.8}, 'grad_gnorm': 39.999996, 'vf_explained_var': -3.4451485e-05, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 467, 'diff_num_grad_updates_vs_sampler_policy': 466}, 'JNJ.US': {'learner_stats': {'cur_lr': 8.771660213824362e-05, 'entropy_coeff': 0.04555129259824753, 'policy_loss': -8103.7407, 'policy_entropy': 70.337814, 'var_gnorm': 22.653593, 'vf_loss': 3210584.0}, 'grad_gnorm': 40.0, 'vf_explained_var': 5.64456e-05, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 467, 'diff_num_grad_updates_vs_sampler_policy': 466}, 'LLY.US': {'learner_stats': {'cur_lr': 8.771660213824362e-05, 'entropy_coeff': 0.04555129259824753, 'policy_loss': -71308.81, 'policy_entropy': 88.71971, 'var_gnorm': 22.648188, 'vf_loss': 15333857.0}, 'grad_gnorm': 40.0, 'vf_explained_var': 1.7285347e-06, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 467, 'diff_num_grad_updates_vs_sampler_policy': 466}, 'MDT.US': {'learner_stats': {'cur_lr': 8.771660213824362e-05, 'entropy_coeff': 0.04555129259824753, 'policy_loss': -15768.619, 'policy_entropy': 65.841644, 'var_gnorm': 22.66214, 'vf_loss': 53071404.0}, 'grad_gnorm': 40.0, 'vf_explained_var': 4.7683716e-07, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 467, 'diff_num_grad_updates_vs_sampler_policy': 466}, 'MRK.US': {'learner_stats': {'cur_lr': 8.771660213824362e-05, 'entropy_coeff': 0.04555129259824753, 'policy_loss': -21369.617, 'policy_entropy': 65.1162, 'var_gnorm': 22.648367, 'vf_loss': 3386151.5}, 'grad_gnorm': 40.000004, 'vf_explained_var': 0.0005902648, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 467, 'diff_num_grad_updates_vs_sampler_policy': 466}, 'PFE.US': {'learner_stats': {'cur_lr': 8.771660213824362e-05, 'entropy_coeff': 0.04555129259824753, 'policy_loss': -46699.54, 'policy_entropy': 66.4682, 'var_gnorm': 22.658619, 'vf_loss': 13855740.0}, 'grad_gnorm': 39.999996, 'vf_explained_var': -2.7418137e-06, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 467, 'diff_num_grad_updates_vs_sampler_policy': 466}, 'SYK.US': {'learner_stats': {'cur_lr': 8.771660213824362e-05, 'entropy_coeff': 0.04555129259824753, 'policy_loss': 33038.426, 'policy_entropy': 75.792816, 'var_gnorm': 22.667772, 'vf_loss': 34307956.0}, 'grad_gnorm': 40.0, 'vf_explained_var': -1.3709068e-05, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 467, 'diff_num_grad_updates_vs_sampler_policy': 466}, 'TMO.US': {'learner_stats': {'cur_lr': 8.771660213824362e-05, 'entropy_coeff': 0.04555129259824753, 'policy_loss': -33163.703, 'policy_entropy': 87.999084, 'var_gnorm': 22.633608, 'vf_loss': 4944347.0}, 'grad_gnorm': 40.000004, 'vf_explained_var': -2.9444695e-05, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 467, 'diff_num_grad_updates_vs_sampler_policy': 466}, 'UNH.US': {'learner_stats': {'cur_lr': 8.771660213824362e-05, 'entropy_coeff': 0.04555129259824753, 'policy_loss': -82699.11, 'policy_entropy': 75.25102, 'var_gnorm': 22.635336, 'vf_loss': 142559120.0}, 'grad_gnorm': 40.000004, 'vf_explained_var': 1.8239021e-05, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 467, 'diff_num_grad_updates_vs_sampler_policy': 466}}, 'num_env_steps_sampled': 14944, 'num_env_steps_trained': 14944, 'num_agent_steps_sampled': 209216, 'num_agent_steps_trained': 209216}",209216,209216,14944,608,37.8188,14944,608,37.8188,0,3,0,0,608,"{'cpu_util_percent': 84.525, 'ram_util_percent': 84.91875}",{},{},{},{},"{'episode_reward_max': nan, 'episode_reward_min': nan, 'episode_reward_mean': nan, 'episode_len_mean': nan, 'episode_media': {}, 'episodes_this_iter': 0, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [], 'episode_lengths': []}, 'sampler_perf': {}, 'num_faulty_episodes': 0, 'connector_metrics': {}}","{'training_iteration_time_ms': 1175.078, 'sample_time_ms': 1053.357, 'learn_time_ms': 96.455, 'learn_throughput': 331.762, 'synch_weights_time_ms': 24.576}"
A2C_MultiAgent_4e970_00001,208320,{},"{'num_env_steps_sampled': 14880, 'num_env_steps_trained': 14880, 'num_agent_steps_sampled': 208320, 'num_agent_steps_trained': 208320}",{},,{},,,,0,"{'learner': {'controller_policy': {'learner_stats': {'cur_lr': 6.338561797747388e-05, 'entropy_coeff': 0.04032785817980766, 'policy_loss': -0.0, 'policy_entropy': 35.155594, 'var_gnorm': 22.627424, 'vf_loss': 0.0}, 'grad_gnorm': 0.0, 'vf_explained_var': nan, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 465, 'diff_num_grad_updates_vs_sampler_policy': 464}, 'ABT.US': {'learner_stats': {'cur_lr': 6.338561797747388e-05, 'entropy_coeff': 0.04032785817980766, 'policy_loss': -6996.175, 'policy_entropy': 67.18506, 'var_gnorm': 22.667051, 'vf_loss': 4748253.0}, 'grad_gnorm': 40.000004, 'vf_explained_var': 9.536743e-07, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 465, 'diff_num_grad_updates_vs_sampler_policy': 464}, 'AMGN.US': {'learner_stats': {'cur_lr': 6.338561797747388e-05, 'entropy_coeff': 0.04032785817980766, 'policy_loss': -115507.84, 'policy_entropy': 75.974434, 'var_gnorm': 22.64296, 'vf_loss': 105987040.0}, 'grad_gnorm': 40.000008, 'vf_explained_var': 3.027916e-05, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 465, 'diff_num_grad_updates_vs_sampler_policy': 464}, 'BDX.US': {'learner_stats': {'cur_lr': 6.338561797747388e-05, 'entropy_coeff': 0.04032785817980766, 'policy_loss': 10075.539, 'policy_entropy': 82.44299, 'var_gnorm': 22.65627, 'vf_loss': 2173541.5}, 'grad_gnorm': 40.0, 'vf_explained_var': -5.364418e-06, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 465, 'diff_num_grad_updates_vs_sampler_policy': 464}, 'BMY.US': {'learner_stats': {'cur_lr': 6.338561797747388e-05, 'entropy_coeff': 0.04032785817980766, 'policy_loss': -3044.5532, 'policy_entropy': 89.71817, 'var_gnorm': 22.643326, 'vf_loss': 294646.94}, 'grad_gnorm': 40.000004, 'vf_explained_var': -4.9710274e-05, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 465, 'diff_num_grad_updates_vs_sampler_policy': 464}, 'HUM.US': {'learner_stats': {'cur_lr': 6.338561797747388e-05, 'entropy_coeff': 0.04032785817980766, 'policy_loss': -8927.56, 'policy_entropy': 76.31587, 'var_gnorm': 22.634209, 'vf_loss': 1423875.2}, 'grad_gnorm': 39.999996, 'vf_explained_var': -3.2305717e-05, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 465, 'diff_num_grad_updates_vs_sampler_policy': 464}, 'JNJ.US': {'learner_stats': {'cur_lr': 6.338561797747388e-05, 'entropy_coeff': 0.04032785817980766, 'policy_loss': -17657.773, 'policy_entropy': 77.02626, 'var_gnorm': 22.636713, 'vf_loss': 4498182.0}, 'grad_gnorm': 39.999996, 'vf_explained_var': -2.2768974e-05, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 465, 'diff_num_grad_updates_vs_sampler_policy': 464}, 'LLY.US': {'learner_stats': {'cur_lr': 6.338561797747388e-05, 'entropy_coeff': 0.04032785817980766, 'policy_loss': -14688.275, 'policy_entropy': 82.333145, 'var_gnorm': 22.635271, 'vf_loss': 3843661.8}, 'grad_gnorm': 40.000004, 'vf_explained_var': 4.529953e-06, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 465, 'diff_num_grad_updates_vs_sampler_policy': 464}, 'MDT.US': {'learner_stats': {'cur_lr': 6.338561797747388e-05, 'entropy_coeff': 0.04032785817980766, 'policy_loss': 23747.527, 'policy_entropy': 65.900475, 'var_gnorm': 22.67044, 'vf_loss': 192922050.0}, 'grad_gnorm': 40.0, 'vf_explained_var': 3.993511e-06, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 465, 'diff_num_grad_updates_vs_sampler_policy': 464}, 'MRK.US': {'learner_stats': {'cur_lr': 6.338561797747388e-05, 'entropy_coeff': 0.04032785817980766, 'policy_loss': -34626.734, 'policy_entropy': 78.7318, 'var_gnorm': 22.647062, 'vf_loss': 10817610.0}, 'grad_gnorm': 39.999992, 'vf_explained_var': -3.3974648e-05, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 465, 'diff_num_grad_updates_vs_sampler_policy': 464}, 'PFE.US': {'learner_stats': {'cur_lr': 6.338561797747388e-05, 'entropy_coeff': 0.04032785817980766, 'policy_loss': -1291.4191, 'policy_entropy': 84.28026, 'var_gnorm': 22.654493, 'vf_loss': 2395295.0}, 'grad_gnorm': 40.0, 'vf_explained_var': -3.695488e-06, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 465, 'diff_num_grad_updates_vs_sampler_policy': 464}, 'SYK.US': {'learner_stats': {'cur_lr': 6.338561797747388e-05, 'entropy_coeff': 0.04032785817980766, 'policy_loss': -38904.156, 'policy_entropy': 79.96017, 'var_gnorm': 22.645924, 'vf_loss': 181790430.0}, 'grad_gnorm': 40.0, 'vf_explained_var': -2.7418137e-06, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 465, 'diff_num_grad_updates_vs_sampler_policy': 464}, 'TMO.US': {'learner_stats': {'cur_lr': 6.338561797747388e-05, 'entropy_coeff': 0.04032785817980766, 'policy_loss': -9684.881, 'policy_entropy': 75.947845, 'var_gnorm': 22.650301, 'vf_loss': 10917432.0}, 'grad_gnorm': 40.0, 'vf_explained_var': 5.4240227e-06, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 465, 'diff_num_grad_updates_vs_sampler_policy': 464}, 'UNH.US': {'learner_stats': {'cur_lr': 6.338561797747388e-05, 'entropy_coeff': 0.04032785817980766, 'policy_loss': -288743.03, 'policy_entropy': 74.87444, 'var_gnorm': 22.644964, 'vf_loss': 592247940.0}, 'grad_gnorm': 40.000004, 'vf_explained_var': 1.5497208e-05, 'custom_metrics': {}, 'num_agent_steps_trained': 32, 'num_grad_updates_lifetime': 465, 'diff_num_grad_updates_vs_sampler_policy': 464}}, 'num_env_steps_sampled': 14880, 'num_env_steps_trained': 14880, 'num_agent_steps_sampled': 208320, 'num_agent_steps_trained': 208320}",208320,208320,14880,224,20.4801,14880,224,20.4801,0,3,0,0,224,"{'cpu_util_percent': 92.4125, 'ram_util_percent': 85.55000000000001}",{},{},{},{},"{'episode_reward_max': nan, 'episode_reward_min': nan, 'episode_reward_mean': nan, 'episode_len_mean': nan, 'episode_media': {}, 'episodes_this_iter': 0, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [], 'episode_lengths': []}, 'sampler_perf': {}, 'num_faulty_episodes': 0, 'connector_metrics': {}}","{'training_iteration_time_ms': 1231.698, 'sample_time_ms': 1082.374, 'learn_time_ms': 125.25, 'learn_throughput': 255.49, 'synch_weights_time_ms': 22.807}"


== Status ==
Current time: 2023-09-29 15:10:40 (running for 00:02:00.81)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 1800.000: None | Iter 600.000: None | Iter 200.000: None
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Volumes/SSD980/ray/multiagent2/A2C
Number of trials: 2/10 (2 RUNNING)
+----------------------------+----------+-----------------+-----------------+----------+----------+-------------+-----------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name                 | status   | loc             |   entropy_coeff |    gamma |   lambda |          lr |   vf_loss_coeff |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|----------------------------+----------+-----------------+-----------------+----------+----------+-------------+-----------------+--------+------------------+------+----------+----------------------



In [None]:
multi_agent_env = MultiAgent()


for episode in range(1):
    obs, info = multi_agent_env.reset()
    done = {"__all__": False}
    
    print(f"Episode {episode + 1}")

    while not done["__all__"]:
        actions = {}
        
        # Collect actions for each agent
        for agent_id, agent_obs in obs.items():
            action_space = multi_agent_env.action_space[agent_id]
            action = action_space.sample()
            actions[agent_id] = action

        # Step the environment
        obs, reward, done, _, info = multi_agent_env.step(actions)

        # print(f"Actions: {actions}")
        # print(f"Observations: {obs}")
        # print(f"Rewards: {reward}")
        # print(f"Done flags: {done}")
        # print(f"Info: {info}")

    print("Episode done!")

In [None]:
env = WorkerStandAlone()
def env_creator(env_config):
    return WorkerStandAlone()  

register_env("worker", env_creator)



analysis = tune.run(
        "A2C",
        metric="episode_reward_mean",
        mode="max",
        config={
            "env": "worker",
            "env_config": {"initial_capital": 1e6},
            "lr": tune.uniform(1e-5, 1e-4),
            "train_batch_size": tune.choice([10000, 20000, 40000]),
}, )

In [None]:


n_iterations = 30000


env = WorkerStandAlone(data=data)

state = env.reset()

for _ in range(n_iterations):
    action = env.action_space.sample()  # Generate random actions for the manager
    worker_dones = {...}  # You need to provide the values for worker_dones
    worker_truncateds = {...}  # You need to provide the values for worker_truncateds

    obs, reward, done, truncated, info = env.step(action)

    if done:
        print("Episode finished!")
        state = env.reset()
    else:
        state = obs

In [None]:
#Mit vielen policies
import sys
# Redirect the standard error to the log file

# Create a log file
log_file = open("console_logs.txt", "w")


# Redirect the standard output to the log file
sys.stdout = log_file


env = HRL()
def env_creator(env_config):
    return HRL(env_config)  

register_env("hrl", env_creator)

def create_policy_spec(worker_id):
    # print(f"Creating policy for {worker_id} with obs space {env.observation_space[worker_id]} and action space {env.action_space[worker_id]}")
    return PolicySpec(
        observation_space=env.observation_space[worker_id],
        action_space=env.action_space[worker_id],
        config={}
    )

manager_policy_spec = PolicySpec(
    observation_space=env.observation_space['manager'],
    action_space=env.action_space['manager'],
    config={}
)

policies = {
    "manager_policy": manager_policy_spec,
}

for worker_id in env.workers:
    policies[worker_id] = create_policy_spec(worker_id)

def policy_mapping_fn(agent_id, episode=None, worker=None, **kwargs):
    if agent_id == 'manager':
        # print(f"!!!!!! policy mapping manager: {agent_id}")
        return "manager_policy"
    elif agent_id in env.workers:
        return agent_id
    else:
        print("defaul policy triggered")
        return "default_policy"

param_space = {
     "env": "hrl",
    "multiagent": {
            "policies": policies,
            "policy_mapping_fn": policy_mapping_fn,
        },
        "rollout_fragment_length": "auto",
        "framework": "tf2",
        "lr": tune.uniform(1e-5,1e-4),
        "gamma": tune.uniform(0.95, 0.9999),
        "lambda": tune.uniform(0.9,1.0),
        "entropy_coeff": tune.uniform(0.01,0.1),
        "vf_loss_coeff": tune.uniform(0.1,0.3),
        "num_workers": 3, 
        #Change for Debugging
        "log_level": "DEBUG",
        "output": "logdir",
        "monitor": True,
}

analysis = tune.run(
    "A2C",
    metric="episode_reward_mean", 
    num_samples=5,
    mode="max",
    config=param_space, 
    local_dir="/Volumes/SSD980/ray/results/tunerun2",
    search_alg=None,
    scheduler=None,
    progress_reporter=CLIReporter(max_progress_rows=5,max_report_frequency=120),
    max_concurrent_trials=2,
    #checkpoint_config not checked yet
    checkpoint_config={
        "num_to_keep": 1,
        "checkpoint_score_attribute": "episode_reward_mean",
        "checkpoint_score_order": "max",
        "checkpoint_frequency": 10
    }
)

In [None]:
env = HRL()

def env_creator(env_config):
    return HRL(env_config)  

register_env("hrl", env_creator)

def policy_mapping_fn(agent_id, episode=None, worker=None, **kwargs):
    if agent_id in env.workers:
        return "worker_policy"
    else:
        return "manager_policy"
    
first_worker_tic = next(iter(env.workers))
worker_policy_spec = PolicySpec(
    observation_space=env.observation_space[first_worker_tic],
    action_space=env.action_space[first_worker_tic],
    config={}
)
 
manager_policy_spec = PolicySpec(
    observation_space=env.observation_space['manager'],
    action_space=env.action_space['manager'],
    config={}
)

policies = {
    "worker_policy": worker_policy_spec,
    "manager_policy": manager_policy_spec,
}
 
initial_params = [{
    "lr": 0.001,
    "gamma": 0.92,
    "lambda": 0.95,
    "entropy_coeff": 1e-3,
    "vf_loss_coeff": 0.5,
    "model": {
        "fcnet_hiddens": 64,
        "fcnet_activation":"relu",
    },
}]

search_space = {
    "lr": tune.loguniform(1e-4, 1e-1),
    "gamma": tune.uniform(0.9, 0.99),
    "lambda": tune.uniform(0.9, 1.0),
    "entropy_coeff": tune.loguniform(1e-4, 1e-1),
    "vf_loss_coeff": tune.uniform(0.1, 1.0),
    # "model": {
    #     "fcnet_hiddens": tune.grid_search([[64, 64], [128, 128], [256, 256]]),
    #     "fcnet_activation": tune.choice(["relu", "tanh"]),
    # },
}


algo = HyperOptSearch(space=search_space,metric="episode_reward_mean", mode="max",)


param_space = {
     "env": "hrl",
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": policy_mapping_fn,
        },

        "num_workers": 1,  
        "num_cpus_per_trial": 1,
         "lr": tune.loguniform(1e-4, 1e-1),
    "gamma": tune.uniform(0.9, 0.99),
    "lambda": tune.uniform(0.9, 1.0),
    "entropy_coeff": tune.loguniform(1e-4, 1e-1),
    "vf_loss_coeff": tune.uniform(0.1, 1.0),
}



param_space = {
     "env": "hrl",
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": policy_mapping_fn,
        },
          "rollout_fragment_length": "auto",
          "lr": tune.uniform(1e-5,1e-4),
          "lambda": tune.uniform(0.95, 1.0),
          "vf_loss_coeff": tune.uniform(0.1, 1.0),
          "entropy_coeff": tune.uniform(1e-4, 1e-1),
        "num_workers": 1,  
        "num_envs_per_worker": 1
}

analysis = tune.run("A2C", 
                    metric="episode_reward_mean", 
                    mode="max",
                    config=param_space,
                    num_samples=10,
                    stop={"training_iteration": 100},
                    progress_reporter=CLIReporter(max_progress_rows=10,max_report_frequency=600),
                    # local_dir="/Users/floriankockler/rayresults/overnight1",
                    storage_path="/Users/floriankockler/Documents/GitHub.nosync/raystorage",
                    checkpoint_config=CheckpointConfig(
                        num_to_keep=2,
                        checkpoint_score_attribute="episode_reward_mean", 
                        checkpoint_score_order="max"
                        )
                    )


best = analysis.best_trial
print(pretty_print(best.last_result))

In [None]:

env = HRL()

n_iterations = 1

state = env.reset()

for _ in range(n_iterations):

    action = env.action_space.sample()

    obs, reward, done, truncated, info= env.step(action)

    # print(f"Action: {action}, Reward: {reward}, Portfolio Value: {obs[0] + obs[1] * obs[2]}")
    
    if done["__all__"]:
        print("Episode finished!")
        state = env.reset()
    else:
        state = obs

In [None]:
def policy_mapping_fn(agent_id, episode=None, worker=None, **kwargs):
    if agent_id in env.workers:
        return "worker_policy"
    else:
        return "manager_policy"
    
first_worker_tic = next(iter(env.workers))
worker_policy_spec = PolicySpec(
    observation_space=env.observation_space[first_worker_tic],
    action_space=env.action_space[first_worker_tic],
    config={}
)
 
manager_policy_spec = PolicySpec(
    observation_space=env.observation_space['manager'],
    action_space=env.action_space['manager'],
    config={}
)
 



policies = {
    "worker_policy": worker_policy_spec,
    "manager_policy": manager_policy_spec,
}

    
algo = PPOConfig().environment(env=HRL).multi_agent(
        policies=policies,
        policy_mapping_fn = policy_mapping_fn,  
    ).training(train_batch_size=4000).build()

print(algo.train())

In [None]:
env = HRL()

def env_creator(env_config):
    return HRL(env_config)  

register_env("hrl", env_creator)

def policy_mapping_fn(agent_id, episode=None, worker=None, **kwargs):
    if agent_id in env.workers:
        return "worker_policy"
    else:
        return "manager_policy"
    
first_worker_tic = next(iter(env.workers))
worker_policy_spec = PolicySpec(
    observation_space=env.observation_space[first_worker_tic],
    action_space=env.action_space[first_worker_tic],
    config={}
)
 
manager_policy_spec = PolicySpec(
    observation_space=env.observation_space['manager'],
    action_space=env.action_space['manager'],
    config={}
)

policies = {
    "worker_policy": worker_policy_spec,
    "manager_policy": manager_policy_spec,
}
 
search_space = {
    "lr": tune.loguniform(1e-4, 1e-1),  # Learning rate
    "gamma": tune.uniform(0.9, 0.99),  # Discount factor
}

tune_config = TuneConfig(
    metric="episode_reward_mean",
    mode="max",
    max_concurrent_trials=1,
    num_samples=100,
    search_alg=BayesOptSearch(
          search_space,
        metric="episode_reward_mean",
        mode="max",
        random_search_steps=4
    ),
    scheduler=HyperBandScheduler()
)

run_config = RunConfig(
    name="MyExperiment",
    storage_path="/Users/floriankockler/rayresults/tuner_trial",
    verbose=2,
    # checkpoint_config=air.CheckpointConfig(checkpoint_frequency=2),
    callbacks=None,
    stop=None,
    failure_config=None,
    sync_config=None,
    checkpoint_config=None,
    progress_reporter=CLIReporter(max_progress_rows=10),# Define the policy mapping function

)

param_space = {
     "env": "hrl",
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": policy_mapping_fn,
        },
          "rollout_fragment_length": "auto",
          "lr": tune.uniform(1e-5,1e-4),
        "num_workers": 1,  
        "num_cpus_per_trial": 1,
}

analysis = tune.run("A2C", metric="episode_reward_mean", mode="max",config=param_space)

In [None]:
from ray.rllib.algorithms.ppo import PPOConfig
algo = PPOConfig().environment(env=HRL).multi_agent(
    policies={
        "policy_1": ()
    }
)

In [None]:
from ray.tune import CLIReporter
def env_creator(env_config):
    return HRL(env_config)  # Assuming this is your environment
 
env = HRL()

register_env("hrl", env_creator)

def policy_mapping_fn(agent_id, episode=None, worker=None, **kwargs):
    if agent_id in env.workers:
        return "worker_policy"
    else:
        return "manager_policy"
    
first_worker_tic = next(iter(env.workers))
worker_policy_spec = PolicySpec(
    observation_space=env.observation_space[first_worker_tic],
    action_space=env.action_space[first_worker_tic],
    config={}
)
 
manager_policy_spec = PolicySpec(
    observation_space=env.observation_space['manager'],
    action_space=env.action_space['manager'],
    config={}
)

policies = {
    "worker_policy": worker_policy_spec,
    "manager_policy": manager_policy_spec,
}
 
search_space = {
    "lr": tune.loguniform(1e-4, 1e-1),  # Learning rate
    "gamma": tune.uniform(0.9, 0.99),  # Discount factor
}

tune_config = TuneConfig(
    metric="episode_reward_mean",
    mode="max",
    max_concurrent_trials=1,
    num_samples=100,
    search_alg=BayesOptSearch(
          search_space,
        metric="episode_reward_mean",
        mode="max",
        random_search_steps=4
    ),
    scheduler=HyperBandScheduler()
)

run_config = RunConfig(
    name="MyExperiment",
    storage_path="/Users/floriankockler/rayresults/tuner_trial",
    verbose=2,
    # checkpoint_config=air.CheckpointConfig(checkpoint_frequency=2),
    callbacks=None,
    stop=None,
    failure_config=None,
    sync_config=None,
    checkpoint_config=None,
    progress_reporter=CLIReporter(max_progress_rows=10),# Define the policy mapping function

)

param_space = {
     "env": "hrl",
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": policy_mapping_fn,
        },
          "rollout_fragment_length": "auto",
        "num_workers": 1,  
        "num_cpus_per_trial": 1,
}

tuner = tune.Tuner(
    "A2C",
    tune_config=tune_config,
    param_space=param_space,

    run_config=run_config,
)
results = tuner.fit()


In [None]:
import random
import os
from ray import air, tune
from ray.tune.schedulers import PopulationBasedTraining
import argparse
from ray.tune import CLIReporter
from env.multi_agent.hrl import HRL

reporter = CLIReporter(max_progress_rows=10)

def env_creator(env_config):
    return HRL(env_config)  # Assuming this is your environment
 
register_env("hrl", env_creator)

manager_config = {
    "df": train_df,

}
hrl_config={
        "manager_config": manager_config
        }
env = HRL(hrl_config)
 
def policy_mapping_fn(agent_id, episode=None, worker=None, **kwargs):
    if agent_id in env.workers:
        return "worker_policy"
    else:
        return "manager_policy"
 


first_worker_tic = next(iter(env.workers))
worker_policy_spec = PolicySpec(
    observation_space=env.observation_space[first_worker_tic],
    action_space=env.action_space[first_worker_tic],
    config={}
)
 
manager_policy_spec = PolicySpec(
    observation_space=env.observation_space['manager'],
    action_space=env.action_space['manager'],
    config={}
)
 

parser = argparse.ArgumentParser()
parser.add_argument(
    "--smoke-test", action="store_true", help="Finish quickly for testing"
)
args, _ = parser.parse_known_args()

policies = {
    "worker_policy": worker_policy_spec,
    "manager_policy": manager_policy_spec,
}


def explore(config):
    # Ensure we collect enough timesteps to do sgd
    if config["train_batch_size"] < config["rollout_fragment_length"] * 2:
        config["train_batch_size"] = config["rollout_fragment_length"] * 2
    return config

hyperparam_mutations = {
    "lr": [1e-3, 5e-4, 1e-4, 5e-5, 1e-5],
    "gamma": lambda: random.uniform(0.9, 1.0),
    "entropy_coeff": [0.01, 0.1, 1.0],
    "num_envs_per_worker": [1, 2, 4, 8],
    #"rollout_fragment_length": [50, 100, 200, 400],
    "train_batch_size": lambda: random.randint(200, 1500),
    "sgd_minibatch_size": tune.choice([50, 100, 200]),

}

pbt = PopulationBasedTraining(
        time_attr="time_total_s",
        perturbation_interval=120,
        resample_probability=0.25,
        # Specifies the mutations of these hyperparams
        hyperparam_mutations=hyperparam_mutations,
        custom_explore_fn=explore,
    )

# Stop when we've reached 100 training iterations or reward=300
stopping_criteria = {"training_iteration": 100}

tuner = tune.Tuner(
    "A2C",
    tune_config=tune.TuneConfig(
        metric="episode_reward_mean",
        mode="max",
        scheduler=pbt,
        num_samples=1 if args.smoke_test else 10,
    ),
    param_space={
        "env": "hrl",
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": policy_mapping_fn,
        },
        "env_config": hrl_config,
        "rollout_fragment_length": "auto",
        "framework": "tf2",
        "num_workers": 1,  # 1 for training + 4 for sampling
        "num_cpus_per_trial": 3,
        # "num_cpus": 1,  # number of CPUs to use per trial --> 6 in total = max available
        # "num_gpus": 0,  # number of GPUs to use per trial
        # These params are tuned from a fixed starting value.
        "lr": 1e-4,
        # These params start off randomly drawn from a set.
        "sgd_minibatch_size": tune.choice([50, 100, 200]),
        "train_batch_size": tune.choice([200, 400, 600]),
    },

    run_config=air.RunConfig(stop=stopping_criteria, local_dir="/Users/floriankockler/rayresults/autobatch", progress_reporter=reporter),
)
results = tuner.fit()