In [1]:
# !git clone https://github.com/sobiodum/quantumai3.git
# !pip3 install --no-cache-dir -r requirements.txt

In [2]:
import numpy as np
import pandas as pd
import ray
from configs.load_specific_data import pharma_basket
from env.multi_agent.worker_hrl import Worker
from env.multi_agent.hrl import HRL
from ray.rllib.utils import check_env
from ray.train.rl import RLTrainer
from ray.air.config import RunConfig, ScalingConfig
from ray.rllib.policy.policy import Policy, PolicySpec
from ray.rllib.algorithms.bc.bc import BC
from ray.rllib.algorithms import a2c
from ray.tune.registry import register_env
import gymnasium
ray.init(_temp_dir='/Volumes/SSD980/ray')
import warnings
warnings.simplefilter(action='ignore', category=Warning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
import os
os.environ['PYTHONWARNINGS'] = 'ignore'

2023-09-07 19:17:58,534	INFO worker.py:1612 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8266 [39m[22m


In [3]:
train_df, validate_df, test_df, stock_dimension, state_space, indicators = pharma_basket()
# train_df = train_df[train_df["tic"] == "PFE.US"]
# validate_df = validate_df[validate_df["tic"] == "PFE.US"]


In [4]:
train_df.to_csv("train.csv")

In [None]:

manager_config = {
    "df": train_df,

}
config={
        "manager_config": manager_config
        }

env = HRL(config)

n_iterations = 1

state = env.reset()

for _ in range(n_iterations):

    action = env.action_space.sample()

    obs, reward, done, truncated, info= env.step(action)

    # print(f"Action: {action}, Reward: {reward}, Portfolio Value: {obs[0] + obs[1] * obs[2]}")
    
    if done["__all__"]:
        print("Episode finished!")
        state = env.reset()
    else:
        state = obs

In [6]:
import os 
manager_config = {
    "df": train_df,

}
config={
        "manager_config": manager_config
        }

env = HRL(config)

def env_creator(env_config):
    return HRL(env_config)  # Assuming this is your environment
 
register_env("hierarch_env", env_creator)
 
def policy_mapping_fn(agent_id, episode=None, worker=None, **kwargs):
    if agent_id in env.workers:
        return "worker_policy"
    else:
        return "manager_policy"
 

first_worker_tic = next(iter(env.workers))
worker_policy_spec = PolicySpec(
    observation_space=env.observation_space[first_worker_tic],
    action_space=env.action_space[first_worker_tic],
    config={}
)
 
manager_policy_spec = PolicySpec(
    observation_space=env.observation_space['manager'],
    action_space=env.action_space['manager'],
    config={}
)
 
manager_config = {
    "df": train_df,

}
hrl_config={
        "manager_config": manager_config
        }


trainer = RLTrainer(
    run_config=RunConfig(stop={"training_iteration": 100},local_dir="/Volumes/SSD980/ray/results/"),
    scaling_config=ScalingConfig(
        num_workers=1,
        use_gpu=False,
    ),
 
    algorithm=a2c.A2C,
    config={
        "multiagent": {
            "policies": {
                "worker_policy": worker_policy_spec,
                "manager_policy": manager_policy_spec,
            },
            "policy_mapping_fn": policy_mapping_fn,
        },
        "env": "hierarch_env",
        "env_config": hrl_config,
        "framework": "tf2",
        "evaluation_num_workers": 5,
        "evaluation_interval": 1,
        "evaluation_config": {"input": "sampler"},
    },
)
result = trainer.fit()

0,1
Current time:,2023-09-06 23:21:33
Running for:,00:26:06.09
Memory:,6.9/8.0 GiB

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
AIRA2C_b4e14_00000,RUNNING,127.0.0.1:69670,7,1510.77,4096,,,,


[2m[36m(AIRA2C pid=69670)[0m Trainable.setup took 25.423 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


[2m[36m(RolloutWorker pid=69707)[0m HRL is done
[2m[36m(RolloutWorker pid=69707)[0m day: 6304, episode: 2
[2m[36m(RolloutWorker pid=69707)[0m Total Cash Transfers: 56
[2m[36m(RolloutWorker pid=69707)[0m total_portfolio_trades: 48700.0
[2m[36m(RolloutWorker pid=69707)[0m Beginn_Portfolio_Value: 2000000
[2m[36m(RolloutWorker pid=69707)[0m End_Portfolio_Value: 4860617.5
[2m[36m(RolloutWorker pid=69707)[0m Annual Return: 5.28 %
[2m[36m(RolloutWorker pid=69707)[0m Worker ID: ABT.US Current Stock Exposure: 0
[2m[36m(RolloutWorker pid=69707)[0m Worker ID: AMGN.US Current Stock Exposure: 54822
[2m[36m(RolloutWorker pid=69707)[0m Worker ID: BDX.US Current Stock Exposure: 7567
[2m[36m(RolloutWorker pid=69707)[0m Worker ID: BMY.US Current Stock Exposure: 75302
[2m[36m(RolloutWorker pid=69707)[0m Worker ID: HUM.US Current Stock Exposure: 345350
[2m[36m(RolloutWorker pid=69707)[0m Worker ID: JNJ.US Current Stock Exposure: 711953
[2m[36m(RolloutWorker pid=697



[2m[36m(RolloutWorker pid=69703)[0m HRL is done[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69703)[0m day: 6304, episode: 3[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69703)[0m Total Cash Transfers: 65[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69703)[0m total_portfolio_trades: 47155.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69703)[0m Beginn_Portfolio_Value: 2000000[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69703)[0m End_Portfolio_Value: 9417435.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69703)[0m Annual Return: 9.38 %[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69703)[0m Worker ID: ABT.US Current Stock Exposure: 0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69703)[0m Worker ID: AMGN.US Current Stock Exposure: 1674876[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69703)[



[2m[36m(RolloutWorker pid=69704)[0m HRL is done[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69704)[0m day: 6304, episode: 4[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69704)[0m Total Cash Transfers: 63[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69704)[0m total_portfolio_trades: 43826.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69704)[0m Beginn_Portfolio_Value: 2000000[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69704)[0m End_Portfolio_Value: 13047182.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69704)[0m Annual Return: 11.47 %[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69704)[0m Worker ID: ABT.US Current Stock Exposure: 759784[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69704)[0m Worker ID: AMGN.US Current Stock Exposure: 265677[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69



[2m[36m(RolloutWorker pid=69705)[0m HRL is done[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69705)[0m day: 6304, episode: 5[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69705)[0m Total Cash Transfers: 58[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69705)[0m total_portfolio_trades: 38418.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69705)[0m Beginn_Portfolio_Value: 2000000[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69705)[0m End_Portfolio_Value: 10711219.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69705)[0m Annual Return: 10.20 %[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69705)[0m Worker ID: ABT.US Current Stock Exposure: 565558[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69705)[0m Worker ID: AMGN.US Current Stock Exposure: 0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=69705)



[2m[36m(RolloutWorker pid=69704)[0m HRL is done
[2m[36m(RolloutWorker pid=69704)[0m day: 6304, episode: 6
[2m[36m(RolloutWorker pid=69704)[0m Total Cash Transfers: 45
[2m[36m(RolloutWorker pid=69704)[0m total_portfolio_trades: 36186.0
[2m[36m(RolloutWorker pid=69704)[0m Beginn_Portfolio_Value: 2000000
[2m[36m(RolloutWorker pid=69704)[0m End_Portfolio_Value: 12264601.0
[2m[36m(RolloutWorker pid=69704)[0m Annual Return: 11.07 %
[2m[36m(RolloutWorker pid=69704)[0m Worker ID: ABT.US Current Stock Exposure: 687968
[2m[36m(RolloutWorker pid=69704)[0m Worker ID: AMGN.US Current Stock Exposure: 71163
[2m[36m(RolloutWorker pid=69704)[0m Worker ID: BDX.US Current Stock Exposure: 52092
[2m[36m(RolloutWorker pid=69704)[0m Worker ID: BMY.US Current Stock Exposure: 311914
[2m[36m(RolloutWorker pid=69704)[0m Worker ID: HUM.US Current Stock Exposure: 852972
[2m[36m(RolloutWorker pid=69704)[0m Worker ID: JNJ.US Current Stock Exposure: 1257533
[2m[36m(RolloutWork



[2m[36m(RolloutWorker pid=69704)[0m HRL is done[32m [repeated 4x across cluster][0m
[2m[36m(RolloutWorker pid=69704)[0m day: 6304, episode: 7[32m [repeated 4x across cluster][0m
[2m[36m(RolloutWorker pid=69704)[0m Total Cash Transfers: 42[32m [repeated 4x across cluster][0m
[2m[36m(RolloutWorker pid=69704)[0m total_portfolio_trades: 36248.0[32m [repeated 4x across cluster][0m
[2m[36m(RolloutWorker pid=69704)[0m Beginn_Portfolio_Value: 2000000[32m [repeated 4x across cluster][0m
[2m[36m(RolloutWorker pid=69704)[0m End_Portfolio_Value: 14436736.0[32m [repeated 4x across cluster][0m
[2m[36m(RolloutWorker pid=69704)[0m Annual Return: 12.12 %[32m [repeated 4x across cluster][0m
[2m[36m(RolloutWorker pid=69704)[0m Worker ID: ABT.US Current Stock Exposure: 92829[32m [repeated 4x across cluster][0m
[2m[36m(RolloutWorker pid=69704)[0m Worker ID: AMGN.US Current Stock Exposure: 3057962[32m [repeated 4x across cluster][0m
[2m[36m(RolloutWorker pid=69



[2m[36m(RolloutWorker pid=69704)[0m HRL is done
[2m[36m(RolloutWorker pid=69704)[0m day: 6304, episode: 8
[2m[36m(RolloutWorker pid=69704)[0m Total Cash Transfers: 43
[2m[36m(RolloutWorker pid=69704)[0m total_portfolio_trades: 36300.0
[2m[36m(RolloutWorker pid=69704)[0m Beginn_Portfolio_Value: 2000000
[2m[36m(RolloutWorker pid=69704)[0m End_Portfolio_Value: 18918238.0
[2m[36m(RolloutWorker pid=69704)[0m Annual Return: 13.89 %
[2m[36m(RolloutWorker pid=69704)[0m Worker ID: ABT.US Current Stock Exposure: 451647
[2m[36m(RolloutWorker pid=69704)[0m Worker ID: AMGN.US Current Stock Exposure: 1793158
[2m[36m(RolloutWorker pid=69704)[0m Worker ID: BDX.US Current Stock Exposure: 2186966
[2m[36m(RolloutWorker pid=69704)[0m Worker ID: BMY.US Current Stock Exposure: 1413645
[2m[36m(RolloutWorker pid=69704)[0m Worker ID: HUM.US Current Stock Exposure: 1495154
[2m[36m(RolloutWorker pid=69704)[0m Worker ID: JNJ.US Current Stock Exposure: 953943
[2m[36m(Rollou

In [None]:
train_df.columns

Index(['date', 'tic', 'open', 'high', 'low', 'close', 'volume', 'avgvol_50',
       'sma_10', 'sma_50', 'sma_100', 'sma_200', 'wma_50', 'rsi_14',
       'volatility_30', 'volatility_100', 'stddev_30', 'dmi_14', 'adx_14',
       'macd', 'atr_14'],
      dtype='object')

In [None]:
# worker_df = train_df[train_df["tic"] == "PFE.US"]

# config={"df": worker_df}
        
# env = Worker(env_config=config)
# check_env(env)



# n_iterations = 500

# state = env.reset()

# for _ in range(n_iterations):

#     action = env.action_space.sample()

#     obs, reward, done, truncated, info= env.step(action)

#     # print(f"Action: {action}, Reward: {reward}, Portfolio Value: {obs[0] + obs[1] * obs[2]}")
    
#     if done:
#         print("Episode finished!")
#         state = env.reset()
#     else:
#         state = obs

In [None]:
manager_config = {
    "df": train_df,

}
config={



        "manager_config": manager_config
        }
env = HRL(env_config=config)
check_env(env)

In [3]:
import random
import os
from ray import air, tune
from ray.tune.schedulers import PopulationBasedTraining
import argparse
from ray.tune import CLIReporter
from env.multi_agent.hrl import HRL

reporter = CLIReporter(max_progress_rows=10)

def env_creator(env_config):
    return HRL(env_config)  # Assuming this is your environment
 
register_env("hrl", env_creator)

manager_config = {
    "df": train_df,

}
hrl_config={
        "manager_config": manager_config
        }
env = HRL(hrl_config)
 
def policy_mapping_fn(agent_id, episode=None, worker=None, **kwargs):
    if agent_id in env.workers:
        return "worker_policy"
    else:
        return "manager_policy"
 


first_worker_tic = next(iter(env.workers))
worker_policy_spec = PolicySpec(
    observation_space=env.observation_space[first_worker_tic],
    action_space=env.action_space[first_worker_tic],
    config={}
)
 
manager_policy_spec = PolicySpec(
    observation_space=env.observation_space['manager'],
    action_space=env.action_space['manager'],
    config={}
)
 

parser = argparse.ArgumentParser()
parser.add_argument(
    "--smoke-test", action="store_true", help="Finish quickly for testing"
)
args, _ = parser.parse_known_args()

policies = {
    "worker_policy": worker_policy_spec,
    "manager_policy": manager_policy_spec,
}


def explore(config):
    # Ensure we collect enough timesteps to do sgd
    if config["train_batch_size"] < config["rollout_fragment_length"] * 2:
        config["train_batch_size"] = config["rollout_fragment_length"] * 2
    return config

hyperparam_mutations = {
    "lr": [1e-3, 5e-4, 1e-4, 5e-5, 1e-5],
    "gamma": lambda: random.uniform(0.9, 1.0),
    "entropy_coeff": [0.01, 0.1, 1.0],
    "num_envs_per_worker": [1, 2, 4, 8],
    #"rollout_fragment_length": [50, 100, 200, 400],
    "train_batch_size": lambda: random.randint(200, 1500),
    "sgd_minibatch_size": tune.choice([50, 100, 200]),

}

pbt = PopulationBasedTraining(
        time_attr="time_total_s",
        perturbation_interval=120,
        resample_probability=0.25,
        # Specifies the mutations of these hyperparams
        hyperparam_mutations=hyperparam_mutations,
        custom_explore_fn=explore,
    )

# Stop when we've reached 100 training iterations or reward=300
stopping_criteria = {"training_iteration": 100}

tuner = tune.Tuner(
    "A2C",
    tune_config=tune.TuneConfig(
        metric="episode_reward_mean",
        mode="max",
        scheduler=pbt,
        num_samples=1 if args.smoke_test else 10,
    ),
    param_space={
        "env": "hrl",
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": policy_mapping_fn,
        },
        "env_config": hrl_config,
        "rollout_fragment_length": "auto",
        "framework": "tf2",
        "num_workers": 1,  # 1 for training + 4 for sampling
        "num_cpus_per_trial": 3,
        # "num_cpus": 1,  # number of CPUs to use per trial --> 6 in total = max available
        # "num_gpus": 0,  # number of GPUs to use per trial
        # These params are tuned from a fixed starting value.
        "lr": 1e-4,
        # These params start off randomly drawn from a set.
        "sgd_minibatch_size": tune.choice([50, 100, 200]),
        "train_batch_size": tune.choice([200, 400, 600]),
    },

    run_config=air.RunConfig(stop=stopping_criteria, local_dir="/Users/floriankockler/rayresults/autobatch", progress_reporter=reporter),
)
results = tuner.fit()

2023-09-06 23:22:30,957	INFO tune.py:657 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


== Status ==
Current time: 2023-09-06 23:22:36 (running for 00:00:05.97)
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Logical resource usage: 0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (10 PENDING)
+---------------------+----------+-------+-----------------+----------+-----------------------+----------------------+--------------------+
| Trial name          | status   | loc   |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |
|---------------------+----------+-------+-----------------+----------+-----------------------+----------------------+--------------------|
| A2C_hrl_7cd10_00000 | PENDING  |       |            0.1  | 0.99721  |                     1 |                  100 |                200 |
| A2C_hrl_7cd10_00001 | PENDING  |       |            1    | 0.929554 |                     2 |                  100 |                200 |
| A2C_hrl_7cd10_00002 | PENDING  |



== Status ==
Current time: 2023-09-06 23:22:47 (running for 00:00:16.06)
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (10 PENDING)
+---------------------+----------+-------+-----------------+----------+-----------------------+----------------------+--------------------+
| Trial name          | status   | loc   |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |
|---------------------+----------+-------+-----------------+----------+-----------------------+----------------------+--------------------|
| A2C_hrl_7cd10_00000 | PENDING  |       |            0.1  | 0.99721  |                     1 |                  100 |                200 |
| A2C_hrl_7cd10_00001 | PENDING  |       |            1    | 0.929554 |                     2 |                  100 |                200 |
| A2C_hrl_7cd10_00002 | PENDING 



== Status ==
Current time: 2023-09-06 23:22:52 (running for 00:00:21.16)
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (10 PENDING)
+---------------------+----------+-------+-----------------+----------+-----------------------+----------------------+--------------------+
| Trial name          | status   | loc   |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |
|---------------------+----------+-------+-----------------+----------+-----------------------+----------------------+--------------------|
| A2C_hrl_7cd10_00000 | PENDING  |       |            0.1  | 0.99721  |                     1 |                  100 |                200 |
| A2C_hrl_7cd10_00001 | PENDING  |       |            1    | 0.929554 |                     2 |                  100 |                200 |
| A2C_hrl_7cd10_00002 | PENDING 



== Status ==
Current time: 2023-09-06 23:23:02 (running for 00:00:31.25)
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (10 PENDING)
+---------------------+----------+-------+-----------------+----------+-----------------------+----------------------+--------------------+
| Trial name          | status   | loc   |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |
|---------------------+----------+-------+-----------------+----------+-----------------------+----------------------+--------------------|
| A2C_hrl_7cd10_00000 | PENDING  |       |            0.1  | 0.99721  |                     1 |                  100 |                200 |
| A2C_hrl_7cd10_00001 | PENDING  |       |            1    | 0.929554 |                     2 |                  100 |                200 |
| A2C_hrl_7cd10_00002 | PENDING 

[2m[36m(A2C pid=74250)[0m Trainable.setup took 16.283 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


== Status ==
Current time: 2023-09-06 23:23:07 (running for 00:00:36.40)
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (6 PENDING, 4 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------|
| A2C_hrl_7cd10_00000 | RUNNING  | 127.0.0.1:74250 |            0.1  | 0.99721  |                     1 |                  100 |                200 |
| A2C_hrl_7cd10_00001 | RUNNING  | 127.0.0.1:74251 |            1    | 0.929554 |                     2 |                 

[2m[36m(A2C pid=74253)[0m Trainable.setup took 16.832 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.[32m [repeated 3x across cluster][0m


== Status ==
Current time: 2023-09-06 23:23:12 (running for 00:00:41.32)
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (6 PENDING, 4 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------|
| A2C_hrl_7cd10_00000 | RUNNING  | 127.0.0.1:74250 |            0.1  | 0.99721  |                     1 |                  100 |                200 |
| A2C_hrl_7cd10_00001 | RUNNING  | 127.0.0.1:74251 |            1    | 0.929554 |                     2 |                 



== Status ==
Current time: 2023-09-06 23:23:22 (running for 00:00:51.38)
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (6 PENDING, 4 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+-



== Status ==
Current time: 2023-09-06 23:24:59 (running for 00:02:28.74)
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (6 PENDING, 4 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+-



== Status ==
Current time: 2023-09-06 23:25:13 (running for 00:02:42.64)
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (6 PENDING, 4 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+-



== Status ==
Current time: 2023-09-06 23:26:00 (running for 00:03:29.73)
PopulationBasedTraining: 1 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (3 PAUSED, 6 PENDING, 1 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--



== Status ==
Current time: 2023-09-06 23:26:15 (running for 00:03:44.91)
PopulationBasedTraining: 1 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (4 PAUSED, 6 PENDING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+--

[2m[36m(A2C pid=75242)[0m Trainable.setup took 18.252 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


== Status ==
Current time: 2023-09-06 23:26:20 (running for 00:03:49.96)
PopulationBasedTraining: 1 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (4 PAUSED, 4 PENDING, 2 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--

[2m[36m(A2C pid=75251)[0m Trainable.setup took 19.251 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


== Status ==
Current time: 2023-09-06 23:26:45 (running for 00:04:14.50)
PopulationBasedTraining: 1 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (4 PAUSED, 4 PENDING, 2 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--

[2m[36m(A2C pid=75306)[0m Trainable.setup took 30.820 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


== Status ==
Current time: 2023-09-06 23:26:50 (running for 00:04:19.52)
PopulationBasedTraining: 1 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (4 PAUSED, 2 PENDING, 4 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--

[2m[36m(A2C pid=75311)[0m Trainable.setup took 31.509 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


== Status ==
Current time: 2023-09-06 23:27:12 (running for 00:04:41.90)
PopulationBasedTraining: 1 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (4 PAUSED, 2 PENDING, 4 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--



== Status ==
Current time: 2023-09-06 23:27:17 (running for 00:04:46.94)
PopulationBasedTraining: 1 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (4 PAUSED, 2 PENDING, 4 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--



== Status ==
Current time: 2023-09-06 23:28:39 (running for 00:06:08.64)
PopulationBasedTraining: 2 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (5 PAUSED, 2 PENDING, 3 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--

[2m[36m(raylet)[0m Spilled 2071 MiB, 54 objects, write throughput 74 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message.


== Status ==
Current time: 2023-09-06 23:28:44 (running for 00:06:13.66)
PopulationBasedTraining: 3 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (6 PAUSED, 2 PENDING, 2 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--



== Status ==
Current time: 2023-09-06 23:28:58 (running for 00:06:27.98)
PopulationBasedTraining: 4 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (7 PAUSED, 2 PENDING, 1 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--



== Status ==
Current time: 2023-09-06 23:29:08 (running for 00:06:38.02)
PopulationBasedTraining: 5 checkpoints, 0 perturbs
Logical resource usage: 4.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (8 PAUSED, 2 PENDING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+--

[2m[36m(A2C pid=75974)[0m Trainable.setup took 16.303 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


== Status ==
Current time: 2023-09-06 23:29:14 (running for 00:06:43.07)
PopulationBasedTraining: 5 checkpoints, 0 perturbs
Logical resource usage: 6.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (7 PAUSED, 1 PENDING, 2 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--

[2m[36m(A2C pid=75968)[0m Trainable.setup took 16.765 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


== Status ==
Current time: 2023-09-06 23:29:19 (running for 00:06:48.12)
PopulationBasedTraining: 5 checkpoints, 0 perturbs
Logical resource usage: 6.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (7 PAUSED, 1 PENDING, 2 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--



== Status ==
Current time: 2023-09-06 23:29:26 (running for 00:06:55.96)
PopulationBasedTraining: 5 checkpoints, 0 perturbs
Logical resource usage: 6.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (7 PAUSED, 1 PENDING, 2 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--



== Status ==
Current time: 2023-09-06 23:29:38 (running for 00:07:07.23)
PopulationBasedTraining: 5 checkpoints, 0 perturbs
Logical resource usage: 6.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (7 PAUSED, 1 PENDING, 2 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--

[2m[36m(A2C pid=76088)[0m Trainable.setup took 18.035 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(A2C pid=76088)[0m Restored on 127.0.0.1 from checkpoint: /var/folders/_2/x__vfndx65bd1j3y57l3q4wc0000gn/T/checkpoint_tmp_fcf85e00d8d8414fa8432f2cf9945a2f
[2m[36m(A2C pid=76088)[0m Current state after restoring: {'_iteration': 8, '_timesteps_total': None, '_time_total': 121.78621697425842, '_episodes_total': 0}


== Status ==
Current time: 2023-09-06 23:29:43 (running for 00:07:12.29)
PopulationBasedTraining: 5 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (6 PAUSED, 1 PENDING, 3 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--



== Status ==
Current time: 2023-09-06 23:29:48 (running for 00:07:17.31)
PopulationBasedTraining: 5 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (6 PAUSED, 1 PENDING, 3 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--



== Status ==
Current time: 2023-09-06 23:29:53 (running for 00:07:22.31)
PopulationBasedTraining: 5 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (6 PAUSED, 1 PENDING, 3 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--

[2m[36m(A2C pid=76190)[0m Trainable.setup took 21.432 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


== Status ==
Current time: 2023-09-06 23:30:14 (running for 00:07:43.16)
PopulationBasedTraining: 5 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (6 PAUSED, 4 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+--

[2m[36m(A2C pid=76190)[0m Restored on 127.0.0.1 from checkpoint: /var/folders/_2/x__vfndx65bd1j3y57l3q4wc0000gn/T/checkpoint_tmp_2ec591f134cd485eb3cbb8f683e1d762
[2m[36m(A2C pid=76190)[0m Current state after restoring: {'_iteration': 9, '_timesteps_total': None, '_time_total': 125.7581434249878, '_episodes_total': 0}


== Status ==
Current time: 2023-09-06 23:30:33 (running for 00:08:02.20)
PopulationBasedTraining: 5 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (5 PAUSED, 1 PENDING, 4 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--



== Status ==
Current time: 2023-09-06 23:30:38 (running for 00:08:07.21)
PopulationBasedTraining: 5 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (5 PAUSED, 1 PENDING, 4 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--



== Status ==
Current time: 2023-09-06 23:31:34 (running for 00:09:03.71)
PopulationBasedTraining: 6 checkpoints, 0 perturbs
Logical resource usage: 8.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (6 PAUSED, 1 PENDING, 3 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--



== Status ==
Current time: 2023-09-06 23:31:44 (running for 00:09:13.80)
PopulationBasedTraining: 7 checkpoints, 0 perturbs
Logical resource usage: 6.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (7 PAUSED, 1 PENDING, 2 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--



== Status ==
Current time: 2023-09-06 23:31:50 (running for 00:09:19.69)
PopulationBasedTraining: 8 checkpoints, 0 perturbs
Logical resource usage: 6.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (8 PAUSED, 1 PENDING, 1 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--



== Status ==
Current time: 2023-09-06 23:32:00 (running for 00:09:29.71)
PopulationBasedTraining: 8 checkpoints, 0 perturbs
Logical resource usage: 4.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (8 PAUSED, 1 PENDING, 1 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--

[2m[36m(A2C pid=76837)[0m Trainable.setup took 15.194 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(raylet)[0m Spilled 4200 MiB, 99 objects, write throughput 109 MiB/s.
[2m[36m(A2C pid=76837)[0m Restored on 127.0.0.1 from checkpoint: /var/folders/_2/x__vfndx65bd1j3y57l3q4wc0000gn/T/checkpoint_tmp_92c3545acd704311a424d5bed73c5f17
[2m[36m(A2C pid=76837)[0m Current state after restoring: {'_iteration': 8, '_timesteps_total': None, '_time_total': 127.39275121688843, '_episodes_total': 0}


== Status ==
Current time: 2023-09-06 23:32:05 (running for 00:09:34.71)
PopulationBasedTraining: 8 checkpoints, 0 perturbs
Logical resource usage: 6.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (7 PAUSED, 1 PENDING, 2 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--



== Status ==
Current time: 2023-09-06 23:32:10 (running for 00:09:39.76)
PopulationBasedTraining: 8 checkpoints, 0 perturbs
Logical resource usage: 6.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (7 PAUSED, 1 PENDING, 2 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--



== Status ==
Current time: 2023-09-06 23:32:15 (running for 00:09:44.82)
PopulationBasedTraining: 8 checkpoints, 0 perturbs
Logical resource usage: 6.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (7 PAUSED, 1 PENDING, 2 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--



== Status ==
Current time: 2023-09-06 23:32:29 (running for 00:09:58.33)
PopulationBasedTraining: 8 checkpoints, 0 perturbs
Logical resource usage: 6.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/autobatch/A2C
Number of trials: 10/10 (7 PAUSED, 1 PENDING, 2 RUNNING)
+---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name          | status   | loc             |   entropy_coeff |    gamma |   num_envs_per_worker |   sgd_minibatch_size |   train_batch_size |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|---------------------+----------+-----------------+-----------------+----------+-----------------------+----------------------+--------------------+--------+------------------+------+--

[2m[36m(A2C pid=76993)[0m Trainable.setup took 17.968 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2023-09-06 23:32:33,730	INFO pbt.py:809 -- 

[PopulationBasedTraining] [Exploit] Cloning trial 7cd10_00007 (score =  nan) into trial 7cd10_00001 (score =  nan)



TuneError: The Ray Tune run failed. Please inspect the previous error messages for a cause. After fixing the issue, you can restart the run from scratch or continue this run. To continue this run, you can use `tuner = Tuner.restore("/Users/floriankockler/rayresults/autobatch/A2C", trainable=...)`.

In [None]:

manager_config = {
    "df": train_df,

}
config={
        "manager_config": manager_config
        }

env = HRL(config)°
env.observation_space.sample()

In [7]:
 
def env_creator(env_config):
    return HRL(env_config)  # Assuming this is your environment
 
register_env("hierarch_env", env_creator)
 
def policy_mapping_fn(agent_id, episode=None, worker=None, **kwargs):
    if agent_id in env.workers:
        return "worker_policy"
    else:
        return "manager_policy"
 

first_worker_tic = next(iter(env.workers))
worker_policy_spec = PolicySpec(
    observation_space=env.observation_space[first_worker_tic],
    action_space=env.action_space[first_worker_tic],
    config={}
)
 
manager_policy_spec = PolicySpec(
    observation_space=env.observation_space['manager'],
    action_space=env.action_space['manager'],
    config={}
)
 
manager_config = {
    "df": train_df,

}
hrl_config={
        "manager_config": manager_config
        }


trainer = RLTrainer(
    run_config=RunConfig(stop={"training_iteration": 5},local_dir="/Users/floriankockler/rayresults/training"),
    scaling_config=ScalingConfig(
        num_workers=2,
        use_gpu=False,
    ),
 
    algorithm=a2c.A2C,
    config={
        "multiagent": {
            "policies": {
                "worker_policy": worker_policy_spec,
                "manager_policy": manager_policy_spec,
            },
            "policy_mapping_fn": policy_mapping_fn,
        },
        "env": "hierarch_env",
        "env_config": hrl_config,
        "framework": "tf",
        "evaluation_num_workers": 1,
        "evaluation_interval": 1,
        "evaluation_config": {"input": "sampler"},
    },
)
result = trainer.fit()

0,1
Current time:,2023-09-06 13:25:07
Running for:,00:00:36.74
Memory:,6.8/8.0 GiB

Trial name,# failures,error file
AIRA2C_f2578_00000,1,/Users/floriankockler/rayresults/training/AIRA2C_2023-09-06_13-24-30/AIRA2C_f2578_00000_0_2023-09-06_13-24-30/error.txt

Trial name,status,loc
AIRA2C_f2578_00000,ERROR,127.0.0.1:92326


[2m[36m(AIRA2C pid=92326)[0m Trainable.setup took 28.210 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2023-09-06 13:25:07,129	ERROR tune_controller.py:911 -- Trial task failed for trial AIRA2C_f2578_00000
Traceback (most recent call last):
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/worker.py", line 2493, in get
    raise value.as_instanceof_c

TrainingFailedError: The Ray Train run failed. Please inspect the previous error messages for a cause. After fixing the issue (assuming that the error is not caused by your own application logic, but rather an error such as OOM), you can restart the run from scratch or continue this run.
To continue this run, you can use: `trainer = RLTrainer.restore("/Users/floriankockler/rayresults/training/AIRA2C_2023-09-06_13-24-30")`.
To start a new run that will retry on training failures, set `air.RunConfig(failure_config=air.FailureConfig(max_failures))` in the Trainer's `run_config` with `max_failures > 0`, or `max_failures = -1` for unlimited retries.

In [None]:
# worker_config =   {
#         "cash_initial": 1000000, 
#         "trading_cost": 0.001, 
#         "tech_indicator_list": indicators,
#         "print_verbosity": 1,
#         "initial_shares_held": 0,
#         "df": df,  }


# env = Worker(env_config=worker_config)
# check_env(env)

In [None]:


worker_config =   {
        "cash_initial": 1000000, 
        "trading_cost": 0.001, 
        "tech_indicator_list": indicators,
        "print_verbosity": 1,
        "initial_shares_held": 0,
        "df": train_df,  }


env = Worker(env_config=worker_config)

n_iterations = 30

state = env.reset()

for _ in range(n_iterations):

    action = env.action_space.sample()

    obs, reward, done, truncated, info= env.step(action)

    # print(f"Action: {action}, Reward: {reward}, Portfolio Value: {obs[0] + obs[1] * obs[2]}")
    
    if done:
        print("Episode finished!")
        state = env.reset()
    else:
        state = obs

In [4]:
import random

from ray import air, tune
from ray.tune.schedulers import PopulationBasedTraining
import argparse
from ray.tune import CLIReporter

reporter = CLIReporter(max_progress_rows=10)

def env_creator(env_config):
    # Assuming this is your environment
    return Worker(env_config)

register_env("Single_Stock", env_creator)


parser = argparse.ArgumentParser()
parser.add_argument(
    "--smoke-test", action="store_true", help="Finish quickly for testing"
)
args, _ = parser.parse_known_args()

worker_config =   {
        "cash_initial": 1000000, 
        "trading_cost": 0.001, 
        "tech_indicator_list": indicators,
        "print_verbosity": 1,
        "initial_shares_held": 0,
        "df": train_df,  
        }

def explore(config):
    # Ensure we collect enough timesteps to do sgd
    if config["train_batch_size"] < config["rollout_fragment_length"] * 2:
        config["train_batch_size"] = config["rollout_fragment_length"] * 2
    return config

hyperparam_mutations = {
    "lr": [1e-3, 5e-4, 1e-4, 5e-5, 1e-5],
    "gamma": lambda: random.uniform(0.9, 1.0),
    "entropy_coeff": [0.01, 0.1, 1.0],
    "num_envs_per_worker": [1, 2, 4, 8],
    "rollout_fragment_length": [75],
    "train_batch_size": lambda: random.randint(1000, 50000),
    "sgd_minibatch_size": tune.choice([50, 100, 200]),
    "train_batch_size": tune.choice([200, 400, 600]),
}

pbt = PopulationBasedTraining(
        time_attr="time_total_s",
        perturbation_interval=120,
        resample_probability=0.25,
        # Specifies the mutations of these hyperparams
        hyperparam_mutations=hyperparam_mutations,
        custom_explore_fn=explore,
    )

# Stop when we've reached 100 training iterations or reward=300
stopping_criteria = {"training_iteration": 100}

tuner = tune.Tuner(
    "A2C",
    tune_config=tune.TuneConfig(
        metric="episode_reward_mean",
        mode="max",
        scheduler=pbt,
        num_samples=1 if args.smoke_test else 10,
    ),
    param_space={
        "env": "Single_Stock",
        "env_config": worker_config,
        "rollout_fragment_length": "auto",
        "framework": "tf2",
        "num_workers": 5,  # 1 for training + 4 for sampling
        "num_cpus": 1,  # number of CPUs to use per trial --> 6 in total = max available
        "num_gpus": 0,  # number of GPUs to use per trial
        # These params are tuned from a fixed starting value.
        "lr": 1e-4,
        # These params start off randomly drawn from a set.
        "sgd_minibatch_size": tune.choice([50, 100, 200]),
        "train_batch_size": tune.choice([200, 400, 600]),
    },

    run_config=air.RunConfig(stop=stopping_criteria, local_dir="/Users/floriankockler/rayresults/tuning3", progress_reporter=reporter),
)
results = tuner.fit()

2023-09-06 23:50:44,830	INFO tune.py:657 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


== Status ==
Current time: 2023-09-06 23:50:47 (running for 00:00:02.93)
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Logical resource usage: 6.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/tuning3/A2C
Number of trials: 10/10 (10 PENDING)


== Status ==
Current time: 2023-09-06 23:50:52 (running for 00:00:08.03)
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Logical resource usage: 6.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/tuning3/A2C
Number of trials: 10/10 (10 PENDING)






== Status ==
Current time: 2023-09-06 23:50:57 (running for 00:00:13.13)
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Logical resource usage: 6.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/tuning3/A2C
Number of trials: 10/10 (10 PENDING)


== Status ==
Current time: 2023-09-06 23:51:03 (running for 00:00:18.16)
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Logical resource usage: 6.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/tuning3/A2C
Number of trials: 10/10 (10 PENDING)




[2m[36m(A2C pid=80798)[0m 2023-09-06 23:51:06,623	ERROR actor_manager.py:500 -- Ray error, taking actor 1 out of service. The actor died because of an error raised in its creation task, [36mray::RolloutWorker.__init__()[39m (pid=80812, ip=127.0.0.1, actor_id=4d937945e844fb743679ca2501000000, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x17ccfba00>)
[2m[36m(A2C pid=80798)[0m   File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/rllib/utils/pre_checks/env.py", line 209, in check_gym_environments
[2m[36m(A2C pid=80798)[0m     raise ValueError(
[2m[36m(A2C pid=80798)[0m ValueError: The observation collected from env.reset() was not contained within your env's observation space. It is possible that there was a type mismatch, or that one of the sub-observations was out of bounds:
[2m[36m(A2C pid=80798)[0m  path: 'current_price'
[2m[36m(A2C pid=80798)[0m  (sub-)obs: [[ 0.8285  0.192   2.1572  2.0991 15.9956  1.489   2.394

== Status ==
Current time: 2023-09-06 23:51:17 (running for 00:00:32.67)
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Logical resource usage: 6.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/tuning3/A2C
Number of trials: 10/10 (10 PENDING)


== Status ==
Current time: 2023-09-06 23:51:22 (running for 00:00:37.74)
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Logical resource usage: 6.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/tuning3/A2C
Number of trials: 10/10 (1 ERROR, 9 PENDING)
Number of errored trials: 1
+------------------------------+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                   |   # failures | error file                                                                                                                                            |
|-----------------------------



== Status ==
Current time: 2023-09-06 23:51:32 (running for 00:00:48.12)
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Logical resource usage: 6.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/tuning3/A2C
Number of trials: 10/10 (1 ERROR, 9 PENDING)
Number of errored trials: 1
+------------------------------+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                   |   # failures | error file                                                                                                                                            |
|------------------------------+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------|
| A2C_Single_Stock_6e71a_00000 |            1 | /Users/floriankockler/rayresults/tuning3/A2C/A2C_S



== Status ==
Current time: 2023-09-06 23:51:39 (running for 00:00:54.64)
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Logical resource usage: 6.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/tuning3/A2C
Number of trials: 10/10 (1 ERROR, 9 PENDING)
Number of errored trials: 1
+------------------------------+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                   |   # failures | error file                                                                                                                                            |
|------------------------------+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------|
| A2C_Single_Stock_6e71a_00000 |            1 | /Users/floriankockler/rayresults/tuning3/A2C/A2C_S



== Status ==
Current time: 2023-09-06 23:51:44 (running for 00:00:59.69)
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Logical resource usage: 6.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/tuning3/A2C
Number of trials: 10/10 (1 ERROR, 9 PENDING)
Number of errored trials: 1
+------------------------------+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                   |   # failures | error file                                                                                                                                            |
|------------------------------+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------|
| A2C_Single_Stock_6e71a_00000 |            1 | /Users/floriankockler/rayresults/tuning3/A2C/A2C_S

[2m[36m(A2C pid=80874)[0m 2023-09-06 23:51:44,819	ERROR actor_manager.py:500 -- Ray error, taking actor 1 out of service. The actor died because of an error raised in its creation task, [36mray::RolloutWorker.__init__()[39m (pid=80908, ip=127.0.0.1, actor_id=6771037b223f8c60f191019b01000000, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x177bdfa60>)
[2m[36m(A2C pid=80874)[0m   File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/rllib/utils/pre_checks/env.py", line 209, in check_gym_environments
[2m[36m(A2C pid=80874)[0m     raise ValueError(
[2m[36m(A2C pid=80874)[0m ValueError: The observation collected from env.reset() was not contained within your env's observation space. It is possible that there was a type mismatch, or that one of the sub-observations was out of bounds:
[2m[36m(A2C pid=80874)[0m  path: 'current_price'
[2m[36m(A2C pid=80874)[0m  (sub-)obs: [[ 0.8285  0.192   2.1572  2.0991 15.9956  1.489   2.394

== Status ==
Current time: 2023-09-06 23:51:49 (running for 00:01:04.83)
PopulationBasedTraining: 0 checkpoints, 0 perturbs
Logical resource usage: 6.0/8 CPUs, 0/0 GPUs
Result logdir: /Users/floriankockler/rayresults/tuning3/A2C
Number of trials: 10/10 (2 ERROR, 8 PENDING)
Number of errored trials: 2
+------------------------------+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                   |   # failures | error file                                                                                                                                            |
|------------------------------+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------|
| A2C_Single_Stock_6e71a_00000 |            1 | /Users/floriankockler/rayresults/tuning3/A2C/A2C_S