In [1]:
import numpy as np
import pandas as pd
import ray
from configs.load_specific_data import pharma_basket
from env.multi_agent.worker_hrl import Worker
from env.multi_agent.hrl import HRL
from ray.rllib.utils import check_env
from ray.train.rl import RLTrainer
from ray.air.config import RunConfig, ScalingConfig
from ray.rllib.policy.policy import Policy, PolicySpec
from ray.rllib.algorithms.bc.bc import BC
from ray.rllib.algorithms import a2c
from ray.tune.registry import register_env
import gymnasium
ray.init(_temp_dir='/Users/floriankockler/rayresults/')
import warnings
warnings.simplefilter(action='ignore', category=Warning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
import os
os.environ['PYTHONWARNINGS'] = 'ignore'

2023-09-06 18:48:00,624	INFO worker.py:1612 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


In [2]:
train_df, validate_df, test_df, stock_dimension, state_space, indicators = pharma_basket()
# train_df = train_df[train_df["tic"] == "PFE.US"]
# validate_df = validate_df[validate_df["tic"] == "PFE.US"]


In [None]:

manager_config = {
    "df": train_df,

}
config={
        "manager_config": manager_config
        }

env = HRL(config)

n_iterations = 7000

state = env.reset()

for _ in range(n_iterations):

    action = env.action_space.sample()

    obs, reward, done, truncated, info= env.step(action)

    # print(f"Action: {action}, Reward: {reward}, Portfolio Value: {obs[0] + obs[1] * obs[2]}")
    
    if done["__all__"]:
        print("Episode finished!")
        state = env.reset()
    else:
        state = obs

In [3]:
import os 
manager_config = {
    "df": train_df,

}
config={
        "manager_config": manager_config
        }

env = HRL(config)

def env_creator(env_config):
    return HRL(env_config)  # Assuming this is your environment
 
register_env("hierarch_env", env_creator)
 
def policy_mapping_fn(agent_id, episode=None, worker=None, **kwargs):
    if agent_id in env.workers:
        return "worker_policy"
    else:
        return "manager_policy"
 

first_worker_tic = next(iter(env.workers))
worker_policy_spec = PolicySpec(
    observation_space=env.observation_space[first_worker_tic],
    action_space=env.action_space[first_worker_tic],
    config={}
)
 
manager_policy_spec = PolicySpec(
    observation_space=env.observation_space['manager'],
    action_space=env.action_space['manager'],
    config={}
)
 
manager_config = {
    "df": train_df,

}
hrl_config={
        "manager_config": manager_config
        }


trainer = RLTrainer(
    run_config=RunConfig(stop={"training_iteration": 100},local_dir="/Users/floriankockler/rayresults/trainingtest1"),
    scaling_config=ScalingConfig(
        num_workers=1,
        use_gpu=False,
    ),
 
    algorithm=a2c.A2C,
    config={
        "multiagent": {
            "policies": {
                "worker_policy": worker_policy_spec,
                "manager_policy": manager_policy_spec,
            },
            "policy_mapping_fn": policy_mapping_fn,
        },
        "env": "hierarch_env",
        "env_config": hrl_config,
        "framework": "tf2",
        "evaluation_num_workers": 5,
        "evaluation_interval": 1,
        "evaluation_config": {"input": "sampler"},
    },
)
result = trainer.fit()

0,1
Current time:,2023-09-06 21:14:01
Running for:,02:25:58.76
Memory:,6.9/8.0 GiB

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
AIRA2C_24e79_00000,RUNNING,127.0.0.1:34492,31,8518.13,14848,198375,343269,53481.6,6305


[2m[36m(AIRA2C pid=34492)[0m Trainable.setup took 19.419 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


[2m[36m(RolloutWorker pid=34524)[0m HRL is done
[2m[36m(RolloutWorker pid=34524)[0m day: 6304, episode: 2
[2m[36m(RolloutWorker pid=34524)[0m Total Cash Transfers: 60
[2m[36m(RolloutWorker pid=34524)[0m total_portfolio_trades: 49328.0
[2m[36m(RolloutWorker pid=34524)[0m Beginn_Portfolio_Value: 2000000
[2m[36m(RolloutWorker pid=34524)[0m End_Portfolio_Value: 7681662.0
[2m[36m(RolloutWorker pid=34524)[0m Annual Return: 8.10 %
[2m[36m(RolloutWorker pid=34524)[0m Worker ID: ABT.US Current Stock Exposure: 0
[2m[36m(RolloutWorker pid=34524)[0m Worker ID: AMGN.US Current Stock Exposure: 219005
[2m[36m(RolloutWorker pid=34524)[0m Worker ID: BDX.US Current Stock Exposure: 858463
[2m[36m(RolloutWorker pid=34524)[0m Worker ID: BMY.US Current Stock Exposure: 433827
[2m[36m(RolloutWorker pid=34524)[0m Worker ID: HUM.US Current Stock Exposure: 421826
[2m[36m(RolloutWorker pid=34524)[0m Worker ID: JNJ.US Current Stock Exposure: 36554
[2m[36m(RolloutWorker pid=



[2m[36m(RolloutWorker pid=34522)[0m HRL is done[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m day: 6304, episode: 3[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Total Cash Transfers: 52[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m total_portfolio_trades: 48317.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Beginn_Portfolio_Value: 2000000[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m End_Portfolio_Value: 7598908.5[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Annual Return: 8.03 %[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Worker ID: ABT.US Current Stock Exposure: 84903[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Worker ID: AMGN.US Current Stock Exposure: 2403055[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=3452



[2m[36m(RolloutWorker pid=34524)[0m HRL is done[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m day: 6304, episode: 4[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Total Cash Transfers: 54[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m total_portfolio_trades: 48607.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Beginn_Portfolio_Value: 2000000[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m End_Portfolio_Value: 11525966.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Annual Return: 10.67 %[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Worker ID: ABT.US Current Stock Exposure: 850475[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Worker ID: AMGN.US Current Stock Exposure: 181660[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34



[2m[36m(RolloutWorker pid=34523)[0m HRL is done[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34523)[0m day: 6304, episode: 6[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34523)[0m Total Cash Transfers: 57[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34523)[0m total_portfolio_trades: 51009.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34523)[0m Beginn_Portfolio_Value: 2000000[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34523)[0m End_Portfolio_Value: 15913318.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34523)[0m Annual Return: 12.76 %[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34523)[0m Worker ID: ABT.US Current Stock Exposure: 340482[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34523)[0m Worker ID: AMGN.US Current Stock Exposure: 803114[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34



[2m[36m(RolloutWorker pid=34524)[0m HRL is done[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m day: 6304, episode: 8[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Total Cash Transfers: 41[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m total_portfolio_trades: 50664.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Beginn_Portfolio_Value: 2000000[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m End_Portfolio_Value: 28381108.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Annual Return: 16.60 %[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Worker ID: ABT.US Current Stock Exposure: 2605965[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Worker ID: AMGN.US Current Stock Exposure: 965877[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=3



[2m[36m(RolloutWorker pid=34524)[0m HRL is done[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m day: 6304, episode: 9[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Total Cash Transfers: 42[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m total_portfolio_trades: 53108.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Beginn_Portfolio_Value: 2000000[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m End_Portfolio_Value: 39938088.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Annual Return: 18.93 %[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Worker ID: ABT.US Current Stock Exposure: 240075[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Worker ID: AMGN.US Current Stock Exposure: 15185033[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=



[2m[36m(RolloutWorker pid=34522)[0m HRL is done[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m day: 6304, episode: 10[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Total Cash Transfers: 43[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m total_portfolio_trades: 55331.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Beginn_Portfolio_Value: 2000000[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m End_Portfolio_Value: 15703899.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Annual Return: 12.67 %[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Worker ID: ABT.US Current Stock Exposure: 543120[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Worker ID: AMGN.US Current Stock Exposure: 3064612[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=



[2m[36m(RolloutWorker pid=34523)[0m HRL is done[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34523)[0m day: 6304, episode: 11[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34523)[0m Total Cash Transfers: 49[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34523)[0m total_portfolio_trades: 55284.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34523)[0m Beginn_Portfolio_Value: 2000000[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34523)[0m End_Portfolio_Value: 19663456.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34523)[0m Annual Return: 14.15 %[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34523)[0m Worker ID: ABT.US Current Stock Exposure: 1179695[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34523)[0m Worker ID: AMGN.US Current Stock Exposure: 2006486[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid



[2m[36m(RolloutWorker pid=34524)[0m HRL is done[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m day: 6304, episode: 14[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Total Cash Transfers: 35[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m total_portfolio_trades: 54055.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Beginn_Portfolio_Value: 2000000[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m End_Portfolio_Value: 29988476.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Annual Return: 16.97 %[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Worker ID: ABT.US Current Stock Exposure: 2321239[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Worker ID: AMGN.US Current Stock Exposure: 2334203[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid



[2m[36m(RolloutWorker pid=34522)[0m HRL is done[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m day: 6304, episode: 18[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Total Cash Transfers: 32[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m total_portfolio_trades: 54793.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Beginn_Portfolio_Value: 2000000[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m End_Portfolio_Value: 24359650.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Annual Return: 15.57 %[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Worker ID: ABT.US Current Stock Exposure: 872583[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Worker ID: AMGN.US Current Stock Exposure: 1658495[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=



[2m[36m(RolloutWorker pid=34524)[0m HRL is done[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m day: 6304, episode: 19[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Total Cash Transfers: 37[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m total_portfolio_trades: 55135.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Beginn_Portfolio_Value: 2000000[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m End_Portfolio_Value: 45969944.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Annual Return: 19.90 %[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Worker ID: ABT.US Current Stock Exposure: 618603[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34524)[0m Worker ID: AMGN.US Current Stock Exposure: 8587102[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=



[2m[36m(RolloutWorker pid=34522)[0m HRL is done[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m day: 6304, episode: 24[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Total Cash Transfers: 31[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m total_portfolio_trades: 57727.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Beginn_Portfolio_Value: 2000000[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m End_Portfolio_Value: 59386508.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Annual Return: 21.69 %[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Worker ID: ABT.US Current Stock Exposure: 1377675[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Worker ID: AMGN.US Current Stock Exposure: 19136816[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pi



[2m[36m(RolloutWorker pid=34521)[0m HRL is done[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m day: 6304, episode: 26[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m Total Cash Transfers: 35[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m total_portfolio_trades: 59496.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m Beginn_Portfolio_Value: 2000000[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m End_Portfolio_Value: 43424996.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m Annual Return: 19.50 %[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m Worker ID: ABT.US Current Stock Exposure: 2060116[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m Worker ID: AMGN.US Current Stock Exposure: 7840838[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid



[2m[36m(RolloutWorker pid=34525)[0m HRL is done[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34525)[0m day: 6304, episode: 28[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34525)[0m Total Cash Transfers: 48[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34525)[0m total_portfolio_trades: 57466.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34525)[0m Beginn_Portfolio_Value: 2000000[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34525)[0m End_Portfolio_Value: 71315544.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34525)[0m Annual Return: 22.99 %[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34525)[0m Worker ID: ABT.US Current Stock Exposure: 1453297[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34525)[0m Worker ID: AMGN.US Current Stock Exposure: 1498894[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid



[2m[36m(RolloutWorker pid=34522)[0m HRL is done[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m day: 6304, episode: 29[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Total Cash Transfers: 35[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m total_portfolio_trades: 58192.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Beginn_Portfolio_Value: 2000000[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m End_Portfolio_Value: 50966864.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Annual Return: 20.62 %[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Worker ID: ABT.US Current Stock Exposure: 1383219[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34522)[0m Worker ID: AMGN.US Current Stock Exposure: 12417726[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pi



[2m[36m(RolloutWorker pid=34502)[0m HRL is done[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34502)[0m day: 6304, episode: 3[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34502)[0m Total Cash Transfers: 35[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34502)[0m total_portfolio_trades: 57287.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34502)[0m Beginn_Portfolio_Value: 2000000[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34502)[0m End_Portfolio_Value: 43850136.0[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34502)[0m Annual Return: 19.57 %[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34502)[0m Worker ID: ABT.US Current Stock Exposure: 507977[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=34502)[0m Worker ID: AMGN.US Current Stock Exposure: 8583047[32m [repeated 5x across cluster][0m
[2m[36m(RolloutWorker pid=3



[2m[36m(RolloutWorker pid=34521)[0m HRL is done[32m [repeated 4x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m day: 6304, episode: 32[32m [repeated 4x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m Total Cash Transfers: 30[32m [repeated 4x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m total_portfolio_trades: 61167.0[32m [repeated 4x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m Beginn_Portfolio_Value: 2000000[32m [repeated 4x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m End_Portfolio_Value: 61262196.0[32m [repeated 4x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m Annual Return: 21.91 %[32m [repeated 4x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m Worker ID: ABT.US Current Stock Exposure: 1694172[32m [repeated 4x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m Worker ID: AMGN.US Current Stock Exposure: 11377117[32m [repeated 4x across cluster][0m
[2m[36m(RolloutWorker pi



[2m[36m(RolloutWorker pid=34521)[0m HRL is done[32m [repeated 2x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m day: 6304, episode: 33[32m [repeated 2x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m Total Cash Transfers: 30[32m [repeated 2x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m total_portfolio_trades: 60251.0[32m [repeated 2x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m Beginn_Portfolio_Value: 2000000[32m [repeated 2x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m End_Portfolio_Value: 63964432.0[32m [repeated 2x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m Annual Return: 22.21 %[32m [repeated 2x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m Worker ID: ABT.US Current Stock Exposure: 4267427[32m [repeated 2x across cluster][0m
[2m[36m(RolloutWorker pid=34521)[0m Worker ID: AMGN.US Current Stock Exposure: 0[32m [repeated 2x across cluster][0m
[2m[36m(RolloutWorker pid=34521



[2m[36m(RolloutWorker pid=34521)[0m HRL is done
[2m[36m(RolloutWorker pid=34521)[0m day: 6304, episode: 34
[2m[36m(RolloutWorker pid=34521)[0m Total Cash Transfers: 24
[2m[36m(RolloutWorker pid=34521)[0m total_portfolio_trades: 60213.0
[2m[36m(RolloutWorker pid=34521)[0m Beginn_Portfolio_Value: 2000000
[2m[36m(RolloutWorker pid=34521)[0m End_Portfolio_Value: 59368540.0
[2m[36m(RolloutWorker pid=34521)[0m Annual Return: 21.69 %
[2m[36m(RolloutWorker pid=34521)[0m Worker ID: ABT.US Current Stock Exposure: 2366515
[2m[36m(RolloutWorker pid=34521)[0m Worker ID: AMGN.US Current Stock Exposure: 13822830
[2m[36m(RolloutWorker pid=34521)[0m Worker ID: BDX.US Current Stock Exposure: 3620598
[2m[36m(RolloutWorker pid=34521)[0m Worker ID: BMY.US Current Stock Exposure: 1081811
[2m[36m(RolloutWorker pid=34521)[0m Worker ID: HUM.US Current Stock Exposure: 1682572
[2m[36m(RolloutWorker pid=34521)[0m Worker ID: JNJ.US Current Stock Exposure: 2637619
[2m[36m(Ro



In [None]:
train_df.columns

Index(['date', 'tic', 'open', 'high', 'low', 'close', 'volume', 'avgvol_50',
       'sma_10', 'sma_50', 'sma_100', 'sma_200', 'wma_50', 'rsi_14',
       'volatility_30', 'volatility_100', 'stddev_30', 'dmi_14', 'adx_14',
       'macd', 'atr_14'],
      dtype='object')

In [None]:
# worker_df = train_df[train_df["tic"] == "PFE.US"]

# config={"df": worker_df}
        
# env = Worker(env_config=config)
# check_env(env)



# n_iterations = 500

# state = env.reset()

# for _ in range(n_iterations):

#     action = env.action_space.sample()

#     obs, reward, done, truncated, info= env.step(action)

#     # print(f"Action: {action}, Reward: {reward}, Portfolio Value: {obs[0] + obs[1] * obs[2]}")
    
#     if done:
#         print("Episode finished!")
#         state = env.reset()
#     else:
#         state = obs

In [None]:
manager_config = {
    "df": train_df,

}
config={



        "manager_config": manager_config
        }
env = HRL(env_config=config)
check_env(env)

In [None]:
import random
import os
from ray import air, tune
from ray.tune.schedulers import PopulationBasedTraining
import argparse
from ray.tune import CLIReporter
from env.multi_agent.hrl import HRL

reporter = CLIReporter(max_progress_rows=10)

def env_creator(env_config):
    return HRL(env_config)  # Assuming this is your environment
 
register_env("hrl", env_creator)

manager_config = {
    "df": train_df,

}
hrl_config={
        "manager_config": manager_config
        }
env = HRL(hrl_config)
 
def policy_mapping_fn(agent_id, episode=None, worker=None, **kwargs):
    if agent_id in env.workers:
        return "worker_policy"
    else:
        return "manager_policy"
 


first_worker_tic = next(iter(env.workers))
worker_policy_spec = PolicySpec(
    observation_space=env.observation_space[first_worker_tic],
    action_space=env.action_space[first_worker_tic],
    config={}
)
 
manager_policy_spec = PolicySpec(
    observation_space=env.observation_space['manager'],
    action_space=env.action_space['manager'],
    config={}
)
 

parser = argparse.ArgumentParser()
parser.add_argument(
    "--smoke-test", action="store_true", help="Finish quickly for testing"
)
args, _ = parser.parse_known_args()

policies = {
    "worker_policy": worker_policy_spec,
    "manager_policy": manager_policy_spec,
}


def explore(config):
    # Ensure we collect enough timesteps to do sgd
    if config["train_batch_size"] < config["rollout_fragment_length"] * 2:
        config["train_batch_size"] = config["rollout_fragment_length"] * 2
    return config

hyperparam_mutations = {
    "lr": [1e-3, 5e-4, 1e-4, 5e-5, 1e-5],
    "gamma": lambda: random.uniform(0.9, 1.0),
    "entropy_coeff": [0.01, 0.1, 1.0],
    "num_envs_per_worker": [1, 2, 4, 8],
    "rollout_fragment_length": [50, 100, 200, 400],
    "train_batch_size": lambda: random.randint(200, 1500),
    "sgd_minibatch_size": tune.choice([50, 100, 200]),

}

pbt = PopulationBasedTraining(
        time_attr="time_total_s",
        perturbation_interval=120,
        resample_probability=0.25,
        # Specifies the mutations of these hyperparams
        hyperparam_mutations=hyperparam_mutations,
        custom_explore_fn=explore,
    )

# Stop when we've reached 100 training iterations or reward=300
stopping_criteria = {"training_iteration": 100}

tuner = tune.Tuner(
    "A2C",
    tune_config=tune.TuneConfig(
        metric="episode_reward_mean",
        mode="max",
        scheduler=pbt,
        num_samples=1 if args.smoke_test else 10,
    ),
    param_space={
        "env": "hrl",
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": policy_mapping_fn,
        },
        "env_config": hrl_config,
        "rollout_fragment_length": "auto",
        "framework": "tf2",
        "num_workers": 1,  # 1 for training + 4 for sampling
        # "num_cpus": 1,  # number of CPUs to use per trial --> 6 in total = max available
        # "num_gpus": 0,  # number of GPUs to use per trial
        # These params are tuned from a fixed starting value.
        "lr": 1e-4,
        # These params start off randomly drawn from a set.
        "sgd_minibatch_size": tune.choice([50, 100, 200]),
        "train_batch_size": tune.choice([200, 400, 600]),
    },

    run_config=air.RunConfig(stop=stopping_criteria, local_dir="/Users/floriankockler/rayresults/falsetry1", progress_reporter=reporter),
)
results = tuner.fit()

In [None]:

manager_config = {
    "df": train_df,

}
config={
        "manager_config": manager_config
        }

env = HRL(config)°
env.observation_space.sample()

In [7]:
 
def env_creator(env_config):
    return HRL(env_config)  # Assuming this is your environment
 
register_env("hierarch_env", env_creator)
 
def policy_mapping_fn(agent_id, episode=None, worker=None, **kwargs):
    if agent_id in env.workers:
        return "worker_policy"
    else:
        return "manager_policy"
 

first_worker_tic = next(iter(env.workers))
worker_policy_spec = PolicySpec(
    observation_space=env.observation_space[first_worker_tic],
    action_space=env.action_space[first_worker_tic],
    config={}
)
 
manager_policy_spec = PolicySpec(
    observation_space=env.observation_space['manager'],
    action_space=env.action_space['manager'],
    config={}
)
 
manager_config = {
    "df": train_df,

}
hrl_config={
        "manager_config": manager_config
        }


trainer = RLTrainer(
    run_config=RunConfig(stop={"training_iteration": 5},local_dir="/Users/floriankockler/rayresults/training"),
    scaling_config=ScalingConfig(
        num_workers=2,
        use_gpu=False,
    ),
 
    algorithm=a2c.A2C,
    config={
        "multiagent": {
            "policies": {
                "worker_policy": worker_policy_spec,
                "manager_policy": manager_policy_spec,
            },
            "policy_mapping_fn": policy_mapping_fn,
        },
        "env": "hierarch_env",
        "env_config": hrl_config,
        "framework": "tf",
        "evaluation_num_workers": 1,
        "evaluation_interval": 1,
        "evaluation_config": {"input": "sampler"},
    },
)
result = trainer.fit()

0,1
Current time:,2023-09-06 13:25:07
Running for:,00:00:36.74
Memory:,6.8/8.0 GiB

Trial name,# failures,error file
AIRA2C_f2578_00000,1,/Users/floriankockler/rayresults/training/AIRA2C_2023-09-06_13-24-30/AIRA2C_f2578_00000_0_2023-09-06_13-24-30/error.txt

Trial name,status,loc
AIRA2C_f2578_00000,ERROR,127.0.0.1:92326


[2m[36m(AIRA2C pid=92326)[0m Trainable.setup took 28.210 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2023-09-06 13:25:07,129	ERROR tune_controller.py:911 -- Trial task failed for trial AIRA2C_f2578_00000
Traceback (most recent call last):
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/Users/floriankockler/anaconda3/envs/py310/lib/python3.10/site-packages/ray/_private/worker.py", line 2493, in get
    raise value.as_instanceof_c

TrainingFailedError: The Ray Train run failed. Please inspect the previous error messages for a cause. After fixing the issue (assuming that the error is not caused by your own application logic, but rather an error such as OOM), you can restart the run from scratch or continue this run.
To continue this run, you can use: `trainer = RLTrainer.restore("/Users/floriankockler/rayresults/training/AIRA2C_2023-09-06_13-24-30")`.
To start a new run that will retry on training failures, set `air.RunConfig(failure_config=air.FailureConfig(max_failures))` in the Trainer's `run_config` with `max_failures > 0`, or `max_failures = -1` for unlimited retries.

In [None]:
# worker_config =   {
#         "cash_initial": 1000000, 
#         "trading_cost": 0.001, 
#         "tech_indicator_list": indicators,
#         "print_verbosity": 1,
#         "initial_shares_held": 0,
#         "df": df,  }


# env = Worker(env_config=worker_config)
# check_env(env)

In [None]:


worker_config =   {
        "cash_initial": 1000000, 
        "trading_cost": 0.001, 
        "tech_indicator_list": indicators,
        "print_verbosity": 1,
        "initial_shares_held": 0,
        "df": train_df,  }


env = Worker(env_config=worker_config)

n_iterations = 30

state = env.reset()

for _ in range(n_iterations):

    action = env.action_space.sample()

    obs, reward, done, truncated, info= env.step(action)

    # print(f"Action: {action}, Reward: {reward}, Portfolio Value: {obs[0] + obs[1] * obs[2]}")
    
    if done:
        print("Episode finished!")
        state = env.reset()
    else:
        state = obs

In [None]:
import random

from ray import air, tune
from ray.tune.schedulers import PopulationBasedTraining
import argparse
from ray.tune import CLIReporter

reporter = CLIReporter(max_progress_rows=10)

def env_creator(env_config):
    # Assuming this is your environment
    return Worker(env_config)

register_env("Single_Stock", env_creator)


parser = argparse.ArgumentParser()
parser.add_argument(
    "--smoke-test", action="store_true", help="Finish quickly for testing"
)
args, _ = parser.parse_known_args()

worker_config =   {
        "cash_initial": 1000000, 
        "trading_cost": 0.001, 
        "tech_indicator_list": indicators,
        "print_verbosity": 1,
        "initial_shares_held": 0,
        "df": train_df,  
        }

def explore(config):
    # Ensure we collect enough timesteps to do sgd
    if config["train_batch_size"] < config["rollout_fragment_length"] * 2:
        config["train_batch_size"] = config["rollout_fragment_length"] * 2
    return config

hyperparam_mutations = {
    "lr": [1e-3, 5e-4, 1e-4, 5e-5, 1e-5],
    "gamma": lambda: random.uniform(0.9, 1.0),
    "entropy_coeff": [0.01, 0.1, 1.0],
    "num_envs_per_worker": [1, 2, 4, 8],
    "rollout_fragment_length": [50, 100, 200, 400],
    "train_batch_size": lambda: random.randint(1000, 50000),
    "sgd_minibatch_size": tune.choice([50, 100, 200]),
    "train_batch_size": tune.choice([200, 400, 600]),
}

pbt = PopulationBasedTraining(
        time_attr="time_total_s",
        perturbation_interval=120,
        resample_probability=0.25,
        # Specifies the mutations of these hyperparams
        hyperparam_mutations=hyperparam_mutations,
        custom_explore_fn=explore,
    )

# Stop when we've reached 100 training iterations or reward=300
stopping_criteria = {"training_iteration": 100}

tuner = tune.Tuner(
    "A2C",
    tune_config=tune.TuneConfig(
        metric="episode_reward_mean",
        mode="max",
        scheduler=pbt,
        num_samples=1 if args.smoke_test else 10,
    ),
    param_space={
        "env": "Single_Stock",
        "env_config": worker_config,
        "rollout_fragment_length": "auto",
        "framework": "tf2",
        "num_workers": 5,  # 1 for training + 4 for sampling
        "num_cpus": 1,  # number of CPUs to use per trial --> 6 in total = max available
        "num_gpus": 0,  # number of GPUs to use per trial
        # These params are tuned from a fixed starting value.
        "lr": 1e-4,
        # These params start off randomly drawn from a set.
        "sgd_minibatch_size": tune.choice([50, 100, 200]),
        "train_batch_size": tune.choice([200, 400, 600]),
    },

    run_config=air.RunConfig(stop=stopping_criteria, local_dir="/Users/floriankockler/rayresults/tuning3", progress_reporter=reporter),
)
results = tuner.fit()