In [1]:
from CybORG import CybORG
from CybORG.Simulator.Scenarios import EnterpriseScenarioGenerator
from CybORG.Agents.Wrappers import EnterpriseMAE
from CybORG.Agents import SleepAgent, EnterpriseGreenAgent, FiniteStateRedAgent
from Wrappers import EnterpriseMAEMaskWrapper, MaskWrapper
from CustomRLLib import TorchActionMaskModel, CustomModel

In [2]:
from ray.tune import register_env
from ray.rllib.algorithms.ppo import PPOConfig, PPO
from ray.rllib.algorithms.dqn import DQNConfig
from ray.rllib.policy.policy import PolicySpec
from ray.rllib.models import ModelCatalog

In [3]:
def env_creator_CC4(env_config: dict):
    sg = EnterpriseScenarioGenerator(
        blue_agent_class=SleepAgent,
        green_agent_class=EnterpriseGreenAgent,
        red_agent_class=FiniteStateRedAgent,
        steps=500
    )
    cyborg = CybORG(scenario_generator=sg)
    # env = EnterpriseMAE(env=cyborg)
    env = EnterpriseMAEMaskWrapper(env=cyborg)
    # env = MaskWrapper(env=cyborg)

    return env

In [4]:
register_env(name='CC4', env_creator=lambda config: env_creator_CC4(config))
env = env_creator_CC4({})

In [5]:
NUM_AGENTS = 5
POLICY_MAP = {f"blue_agent_{i}": f"Agent{i}" for i in range(NUM_AGENTS)}

def policy_mapper(agent_id, episode, worker, **kwargs):
# def policy_mapper(agent_id, episode, **kwargs):
    return POLICY_MAP[agent_id]

In [6]:
ModelCatalog.register_custom_model("custom_pytorch_model", CustomModel)
ModelCatalog.register_custom_model("torch_action_mask_model", TorchActionMaskModel)
# ModelCatalog.register_custom_model("action_mask_model", ActionMaskModel)


# ray.init()
# algo = ppo.PPO(env="CartPole-v1", config={
#     "model": {
#         "custom_model": "my_tf_model",
#         # Extra kwargs to be passed to your model's c'tor.
#         "custom_model_config": {},
#     },
# })

In [7]:
algo_config = (
    PPOConfig()
    .environment(env="CC4")
    .debugging(logger_config={'logdir':'./logs/PPO_Example', 'type': 'ray.tune.logger.TBXLogger'})
    .multi_agent(policies={
        ray_agent: PolicySpec(
            policy_class=None,
            observation_space=env.observation_space(cyborg_agent),
            action_space=env.action_space(cyborg_agent),
            config={'gamma': 0.85}
        ) for cyborg_agent, ray_agent in POLICY_MAP.items()
    },
    policy_mapping_fn=policy_mapper
    )
    .training(
        model={'custom_model': "torch_action_mask_model"}
    )
)

In [8]:
print(algo_config.policies['Agent0'].policy_class)

None


In [9]:
import time
print(time.strftime("%Y-%m-%d_%H:%M:%S"))

2024-04-16_16:18:10


In [10]:
algo = algo_config.build()

for i in range(2):
    train_info=algo.train()

algo.save(f'./Submissions/results_{time.strftime("%Y-%m-%d_%H:%M:%S")}/staging/')

The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  object_ = constructor(*ctor_args, **ctor_kwargs)
2024-04-16 16:18:12,912	INFO worker.py:1724 -- Started a local Ray instance.
[36m(RolloutWorker pid=69429)[0m   ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


!!!! space_to_dict(self.observation_space):  {'space': {'space': 'box', 'low': 'eJyb7BfqGxDJyFDGUK2eklqcXKRupaBuk2airqOgnpZfVFKUmBefX5SSChJ3S8wpTgWKF2ckFqQC+RpGZuY6mjoKtQpkAy4Ghob9o3gUj+JRTAgDAJnVaEo=', 'high': 'eJyb7BfqGxDJyFDGUK2eklqcXKRupaBuk2airqOgnpZfVFKUmBefX5SSChJ3S8wpTgWKF2ckFqQC+RpGZuY6mjoKtQpkAy4Ghgb7UTyKR/EoJoQBlgviuw==', 'shape': (267,), 'dtype': '<f4'}, 'original_space': {'space': {'space': 'dict', 'spaces': {'action_mask': {'space': 'multi-binary', 'n': 82}, 'observations': {'space': 'multi-discrete', 'nvec': 'eJyb7BfqGxDJyFDGUK2eklqcXKRupaBuk2mhrqOgnpZfVFKUmBefX5SSChJ3S8wpTgWKF2ckFqQC+RqWRjqaOgq1CuQDLmYGCGAapUfpYUgDAGpCHDI=', 'dtype': '<i8'}}}}}
!!!! space_to_dict(self.observation_space):  {'space': {'space': 'box', 'low': 'eJyb7BfqGxDJyFDGUK2eklqcXKRupaBuk2airqOgnpZfVFKUmBefX5SSChJ3S8wpTgWKF2ckFqQC+RpGZuY6mjoKtQpkAy4Ghob9o3gUj+JRTAgDAJnVaEo=', 'high': 'eJyb7BfqGxDJyFDGUK2eklqcXKRupaBuk2airqOgnpZfVFKUmBefX5SSChJ3S8wpTgWKF2ckFqQC+RpGZuY6mjoKtQpkAy4Ghgb7UTyKR/EoJoQBlgviuw=

TrainingResult(checkpoint=Checkpoint(filesystem=local, path=./Submissions/results_2024-04-16_16:20:24/staging/), metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'Agent1': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 0.9360178337432444, 'cur_kl_coeff': 0.20000000000000004, 'cur_lr': 5.0000000000000016e-05, 'total_loss': 9.863586792349816, 'policy_loss': -0.04361353814262354, 'vf_loss': 9.903740325570107, 'vf_explained_var': -6.073899567127228e-06, 'kl': 0.01730008400710033, 'entropy': 4.032264269391695, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 125.0, 'num_grad_updates_lifetime': 1440.5, 'diff_num_grad_updates_vs_sampler_policy': 479.5}, 'Agent3': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 0.9649675623203318, 'cur_kl_coeff': 0.29999999999999993, 'cur_lr': 5.0000000000000016e-05, 'total_loss': 9.868787041306495, 'policy_loss': -0.03893161678133765, 'vf_loss': 9.902448217074076, 'vf_explained

In [11]:
policy = algo.get_policy(policy_id='Agent0')
# policy.compute_actions()
print(env.observation_space('blue_agent_0'))
print(len(env.observation_space('blue_agent_0').sample()))
print(env.action_space('blue_agent_0'))
print(policy.observation_space)

Dict('action_mask': MultiBinary(82), 'observations': MultiDiscrete([3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]))
2
Discrete(82)
Box(-1.0, 1.0, (267,), float32)


In [12]:
print(env.observation_space('blue_agent_0').sample())

OrderedDict([('action_mask', array([0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1], dtype=int8)), ('observations', array([0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 1, 1]))])


In [13]:
print(env.observation_space('blue_agent_4'))

Dict('action_mask': MultiBinary(242), 'observations': MultiDiscrete([3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]))


In [14]:
# for i in range(5):
#     id = f'Agent{i}'
#     # Retrieve the Policy object from an Algorithm.
#     # Note that for normal, single-agent Algorithms, the Policy ID is "default_policy".
#     policy = algo.get_policy(policy_id=id)
#     print(policy)

#     # Tell RLlib to store an individual policy checkpoint (only for "pol1") inside
#     # /tmp/my_policy_checkpoint
#     policy.export_checkpoint(f"./Submissions/2_rounds/staging/policies/Agent{i}")

In [15]:
from ray.rllib.policy.policy import PolicySpec, Policy
my_restored_policy = Policy.from_checkpoint("./Submissions/mask_results_2024-04-16_02:01:29/staging/policies/Agent0")

In [16]:
# new_algo = algo_config.build()

# for i in range(2):
#     train_info=new_algo.train()

# new_algo.save('new_results')

The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  object_ = constructor(*ctor_args, **ctor_kwargs)
2024-04-16 16:36:48,303	INFO trainable.py:164 -- Trainable.setup took 983.492 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


[36m(RolloutWorker pid=69571)[0m   ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


!!!! space_to_dict(self.observation_space):  {'space': {'space': 'box', 'low': 'eJyb7BfqGxDJyFDGUK2eklqcXKRupaBuk2airqOgnpZfVFKUmBefX5SSChJ3S8wpTgWKF2ckFqQC+RpGZuY6mjoKtQpkAy4Ghob9o3gUj+JRTAgDAJnVaEo=', 'high': 'eJyb7BfqGxDJyFDGUK2eklqcXKRupaBuk2airqOgnpZfVFKUmBefX5SSChJ3S8wpTgWKF2ckFqQC+RpGZuY6mjoKtQpkAy4Ghgb7UTyKR/EoJoQBlgviuw==', 'shape': (267,), 'dtype': '<f4'}, 'original_space': {'space': {'space': 'dict', 'spaces': {'action_mask': {'space': 'multi-binary', 'n': 82}, 'observations': {'space': 'multi-discrete', 'nvec': 'eJyb7BfqGxDJyFDGUK2eklqcXKRupaBuk2mhrqOgnpZfVFKUmBefX5SSChJ3S8wpTgWKF2ckFqQC+RqWRjqaOgq1CuQDLmYGCGAapUfpYUgDAGpCHDI=', 'dtype': '<i8'}}}}}
!!!! space_to_dict(self.observation_space):  {'space': {'space': 'box', 'low': 'eJyb7BfqGxDJyFDGUK2eklqcXKRupaBuk2airqOgnpZfVFKUmBefX5SSChJ3S8wpTgWKF2ckFqQC+RpGZuY6mjoKtQpkAy4Ghob9o3gUj+JRTAgDAJnVaEo=', 'high': 'eJyb7BfqGxDJyFDGUK2eklqcXKRupaBuk2airqOgnpZfVFKUmBefX5SSChJ3S8wpTgWKF2ckFqQC+RpGZuY6mjoKtQpkAy4Ghgb7UTyKR/EoJoQBlgviuw=

TrainingResult(checkpoint=Checkpoint(filesystem=local, path=new_results), metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'Agent1': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 0.8229975630218784, 'cur_kl_coeff': 0.29999999999999993, 'cur_lr': 5.0000000000000016e-05, 'total_loss': 9.918857701619466, 'policy_loss': -0.03447158923396879, 'vf_loss': 9.949556578199068, 'vf_explained_var': -1.5843535463015237e-05, 'kl': 0.012575650896635876, 'entropy': 4.029745149115722, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 125.0, 'num_grad_updates_lifetime': 1440.5, 'diff_num_grad_updates_vs_sampler_policy': 479.5}, 'Agent3': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 0.8679699104900161, 'cur_kl_coeff': 0.29999999999999993, 'cur_lr': 5.0000000000000016e-05, 'total_loss': 9.918698250253994, 'policy_loss': -0.03387821316791815, 'vf_loss': 9.948247214158377, 'vf_explained_var': -9.839609265327454e-06, 'kl': 

In [17]:
# new_algo.export_model('new_results_model')

TuneError: Unsupported import/export format: new_results_model

In [None]:
# output = new_algo.evaluate()

# print(output)
# print(
#     "Avg episode length for trained agent: %.1f"
#     % output["evaluation"]["episode_len_mean"]
# )

ValueError: Cannot evaluate w/o an evaluation worker set in the Algorithm or w/o an env on the local worker!
Try one of the following:
1) Set `evaluation_interval` >= 0 to force creating a separate evaluation worker set.
2) Set `create_env_on_driver=True` to force the local (non-eval) worker to have an environment to evaluate on.

In [None]:
# policy = new_algo.get_policy(policy_id='blue_agent_1')
# print(policy)
# path = policy.export_model('saved_policies')
# print(path)

None


AttributeError: 'NoneType' object has no attribute 'export_model'

In [None]:
# algo.save("experiment1a")
# output = algo.evaluate()

# print(output)
# print(
#     "Avg episode length for trained agent: %.1f"
#     % output["evaluation"]["episode_len_mean"]
# )

ValueError: Cannot evaluate w/o an evaluation worker set in the Algorithm or w/o an env on the local worker!
Try one of the following:
1) Set `evaluation_interval` >= 0 to force creating a separate evaluation worker set.
2) Set `create_env_on_driver=True` to force the local (non-eval) worker to have an environment to evaluate on.

In [None]:
# import pickle
# import os
# import torch

In [None]:
# with open(os.path.dirname(f'/Users/rll249/Documents/CAGE/cage-4-playground/Submissions/staging/policies/Agent0/') + "/policy_state.pkl", 'rb') as f:
#     model = pickle.load(f)

# print(model)

{'weights': {'_logits._model.0.weight': array([[ 0.00250164,  0.00527381,  0.01931156, ...,  0.00025262,
        -0.00785647, -0.01378386],
       [ 0.00231189,  0.00124045,  0.00146192, ...,  0.00211412,
        -0.00117784,  0.00250254],
       [ 0.00341276,  0.00233744, -0.00342125, ...,  0.00335405,
        -0.00172418,  0.0129694 ],
       ...,
       [-0.00154116, -0.00215152, -0.00591432, ...,  0.00101742,
         0.00134184,  0.01044994],
       [ 0.00223243,  0.00372557,  0.00529027, ..., -0.00113381,
        -0.0052026 , -0.00424745],
       [-0.00074222,  0.00239624,  0.01122481, ...,  0.00472425,
        -0.00253568, -0.0054925 ]], dtype=float32), '_logits._model.0.bias': array([ 3.6214816e-03,  1.9464773e-03,  1.4313067e-03, -2.0742356e-03,
       -1.5992419e-03,  1.4055480e-03,  1.8314883e-03,  2.4325873e-03,
       -2.3271784e-03, -1.4031064e-03, -1.6773037e-04,  6.4351846e-04,
       -1.0132344e-03, -4.4959554e-04, -1.3190174e-03, -1.3267744e-03,
        1.1732498e-03,

In [None]:

# torch.load(os.path.dirname(f'/Users/rll249/Documents/CAGE/cage-4-playground/Submissions/staging/policies/Agent0/') + "/policy_state.pkl")

RuntimeError: Invalid magic number; corrupt file?