prosysscience · PhilippWillms · Apr 20, 2022 · Apr 20, 2022 · May 1, 2022 · May 8, 2022
diff --git a/JSS/Backup_JSSEnv.py b/JSS/Backup_JSSEnv.py
diff --git a/JSS/CustomCallbacks.py b/JSS/CustomCallbacks.py
@@ -14,7 +14,8 @@ def __init__(self, legacy_callbacks_dict: Dict[str, callable] = None):
     def on_episode_end(self, worker: "RolloutWorker", base_env: BaseEnv,
                        policies: Dict[PolicyID, Policy],
                        episode: MultiAgentEpisode, **kwargs):
-        env = base_env.get_unwrapped()[0]
+        # env = base_env.get_unwrapped()[0]  --> deprecated
+        env = base_env.get_sub_environments()[0]
         if env.last_time_step != float('inf'):
             episode.custom_metrics['make_span'] = env.last_time_step
 
diff --git a/JSS/__pycache__/CustomCallbacks.cpython-38.pyc b/JSS/__pycache__/CustomCallbacks.cpython-38.pyc
diff --git a/JSS/__pycache__/models.cpython-38.pyc b/JSS/__pycache__/models.cpython-38.pyc
diff --git a/JSS/demo.py b/JSS/demo.py
@@ -0,0 +1,220 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun May  1 01:39:26 2022
+
+@author: Philipp
+"""
+
+"""
+Example of a custom gym environment and model. Run this for a demo.
+
+This example shows:
+  - using a custom environment
+  - using a custom model
+  - using Tune for grid search to try different learning rates
+
+You can visualize experiment results in ~/ray_results using TensorBoard.
+
+Run example with defaults:
+$ python custom_env.py
+For CLI options:
+$ python custom_env.py --help
+"""
+import argparse
+import gym
+from gym.spaces import Discrete, Box
+import numpy as np
+import os
+import random
+
+import ray
+from ray import tune
+from ray.rllib.agents import ppo
+from ray.rllib.env.env_context import EnvContext
+from ray.rllib.models import ModelCatalog
+from ray.rllib.models.tf.tf_modelv2 import TFModelV2
+from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from ray.rllib.utils.test_utils import check_learning_achieved
+from ray.tune.logger import pretty_print
+
+from ray.tune.registry import register_env
+
+tf1, tf, tfv = try_import_tf()
+torch, nn = try_import_torch()
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--run", type=str, default="PPO", help="The RLlib-registered algorithm to use."
+)
+parser.add_argument(
+    "--framework",
+    choices=["tf", "tf2", "tfe", "torch"],
+    default="tf",
+    help="The DL framework specifier.",
+)
+parser.add_argument(
+    "--as-test",
+    action="store_true",
+    help="Whether this script should be run as a test: --stop-reward must "
+    "be achieved within --stop-timesteps AND --stop-iters.",
+)
+parser.add_argument(
+    "--stop-iters", type=int, default=50, help="Number of iterations to train."
+)
+parser.add_argument(
+    "--stop-timesteps", type=int, default=100000, help="Number of timesteps to train."
+)
+parser.add_argument(
+    "--stop-reward", type=float, default=0.1, help="Reward at which we stop training."
+)
+parser.add_argument(
+    "--no-tune",
+    action="store_true",
+    help="Run without Tune using a manual train loop instead. In this case,"
+    "use PPO without grid search and no TensorBoard.",
+)
+parser.add_argument(
+    "--local-mode",
+    action="store_true",
+    help="Init Ray in local mode for easier debugging.",
+)
+
+
+class SimpleCorridor(gym.Env):
+    """Example of a custom env in which you have to walk down a corridor.
+
+    You can configure the length of the corridor via the env config."""
+
+    def __init__(self, config: EnvContext):
+        self.end_pos = config["corridor_length"]
+        self.cur_pos = 0
+        self.action_space = Discrete(2)
+        self.observation_space = Box(0.0, self.end_pos, shape=(1,), dtype=np.float32)
+        # Set the seed. This is only used for the final (reach goal) reward.
+        self.seed(config.worker_index * config.num_workers)
+
+    def reset(self):
+        self.cur_pos = 0
+        return [self.cur_pos]
+
+    def step(self, action):
+        assert action in [0, 1], action
+        if action == 0 and self.cur_pos > 0:
+            self.cur_pos -= 1
+        elif action == 1:
+            self.cur_pos += 1
+        done = self.cur_pos >= self.end_pos
+        # Produce a random reward when we reach the goal.
+        return [self.cur_pos], random.random() * 2 if done else -0.1, done, {}
+
+    def seed(self, seed=None):
+        random.seed(seed)
+
+
+class CustomModel(TFModelV2):
+    """Example of a keras custom model that just delegates to an fc-net."""
+
+    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
+        super(CustomModel, self).__init__(
+            obs_space, action_space, num_outputs, model_config, name
+        )
+        self.model = FullyConnectedNetwork(
+            obs_space, action_space, num_outputs, model_config, name
+        )
+
+    def forward(self, input_dict, state, seq_lens):
+        return self.model.forward(input_dict, state, seq_lens)
+
+    def value_function(self):
+        return self.model.value_function()
+
+
+class TorchCustomModel(TorchModelV2, nn.Module):
+    """Example of a PyTorch custom model that just delegates to a fc-net."""
+
+    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
+        TorchModelV2.__init__(
+            self, obs_space, action_space, num_outputs, model_config, name
+        )
+        nn.Module.__init__(self)
+
+        self.torch_sub_model = TorchFC(
+            obs_space, action_space, num_outputs, model_config, name
+        )
+
+    def forward(self, input_dict, state, seq_lens):
+        input_dict["obs"] = input_dict["obs"].float()
+        fc_out, _ = self.torch_sub_model(input_dict, state, seq_lens)
+        return fc_out, []
+
+    def value_function(self):
+        return torch.reshape(self.torch_sub_model.value_function(), [-1])
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    print(f"Running with following CLI options: {args}")
+
+    ray.init(local_mode=args.local_mode)
+
+    # Can also register the env creator function explicitly with:
+    register_env("corridor", lambda config: SimpleCorridor(config))
+    ModelCatalog.register_custom_model(
+        "my_model", TorchCustomModel if args.framework == "torch" else CustomModel
+    )
+
+    config = {
+        # "env": SimpleCorridor,  # or "corridor" if registered above
+        "env" : "corridor",
+        "env_config": {
+            "corridor_length": 5,
+        },
+        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
+        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
+        "model": {
+            "custom_model": "my_model",
+            "vf_share_layers": True,
+        },
+        "num_workers": 1,  # parallelism
+        "framework": args.framework,
+    }
+
+    stop = {
+        "training_iteration": args.stop_iters,
+        "timesteps_total": args.stop_timesteps,
+        "episode_reward_mean": args.stop_reward,
+    }
+
+    if args.no_tune:
+        # manual training with train loop using PPO and fixed learning rate
+        if args.run != "PPO":
+            raise ValueError("Only support --run PPO with --no-tune.")
+        print("Running manual train loop without Ray Tune.")
+        ppo_config = ppo.DEFAULT_CONFIG.copy()
+        ppo_config.update(config)
+        # use fixed learning rate instead of grid search (needs tune)
+        ppo_config["lr"] = 1e-3
+        trainer = ppo.PPOTrainer(config=ppo_config, env=SimpleCorridor)
+        # run manual training loop and print results after each iteration
+        for _ in range(args.stop_iters):
+            result = trainer.train()
+            print(pretty_print(result))
+            # stop training of the target train steps or reward are reached
+            if (
+                result["timesteps_total"] >= args.stop_timesteps
+                or result["episode_reward_mean"] >= args.stop_reward
+            ):
+                break
+    else:
+        # automated run with Tune and grid search and TensorBoard
+        print("Training automatically with Ray Tune")
+        results = tune.run(args.run, config=config, stop=stop)
+
+        if args.as_test:
+            print("Checking if learning goals were achieved")
+            check_learning_achieved(results, args.stop_reward)
+
+    ray.shutdown()
diff --git a/JSS/main.py b/JSS/main.py
@@ -1,30 +1,32 @@
 import time
 
-import ray
-import wandb
-
 import random
 
-import numpy as np
+import multiprocessing as mp
 
-import ray.tune.integration.wandb as wandb_tune
+import numpy as np
 
-from ray.rllib.agents.ppo import PPOTrainer
+from typing import Dict, Tuple
 
 from CustomCallbacks import *
-from models import *
+from models import FCMaskedActionsModelTF
 
-from typing import Dict, Tuple
+from JSSEnv.envs import JssEnv
 
-import multiprocessing as mp
+import ray
+from ray.tune.utils import flatten_dict
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.agents.ppo import PPOTrainer
 from ray.rllib.agents import with_common_config
 from ray.rllib.models import ModelCatalog
+from ray.tune.registry import register_env
+
+import wandb
+import ray.tune.integration.wandb as wandb_tune
 
-from ray.tune.utils import flatten_dict
-from ray.rllib.utils.framework import try_import_tf
 
-tf1, tf, tfv = try_import_tf()
 
+tf1, tf, tfv = try_import_tf()
 
 _exclude_results = ["done", "should_checkpoint", "config"]
 
@@ -59,13 +61,18 @@ def _handle_result(result: Dict) -> Tuple[Dict, Dict]:
 
 
 def train_func():
+
+    # Following alternative to class-name in config, i.e. registering the env and connecting to the callable constructor
+    register_env("JSSEnv-v1", lambda config: JssEnv(config))
+
     default_config = {
-        'env': 'JSSEnv:jss-v1',
+        'env': "JSSEnv-v1",     # the name defined in register_env        
+        'env_config' : {
+            'instance_path': '/instances/ta41'},
         'seed': 0,
         'framework': 'tf',
         'log_level': 'WARN',
-        'num_gpus': 1,
-        'instance_path': 'instances/ta41',
+        'num_gpus': 1,                       
         'evaluation_interval': None,
         'metrics_smoothing_episodes': 2000,
         'gamma': 1.0,
@@ -95,7 +102,7 @@ def train_func():
         "use_critic": True,
         "use_gae": True,
         "shuffle_sequences": True,
-        "vf_share_layers": False,
+        # "vf_share_layers": False,            # leads to deprecated warning and run issues 
         "observation_filter": "NoFilter",
         "simple_optimizer": False,
         "_fake_gpus": False,
@@ -110,16 +117,18 @@ def train_func():
     config = wandb.config
 
     ModelCatalog.register_custom_model("fc_masked_model_tf", FCMaskedActionsModelTF)
-
+    
     config['model'] = {
         "fcnet_activation": "relu",
         "custom_model": "fc_masked_model_tf",
         'fcnet_hiddens': [config['layer_size'] for k in range(config['layer_nb'])],
         "vf_share_layers": False,
     }
-    config['env_config'] = {
-        'env_config': {'instance_path': config['instance_path']}
-    }
+
+    # Deactivated because passing as env_config dictionary item in master config
+    # config['env_config'] = {
+    #    'env_config': {'instance_path': config['instance_path']}
+    #}
 
     config = with_common_config(config)
     config['seed'] = 0
@@ -132,7 +141,8 @@ def train_func():
     config['entropy_coeff'] = config['entropy_start']
     config['entropy_coeff_schedule'] = [[0, config['entropy_start']], [15000000, config['entropy_end']]]
 
-    config.pop('instance_path', None)
+    # Deactivated because passing it as env_config dictionary item in master config
+    # config.pop('instance_path', None)
     config.pop('layer_size', None)
     config.pop('layer_nb', None)
     config.pop('lr_start', None)
@@ -144,15 +154,15 @@ def train_func():
         "time_total_s": 10 * 60,
     }
 
+
     start_time = time.time()
     trainer = PPOTrainer(config=config)
     while start_time + stop['time_total_s'] > time.time():
         result = trainer.train()
         result = wandb_tune._clean_log(result)
         log, config_update = _handle_result(result)
         wandb.log(log)
-        # wandb.config.update(config_update, allow_val_change=True)
-    # trainer.export_policy_model("/home/jupyter/JSS/JSS/models/")
+        wandb.config.update(config_update, allow_val_change=True)
 
     ray.shutdown()
 

diff --git a/JSS/models.py b/JSS/models.py
@@ -23,7 +23,7 @@ def __init__(self,
         self.action_embed_model = FullyConnectedNetwork(
             obs_space=true_obs_space, action_space=action_space, num_outputs=action_space.n,
             model_config=model_config, name=name + "action_model")
-        self.register_variables(self.action_embed_model.variables())
+        # self.register_variables(self.action_embed_model.variables())
 
     def forward(self, input_dict, state, seq_lens):
         action_mask = input_dict["obs"]["action_mask"]
@@ -32,7 +32,7 @@ def forward(self, input_dict, state, seq_lens):
         raw_actions, _ = self.action_embed_model({
             "obs": input_dict["obs"]["real_obs"]
         })
-        #inf_mask = tf.maximum(tf.math.log(action_mask), tf.float32.min)
+        # inf_mask = tf.maximum(tf.math.log(action_mask), tf.float32.min)
         logits = tf.where(tf.math.equal(action_mask, 1), raw_actions,  tf.float32.min)
         return logits, state