Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Env configuration update to ray 1.12 standard #30

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
385 changes: 385 additions & 0 deletions JSS/Backup_JSSEnv.py

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion JSS/CustomCallbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ def __init__(self, legacy_callbacks_dict: Dict[str, callable] = None):
def on_episode_end(self, worker: "RolloutWorker", base_env: BaseEnv,
policies: Dict[PolicyID, Policy],
episode: MultiAgentEpisode, **kwargs):
env = base_env.get_unwrapped()[0]
# env = base_env.get_unwrapped()[0] --> deprecated
env = base_env.get_sub_environments()[0]
if env.last_time_step != float('inf'):
episode.custom_metrics['make_span'] = env.last_time_step

Binary file modified JSS/__pycache__/CustomCallbacks.cpython-38.pyc
Binary file not shown.
Binary file modified JSS/__pycache__/models.cpython-38.pyc
Binary file not shown.
220 changes: 220 additions & 0 deletions JSS/demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
# -*- coding: utf-8 -*-
"""
Created on Sun May 1 01:39:26 2022

@author: Philipp
"""

"""
Example of a custom gym environment and model. Run this for a demo.

This example shows:
- using a custom environment
- using a custom model
- using Tune for grid search to try different learning rates

You can visualize experiment results in ~/ray_results using TensorBoard.

Run example with defaults:
$ python custom_env.py
For CLI options:
$ python custom_env.py --help
"""
import argparse
import gym
from gym.spaces import Discrete, Box
import numpy as np
import os
import random

import ray
from ray import tune
from ray.rllib.agents import ppo
from ray.rllib.env.env_context import EnvContext
from ray.rllib.models import ModelCatalog
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
from ray.rllib.utils.framework import try_import_tf, try_import_torch
from ray.rllib.utils.test_utils import check_learning_achieved
from ray.tune.logger import pretty_print

from ray.tune.registry import register_env

tf1, tf, tfv = try_import_tf()
torch, nn = try_import_torch()

parser = argparse.ArgumentParser()
parser.add_argument(
"--run", type=str, default="PPO", help="The RLlib-registered algorithm to use."
)
parser.add_argument(
"--framework",
choices=["tf", "tf2", "tfe", "torch"],
default="tf",
help="The DL framework specifier.",
)
parser.add_argument(
"--as-test",
action="store_true",
help="Whether this script should be run as a test: --stop-reward must "
"be achieved within --stop-timesteps AND --stop-iters.",
)
parser.add_argument(
"--stop-iters", type=int, default=50, help="Number of iterations to train."
)
parser.add_argument(
"--stop-timesteps", type=int, default=100000, help="Number of timesteps to train."
)
parser.add_argument(
"--stop-reward", type=float, default=0.1, help="Reward at which we stop training."
)
parser.add_argument(
"--no-tune",
action="store_true",
help="Run without Tune using a manual train loop instead. In this case,"
"use PPO without grid search and no TensorBoard.",
)
parser.add_argument(
"--local-mode",
action="store_true",
help="Init Ray in local mode for easier debugging.",
)


class SimpleCorridor(gym.Env):
"""Example of a custom env in which you have to walk down a corridor.

You can configure the length of the corridor via the env config."""

def __init__(self, config: EnvContext):
self.end_pos = config["corridor_length"]
self.cur_pos = 0
self.action_space = Discrete(2)
self.observation_space = Box(0.0, self.end_pos, shape=(1,), dtype=np.float32)
# Set the seed. This is only used for the final (reach goal) reward.
self.seed(config.worker_index * config.num_workers)

def reset(self):
self.cur_pos = 0
return [self.cur_pos]

def step(self, action):
assert action in [0, 1], action
if action == 0 and self.cur_pos > 0:
self.cur_pos -= 1
elif action == 1:
self.cur_pos += 1
done = self.cur_pos >= self.end_pos
# Produce a random reward when we reach the goal.
return [self.cur_pos], random.random() * 2 if done else -0.1, done, {}

def seed(self, seed=None):
random.seed(seed)


class CustomModel(TFModelV2):
"""Example of a keras custom model that just delegates to an fc-net."""

def __init__(self, obs_space, action_space, num_outputs, model_config, name):
super(CustomModel, self).__init__(
obs_space, action_space, num_outputs, model_config, name
)
self.model = FullyConnectedNetwork(
obs_space, action_space, num_outputs, model_config, name
)

def forward(self, input_dict, state, seq_lens):
return self.model.forward(input_dict, state, seq_lens)

def value_function(self):
return self.model.value_function()


class TorchCustomModel(TorchModelV2, nn.Module):
"""Example of a PyTorch custom model that just delegates to a fc-net."""

def __init__(self, obs_space, action_space, num_outputs, model_config, name):
TorchModelV2.__init__(
self, obs_space, action_space, num_outputs, model_config, name
)
nn.Module.__init__(self)

self.torch_sub_model = TorchFC(
obs_space, action_space, num_outputs, model_config, name
)

def forward(self, input_dict, state, seq_lens):
input_dict["obs"] = input_dict["obs"].float()
fc_out, _ = self.torch_sub_model(input_dict, state, seq_lens)
return fc_out, []

def value_function(self):
return torch.reshape(self.torch_sub_model.value_function(), [-1])


if __name__ == "__main__":
args = parser.parse_args()
print(f"Running with following CLI options: {args}")

ray.init(local_mode=args.local_mode)

# Can also register the env creator function explicitly with:
register_env("corridor", lambda config: SimpleCorridor(config))
ModelCatalog.register_custom_model(
"my_model", TorchCustomModel if args.framework == "torch" else CustomModel
)

config = {
# "env": SimpleCorridor, # or "corridor" if registered above
"env" : "corridor",
"env_config": {
"corridor_length": 5,
},
# Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
"num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
"model": {
"custom_model": "my_model",
"vf_share_layers": True,
},
"num_workers": 1, # parallelism
"framework": args.framework,
}

stop = {
"training_iteration": args.stop_iters,
"timesteps_total": args.stop_timesteps,
"episode_reward_mean": args.stop_reward,
}

if args.no_tune:
# manual training with train loop using PPO and fixed learning rate
if args.run != "PPO":
raise ValueError("Only support --run PPO with --no-tune.")
print("Running manual train loop without Ray Tune.")
ppo_config = ppo.DEFAULT_CONFIG.copy()
ppo_config.update(config)
# use fixed learning rate instead of grid search (needs tune)
ppo_config["lr"] = 1e-3
trainer = ppo.PPOTrainer(config=ppo_config, env=SimpleCorridor)
# run manual training loop and print results after each iteration
for _ in range(args.stop_iters):
result = trainer.train()
print(pretty_print(result))
# stop training of the target train steps or reward are reached
if (
result["timesteps_total"] >= args.stop_timesteps
or result["episode_reward_mean"] >= args.stop_reward
):
break
else:
# automated run with Tune and grid search and TensorBoard
print("Training automatically with Ray Tune")
results = tune.run(args.run, config=config, stop=stop)

if args.as_test:
print("Checking if learning goals were achieved")
check_learning_achieved(results, args.stop_reward)

ray.shutdown()
56 changes: 33 additions & 23 deletions JSS/main.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,32 @@
import time

import ray
import wandb

import random

import numpy as np
import multiprocessing as mp

import ray.tune.integration.wandb as wandb_tune
import numpy as np

from ray.rllib.agents.ppo import PPOTrainer
from typing import Dict, Tuple

from CustomCallbacks import *
from models import *
from models import FCMaskedActionsModelTF

from typing import Dict, Tuple
from JSSEnv.envs import JssEnv

import multiprocessing as mp
import ray
from ray.tune.utils import flatten_dict
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.agents import with_common_config
from ray.rllib.models import ModelCatalog
from ray.tune.registry import register_env

import wandb
import ray.tune.integration.wandb as wandb_tune

from ray.tune.utils import flatten_dict
from ray.rllib.utils.framework import try_import_tf

tf1, tf, tfv = try_import_tf()

tf1, tf, tfv = try_import_tf()

_exclude_results = ["done", "should_checkpoint", "config"]

Expand Down Expand Up @@ -59,13 +61,18 @@ def _handle_result(result: Dict) -> Tuple[Dict, Dict]:


def train_func():

# Following alternative to class-name in config, i.e. registering the env and connecting to the callable constructor
register_env("JSSEnv-v1", lambda config: JssEnv(config))

default_config = {
'env': 'JSSEnv:jss-v1',
'env': "JSSEnv-v1", # the name defined in register_env
'env_config' : {
'instance_path': '/instances/ta41'},
'seed': 0,
'framework': 'tf',
'log_level': 'WARN',
'num_gpus': 1,
'instance_path': 'instances/ta41',
'num_gpus': 1,
'evaluation_interval': None,
'metrics_smoothing_episodes': 2000,
'gamma': 1.0,
Expand Down Expand Up @@ -95,7 +102,7 @@ def train_func():
"use_critic": True,
"use_gae": True,
"shuffle_sequences": True,
"vf_share_layers": False,
# "vf_share_layers": False, # leads to deprecated warning and run issues
"observation_filter": "NoFilter",
"simple_optimizer": False,
"_fake_gpus": False,
Expand All @@ -110,16 +117,18 @@ def train_func():
config = wandb.config

ModelCatalog.register_custom_model("fc_masked_model_tf", FCMaskedActionsModelTF)

config['model'] = {
"fcnet_activation": "relu",
"custom_model": "fc_masked_model_tf",
'fcnet_hiddens': [config['layer_size'] for k in range(config['layer_nb'])],
"vf_share_layers": False,
}
config['env_config'] = {
'env_config': {'instance_path': config['instance_path']}
}

# Deactivated because passing as env_config dictionary item in master config
# config['env_config'] = {
# 'env_config': {'instance_path': config['instance_path']}
#}

config = with_common_config(config)
config['seed'] = 0
Expand All @@ -132,7 +141,8 @@ def train_func():
config['entropy_coeff'] = config['entropy_start']
config['entropy_coeff_schedule'] = [[0, config['entropy_start']], [15000000, config['entropy_end']]]

config.pop('instance_path', None)
# Deactivated because passing it as env_config dictionary item in master config
# config.pop('instance_path', None)
config.pop('layer_size', None)
config.pop('layer_nb', None)
config.pop('lr_start', None)
Expand All @@ -144,15 +154,15 @@ def train_func():
"time_total_s": 10 * 60,
}


start_time = time.time()
trainer = PPOTrainer(config=config)
while start_time + stop['time_total_s'] > time.time():
result = trainer.train()
result = wandb_tune._clean_log(result)
log, config_update = _handle_result(result)
wandb.log(log)
# wandb.config.update(config_update, allow_val_change=True)
# trainer.export_policy_model("/home/jupyter/JSS/JSS/models/")
wandb.config.update(config_update, allow_val_change=True)

ray.shutdown()

Expand Down
4 changes: 2 additions & 2 deletions JSS/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __init__(self,
self.action_embed_model = FullyConnectedNetwork(
obs_space=true_obs_space, action_space=action_space, num_outputs=action_space.n,
model_config=model_config, name=name + "action_model")
self.register_variables(self.action_embed_model.variables())
# self.register_variables(self.action_embed_model.variables())

def forward(self, input_dict, state, seq_lens):
action_mask = input_dict["obs"]["action_mask"]
Expand All @@ -32,7 +32,7 @@ def forward(self, input_dict, state, seq_lens):
raw_actions, _ = self.action_embed_model({
"obs": input_dict["obs"]["real_obs"]
})
#inf_mask = tf.maximum(tf.math.log(action_mask), tf.float32.min)
# inf_mask = tf.maximum(tf.math.log(action_mask), tf.float32.min)
logits = tf.where(tf.math.equal(action_mask, 1), raw_actions, tf.float32.min)
return logits, state

Expand Down