Skip to content

Commit

Permalink
[RLlib] Create a set of performance benchmark tests to run nightly. (#…
Browse files Browse the repository at this point in the history
…19945)

* Create a core set of algorithms tests to run nightly.

* Run release tests under tf, tf2, and torch frameworks.

* Fix

* Add eager_tracing option for tf2 framework.

* make sure core tests can run in parallel.

* cql

* Report progress while running nightly/weekly tests.

* Innclude SAC in nightly lineup.

* Revert changes to learning_tests

* rebrand to performance test.

* update build_pipeline.py with new performance_tests name.

* Record stats.

* bug fix, need to populate experiments dict.

* Alphabetize yaml files.

* Allow specifying frameworks. And do not run tf2 by default.

* remove some debugging code.

* fix

* Undo testing changes.

* Do not run CQL regression for now.

* LINT.

Co-authored-by: sven1977 <svenmika1977@gmail.com>
  • Loading branch information
gjoliver and sven1977 authored Nov 8, 2021
1 parent b1f2476 commit d8a61f8
Show file tree
Hide file tree
Showing 8 changed files with 257 additions and 38 deletions.
1 change: 1 addition & 0 deletions release/.buildkite/build_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ def __init__(self, name: str, retry: int = 0):
"~/ray/release/rllib_tests/rllib_tests.yaml": [
SmokeTest("learning_tests"),
SmokeTest("stress_tests"),
"performance_tests",
"multi_gpu_learning_tests",
"multi_gpu_with_lstm_learning_tests",
"multi_gpu_with_attention_learning_tests",
Expand Down
21 changes: 21 additions & 0 deletions release/rllib_tests/12gpus_192cpus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 12

head_node_type:
name: head_node
instance_type: m5.xlarge

worker_node_types:
- name: worker_node
instance_type: g3.4xlarge
min_workers: 12
max_workers: 12
use_spot: false

aws:
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 500
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ ddpg-hopperbulletenv-v0:
# time_total_s: 3600
# config:
# # DDPPO only supports PyTorch so far.
# framework: torch
# frameworks: [ "torch" ]
# # Worker config: 10 workers, each of which requires a GPU.
# num_workers: 16
# # Workers require GPUs, but share 1 GPU amongst 2 workers.
Expand Down
136 changes: 136 additions & 0 deletions release/rllib_tests/performance_tests/performance_tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
apex-breakoutnoframeskip-v4:
env: BreakoutNoFrameskip-v4
run: APEX
frameworks: [ "tf", "tf2", "torch" ]
stop:
time_total_s: 3600
config:
double_q: false
dueling: false
num_atoms: 1
noisy: false
n_step: 3
lr: .0001
adam_epsilon: .00015
hiddens: [512]
buffer_size: 1000000
exploration_config:
epsilon_timesteps: 200000
final_epsilon: 0.01
prioritized_replay_alpha: 0.5
final_prioritized_replay_beta: 1.0
prioritized_replay_beta_annealing_timesteps: 2000000
num_gpus: 1
num_workers: 8
num_envs_per_worker: 8
rollout_fragment_length: 20
train_batch_size: 512
target_network_update_freq: 50000
timesteps_per_iteration: 25000

appo-pongnoframeskip-v4:
env: PongNoFrameskip-v4
run: APPO
frameworks: [ "tf", "tf2", "torch" ]
stop:
time_total_s: 2000
config:
vtrace: True
use_kl_loss: False
rollout_fragment_length: 50
train_batch_size: 750
num_workers: 31
broadcast_interval: 1
max_sample_requests_in_flight_per_worker: 1
num_multi_gpu_tower_stacks: 1
num_envs_per_worker: 8
num_sgd_iter: 2
vf_loss_coeff: 1.0
clip_param: 0.3
num_gpus: 1
grad_clip: 10
model:
dim: 42

# Bring cql test back after we make sure it learns.
#cql-halfcheetahbulletenv-v0:
# env: HalfCheetahBulletEnv-v0
# run: CQL
# frameworks: [ "tf", "tf2", "torch" ]
# stop:
# time_total_s: 1800
# config:
# # Use input produced by expert SAC algo.
# input: ["~/halfcheetah_expert_sac.zip"]
# actions_in_input_normalized: true
#
# soft_horizon: False
# horizon: 1000
# Q_model:
# fcnet_activation: relu
# fcnet_hiddens: [256, 256, 256]
# policy_model:
# fcnet_activation: relu
# fcnet_hiddens: [256, 256, 256]
# tau: 0.005
# target_entropy: auto
# no_done_at_end: false
# n_step: 3
# rollout_fragment_length: 1
# prioritized_replay: false
# train_batch_size: 256
# target_network_update_freq: 0
# timesteps_per_iteration: 1000
# learning_starts: 256
# optimization:
# actor_learning_rate: 0.0001
# critic_learning_rate: 0.0003
# entropy_learning_rate: 0.0001
# num_workers: 0
# num_gpus: 1
# metrics_smoothing_episodes: 5
#
# # CQL Configs
# min_q_weight: 5.0
# bc_iters: 20000
# temperature: 1.0
# num_actions: 10
# lagrangian: False
#
# # Switch on online evaluation.
# evaluation_interval: 3
# evaluation_config:
# input: sampler

sac-halfcheetahbulletenv-v0:
env: HalfCheetahBulletEnv-v0
run: SAC
frameworks: [ "tf", "tf2", "torch" ]
stop:
time_total_s: 3600
config:
horizon: 1000
soft_horizon: false
Q_model:
fcnet_activation: relu
fcnet_hiddens: [256, 256]
policy_model:
fcnet_activation: relu
fcnet_hiddens: [256, 256]
tau: 0.005
target_entropy: auto
no_done_at_end: false
n_step: 3
rollout_fragment_length: 1
prioritized_replay: true
train_batch_size: 256
target_network_update_freq: 1
timesteps_per_iteration: 1000
learning_starts: 10000
optimization:
actor_learning_rate: 0.0003
critic_learning_rate: 0.0003
entropy_learning_rate: 0.0003
num_workers: 0
num_gpus: 1
metrics_smoothing_episodes: 5
29 changes: 29 additions & 0 deletions release/rllib_tests/performance_tests/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""Core Learning regression tests for RLlib (torch and tf).
Runs Atari/PyBullet benchmarks for the most popular algorithms.
"""

import json
import os
from pathlib import Path

from ray.rllib.utils.test_utils import run_learning_tests_from_yaml

if __name__ == "__main__":
# Get path of this very script to look for yaml files.
abs_yaml_path = Path(__file__).parent
print("abs_yaml_path={}".format(abs_yaml_path))

yaml_files = abs_yaml_path.rglob("*.yaml")
yaml_files = sorted(
map(lambda path: str(path.absolute()), yaml_files), reverse=True)

# Run all tests in the found yaml files.
results = run_learning_tests_from_yaml(yaml_files=yaml_files)

test_output_json = os.environ.get("TEST_OUTPUT_JSON",
"/tmp/rllib_learning_test_core.json")
with open(test_output_json, "wt") as f:
json.dump(results, f)

print("Ok.")
14 changes: 14 additions & 0 deletions release/rllib_tests/rllib_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,17 @@
use_connect: True
timeout: 3000
script: python connect_tests/run_connect_tests.py

# Nightly performance regression for popular algorithms.
# These algorithms run nightly for pre-determined amount of time without
# passing criteria.
# Performance metrics, such as reward achieved and throughput, are then
# collected and tracked over time.
- name: performance_tests
cluster:
app_config: app_config.yaml
compute_template: 12gpus_192cpus.yaml

run:
timeout: 7200
script: python performance_tests/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ atari-impala:
stop:
time_total_s: 3600
config:
framework: tf
frameworks: [ "tf" ]

This comment has been minimized.

Copy link
@xwjiang2010

xwjiang2010 Nov 10, 2021

Contributor

Nightly is failing?
https://buildkite.com/ray-project/periodic-ci/builds/1574#9dfda9ad-dcbf-4ec7-b248-401c7fc477af

Exception: Unknown config parameter frameworks

num_gpus: 1
num_cpus_for_driver: 0
rollout_fragment_length: 50
Expand Down
90 changes: 54 additions & 36 deletions rllib/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from ray.rllib.utils.framework import try_import_jax, try_import_tf, \
try_import_torch
from ray.rllib.utils.typing import PartialTrainerConfigDict
from ray.tune import run_experiments
from ray.tune import CLIReporter, run_experiments

jax, _ = try_import_jax()
tf1, tf, tfv = try_import_tf()
Expand Down Expand Up @@ -588,6 +588,8 @@ def run_learning_tests_from_yaml(
experiments = {}
# The results per experiment.
checks = {}
# Metrics per experiment.
stats = {}

start_time = time.monotonic()

Expand All @@ -600,14 +602,18 @@ def run_learning_tests_from_yaml(
for k, e in tf_experiments.items():
# If framework explicitly given, only test for that framework.
# Some algos do not have both versions available.
if "framework" in e["config"]:
frameworks = [e["config"]["framework"]]
if "frameworks" in e:
frameworks = e["frameworks"]
else:
# By default we don't run tf2, because tf2's multi-gpu support
# isn't complete yet.
frameworks = ["tf", "torch"]
e["config"]["framework"] = "tf"
# Pop frameworks key to not confuse Tune.
e.pop("frameworks", None)

e["stop"] = e["stop"] or {}
e["pass_criteria"] = e["pass_criteria"] or {}
e["stop"] = e["stop"] if "stop" in e else {}
e["pass_criteria"] = e[
"pass_criteria"] if "pass_criteria" in e else {}

# For smoke-tests, we just run for n min.
if smoke_test:
Expand All @@ -623,39 +629,30 @@ def run_learning_tests_from_yaml(
if min_reward is not None:
e["stop"]["episode_reward_mean"] = min_reward

keys = []
# Generate the torch copy of the experiment.
if len(frameworks) == 2:
e_torch = copy.deepcopy(e)
e_torch["config"]["framework"] = "torch"
keys.append(re.sub("^(\\w+)-", "\\1-tf-", k))
keys.append(re.sub("-tf-", "-torch-", keys[0]))
experiments[keys[0]] = e
experiments[keys[1]] = e_torch
# tf-only.
elif frameworks[0] == "tf":
keys.append(re.sub("^(\\w+)-", "\\1-tf-", k))
experiments[keys[0]] = e
# torch-only.
else:
keys.append(re.sub("^(\\w+)-", "\\1-torch-", k))
experiments[keys[0]] = e
# Generate `checks` dict for all experiments
# (tf, tf2 and/or torch).
for framework in frameworks:
k_ = k + "-" + framework
ec = copy.deepcopy(e)
ec["config"]["framework"] = framework
if framework == "tf2":
ec["config"]["eager_tracing"] = True

# Generate `checks` dict for all experiments (tf and/or torch).
for k_ in keys:
e = experiments[k_]
checks[k_] = {
"min_reward": e["pass_criteria"].get(
"episode_reward_mean"),
"min_throughput": e["pass_criteria"].get(
"min_reward": ec["pass_criteria"].get(
"episode_reward_mean", 0.0),
"min_throughput": ec["pass_criteria"].get(
"timesteps_total", 0.0) /
(e["stop"].get("time_total_s", 1.0) or 1.0),
"time_total_s": e["stop"].get("time_total_s"),
(ec["stop"].get("time_total_s", 1.0) or 1.0),
"time_total_s": ec["stop"].get("time_total_s"),
"failures": 0,
"passed": False,
}
# This key would break tune.
e.pop("pass_criteria", None)
ec.pop("pass_criteria", None)

# One experiment to run.
experiments[k_] = ec

# Print out the actual config.
print("== Test config ==")
Expand All @@ -679,7 +676,22 @@ def run_learning_tests_from_yaml(
print(f"Starting learning test iteration {i}...")

# Run remaining experiments.
trials = run_experiments(experiments_to_run, resume=False, verbose=2)
trials = run_experiments(
experiments_to_run,
resume=False,
verbose=2,
progress_reporter=CLIReporter(
metric_columns={
"training_iteration": "iter",
"time_total_s": "time_total_s",
"timesteps_total": "ts",
"episodes_this_iter": "train_episodes",
"episode_reward_mean": "reward_mean",
},
sort_by_metric=True,
max_report_frequency=30,
))

all_trials.extend(trials)

# Check each experiment for whether it passed.
Expand Down Expand Up @@ -735,10 +747,15 @@ def run_learning_tests_from_yaml(
for t in trials_for_experiment
])

# TODO(jungong) : track trainer and env throughput separately.
throughput = timesteps_total / (total_time_s or 1.0)
desired_throughput = None
# TODO(Jun): Stop checking throughput for now.
# desired_throughput = checks[experiment]["min_throughput"]
desired_throughput = checks[experiment]["min_throughput"]

# Record performance.
stats[experiment] = {
"episode_reward_mean": episode_reward_mean,
"throughput": throughput,
}

print(f" ... Desired reward={desired_reward}; "
f"desired throughput={desired_throughput}")
Expand Down Expand Up @@ -767,6 +784,7 @@ def run_learning_tests_from_yaml(
"time_taken": time_taken,
"trial_states": dict(Counter([trial.status for trial in all_trials])),
"last_update": time.time(),
"stats": stats,
"passed": [k for k, exp in checks.items() if exp["passed"]],
"failures": {
k: exp["failures"]
Expand Down

0 comments on commit d8a61f8

Please sign in to comment.