[RLlib] Create a set of performance benchmark tests to run nightly. (#…

…19945) * Create a core set of algorithms tests to run nightly. * Run release tests under tf, tf2, and torch frameworks. * Fix * Add eager_tracing option for tf2 framework. * make sure core tests can run in parallel. * cql * Report progress while running nightly/weekly tests. * Innclude SAC in nightly lineup. * Revert changes to learning_tests * rebrand to performance test. * update build_pipeline.py with new performance_tests name. * Record stats. * bug fix, need to populate experiments dict. * Alphabetize yaml files. * Allow specifying frameworks. And do not run tf2 by default. * remove some debugging code. * fix * Undo testing changes. * Do not run CQL regression for now. * LINT. Co-authored-by: sven1977 <svenmika1977@gmail.com>
ray-project · Nov 8, 2021 · d8a61f8 · xwjiang2010 · Nov 10, 2021 · d8a61f8
1 parent b1f2476
commit d8a61f8
Show file tree

Hide file tree

Showing 8 changed files with 257 additions and 38 deletions.
diff --git a/release/.buildkite/build_pipeline.py b/release/.buildkite/build_pipeline.py
@@ -158,6 +158,7 @@ def __init__(self, name: str, retry: int = 0):
     "~/ray/release/rllib_tests/rllib_tests.yaml": [
         SmokeTest("learning_tests"),
         SmokeTest("stress_tests"),
+        "performance_tests",
         "multi_gpu_learning_tests",
         "multi_gpu_with_lstm_learning_tests",
         "multi_gpu_with_attention_learning_tests",

diff --git a/release/rllib_tests/12gpus_192cpus.yaml b/release/rllib_tests/12gpus_192cpus.yaml
@@ -0,0 +1,21 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west-2
+
+max_workers: 12
+
+head_node_type:
+    name: head_node
+    instance_type: m5.xlarge
+
+worker_node_types:
+    - name: worker_node
+      instance_type: g3.4xlarge
+      min_workers: 12
+      max_workers: 12
+      use_spot: false
+
+aws:
+    BlockDeviceMappings:
+        - DeviceName: /dev/sda1
+          Ebs:
+            VolumeSize: 500
diff --git a/release/rllib_tests/learning_tests/hard_learning_tests.yaml b/release/rllib_tests/learning_tests/hard_learning_tests.yaml
@@ -273,7 +273,7 @@ ddpg-hopperbulletenv-v0:
 #        time_total_s: 3600
 #    config:
 #        # DDPPO only supports PyTorch so far.
-#        framework: torch
+#        frameworks: [ "torch" ]
 #        # Worker config: 10 workers, each of which requires a GPU.
 #        num_workers: 16
 #        # Workers require GPUs, but share 1 GPU amongst 2 workers.

diff --git a/release/rllib_tests/performance_tests/performance_tests.yaml b/release/rllib_tests/performance_tests/performance_tests.yaml
@@ -0,0 +1,136 @@
+apex-breakoutnoframeskip-v4:
+    env: BreakoutNoFrameskip-v4
+    run: APEX
+    frameworks: [ "tf", "tf2", "torch" ]
+    stop:
+        time_total_s: 3600
+    config:
+        double_q: false
+        dueling: false
+        num_atoms: 1
+        noisy: false
+        n_step: 3
+        lr: .0001
+        adam_epsilon: .00015
+        hiddens: [512]
+        buffer_size: 1000000
+        exploration_config:
+            epsilon_timesteps: 200000
+            final_epsilon: 0.01
+        prioritized_replay_alpha: 0.5
+        final_prioritized_replay_beta: 1.0
+        prioritized_replay_beta_annealing_timesteps: 2000000
+        num_gpus: 1
+        num_workers: 8
+        num_envs_per_worker: 8
+        rollout_fragment_length: 20
+        train_batch_size: 512
+        target_network_update_freq: 50000
+        timesteps_per_iteration: 25000
+
+appo-pongnoframeskip-v4:
+    env: PongNoFrameskip-v4
+    run: APPO
+    frameworks: [ "tf", "tf2", "torch" ]
+    stop:
+        time_total_s: 2000
+    config:
+        vtrace: True
+        use_kl_loss: False
+        rollout_fragment_length: 50
+        train_batch_size: 750
+        num_workers: 31
+        broadcast_interval: 1
+        max_sample_requests_in_flight_per_worker: 1
+        num_multi_gpu_tower_stacks: 1
+        num_envs_per_worker: 8
+        num_sgd_iter: 2
+        vf_loss_coeff: 1.0
+        clip_param: 0.3
+        num_gpus: 1
+        grad_clip: 10
+        model:
+            dim: 42
+
+# Bring cql test back after we make sure it learns.
+#cql-halfcheetahbulletenv-v0:
+#    env: HalfCheetahBulletEnv-v0
+#    run: CQL
+#    frameworks: [ "tf", "tf2", "torch" ]
+#    stop:
+#        time_total_s: 1800
+#    config:
+#        # Use input produced by expert SAC algo.
+#        input: ["~/halfcheetah_expert_sac.zip"]
+#        actions_in_input_normalized: true
+#
+#        soft_horizon: False
+#        horizon: 1000
+#        Q_model:
+#            fcnet_activation: relu
+#            fcnet_hiddens: [256, 256, 256]
+#        policy_model:
+#            fcnet_activation: relu
+#            fcnet_hiddens: [256, 256, 256]
+#        tau: 0.005
+#        target_entropy: auto
+#        no_done_at_end: false
+#        n_step: 3
+#        rollout_fragment_length: 1
+#        prioritized_replay: false
+#        train_batch_size: 256
+#        target_network_update_freq: 0
+#        timesteps_per_iteration: 1000
+#        learning_starts: 256
+#        optimization:
+#            actor_learning_rate: 0.0001
+#            critic_learning_rate: 0.0003
+#            entropy_learning_rate: 0.0001
+#        num_workers: 0
+#        num_gpus: 1
+#        metrics_smoothing_episodes: 5
+#
+#        # CQL Configs
+#        min_q_weight: 5.0
+#        bc_iters: 20000
+#        temperature: 1.0
+#        num_actions: 10
+#        lagrangian: False
+#
+#        # Switch on online evaluation.
+#        evaluation_interval: 3
+#        evaluation_config:
+#            input: sampler
+
+sac-halfcheetahbulletenv-v0:
+    env: HalfCheetahBulletEnv-v0
+    run: SAC
+    frameworks: [ "tf", "tf2", "torch" ]
+    stop:
+        time_total_s: 3600
+    config:
+        horizon: 1000
+        soft_horizon: false
+        Q_model:
+            fcnet_activation: relu
+            fcnet_hiddens: [256, 256]
+        policy_model:
+            fcnet_activation: relu
+            fcnet_hiddens: [256, 256]
+        tau: 0.005
+        target_entropy: auto
+        no_done_at_end: false
+        n_step: 3
+        rollout_fragment_length: 1
+        prioritized_replay: true
+        train_batch_size: 256
+        target_network_update_freq: 1
+        timesteps_per_iteration: 1000
+        learning_starts: 10000
+        optimization:
+            actor_learning_rate: 0.0003
+            critic_learning_rate: 0.0003
+            entropy_learning_rate: 0.0003
+        num_workers: 0
+        num_gpus: 1
+        metrics_smoothing_episodes: 5
diff --git a/release/rllib_tests/performance_tests/run.py b/release/rllib_tests/performance_tests/run.py
@@ -0,0 +1,29 @@
+"""Core Learning regression tests for RLlib (torch and tf).
+
+Runs Atari/PyBullet benchmarks for the most popular algorithms.
+"""
+
+import json
+import os
+from pathlib import Path
+
+from ray.rllib.utils.test_utils import run_learning_tests_from_yaml
+
+if __name__ == "__main__":
+    # Get path of this very script to look for yaml files.
+    abs_yaml_path = Path(__file__).parent
+    print("abs_yaml_path={}".format(abs_yaml_path))
+
+    yaml_files = abs_yaml_path.rglob("*.yaml")
+    yaml_files = sorted(
+        map(lambda path: str(path.absolute()), yaml_files), reverse=True)
+
+    # Run all tests in the found yaml files.
+    results = run_learning_tests_from_yaml(yaml_files=yaml_files)
+
+    test_output_json = os.environ.get("TEST_OUTPUT_JSON",
+                                      "/tmp/rllib_learning_test_core.json")
+    with open(test_output_json, "wt") as f:
+        json.dump(results, f)
+
+    print("Ok.")
diff --git a/release/rllib_tests/rllib_tests.yaml b/release/rllib_tests/rllib_tests.yaml
@@ -79,3 +79,17 @@
     use_connect: True
     timeout: 3000
     script: python connect_tests/run_connect_tests.py
+
+# Nightly performance regression for popular algorithms.
+# These algorithms run nightly for pre-determined amount of time without
+# passing criteria.
+# Performance metrics, such as reward achieved and throughput, are then
+# collected and tracked over time.
+- name: performance_tests
+  cluster:
+    app_config: app_config.yaml
+    compute_template: 12gpus_192cpus.yaml
+
+  run:
+    timeout: 7200
+    script: python performance_tests/run.py
diff --git a/release/rllib_tests/stress_tests/atari_impala_xlarge_tests.yaml b/release/rllib_tests/stress_tests/atari_impala_xlarge_tests.yaml
@@ -15,7 +15,7 @@ atari-impala:
     stop:
         time_total_s: 3600
     config:
-        framework: tf
+        frameworks: [ "tf" ]
         num_gpus: 1
         num_cpus_for_driver: 0
         rollout_fragment_length: 50

diff --git a/rllib/utils/test_utils.py b/rllib/utils/test_utils.py
@@ -14,7 +14,7 @@
 from ray.rllib.utils.framework import try_import_jax, try_import_tf, \
     try_import_torch
 from ray.rllib.utils.typing import PartialTrainerConfigDict
-from ray.tune import run_experiments
+from ray.tune import CLIReporter, run_experiments
 
 jax, _ = try_import_jax()
 tf1, tf, tfv = try_import_tf()
@@ -588,6 +588,8 @@ def run_learning_tests_from_yaml(
     experiments = {}
     # The results per experiment.
     checks = {}
+    # Metrics per experiment.
+    stats = {}
 
     start_time = time.monotonic()
 
@@ -600,14 +602,18 @@ def run_learning_tests_from_yaml(
         for k, e in tf_experiments.items():
             # If framework explicitly given, only test for that framework.
             # Some algos do not have both versions available.
-            if "framework" in e["config"]:
-                frameworks = [e["config"]["framework"]]
+            if "frameworks" in e:
+                frameworks = e["frameworks"]
             else:
+                # By default we don't run tf2, because tf2's multi-gpu support
+                # isn't complete yet.
                 frameworks = ["tf", "torch"]
-                e["config"]["framework"] = "tf"
+            # Pop frameworks key to not confuse Tune.
+            e.pop("frameworks", None)
 
-            e["stop"] = e["stop"] or {}
-            e["pass_criteria"] = e["pass_criteria"] or {}
+            e["stop"] = e["stop"] if "stop" in e else {}
+            e["pass_criteria"] = e[
+                "pass_criteria"] if "pass_criteria" in e else {}
 
             # For smoke-tests, we just run for n min.
             if smoke_test:
@@ -623,39 +629,30 @@ def run_learning_tests_from_yaml(
                 if min_reward is not None:
                     e["stop"]["episode_reward_mean"] = min_reward
 
-            keys = []
-            # Generate the torch copy of the experiment.
-            if len(frameworks) == 2:
-                e_torch = copy.deepcopy(e)
-                e_torch["config"]["framework"] = "torch"
-                keys.append(re.sub("^(\\w+)-", "\\1-tf-", k))
-                keys.append(re.sub("-tf-", "-torch-", keys[0]))
-                experiments[keys[0]] = e
-                experiments[keys[1]] = e_torch
-            # tf-only.
-            elif frameworks[0] == "tf":
-                keys.append(re.sub("^(\\w+)-", "\\1-tf-", k))
-                experiments[keys[0]] = e
-            # torch-only.
-            else:
-                keys.append(re.sub("^(\\w+)-", "\\1-torch-", k))
-                experiments[keys[0]] = e
+            # Generate `checks` dict for all experiments
+            # (tf, tf2 and/or torch).
+            for framework in frameworks:
+                k_ = k + "-" + framework
+                ec = copy.deepcopy(e)
+                ec["config"]["framework"] = framework
+                if framework == "tf2":
+                    ec["config"]["eager_tracing"] = True
 
-            # Generate `checks` dict for all experiments (tf and/or torch).
-            for k_ in keys:
-                e = experiments[k_]
                 checks[k_] = {
-                    "min_reward": e["pass_criteria"].get(
-                        "episode_reward_mean"),
-                    "min_throughput": e["pass_criteria"].get(
+                    "min_reward": ec["pass_criteria"].get(
+                        "episode_reward_mean", 0.0),
+                    "min_throughput": ec["pass_criteria"].get(
                         "timesteps_total", 0.0) /
-                    (e["stop"].get("time_total_s", 1.0) or 1.0),
-                    "time_total_s": e["stop"].get("time_total_s"),
+                    (ec["stop"].get("time_total_s", 1.0) or 1.0),
+                    "time_total_s": ec["stop"].get("time_total_s"),
                     "failures": 0,
                     "passed": False,
                 }
                 # This key would break tune.
-                e.pop("pass_criteria", None)
+                ec.pop("pass_criteria", None)
+
+                # One experiment to run.
+                experiments[k_] = ec
 
     # Print out the actual config.
     print("== Test config ==")
@@ -679,7 +676,22 @@ def run_learning_tests_from_yaml(
         print(f"Starting learning test iteration {i}...")
 
         # Run remaining experiments.
-        trials = run_experiments(experiments_to_run, resume=False, verbose=2)
+        trials = run_experiments(
+            experiments_to_run,
+            resume=False,
+            verbose=2,
+            progress_reporter=CLIReporter(
+                metric_columns={
+                    "training_iteration": "iter",
+                    "time_total_s": "time_total_s",
+                    "timesteps_total": "ts",
+                    "episodes_this_iter": "train_episodes",
+                    "episode_reward_mean": "reward_mean",
+                },
+                sort_by_metric=True,
+                max_report_frequency=30,
+            ))
+
         all_trials.extend(trials)
 
         # Check each experiment for whether it passed.
@@ -735,10 +747,15 @@ def run_learning_tests_from_yaml(
                     for t in trials_for_experiment
                 ])
 
+                # TODO(jungong) : track trainer and env throughput separately.
                 throughput = timesteps_total / (total_time_s or 1.0)
-                desired_throughput = None
-                # TODO(Jun): Stop checking throughput for now.
-                # desired_throughput = checks[experiment]["min_throughput"]
+                desired_throughput = checks[experiment]["min_throughput"]
+
+                # Record performance.
+                stats[experiment] = {
+                    "episode_reward_mean": episode_reward_mean,
+                    "throughput": throughput,
+                }
 
                 print(f" ... Desired reward={desired_reward}; "
                       f"desired throughput={desired_throughput}")
@@ -767,6 +784,7 @@ def run_learning_tests_from_yaml(
         "time_taken": time_taken,
         "trial_states": dict(Counter([trial.status for trial in all_trials])),
         "last_update": time.time(),
+        "stats": stats,
         "passed": [k for k, exp in checks.items() if exp["passed"]],
         "failures": {
             k: exp["failures"]