pytorch · suo · Feb 17, 2022 · Feb 17, 2022 · Feb 17, 2022 · Feb 17, 2022
diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
@@ -9,3 +9,4 @@ self-hosted-runner:
     - windows.4xlarge
     - windows.8xlarge.nvidia.gpu
     - bm-runner
+    - linux.rocm.gpu
diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
@@ -2,17 +2,16 @@
 
 from dataclasses import asdict, dataclass, field
 from pathlib import Path
-from typing import Dict, Set, List, Iterable
+from typing import Dict, Set, List, Iterable, Any
 
 import jinja2
 import json
 import os
 import sys
-from typing_extensions import Literal
+from typing_extensions import Literal, TypedDict
 
 import generate_binary_build_matrix  # type: ignore[import]
 
-YamlShellBool = Literal["''", 1]
 Arch = Literal["windows", "linux", "macos"]
 
 DOCKER_REGISTRY = "308535385114.dkr.ecr.us-east-1.amazonaws.com"
@@ -142,6 +141,11 @@ def generate_json(self) -> None:
             outfile.write('\n')
 
 
+class Config(TypedDict):
+    num_shards: int
+    runner: str
+
+
 @dataclass
 class CIWorkflow:
     # Required fields
@@ -162,50 +166,38 @@ class CIWorkflow:
     is_scheduled: str = ''
     is_default: bool = False
     num_test_shards: int = 1
-    only_run_smoke_tests_on_pull_request: bool = False
-    num_test_shards_on_pull_request: int = -1
-    distributed_test: bool = True
     timeout_after: int = 240
     xcode_version: str = ''
     only_on_pr: bool = False
     ios_arch: str = ''
     ios_platform: str = ''
+    test_jobs: Any = field(default_factory=list)
 
-    # The following variables will be set as environment variables,
-    # so it's easier for both shell and Python scripts to consume it if false is represented as the empty string.
-    enable_jit_legacy_test: YamlShellBool = "''"
-    enable_distributed_test: YamlShellBool = "''"
-    enable_multigpu_test: YamlShellBool = "''"
-    enable_nogpu_no_avx_test: YamlShellBool = "''"
-    enable_nogpu_no_avx2_test: YamlShellBool = "''"
-    enable_slow_test: YamlShellBool = "''"
-    enable_docs_test: YamlShellBool = "''"
-    enable_backwards_compat_test: YamlShellBool = "''"
-    enable_xla_test: YamlShellBool = "''"
-    enable_noarch_test: YamlShellBool = "''"
-    enable_force_on_cpu_test: YamlShellBool = "''"
+    enable_default_test: bool = True
+    enable_smoke_test: bool = True
+    enable_jit_legacy_test: bool = False
+    enable_distributed_test: bool = True
+    enable_multigpu_test: bool = False
+    enable_nogpu_no_avx_test: bool = False
+    enable_nogpu_no_avx2_test: bool = False
+    enable_slow_test: bool = False
+    enable_docs_test: bool = False
+    enable_backwards_compat_test: bool = False
+    enable_xla_test: bool = False
+    enable_noarch_test: bool = False
+    enable_force_on_cpu_test: bool = False
 
     def __post_init__(self) -> None:
         if not self.build_generates_artifacts:
             self.exclude_test = True
 
-        if self.distributed_test:
-            self.enable_distributed_test = 1
-
         self.multigpu_runner_type = LINUX_MULTIGPU_RUNNERS.get(self.test_runner_type, "linux.16xlarge.nvidia.gpu")
         self.distributed_gpu_runner_type = LINUX_DISTRIBUTED_GPU_RUNNERS.get(self.test_runner_type, "linux.8xlarge.nvidia.gpu")
 
         if LABEL_CIFLOW_DEFAULT in self.ciflow_config.labels:
             self.is_default = True
 
-        # If num_test_shards_on_pull_request is not user-defined, default to num_test_shards unless we are
-        # only running smoke tests on the pull request.
-        if self.num_test_shards_on_pull_request == -1:
-            # Don't run the default if we are only running smoke tests
-            if self.only_run_smoke_tests_on_pull_request:
-                self.num_test_shards_on_pull_request = 0
-            else:
-                self.num_test_shards_on_pull_request = self.num_test_shards
+        self.test_jobs = self._gen_test_jobs()
         self.assert_valid()
 
     def assert_valid(self) -> None:
@@ -254,6 +246,83 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
                 output_file.write("\n")
         print(output_file_path)
 
+    def normalized_build_environment(self, suffix: str) -> str:
+        return self.build_environment.replace(".", "_") + suffix
+
+    def _gen_test_jobs(self) -> Any:
+        if self.arch == "linux":
+            MULTIGPU_RUNNER_TYPE = "linux.16xlarge.nvidia.gpu"
+            DISTRIBUTED_GPU_RUNNER_TYPE = "linux.8xlarge.nvidia.gpu"
+            NOGPU_RUNNER_TYPE = "linux.2xlarge"
+        elif self.arch == "windows":
+            DISTRIBUTED_GPU_RUNNER_TYPE = self.test_runner_type
+            NOGPU_RUNNER_TYPE = "windows.4xlarge"
+
+        test_jobs = []
+
+        configs: Dict[str, Config] = {}
+        if self.enable_jit_legacy_test:
+            configs["jit_legacy"] = {"num_shards": 1, "runner": self.test_runner_type}
+        if self.enable_multigpu_test:
+            configs["multigpu"] = {"num_shards": 1, "runner": MULTIGPU_RUNNER_TYPE}
+
+        if self.enable_nogpu_no_avx_test:
+            configs["nogpu_NO_AVX"] = {"num_shards": 1, "runner": NOGPU_RUNNER_TYPE}
+        if self.enable_nogpu_no_avx2_test:
+            configs["nogpu_NO_AVX2"] = {"num_shards": 1, "runner": NOGPU_RUNNER_TYPE}
+        if self.enable_force_on_cpu_test:
+            configs["force_on_cpu"] = {"num_shards": 1, "runner": NOGPU_RUNNER_TYPE}
+        if self.enable_distributed_test:
+            configs["distributed"] = {
+                "num_shards": 1,
+                "runner": DISTRIBUTED_GPU_RUNNER_TYPE
+                if "cuda" in str(self.build_environment)
+                else self.test_runner_type,
+            }
+        if self.enable_slow_test:
+            configs["slow"] = {"num_shards": 1, "runner": self.test_runner_type}
+        if self.enable_docs_test:
+            configs["docs_test"] = {"num_shards": 1, "runner": self.test_runner_type}
+        if self.enable_backwards_compat_test:
+            configs["backwards_compat"] = {
+                "num_shards": 1,
+                "runner": self.test_runner_type,
+            }
+        if self.enable_xla_test:
+            configs["xla"] = {"num_shards": 1, "runner": self.test_runner_type}
+        if self.enable_noarch_test:
+            configs["noarch"] = {"num_shards": 1, "runner": self.test_runner_type}
+
+        if self.enable_smoke_test:
+            configs["smoke_tests"] = {"num_shards": 1, "runner": self.test_runner_type}
+
+        for name, config in configs.items():
+            for shard in range(1, config["num_shards"] + 1):
+                test_jobs.append(
+                    {
+                        "id": f"test_{name}_{shard}_{config['num_shards']}",
+                        "name": f"test ({name}, {shard}, {config['num_shards']}, {config['runner']})",
+                        "config": name,
+                        "shard": shard,
+                        "num_shards": config["num_shards"],
+                        "runner": config["runner"],
+                    }
+                )
+
+        if self.enable_default_test:
+            for shard in range(1, self.num_test_shards + 1):
+                test_jobs.append(
+                    {
+                        "id": f"test_default_{shard}_{config['num_shards']}",
+                        "name": f"test (default, {shard}, {self.num_test_shards}, {self.test_runner_type})",
+                        "config": "default",
+                        "shard": shard,
+                        "num_shards": self.num_test_shards,
+                        "runner": self.test_runner_type,
+                    }
+                )
+        return test_jobs
+
 @dataclass
 class DockerWorkflow:
     build_environment: str
@@ -327,17 +396,30 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
             labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CPU, LABEL_CIFLOW_WIN}
         ),
     ),
+    CIWorkflow(
+        arch="windows",
+        build_environment="win-vs2019-cuda11.3-py3-smoke",
+        cuda_version="11.3",
+        test_runner_type=WINDOWS_CUDA_TEST_RUNNER,
+        enable_default_test=False,
+        enable_smoke_test=True,
+        enable_force_on_cpu_test=True,
+        only_on_pr=True,
+        ciflow_config=CIFlowConfig(
+            run_on_canary=True,
+            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN}
+        ),
+    ),
     CIWorkflow(
         arch="windows",
         build_environment="win-vs2019-cuda11.3-py3",
         cuda_version="11.3",
         test_runner_type=WINDOWS_CUDA_TEST_RUNNER,
         num_test_shards=2,
-        only_run_smoke_tests_on_pull_request=True,
-        enable_force_on_cpu_test=1,
+        enable_force_on_cpu_test=True,
         ciflow_config=CIFlowConfig(
             run_on_canary=True,
-            labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN}
+            labels={LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN}
         ),
     ),
     CIWorkflow(
@@ -346,7 +428,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         cuda_version="11.5",
         test_runner_type=WINDOWS_CUDA_TEST_RUNNER,
         num_test_shards=2,
-        enable_force_on_cpu_test=1,
+        enable_force_on_cpu_test=True,
         is_scheduled="45 4,10,16,22 * * *",
         ciflow_config=CIFlowConfig(
             run_on_canary=True,
@@ -372,9 +454,9 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         build_environment="linux-xenial-py3.7-gcc5.4",
         docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.7-gcc5.4",
         test_runner_type=LINUX_CPU_TEST_RUNNER,
-        enable_jit_legacy_test=1,
-        enable_backwards_compat_test=1,
-        enable_docs_test=1,
+        enable_jit_legacy_test=True,
+        enable_backwards_compat_test=True,
+        enable_docs_test=True,
         num_test_shards=2,
         ciflow_config=CIFlowConfig(
             run_on_canary=True,
@@ -475,7 +557,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang7-asan",
         test_runner_type=LINUX_CPU_TEST_RUNNER,
         num_test_shards=3,
-        distributed_test=False,
+        enable_distributed_test=False,
         ciflow_config=CIFlowConfig(
             labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_SANITIZERS, LABEL_CIFLOW_CPU},
         ),
@@ -486,7 +568,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang7-onnx",
         test_runner_type=LINUX_CPU_TEST_RUNNER,
         num_test_shards=2,
-        distributed_test=False,
+        enable_distributed_test=False,
         ciflow_config=CIFlowConfig(
             labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_ONNX, LABEL_CIFLOW_CPU},
         ),
@@ -496,11 +578,11 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         build_environment="linux-bionic-cuda10.2-py3.9-gcc7",
         docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7",
         test_runner_type=LINUX_CUDA_TEST_RUNNER,
-        enable_jit_legacy_test=1,
-        enable_multigpu_test=1,
-        enable_nogpu_no_avx_test=1,
-        enable_nogpu_no_avx2_test=1,
-        enable_slow_test=1,
+        enable_jit_legacy_test=True,
+        enable_multigpu_test=True,
+        enable_nogpu_no_avx_test=True,
+        enable_nogpu_no_avx2_test=True,
+        enable_slow_test=True,
         num_test_shards=2,
         ciflow_config=CIFlowConfig(
             run_on_canary=True,
@@ -623,8 +705,8 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.7-clang9",
         test_runner_type=LINUX_CPU_TEST_RUNNER,
         num_test_shards=2,
-        distributed_test=False,
-        enable_noarch_test=1,
+        enable_distributed_test=False,
+        enable_noarch_test=True,
         ciflow_config=CIFlowConfig(
             labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_NOARCH},
         ),
@@ -635,7 +717,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.7-clang9",
         test_runner_type=LINUX_CPU_TEST_RUNNER,
         num_test_shards=1,
-        distributed_test=False,
+        enable_distributed_test=False,
         ciflow_config=CIFlowConfig(
             labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_VULKAN},
         ),
@@ -646,7 +728,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
         test_runner_type=LINUX_CUDA_TEST_RUNNER,
         num_test_shards=2,
-        distributed_test=False,
+        enable_distributed_test=False,
         timeout_after=360,
         # Only run this on master 4 times per day since it does take a while
         is_scheduled="0 */4 * * *",
@@ -663,8 +745,9 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         docker_image_base=f"{DOCKER_REGISTRY}/pytorch/xla_base",
         test_runner_type=LINUX_CPU_TEST_RUNNER,
         num_test_shards=2,
-        distributed_test=False,
-        enable_xla_test=1,
+        enable_distributed_test=False,
+        enable_xla_test=True,
+        enable_default_test=False,
         ciflow_config=CIFlowConfig(
             labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_XLA},
         ),
@@ -801,7 +884,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
         xcode_version="12.4",
         test_runner_type=MACOS_TEST_RUNNER_11,
         num_test_shards=2,
-        distributed_test=False,
+        enable_distributed_test=False,
         ciflow_config=CIFlowConfig(
             labels={LABEL_CIFLOW_MACOS},
         ),