Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/actionlint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ self-hosted-runner:
- windows.4xlarge
- windows.8xlarge.nvidia.gpu
- bm-runner
- linux.rocm.gpu
14 changes: 9 additions & 5 deletions .github/generated-ciflow-ruleset.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

185 changes: 134 additions & 51 deletions .github/scripts/generate_ci_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,16 @@

from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Dict, Set, List, Iterable
from typing import Dict, Set, List, Iterable, Any

import jinja2
import json
import os
import sys
from typing_extensions import Literal
from typing_extensions import Literal, TypedDict

import generate_binary_build_matrix # type: ignore[import]

YamlShellBool = Literal["''", 1]
Arch = Literal["windows", "linux", "macos"]

DOCKER_REGISTRY = "308535385114.dkr.ecr.us-east-1.amazonaws.com"
Expand Down Expand Up @@ -142,6 +141,11 @@ def generate_json(self) -> None:
outfile.write('\n')


class Config(TypedDict):
num_shards: int
runner: str


@dataclass
class CIWorkflow:
# Required fields
Expand All @@ -162,50 +166,38 @@ class CIWorkflow:
is_scheduled: str = ''
is_default: bool = False
num_test_shards: int = 1
only_run_smoke_tests_on_pull_request: bool = False
num_test_shards_on_pull_request: int = -1
distributed_test: bool = True
timeout_after: int = 240
xcode_version: str = ''
only_on_pr: bool = False
ios_arch: str = ''
ios_platform: str = ''
test_jobs: Any = field(default_factory=list)

# The following variables will be set as environment variables,
# so it's easier for both shell and Python scripts to consume it if false is represented as the empty string.
enable_jit_legacy_test: YamlShellBool = "''"
enable_distributed_test: YamlShellBool = "''"
enable_multigpu_test: YamlShellBool = "''"
enable_nogpu_no_avx_test: YamlShellBool = "''"
enable_nogpu_no_avx2_test: YamlShellBool = "''"
enable_slow_test: YamlShellBool = "''"
enable_docs_test: YamlShellBool = "''"
enable_backwards_compat_test: YamlShellBool = "''"
enable_xla_test: YamlShellBool = "''"
enable_noarch_test: YamlShellBool = "''"
enable_force_on_cpu_test: YamlShellBool = "''"
enable_default_test: bool = True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We no longer need explicit type annotation here, do we?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have no idea lol, the rules for .github are different and stricter than other folders, I can try to remove

enable_smoke_test: bool = True
enable_jit_legacy_test: bool = False
enable_distributed_test: bool = True
enable_multigpu_test: bool = False
enable_nogpu_no_avx_test: bool = False
enable_nogpu_no_avx2_test: bool = False
enable_slow_test: bool = False
enable_docs_test: bool = False
enable_backwards_compat_test: bool = False
enable_xla_test: bool = False
enable_noarch_test: bool = False
enable_force_on_cpu_test: bool = False

def __post_init__(self) -> None:
if not self.build_generates_artifacts:
self.exclude_test = True

if self.distributed_test:
self.enable_distributed_test = 1

self.multigpu_runner_type = LINUX_MULTIGPU_RUNNERS.get(self.test_runner_type, "linux.16xlarge.nvidia.gpu")
self.distributed_gpu_runner_type = LINUX_DISTRIBUTED_GPU_RUNNERS.get(self.test_runner_type, "linux.8xlarge.nvidia.gpu")

if LABEL_CIFLOW_DEFAULT in self.ciflow_config.labels:
self.is_default = True

# If num_test_shards_on_pull_request is not user-defined, default to num_test_shards unless we are
# only running smoke tests on the pull request.
if self.num_test_shards_on_pull_request == -1:
# Don't run the default if we are only running smoke tests
if self.only_run_smoke_tests_on_pull_request:
self.num_test_shards_on_pull_request = 0
else:
self.num_test_shards_on_pull_request = self.num_test_shards
self.test_jobs = self._gen_test_jobs()
self.assert_valid()

def assert_valid(self) -> None:
Expand Down Expand Up @@ -254,6 +246,83 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
output_file.write("\n")
print(output_file_path)

def normalized_build_environment(self, suffix: str) -> str:
return self.build_environment.replace(".", "_") + suffix

def _gen_test_jobs(self) -> Any:
if self.arch == "linux":
MULTIGPU_RUNNER_TYPE = "linux.16xlarge.nvidia.gpu"
DISTRIBUTED_GPU_RUNNER_TYPE = "linux.8xlarge.nvidia.gpu"
NOGPU_RUNNER_TYPE = "linux.2xlarge"
elif self.arch == "windows":
DISTRIBUTED_GPU_RUNNER_TYPE = self.test_runner_type
NOGPU_RUNNER_TYPE = "windows.4xlarge"

test_jobs = []

configs: Dict[str, Config] = {}
if self.enable_jit_legacy_test:
configs["jit_legacy"] = {"num_shards": 1, "runner": self.test_runner_type}
if self.enable_multigpu_test:
configs["multigpu"] = {"num_shards": 1, "runner": MULTIGPU_RUNNER_TYPE}

if self.enable_nogpu_no_avx_test:
configs["nogpu_NO_AVX"] = {"num_shards": 1, "runner": NOGPU_RUNNER_TYPE}
Comment on lines +269 to +270
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unrelated - isn't NO_AVX dead (as we only have AVX2 and AVX512 now?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We still run it on linux-bionic-cuda10.2-py3.9-gcc7 at least, so not sure.

if self.enable_nogpu_no_avx2_test:
configs["nogpu_NO_AVX2"] = {"num_shards": 1, "runner": NOGPU_RUNNER_TYPE}
if self.enable_force_on_cpu_test:
configs["force_on_cpu"] = {"num_shards": 1, "runner": NOGPU_RUNNER_TYPE}
if self.enable_distributed_test:
configs["distributed"] = {
"num_shards": 1,
"runner": DISTRIBUTED_GPU_RUNNER_TYPE
if "cuda" in str(self.build_environment)
else self.test_runner_type,
}
if self.enable_slow_test:
configs["slow"] = {"num_shards": 1, "runner": self.test_runner_type}
if self.enable_docs_test:
configs["docs_test"] = {"num_shards": 1, "runner": self.test_runner_type}
if self.enable_backwards_compat_test:
configs["backwards_compat"] = {
"num_shards": 1,
"runner": self.test_runner_type,
}
if self.enable_xla_test:
configs["xla"] = {"num_shards": 1, "runner": self.test_runner_type}
if self.enable_noarch_test:
configs["noarch"] = {"num_shards": 1, "runner": self.test_runner_type}

if self.enable_smoke_test:
configs["smoke_tests"] = {"num_shards": 1, "runner": self.test_runner_type}

for name, config in configs.items():
for shard in range(1, config["num_shards"] + 1):
test_jobs.append(
{
"id": f"test_{name}_{shard}_{config['num_shards']}",
"name": f"test ({name}, {shard}, {config['num_shards']}, {config['runner']})",
"config": name,
"shard": shard,
"num_shards": config["num_shards"],
"runner": config["runner"],
}
)

if self.enable_default_test:
for shard in range(1, self.num_test_shards + 1):
test_jobs.append(
{
"id": f"test_default_{shard}_{config['num_shards']}",
"name": f"test (default, {shard}, {self.num_test_shards}, {self.test_runner_type})",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might also be beneficial to simplify the names as well here to not include the test_runner_type? I find that information might not be very useful to a majority of people and we can derive it from the logs

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, at the moment I am just trying replicate the same job name, to avoid churning metrics and the HUD. We can definitely change it if it we want though.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's actually a long term improvement I wanted to have in HUD - when it can combine history over renames (for example, today we have old XLA jobs names and new XLA job names and there are no continuation)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, one thing that might be interesting is that we are allowed to control the display name separately from the ID. So we can try to come up with some stable scheme for the ID and use that to identify things in HUD rather than the display name

"config": "default",
"shard": shard,
"num_shards": self.num_test_shards,
"runner": self.test_runner_type,
}
)
return test_jobs

@dataclass
class DockerWorkflow:
build_environment: str
Expand Down Expand Up @@ -327,17 +396,30 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CPU, LABEL_CIFLOW_WIN}
),
),
CIWorkflow(
arch="windows",
build_environment="win-vs2019-cuda11.3-py3-smoke",
cuda_version="11.3",
test_runner_type=WINDOWS_CUDA_TEST_RUNNER,
enable_default_test=False,
enable_smoke_test=True,
enable_force_on_cpu_test=True,
only_on_pr=True,
ciflow_config=CIFlowConfig(
run_on_canary=True,
labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN}
),
),
CIWorkflow(
arch="windows",
build_environment="win-vs2019-cuda11.3-py3",
cuda_version="11.3",
test_runner_type=WINDOWS_CUDA_TEST_RUNNER,
num_test_shards=2,
only_run_smoke_tests_on_pull_request=True,
enable_force_on_cpu_test=1,
enable_force_on_cpu_test=True,
ciflow_config=CIFlowConfig(
run_on_canary=True,
labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN}
labels={LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN}
),
),
CIWorkflow(
Expand All @@ -346,7 +428,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
cuda_version="11.5",
test_runner_type=WINDOWS_CUDA_TEST_RUNNER,
num_test_shards=2,
enable_force_on_cpu_test=1,
enable_force_on_cpu_test=True,
is_scheduled="45 4,10,16,22 * * *",
ciflow_config=CIFlowConfig(
run_on_canary=True,
Expand All @@ -372,9 +454,9 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
build_environment="linux-xenial-py3.7-gcc5.4",
docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.7-gcc5.4",
test_runner_type=LINUX_CPU_TEST_RUNNER,
enable_jit_legacy_test=1,
enable_backwards_compat_test=1,
enable_docs_test=1,
enable_jit_legacy_test=True,
enable_backwards_compat_test=True,
enable_docs_test=True,
num_test_shards=2,
ciflow_config=CIFlowConfig(
run_on_canary=True,
Expand Down Expand Up @@ -475,7 +557,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang7-asan",
test_runner_type=LINUX_CPU_TEST_RUNNER,
num_test_shards=3,
distributed_test=False,
enable_distributed_test=False,
ciflow_config=CIFlowConfig(
labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_SANITIZERS, LABEL_CIFLOW_CPU},
),
Expand All @@ -486,7 +568,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang7-onnx",
test_runner_type=LINUX_CPU_TEST_RUNNER,
num_test_shards=2,
distributed_test=False,
enable_distributed_test=False,
ciflow_config=CIFlowConfig(
labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_ONNX, LABEL_CIFLOW_CPU},
),
Expand All @@ -496,11 +578,11 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
build_environment="linux-bionic-cuda10.2-py3.9-gcc7",
docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7",
test_runner_type=LINUX_CUDA_TEST_RUNNER,
enable_jit_legacy_test=1,
enable_multigpu_test=1,
enable_nogpu_no_avx_test=1,
enable_nogpu_no_avx2_test=1,
enable_slow_test=1,
enable_jit_legacy_test=True,
enable_multigpu_test=True,
enable_nogpu_no_avx_test=True,
enable_nogpu_no_avx2_test=True,
enable_slow_test=True,
num_test_shards=2,
ciflow_config=CIFlowConfig(
run_on_canary=True,
Expand Down Expand Up @@ -623,8 +705,8 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.7-clang9",
test_runner_type=LINUX_CPU_TEST_RUNNER,
num_test_shards=2,
distributed_test=False,
enable_noarch_test=1,
enable_distributed_test=False,
enable_noarch_test=True,
ciflow_config=CIFlowConfig(
labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_NOARCH},
),
Expand All @@ -635,7 +717,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.7-clang9",
test_runner_type=LINUX_CPU_TEST_RUNNER,
num_test_shards=1,
distributed_test=False,
enable_distributed_test=False,
ciflow_config=CIFlowConfig(
labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_VULKAN},
),
Expand All @@ -646,7 +728,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
test_runner_type=LINUX_CUDA_TEST_RUNNER,
num_test_shards=2,
distributed_test=False,
enable_distributed_test=False,
timeout_after=360,
# Only run this on master 4 times per day since it does take a while
is_scheduled="0 */4 * * *",
Expand All @@ -663,8 +745,9 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
docker_image_base=f"{DOCKER_REGISTRY}/pytorch/xla_base",
test_runner_type=LINUX_CPU_TEST_RUNNER,
num_test_shards=2,
distributed_test=False,
enable_xla_test=1,
enable_distributed_test=False,
enable_xla_test=True,
enable_default_test=False,
ciflow_config=CIFlowConfig(
labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_XLA},
),
Expand Down Expand Up @@ -801,7 +884,7 @@ def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
xcode_version="12.4",
test_runner_type=MACOS_TEST_RUNNER_11,
num_test_shards=2,
distributed_test=False,
enable_distributed_test=False,
ciflow_config=CIFlowConfig(
labels={LABEL_CIFLOW_MACOS},
),
Expand Down
Loading