Skip to content

Commit

Permalink
Update on "[AOTInductor] Implement autograd eager backend for native …
Browse files Browse the repository at this point in the history
…triton kernels"

cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng Xia-Weiwen wenzhe-nrv jiayisunx chenyang78 aakhundov kadeng

[ghstack-poisoned]
  • Loading branch information
oulgen committed Oct 4, 2023
2 parents c34d126 + 996d848 commit b8a9266
Show file tree
Hide file tree
Showing 407 changed files with 4,072 additions and 2,446 deletions.
11 changes: 8 additions & 3 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,13 @@ cppcoreguidelines-*,
-facebook-hte-RelativeInclude,
hicpp-exception-baseclass,
hicpp-avoid-goto,
misc-unused-alias-decls,
misc-unused-using-decls,
misc-*,
-misc-const-correctness,
-misc-use-anonymous-namespace,
-misc-unused-parameters,
-misc-no-recursion,
-misc-non-private-member-variables-in-classes,
-misc-confusable-identifiers,
modernize-*,
-modernize-concat-nested-namespaces,
-modernize-macro-to-enum,
Expand All @@ -44,7 +49,7 @@ modernize-*,
performance-*,
readability-container-size-empty,
'
HeaderFilterRegex: '^(c10/(?!test)|torch/csrc/).*$'
HeaderFilterRegex: '^(c10/|torch/csrc/).*$'
AnalyzeTemporaryDtors: false
WarningsAsErrors: '*'
...
2 changes: 2 additions & 0 deletions .github/actionlint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,5 @@ self-hosted-runner:
- macos-12-xl
- macos-12
- macos12.3-m1
- macos-latest-xlarge
- macos-13-xlarge
2 changes: 1 addition & 1 deletion .github/ci_commit_pins/vision.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3da865859d458673e733b5857a14efc1e02383ef
48f8473e21b0f3e425aabc60db201b68fedf59b3
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
blas=1.0
cmake=3.22.1
mkl=2022.1.0
mkl-include=2022.1.0
ninja=1.10.2
numpy=1.23.3
pyyaml=6.0
Expand Down
Binary file modified .github/scripts/drci_mocks.json.gz
Binary file not shown.
5 changes: 3 additions & 2 deletions .github/scripts/generate_ci_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class BinaryBuildWorkflow:
branches: str = "nightly"
# Mainly for macos
cross_compile_arm64: bool = False
xcode_version: str = ""
macos_runner: str = "macos-12-xl"

def __post_init__(self) -> None:
if self.abi_version:
Expand Down Expand Up @@ -307,7 +307,8 @@ class OperatingSystem:
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
OperatingSystem.MACOS_ARM64
),
cross_compile_arm64=True,
cross_compile_arm64=False,
macos_runner="macos-13-xlarge",
ciflow_config=CIFlowConfig(
labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
isolated_workflow=True,
Expand Down
Binary file modified .github/scripts/gql_mocks.json.gz
Binary file not shown.
Binary file modified .github/scripts/rockset_mocks.json.gz
Binary file not shown.
120 changes: 47 additions & 73 deletions .github/scripts/test_trymerge.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from trymerge import (
categorize_checks,
find_matching_merge_rule,
FlakyRule,
get_classifications,
get_drci_classifications,
get_rockset_results,
Expand Down Expand Up @@ -226,16 +225,6 @@ def mocked_read_merge_rules_raise(repo: Any, org: str, project: str) -> List[Mer
raise RuntimeError("testing")


def empty_flaky_rules() -> List[FlakyRule]:
return []


def xla_is_flaky_rules() -> List[FlakyRule]:
return [
FlakyRule("xla", ["FAILED: Build did NOT complete successfully"]),
]


def xla_merge_rules(repo: Any, org: str, project: str) -> List[MergeRule]:
return [
MergeRule(
Expand All @@ -247,6 +236,7 @@ def xla_merge_rules(repo: Any, org: str, project: str) -> List[MergeRule]:
"EasyCLA",
"pull / linux-bionic-py3_8-clang8-xla / build",
"pull / linux-bionic-py3_8-clang8-xla / test (xla, 1, 1, linux.4xlarge)",
"inductor / cuda11.8-py3.10-gcc7-sm86 / test (inductor_torchbench_dynamic, 1, 1, linux.g5.4xlarge.nvidia.gpu)",
],
ignore_flaky_failures=False,
),
Expand All @@ -268,7 +258,6 @@ def commit_message(self, ref: str) -> str:
return "super awsome commit message"


@mock.patch("trymerge.read_flaky_rules", side_effect=empty_flaky_rules)
@mock.patch("trymerge.get_rockset_results", side_effect=empty_rockset_results)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch(
Expand Down Expand Up @@ -677,55 +666,74 @@ def test_get_merge_base(self, *args: Any) -> None:
)
class TestBypassFailures(TestCase):
def test_get_classifications(self, *args: Any) -> None:
flaky_rules = [
# Try a regex rule
FlakyRule("distributed", ["##\\[error\\]The operation [wW]as .+"])
]
pr = GitHubPR("pytorch", "pytorch", 92863)
pr = GitHubPR("pytorch", "pytorch", 109584)
checks = pr.get_checkrun_conclusions()
checks = get_classifications(
pr.pr_num,
pr.project,
checks,
pr.last_commit()["oid"],
pr.get_merge_base(),
flaky_rules,
[],
)
self.assertTrue(
checks[
"pull / linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.4xlarge)"
"pull / linux-focal-py3.11-clang10 / test (dynamo, 1, 2, linux.2xlarge)"
].classification
== "BROKEN_TRUNK"
)
self.assertTrue(
checks[
"trunk / win-vs2019-cpu-py3 / test (default, 2, 3, windows.4xlarge.nonephemeral)"
].classification
== "BROKEN_TRUNK"
)
self.assertTrue(
checks[
"pull / linux-focal-py3.7-gcc7 / test (distributed, 1, 2, linux.2xlarge)"
"pull / linux-jammy-py3.8-gcc11 / test (distributed, 1, 2, linux.2xlarge)"
].classification
== "BROKEN_TRUNK"
)
self.assertTrue(
checks[
"pull / linux-focal-cuda11.8-py3.10-gcc9 / test (distributed, 1, 3, linux.8xlarge.nvidia.gpu)"
].classification
== "FLAKY"
)

# Set the threshold larger or equal to the number of ok failures
pending, failed, ignorable = categorize_checks(
checks, list(checks.keys()), ok_failed_checks_threshold=2
checks, list(checks.keys()), ok_failed_checks_threshold=6
)
self.assertTrue(len(pending) == 0)
self.assertTrue(len(failed) == 0)
self.assertTrue(len(ignorable["FLAKY"]) == 1)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 1)
self.assertTrue(len(ignorable["FLAKY"]) == 2)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 4)

# Not set any threshold, defaults to -1 to ignore all flaky and broken trunk failures
pending, failed, ignorable = categorize_checks(checks, list(checks.keys()))
self.assertTrue(len(pending) == 0)
self.assertTrue(len(failed) == 0)
self.assertTrue(len(ignorable["FLAKY"]) == 1)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 1)
self.assertTrue(len(ignorable["FLAKY"]) == 2)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 4)

# Set the threshold lower than the number of ok failures
pending, failed, ignorable = categorize_checks(
checks, list(checks.keys()), ok_failed_checks_threshold=1
)
self.assertTrue(len(pending) == 0)
self.assertTrue(len(failed) == 2)
self.assertTrue(len(ignorable["FLAKY"]) == 1)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 1)
self.assertTrue(len(failed) == 6)
self.assertTrue(len(ignorable["FLAKY"]) == 2)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 4)

# Set the threshold to 0 like when ignore_flaky_failures is on
pending, failed, ignorable = categorize_checks(
checks, list(checks.keys()), ok_failed_checks_threshold=1
)
self.assertTrue(len(pending) == 0)
self.assertTrue(len(failed) == 6)
self.assertTrue(len(ignorable["FLAKY"]) == 2)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 4)

def test_get_classifications_similar_failures(self, *args: Any) -> None:
pr = GitHubPR("pytorch", "pytorch", 109750)
Expand All @@ -737,7 +745,6 @@ def test_get_classifications_similar_failures(self, *args: Any) -> None:
pr.last_commit()["oid"],
pr.get_merge_base(),
[],
[],
)
pending, failed, ignorable = categorize_checks(checks, list(checks.keys()))
self.assertTrue(len(pending) == 0)
Expand All @@ -754,7 +761,6 @@ def test_get_classifications_unstable(self, *args: Any) -> None:
pr.last_commit()["oid"],
pr.get_merge_base(),
[],
[],
)
workflow_name = "linux-bionic-cuda12.1-py3.10-gcc9-bazel-test"
job_name = "build-and-test (default, 1, 1, linux.4xlarge.nvidia.gpu, unstable)"
Expand Down Expand Up @@ -812,7 +818,6 @@ def test_get_classifications_broken_trunk(self, *args: Any) -> None:
pr.last_commit()["oid"],
pr.get_merge_base(),
[],
[],
)

pending, failed, _ = categorize_checks(checks, list(checks.keys()))
Expand All @@ -832,35 +837,28 @@ def test_ignore_current(self, *args: Any) -> None:
# current checks takes place after other classifications: flaky, unstable,
# or broken trunk. Only actual new failures should be kept in the list of
# ignore current checks to use to record force merge with actual failures
flaky_rules = [
FlakyRule("distributed", ["##\\[error\\]The operation was canceled."])
]
flaky = (
"pull / linux-focal-py3.7-gcc7 / test (distributed, 1, 2, linux.2xlarge)"
)
flaky = "pull / linux-focal-cuda11.8-py3.10-gcc9 / test (distributed, 1, 3, linux.8xlarge.nvidia.gpu)"
broken_trunk = (
"pull / linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.4xlarge)"
"pull / linux-focal-py3.11-clang10 / test (dynamo, 1, 2, linux.2xlarge)"
)

pr = GitHubPR("pytorch", "pytorch", 92863)
pr = GitHubPR("pytorch", "pytorch", 109584)
checks = pr.get_checkrun_conclusions()

# No broken trunk or flaky rules, then all failures are ignored when ic is used
# No broken trunk or flaky as the merge base is not set, these failures are
# counted as ignore current when ic is used
checks = get_classifications(
pr.pr_num,
pr.project,
checks,
pr.last_commit()["oid"],
None,
[],
[broken_trunk, flaky],
)
self.assertTrue(checks[flaky].classification == "IGNORE_CURRENT_CHECK")
self.assertTrue(checks[broken_trunk].classification == "IGNORE_CURRENT_CHECK")
_, failed, ignorable = categorize_checks(
checks, list(checks.keys()), ok_failed_checks_threshold=2
)
self.assertTrue(len(failed) == 0)
_, failed, ignorable = categorize_checks(checks, list(checks.keys()))
self.assertTrue(len(failed) == 4)
self.assertTrue(len(ignorable["IGNORE_CURRENT_CHECK"]) == 2)
self.assertTrue(len(ignorable["FLAKY"]) == 0)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 0)
Expand All @@ -874,46 +872,22 @@ def test_ignore_current(self, *args: Any) -> None:
checks,
pr.last_commit()["oid"],
pr.get_merge_base(),
flaky_rules,
[broken_trunk, flaky],
)
self.assertTrue(checks[flaky].classification == "FLAKY")
self.assertTrue(checks[broken_trunk].classification == "BROKEN_TRUNK")
_, failed, ignorable = categorize_checks(
checks, list(checks.keys()), ok_failed_checks_threshold=2
)
_, failed, ignorable = categorize_checks(checks, list(checks.keys()))
self.assertTrue(len(failed) == 0)
self.assertTrue(len(ignorable["IGNORE_CURRENT_CHECK"]) == 0)
self.assertTrue(len(ignorable["FLAKY"]) == 1)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 1)

# Broken trunk takes precedence over ignore current (no flaky rule is set here)
checks = get_classifications(
pr.pr_num,
pr.project,
checks,
pr.last_commit()["oid"],
pr.get_merge_base(),
[],
[broken_trunk, flaky],
)
self.assertTrue(checks[flaky].classification == "IGNORE_CURRENT_CHECK")
self.assertTrue(checks[broken_trunk].classification == "BROKEN_TRUNK")
_, failed, ignorable = categorize_checks(
checks, list(checks.keys()), ok_failed_checks_threshold=2
)
self.assertTrue(len(failed) == 0)
self.assertTrue(len(ignorable["IGNORE_CURRENT_CHECK"]) == 1)
self.assertTrue(len(ignorable["FLAKY"]) == 0)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 1)
self.assertTrue(len(ignorable["FLAKY"]) == 2)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 4)

@mock.patch("trymerge.read_flaky_rules", side_effect=xla_is_flaky_rules)
@mock.patch("trymerge.read_merge_rules", side_effect=xla_merge_rules)
def test_dont_ignore_flaky_failures(self, *args: Any) -> None:
"""
Regression test for https://github.com/pytorch/test-infra/issues/4126
"""
pr = GitHubPR("pytorch", "pytorch", 100369)
pr = GitHubPR("pytorch", "pytorch", 105312)
repo = DummyGitRepo()
# Check that failure is classified as flaky but still raises exception
with warnings.catch_warnings(record=True) as w, self.assertRaises(RuntimeError):
Expand Down

0 comments on commit b8a9266

Please sign in to comment.