From 4c4b5b29e19fce26c3f6ce0ada381c9d6f1b8cec Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Fri, 21 Nov 2025 16:36:20 -0800 Subject: [PATCH 1/9] tc --- .github/workflows/check-alerts.yml | 68 +++++++++---------- tools/torchci/check_alerts.py | 7 +- tools/torchci/queue_alert.py | 3 +- .../torchci/test_insights/daily_regression.py | 4 +- tools/torchci/utils.py | 18 ++++- 5 files changed, 62 insertions(+), 38 deletions(-) diff --git a/.github/workflows/check-alerts.yml b/.github/workflows/check-alerts.yml index 4b6595e5e4..3527e172f0 100644 --- a/.github/workflows/check-alerts.yml +++ b/.github/workflows/check-alerts.yml @@ -13,40 +13,40 @@ on: jobs: - update-alerts: - strategy: - matrix: - include: - - repo: pytorch/pytorch - branch: main - with_flaky_test_alerting: YES - job_filter_regex: "^(pull|trunk|lint|linux-binary-)" - - repo: pytorch/pytorch - branch: nightly - with_flaky_test_alerting: NO - job_filter_regex: "" - env: - REPO_TO_CHECK: ${{ matrix.repo }} - BRANCH_TO_CHECK: ${{ matrix.branch }} - WITH_FLAKY_TEST_ALERT: ${{ matrix.with_flaky_test_alerting }} - JOB_NAME_REGEX: ${{ matrix.job_filter_regex }} - # Don't do actual work on pull request - DRY_RUN: ${{ github.event_name == 'pull_request'}} - runs-on: ubuntu-24.04 - permissions: - issues: write - steps: - - name: Checkout - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Install Dependencies - run: pip3 install requests setuptools==80.9.0 - - name: Check for alerts and creates issue - run: | - cd tools - python3 -m torchci.check_alerts - env: - # NOTE: Should be a blank string for pull requests - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # update-alerts: + # strategy: + # matrix: + # include: + # - repo: pytorch/pytorch + # branch: main + # with_flaky_test_alerting: YES + # job_filter_regex: "^(pull|trunk|lint|linux-binary-)" + # - repo: pytorch/pytorch + # branch: nightly + # with_flaky_test_alerting: NO + # job_filter_regex: "" + # env: + # REPO_TO_CHECK: ${{ matrix.repo }} + # BRANCH_TO_CHECK: ${{ matrix.branch }} + # WITH_FLAKY_TEST_ALERT: ${{ matrix.with_flaky_test_alerting }} + # JOB_NAME_REGEX: ${{ matrix.job_filter_regex }} + # # Don't do actual work on pull request + # DRY_RUN: ${{ github.event_name == 'pull_request'}} + # runs-on: ubuntu-24.04 + # permissions: + # issues: write + # steps: + # - name: Checkout + # uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + # - name: Install Dependencies + # run: pip3 install requests setuptools==80.9.0 + # - name: Check for alerts and creates issue + # run: | + # cd tools + # python3 -m torchci.check_alerts + # env: + # # NOTE: Should be a blank string for pull requests + # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} update-queue-alert: env: diff --git a/tools/torchci/check_alerts.py b/tools/torchci/check_alerts.py index a9e4311d2a..d583df4ce8 100755 --- a/tools/torchci/check_alerts.py +++ b/tools/torchci/check_alerts.py @@ -13,6 +13,8 @@ import requests from setuptools import distutils # type: ignore[import] +from torchci.utils import fake_browser_headers + FAILURE_CHAIN_THRESHOLD = 2 FAILED_JOB_PATTERN = r"^- \[(.*)\]\(.*\)$" @@ -351,7 +353,10 @@ def create_issue(issue: Dict, dry_run: bool) -> Dict: def fetch_hud_data(repo: str, branch: str) -> Tuple[List[str], list[list[JobData]]]: - response = requests.get(f"https://hud.pytorch.org/api/hud/{repo}/{branch}/0") + response = requests.get( + f"https://hud.pytorch.org/api/hud/{repo}/{branch}/0", + headers=fake_browser_headers(), + ) response.raise_for_status() hud_data = json.loads(response.text) diff --git a/tools/torchci/queue_alert.py b/tools/torchci/queue_alert.py index 7fdcf8b7f9..d4a63b2f32 100644 --- a/tools/torchci/queue_alert.py +++ b/tools/torchci/queue_alert.py @@ -10,6 +10,7 @@ clear_alerts, close_if_too_many_comments, create_issue, + fake_browser_headers, fetch_alerts, update_issue, ) @@ -109,7 +110,7 @@ def queuing_alert(dry_run: bool) -> None: url = ( "https://hud.pytorch.org/api/clickhouse/queued_jobs_by_label?parameters=%7B%7D" ) - response = requests.get(url).json() + response = requests.get(url, headers=fake_browser_headers()).json() large_queue = filter_long_queues(response) diff --git a/tools/torchci/test_insights/daily_regression.py b/tools/torchci/test_insights/daily_regression.py index e6a881379f..9d00a37f80 100644 --- a/tools/torchci/test_insights/daily_regression.py +++ b/tools/torchci/test_insights/daily_regression.py @@ -5,6 +5,7 @@ import requests from torchci.test_insights.file_report_generator import FileReportGenerator from torchci.test_insights.weekly_notification import send_to_aws_alerting_lambda +from torchci.utils import fake_browser_headers FILE_REPORT_URL = "https://hud.pytorch.org/tests/fileReport" @@ -211,7 +212,8 @@ def get_representative_data_for_time( self, start_date, stop_date ) -> list[dict[str, Any]]: response = requests.get( - f"https://hud.pytorch.org/api/flaky-tests/fileReport?startDate={start_date}&endDate={stop_date}" + f"https://hud.pytorch.org/api/flaky-tests/fileReport?startDate={start_date}&endDate={stop_date}", + headers=fake_browser_headers(), ) if response.status_code != 200: diff --git a/tools/torchci/utils.py b/tools/torchci/utils.py index 6789d90060..b957c38efc 100644 --- a/tools/torchci/utils.py +++ b/tools/torchci/utils.py @@ -4,7 +4,7 @@ import pathlib import subprocess from hashlib import sha256 -from typing import List, Union +from typing import Any, List, Union FILE_CACHE_LIFESPAN_SECONDS = 60 * 60 * 24 # 1 day @@ -66,3 +66,19 @@ def wrapper(*args, **kwargs): return res return wrapper + + +def fake_browser_headers() -> dict[str, Any]: + # Same as + # https://github.com/pytorch/test-infra/pull/7509/files#diff-de488f3b0cabe84bb81be2693e800c5736d35372778cb5c9944fe1301992a692L63 + # TODO: get better auth tokens + return { + # Looks like a real browser instead of python-requests + "User-Agent": ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.0.0 Safari/537.36" + ), + "Accept": "application/json,text/html;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + } From f12c1a79966deadef383bc52af358967b48f3191 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Fri, 21 Nov 2025 16:36:34 -0800 Subject: [PATCH 2/9] lint --- tools/torchci/check_alerts.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/torchci/check_alerts.py b/tools/torchci/check_alerts.py index d583df4ce8..55e36e3716 100755 --- a/tools/torchci/check_alerts.py +++ b/tools/torchci/check_alerts.py @@ -12,7 +12,6 @@ import requests from setuptools import distutils # type: ignore[import] - from torchci.utils import fake_browser_headers From 539cbaaa84e43dd64d94261d9f9adbe528943ccd Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Fri, 21 Nov 2025 16:38:02 -0800 Subject: [PATCH 3/9] print --- tools/torchci/queue_alert.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/torchci/queue_alert.py b/tools/torchci/queue_alert.py index d4a63b2f32..b6a1645bd9 100644 --- a/tools/torchci/queue_alert.py +++ b/tools/torchci/queue_alert.py @@ -110,7 +110,10 @@ def queuing_alert(dry_run: bool) -> None: url = ( "https://hud.pytorch.org/api/clickhouse/queued_jobs_by_label?parameters=%7B%7D" ) - response = requests.get(url, headers=fake_browser_headers()).json() + response = requests.get(url, headers=fake_browser_headers()) + print(response.status_code) + print(response.text) + response = response.json() large_queue = filter_long_queues(response) From 416628496528283a9f8a0f30b4323590a80780ac Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Fri, 21 Nov 2025 16:40:36 -0800 Subject: [PATCH 4/9] tc --- tools/torchci/queue_alert.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/torchci/queue_alert.py b/tools/torchci/queue_alert.py index b6a1645bd9..75da26f1ce 100644 --- a/tools/torchci/queue_alert.py +++ b/tools/torchci/queue_alert.py @@ -113,6 +113,7 @@ def queuing_alert(dry_run: bool) -> None: response = requests.get(url, headers=fake_browser_headers()) print(response.status_code) print(response.text) + print(response.raise_for_status()) response = response.json() large_queue = filter_long_queues(response) From cb67ec3c43e33e78df31a79d1d8d638fe1f2a50d Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Sun, 23 Nov 2025 16:11:03 -0800 Subject: [PATCH 5/9] tc --- .github/workflows/check-alerts.yml | 70 ++++++++++--------- .github/workflows/update_test_file_report.yml | 1 + tools/torchci/check_alerts.py | 4 +- tools/torchci/queue_alert.py | 8 +-- .../torchci/test_insights/daily_regression.py | 4 +- tools/torchci/utils.py | 15 ++-- 6 files changed, 47 insertions(+), 55 deletions(-) diff --git a/.github/workflows/check-alerts.yml b/.github/workflows/check-alerts.yml index 3527e172f0..488d005e53 100644 --- a/.github/workflows/check-alerts.yml +++ b/.github/workflows/check-alerts.yml @@ -13,40 +13,41 @@ on: jobs: - # update-alerts: - # strategy: - # matrix: - # include: - # - repo: pytorch/pytorch - # branch: main - # with_flaky_test_alerting: YES - # job_filter_regex: "^(pull|trunk|lint|linux-binary-)" - # - repo: pytorch/pytorch - # branch: nightly - # with_flaky_test_alerting: NO - # job_filter_regex: "" - # env: - # REPO_TO_CHECK: ${{ matrix.repo }} - # BRANCH_TO_CHECK: ${{ matrix.branch }} - # WITH_FLAKY_TEST_ALERT: ${{ matrix.with_flaky_test_alerting }} - # JOB_NAME_REGEX: ${{ matrix.job_filter_regex }} - # # Don't do actual work on pull request - # DRY_RUN: ${{ github.event_name == 'pull_request'}} - # runs-on: ubuntu-24.04 - # permissions: - # issues: write - # steps: - # - name: Checkout - # uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - # - name: Install Dependencies - # run: pip3 install requests setuptools==80.9.0 - # - name: Check for alerts and creates issue - # run: | - # cd tools - # python3 -m torchci.check_alerts - # env: - # # NOTE: Should be a blank string for pull requests - # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + update-alerts: + strategy: + matrix: + include: + - repo: pytorch/pytorch + branch: main + with_flaky_test_alerting: YES + job_filter_regex: "^(pull|trunk|lint|linux-binary-)" + - repo: pytorch/pytorch + branch: nightly + with_flaky_test_alerting: NO + job_filter_regex: "" + env: + REPO_TO_CHECK: ${{ matrix.repo }} + BRANCH_TO_CHECK: ${{ matrix.branch }} + WITH_FLAKY_TEST_ALERT: ${{ matrix.with_flaky_test_alerting }} + JOB_NAME_REGEX: ${{ matrix.job_filter_regex }} + # Don't do actual work on pull request + DRY_RUN: ${{ github.event_name == 'pull_request'}} + runs-on: ubuntu-24.04 + permissions: + issues: write + steps: + - name: Checkout + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Install Dependencies + run: pip3 install requests setuptools==80.9.0 + - name: Check for alerts and creates issue + run: | + cd tools + python3 -m torchci.check_alerts + env: + # NOTE: Should be a blank string for pull requests + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + HUD_API_TOKEN: ${{ secrets.HUD_API_TOKEN }} update-queue-alert: env: @@ -66,3 +67,4 @@ jobs: env: # NOTE: Should be a blank string for pull requests GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + HUD_API_TOKEN: ${{ secrets.HUD_API_TOKEN }} diff --git a/.github/workflows/update_test_file_report.yml b/.github/workflows/update_test_file_report.yml index 237d564cc9..ee38419850 100644 --- a/.github/workflows/update_test_file_report.yml +++ b/.github/workflows/update_test_file_report.yml @@ -53,6 +53,7 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} AWS_INFRA_ALERTS_LAMBDA_URL: ${{ secrets.AWS_INFRA_ALERTS_LAMBDA_URL }} TEST_REPORT_AWS_LAMBDA_TOKEN: ${{ secrets.TEST_REPORT_AWS_LAMBDA_TOKEN }} + HUD_API_TOKEN: ${{ secrets.HUD_API_TOKEN }} run: | cd test-infra python3 tools/torchci/test_insights/daily_regression.py diff --git a/tools/torchci/check_alerts.py b/tools/torchci/check_alerts.py index 55e36e3716..6b1161f045 100755 --- a/tools/torchci/check_alerts.py +++ b/tools/torchci/check_alerts.py @@ -12,7 +12,7 @@ import requests from setuptools import distutils # type: ignore[import] -from torchci.utils import fake_browser_headers +from torchci.utils import get_hud_headers FAILURE_CHAIN_THRESHOLD = 2 @@ -354,7 +354,7 @@ def create_issue(issue: Dict, dry_run: bool) -> Dict: def fetch_hud_data(repo: str, branch: str) -> Tuple[List[str], list[list[JobData]]]: response = requests.get( f"https://hud.pytorch.org/api/hud/{repo}/{branch}/0", - headers=fake_browser_headers(), + headers=get_hud_headers(), ) response.raise_for_status() hud_data = json.loads(response.text) diff --git a/tools/torchci/queue_alert.py b/tools/torchci/queue_alert.py index 75da26f1ce..0ed99b7f40 100644 --- a/tools/torchci/queue_alert.py +++ b/tools/torchci/queue_alert.py @@ -10,10 +10,10 @@ clear_alerts, close_if_too_many_comments, create_issue, - fake_browser_headers, fetch_alerts, update_issue, ) +from torchci.utils import get_hud_headers REPO_ROOT = Path(__file__).resolve().parent.parent.parent @@ -110,11 +110,7 @@ def queuing_alert(dry_run: bool) -> None: url = ( "https://hud.pytorch.org/api/clickhouse/queued_jobs_by_label?parameters=%7B%7D" ) - response = requests.get(url, headers=fake_browser_headers()) - print(response.status_code) - print(response.text) - print(response.raise_for_status()) - response = response.json() + response = requests.get(url, headers=get_hud_headers()).json() large_queue = filter_long_queues(response) diff --git a/tools/torchci/test_insights/daily_regression.py b/tools/torchci/test_insights/daily_regression.py index 9d00a37f80..910a7a8b77 100644 --- a/tools/torchci/test_insights/daily_regression.py +++ b/tools/torchci/test_insights/daily_regression.py @@ -5,7 +5,7 @@ import requests from torchci.test_insights.file_report_generator import FileReportGenerator from torchci.test_insights.weekly_notification import send_to_aws_alerting_lambda -from torchci.utils import fake_browser_headers +from torchci.utils import get_hud_headers FILE_REPORT_URL = "https://hud.pytorch.org/tests/fileReport" @@ -213,7 +213,7 @@ def get_representative_data_for_time( ) -> list[dict[str, Any]]: response = requests.get( f"https://hud.pytorch.org/api/flaky-tests/fileReport?startDate={start_date}&endDate={stop_date}", - headers=fake_browser_headers(), + headers=get_hud_headers(), ) if response.status_code != 200: diff --git a/tools/torchci/utils.py b/tools/torchci/utils.py index b957c38efc..0323cd246c 100644 --- a/tools/torchci/utils.py +++ b/tools/torchci/utils.py @@ -68,17 +68,10 @@ def wrapper(*args, **kwargs): return wrapper -def fake_browser_headers() -> dict[str, Any]: - # Same as - # https://github.com/pytorch/test-infra/pull/7509/files#diff-de488f3b0cabe84bb81be2693e800c5736d35372778cb5c9944fe1301992a692L63 - # TODO: get better auth tokens +def get_hud_headers() -> dict[str, Any]: + # Get headers for requests to the HUD API. This includes the + # x-hud-internal-bot header which is required for authentication. return { # Looks like a real browser instead of python-requests - "User-Agent": ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/120.0.0.0 Safari/537.36" - ), - "Accept": "application/json,text/html;q=0.9,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.9", + "x-hud-internal-bot": os.environ["HUD_API_TOKEN"], } From f30bae5e37643121b048fa8a4d4fa70a356a8934 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Sun, 23 Nov 2025 16:13:27 -0800 Subject: [PATCH 6/9] tc --- tools/torchci/utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/torchci/utils.py b/tools/torchci/utils.py index 0323cd246c..346dddad07 100644 --- a/tools/torchci/utils.py +++ b/tools/torchci/utils.py @@ -69,8 +69,14 @@ def wrapper(*args, **kwargs): def get_hud_headers() -> dict[str, Any]: - # Get headers for requests to the HUD API. This includes the - # x-hud-internal-bot header which is required for authentication. + """ + Get headers for requests to the HUD API. This includes the + x-hud-internal-bot header which is required for authentication. If the + HUD_API_TOKEN environment variable is not set, this function returns an + empty dictionary. + """ + if "HUD_API_TOKEN" not in os.environ: + return {} return { # Looks like a real browser instead of python-requests "x-hud-internal-bot": os.environ["HUD_API_TOKEN"], From d686a1aa886444fdd7647912344f8c29de47b745 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Sun, 23 Nov 2025 16:16:41 -0800 Subject: [PATCH 7/9] tc --- .github/workflows/update-queue-times.yml | 2 ++ torchci/scripts/updateQueueTimes.mjs | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/update-queue-times.yml b/.github/workflows/update-queue-times.yml index 0ab6d8add9..27b0b07d5b 100644 --- a/.github/workflows/update-queue-times.yml +++ b/.github/workflows/update-queue-times.yml @@ -24,3 +24,5 @@ jobs: role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_update_queue_times aws-region: us-east-1 - run: yarn node scripts/updateQueueTimes.mjs + env: + HUD_API_TOKEN: ${{ secrets.HUD_API_TOKEN }} diff --git a/torchci/scripts/updateQueueTimes.mjs b/torchci/scripts/updateQueueTimes.mjs index 0a4d019c73..326dc61a67 100644 --- a/torchci/scripts/updateQueueTimes.mjs +++ b/torchci/scripts/updateQueueTimes.mjs @@ -14,7 +14,12 @@ const s3client = getS3Client(); // %7B%7D = encoded {} const response = await fetch( - "https://hud.pytorch.org/api/clickhouse/queued_jobs_by_label?parameters=%7B%7D" + "https://hud.pytorch.org/api/clickhouse/queued_jobs_by_label?parameters=%7B%7D", + { + headers: { + "x-hud-internal-bot": process.env.HUD_API_TOKEN + }, + } ).then((r) => r.json()); for (const r of response) { const unixTime = parseInt((new Date(r.time).getTime() / 1000).toFixed(0)); From e4d447f7af7796ab60cdbe77192ad0eb53c1e9f4 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Sun, 23 Nov 2025 16:19:34 -0800 Subject: [PATCH 8/9] lint --- torchci/scripts/updateQueueTimes.mjs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchci/scripts/updateQueueTimes.mjs b/torchci/scripts/updateQueueTimes.mjs index 326dc61a67..8bce92d10d 100644 --- a/torchci/scripts/updateQueueTimes.mjs +++ b/torchci/scripts/updateQueueTimes.mjs @@ -17,7 +17,7 @@ const response = await fetch( "https://hud.pytorch.org/api/clickhouse/queued_jobs_by_label?parameters=%7B%7D", { headers: { - "x-hud-internal-bot": process.env.HUD_API_TOKEN + "x-hud-internal-bot": process.env.HUD_API_TOKEN, }, } ).then((r) => r.json()); From e90666a99df9e5e300b01cf2000628b7c2d592f1 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Sun, 23 Nov 2025 16:23:21 -0800 Subject: [PATCH 9/9] tc --- .github/scripts/update_disabled_issues.py | 5 ++++- .github/workflows/update_disabled_tests.yml | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/scripts/update_disabled_issues.py b/.github/scripts/update_disabled_issues.py index 631727fde7..00125d0db3 100755 --- a/.github/scripts/update_disabled_issues.py +++ b/.github/scripts/update_disabled_issues.py @@ -28,7 +28,10 @@ def main() -> None: with urlopen( Request( f"{HUD_URL}/api/flaky-tests/getDisabledTestsAndJobs", - headers={"Authorization": os.environ["FLAKY_TEST_BOT_KEY"]}, + headers={ + "Authorization": os.environ["FLAKY_TEST_BOT_KEY"], + "x-hud-internal-bot": os.environ["HUD_API_TOKEN"], + }, ) ) as result: if result.status != 200: diff --git a/.github/workflows/update_disabled_tests.yml b/.github/workflows/update_disabled_tests.yml index fcbf627b65..abc6fca965 100644 --- a/.github/workflows/update_disabled_tests.yml +++ b/.github/workflows/update_disabled_tests.yml @@ -31,6 +31,7 @@ jobs: # environment, we do not have access to this token so fall back to the # GITHUB_TOKEN. FLAKY_TEST_BOT_KEY: ${{ secrets.FLAKY_TEST_BOT_KEY }} + HUD_API_TOKEN: ${{ secrets.HUD_API_TOKEN }} run: | python3 -m pip install GitPython==3.1.44 python3 .github/scripts/update_disabled_issues.py