diff --git a/torchci/lib/bot/retryBot.ts b/torchci/lib/bot/retryBot.ts index 230684df94..2d4c5ddbd4 100644 --- a/torchci/lib/bot/retryBot.ts +++ b/torchci/lib/bot/retryBot.ts @@ -16,7 +16,6 @@ function retryBot(app: Probot): void { if ( ctx.payload.workflow_run.conclusion === "success" || - ctx.payload.workflow_run.head_branch !== "master" || attemptNumber > 1 || allowedWorkflowPrefixes.every( allowedWorkflow => !workflowName.toLowerCase().includes(allowedWorkflow) @@ -54,27 +53,41 @@ function retryBot(app: Probot): void { // or a widespread outage that wouldn't be helped by retries return; } + + // @ts-expect-error - we don't have types for these + let doesLookLikeUserFailure = (job, isCodeValiationStep: (step: any) => boolean) => { + // Ensure if any of the steps that failed are not infra related steps (e.g. they're lint, build or test steps) + return job.steps?.filter( + // @ts-expect-error + (step) => + step.conclusion !== null && + FAILURE_CONCLUSIONS.includes(step.conclusion) && + isCodeValiationStep(step) + ).length > 0 + } + const shouldRetry = failedJobs.filter((job) => { - // always rerun lint - if (workflowName === "lint") { - return true; - } // if the job was cancelled, it was probably an infra error, so rerun if (job.conclusion === "cancelled") { return true; } - // if no test steps failed, can rerun - if ( - job.steps?.filter( - (step) => - step.conclusion !== null && - step.name.toLowerCase().includes("test") && - FAILURE_CONCLUSIONS.includes(step.conclusion) - ).length === 0 - ) { - return true; + + // don't rerun if the linter failed on the actual linting steps, which have the nonretryable suffix + if (workflowName.toLocaleLowerCase() === "lint") { + return !doesLookLikeUserFailure(job, step => step.name.toLowerCase().includes("(nonretryable)")) + } + + // for builds, don't rerun if it failed on the actual build step + if (job.name.toLocaleLowerCase().startsWith("build") && + doesLookLikeUserFailure(job, step => step.name.toLowerCase().startsWith("build"))){ + // we continue our retry checks even if this test passes in case this is a build-and-test job + return false } + + // if no test steps failed, can rerun + return !doesLookLikeUserFailure(job, step => step.name.toLowerCase().includes("test")) }); + if (shouldRetry.length === 0) { return; } diff --git a/torchci/test/retryBot.test.ts b/torchci/test/retryBot.test.ts index 8f43a3435d..dfb96e6bcf 100644 --- a/torchci/test/retryBot.test.ts +++ b/torchci/test/retryBot.test.ts @@ -69,7 +69,7 @@ describe("retry-bot", () => { handleScope(scope); }); - test("rerun all if lint", async () => { + test("rerun lint if no nonretryable step failed", async () => { const event = requireDeepCopy("./fixtures/workflow_run.completed.json"); event.payload.workflow_run.name = "Lint"; const workflow_jobs = requireDeepCopy("./fixtures/workflow_jobs.json"); @@ -93,6 +93,29 @@ describe("retry-bot", () => { handleScope(scope); }); + test("don't rerun lint if nonretryable step failed", async () => { + const event = requireDeepCopy("./fixtures/workflow_run.completed.json"); + event.payload.workflow_run.name = "Lint"; + const workflow_jobs = requireDeepCopy("./fixtures/workflow_jobs.json"); + workflow_jobs.jobs[3].conclusion = "failure"; + workflow_jobs.jobs[3].steps[0].name = "Do the lints (nonretryable)" + workflow_jobs.jobs[3].steps[0].conclusion = "failure" + + const owner = event.payload.repository.owner.login; + const repo = event.payload.repository.name; + const attempt_number = event.payload.workflow_run.run_attempt; + const run_id = event.payload.workflow_run.id; + + const scope = nock("https://api.github.com") + .get( + `/repos/${owner}/${repo}/actions/runs/${run_id}/attempts/${attempt_number}/jobs?page=1&per_page=100` + ) + .reply(200, workflow_jobs) + + await probot.receive(event); + handleScope(scope); + }); + test("dont rerun if failed at test step if possible", async () => { const event = requireDeepCopy("./fixtures/workflow_run.completed.json"); event.payload.workflow_run.name = "pull";