Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 28 additions & 15 deletions torchci/lib/bot/retryBot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ function retryBot(app: Probot): void {

if (
ctx.payload.workflow_run.conclusion === "success" ||
ctx.payload.workflow_run.head_branch !== "master" ||
attemptNumber > 1 ||
allowedWorkflowPrefixes.every(
allowedWorkflow => !workflowName.toLowerCase().includes(allowedWorkflow)
Expand Down Expand Up @@ -54,27 +53,41 @@ function retryBot(app: Probot): void {
// or a widespread outage that wouldn't be helped by retries
return;
}

// @ts-expect-error - we don't have types for these
let doesLookLikeUserFailure = (job, isCodeValiationStep: (step: any) => boolean) => {
// Ensure if any of the steps that failed are not infra related steps (e.g. they're lint, build or test steps)
return job.steps?.filter(
// @ts-expect-error
(step) =>
step.conclusion !== null &&
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: If step before non-retriable one in the job has failed (with infra failure), what would be the conclusion of the next step? If it will be failure as well, then the logic would not work, would it? Can we add a unit test of sorts for this one (using a simulated failure in canary or ones personal repo?)

Copy link
Contributor Author

@ZainRizvi ZainRizvi Dec 13, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Edit: It’s status is skipped, which is ignored

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, it would be good to have an example on PR description, where it works as expected, otherwise it's not really useful (i.e. if later API will change it would be good to have a reference to a runs when it was working as expected)
And in general, testing code before landing is a good idea.

FAILURE_CONCLUSIONS.includes(step.conclusion) &&
isCodeValiationStep(step)
).length > 0
}

const shouldRetry = failedJobs.filter((job) => {
// always rerun lint
if (workflowName === "lint") {
return true;
}
// if the job was cancelled, it was probably an infra error, so rerun
if (job.conclusion === "cancelled") {
return true;
}
// if no test steps failed, can rerun
if (
job.steps?.filter(
(step) =>
step.conclusion !== null &&
step.name.toLowerCase().includes("test") &&
FAILURE_CONCLUSIONS.includes(step.conclusion)
).length === 0
) {
return true;

// don't rerun if the linter failed on the actual linting steps, which have the nonretryable suffix
if (workflowName.toLocaleLowerCase() === "lint") {
return !doesLookLikeUserFailure(job, step => step.name.toLowerCase().includes("(nonretryable)"))
}

// for builds, don't rerun if it failed on the actual build step
if (job.name.toLocaleLowerCase().startsWith("build") &&
doesLookLikeUserFailure(job, step => step.name.toLowerCase().startsWith("build"))){
// we continue our retry checks even if this test passes in case this is a build-and-test job
return false
}

// if no test steps failed, can rerun
return !doesLookLikeUserFailure(job, step => step.name.toLowerCase().includes("test"))
});

if (shouldRetry.length === 0) {
return;
}
Expand Down
25 changes: 24 additions & 1 deletion torchci/test/retryBot.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ describe("retry-bot", () => {
handleScope(scope);
});

test("rerun all if lint", async () => {
test("rerun lint if no nonretryable step failed", async () => {
const event = requireDeepCopy("./fixtures/workflow_run.completed.json");
event.payload.workflow_run.name = "Lint";
const workflow_jobs = requireDeepCopy("./fixtures/workflow_jobs.json");
Expand All @@ -93,6 +93,29 @@ describe("retry-bot", () => {
handleScope(scope);
});

test("don't rerun lint if nonretryable step failed", async () => {
const event = requireDeepCopy("./fixtures/workflow_run.completed.json");
event.payload.workflow_run.name = "Lint";
const workflow_jobs = requireDeepCopy("./fixtures/workflow_jobs.json");
workflow_jobs.jobs[3].conclusion = "failure";
workflow_jobs.jobs[3].steps[0].name = "Do the lints (nonretryable)"
workflow_jobs.jobs[3].steps[0].conclusion = "failure"

const owner = event.payload.repository.owner.login;
const repo = event.payload.repository.name;
const attempt_number = event.payload.workflow_run.run_attempt;
const run_id = event.payload.workflow_run.id;

const scope = nock("https://api.github.com")
.get(
`/repos/${owner}/${repo}/actions/runs/${run_id}/attempts/${attempt_number}/jobs?page=1&per_page=100`
)
.reply(200, workflow_jobs)

await probot.receive(event);
handleScope(scope);
});

test("dont rerun if failed at test step if possible", async () => {
const event = requireDeepCopy("./fixtures/workflow_run.completed.json");
event.payload.workflow_run.name = "pull";
Expand Down