Skip to content

Commit

Permalink
Fix Dr.CI flaky FP when GH fails to dispatch the workflow (#4998)
Browse files Browse the repository at this point in the history
Fixes #4987

Dr.CI logic to detect `isInfraFlakyJob` and `isLogClassifierFailed` has
a FP where it misclassifies the GH failure to dispatch the whole
workflow as flaky, for example
pytorch/pytorch#121317.

These logic should only be applicable to workflow job, not workflow run.
The way to separate them is to check the `workflowId` field where it is
set to `null` whenever it is a workflow run.

### Testing

Unit test + local curl command will mark them as legit failures:

```
curl --request POST \
--url "http://localhost:3000/api/drci/drci?prNumber=121317" \
--header "Authorization: TOKEN" \
--data 'repo=pytorch'
```
  • Loading branch information
huydhn committed Mar 12, 2024
1 parent 199a5b6 commit 630817a
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 5 deletions.
14 changes: 13 additions & 1 deletion torchci/lib/drciUtils.ts
Expand Up @@ -368,9 +368,15 @@ export async function hasSimilarFailures(
export function isInfraFlakyJob(job: RecentWorkflowsData): boolean {
// An infra flaky job is a failed job without any failure line and runner. It shows
// up as an empty job without any logs on GitHub. The failure can only be seen via
// the workflow summary tab
// the workflow summary tab.
//
// Also having a workflow ID means that this is a workflow job, not a workflow run.
// This is to prevent the case where GitHub failed to run the whole workflow, but
// was allowed to go through as flaky
return (
job.conclusion === "failure" &&
job.workflowId !== null &&
job.workflowId !== undefined &&
(job.failure_lines === null ||
job.failure_lines === undefined ||
job.failure_lines.join("") === "") &&
Expand All @@ -383,6 +389,12 @@ export function isInfraFlakyJob(job: RecentWorkflowsData): boolean {
export async function isLogClassifierFailed(
job: RecentWorkflowsData
): Promise<boolean> {
// Having no workflow ID means that this is a workflow run, not a workflow job.
// We don't want to apply the log classifier check for a workflow run
if (job.workflowId === null || job.workflowId === undefined) {
return false;
}

// This covers the case when there is no log on S3 or log classifier fails to triggered
const hasFailureLines =
job.failure_lines !== null &&
Expand Down
20 changes: 16 additions & 4 deletions torchci/test/drciUtils.test.ts
Expand Up @@ -334,6 +334,7 @@ describe("Test various utils used by Dr.CI", () => {
});

test("test isInfraFlakyJob", () => {
// Not a workflow job
const notInfraFlakyFailure: RecentWorkflowsData = {
id: "A",
name: "A",
Expand All @@ -348,8 +349,13 @@ describe("Test various utils used by Dr.CI", () => {
};
expect(isInfraFlakyJob(notInfraFlakyFailure)).toEqual(false);

// Set the workflow ID to mark this as a workflow job
notInfraFlakyFailure.workflowId = "A";
expect(isInfraFlakyJob(notInfraFlakyFailure)).toEqual(false);

const notInfraFlakyFailureAgain: RecentWorkflowsData = {
id: "A",
workflowId: "A",
name: "A",
html_url: "A",
head_sha: "A",
Expand All @@ -364,6 +370,7 @@ describe("Test various utils used by Dr.CI", () => {

const isInfraFlakyFailure: RecentWorkflowsData = {
id: "A",
workflowId: "A",
name: "A",
html_url: "A",
head_sha: "A",
Expand All @@ -381,8 +388,8 @@ describe("Test various utils used by Dr.CI", () => {
const mockJobUtils = jest.spyOn(jobUtils, "hasS3Log");
mockJobUtils.mockImplementation(() => Promise.resolve(true));

// Has log and failure lines
const validFailure: RecentWorkflowsData = {
// Not a workflow job
const mockFailure: RecentWorkflowsData = {
id: "A",
name: "A",
html_url: "A",
Expand All @@ -394,11 +401,16 @@ describe("Test various utils used by Dr.CI", () => {
head_branch: "whatever",
runnerName: "dummy",
};
expect(await isLogClassifierFailed(validFailure)).toEqual(false);
expect(await isLogClassifierFailed(mockFailure)).toEqual(false);

// Has log and failure lines and is a workflow job
mockFailure.workflowId = "A";
expect(await isLogClassifierFailed(mockFailure)).toEqual(false);

// Has log but not failure lines (log classifier not triggered)
const logClassifierNotTriggered: RecentWorkflowsData = {
id: "A",
workflowId: "A",
name: "A",
html_url: "A",
head_sha: "A",
Expand All @@ -415,7 +427,7 @@ describe("Test various utils used by Dr.CI", () => {

// No S3 log
mockJobUtils.mockImplementation(() => Promise.resolve(false));
expect(await isLogClassifierFailed(validFailure)).toEqual(true);
expect(await isLogClassifierFailed(mockFailure)).toEqual(true);
});

test("test isExcludedFromFlakiness", () => {
Expand Down

0 comments on commit 630817a

Please sign in to comment.