From 5cde38cc1e53e56f2a134c3d374255c1f63e7ba3 Mon Sep 17 00:00:00 2001 From: David Grove Date: Tue, 17 Sep 2024 10:22:01 -0400 Subject: [PATCH] Elide generating annotations with default values 1. Always generate an annotation with the version of the helm chart. 2. Do not generate fault tolerance annotations unless user explicitly sets them 3. Bump chart version to 1.1.0 4. Add unit tests for annotation generation --- tools/pytorchjob-generator/chart/Chart.yaml | 2 +- tools/pytorchjob-generator/chart/README.md | 16 ++--- .../chart/templates/appwrapper.yaml | 15 +++++ .../__snapshot__/helloworld_test.yaml.snap | 64 +++---------------- .../chart/tests/helloworld_test.yaml | 32 +++++++++- .../chart/values.schema.json | 44 ++++++++----- tools/pytorchjob-generator/chart/values.yaml | 21 ++++-- 7 files changed, 106 insertions(+), 88 deletions(-) diff --git a/tools/pytorchjob-generator/chart/Chart.yaml b/tools/pytorchjob-generator/chart/Chart.yaml index 47ea0e2..e5ea414 100644 --- a/tools/pytorchjob-generator/chart/Chart.yaml +++ b/tools/pytorchjob-generator/chart/Chart.yaml @@ -2,5 +2,5 @@ apiVersion: v2 name: pytorchjob-generator description: An AppWrapper generator for PyTorchJobs type: application -version: 1.0.0 +version: 1.1.0 appVersion: "v1beta2" diff --git a/tools/pytorchjob-generator/chart/README.md b/tools/pytorchjob-generator/chart/README.md index cb78bcd..c8b01a9 100644 --- a/tools/pytorchjob-generator/chart/README.md +++ b/tools/pytorchjob-generator/chart/README.md @@ -2,7 +2,7 @@ An AppWrapper generator for PyTorchJobs -![Version: 1.0.0](https://img.shields.io/badge/Version-1.0.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v1beta2](https://img.shields.io/badge/AppVersion-v1beta2-informational?style=flat-square) +![Version: 1.1.0](https://img.shields.io/badge/Version-1.1.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v1beta2](https://img.shields.io/badge/AppVersion-v1beta2-informational?style=flat-square) ## Overview @@ -66,12 +66,12 @@ customize the Jobs generated by the tool. | Key | Type | Default | Description | |-----|------|---------|-------------| -| admissionGracePeriodDuration | string | `"60s"` | Customize the admissionGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | -| warmupGracePeriodDuration | string | `"300s"` | Customize the warmupGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | -| failureGracePeriodDuration | string | `"60s"` | Customize the failureGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | -| retryPausePeriodDuration | string | `"90s"` | Customize the retryPausePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | -| retryLimit | integer | `3` | Customize the retryLimit; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | -| forcefulDeletionGracePeriodDuration | string | `"600s"` | Customize the forcefulDelectionGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | -| deletionOnFailureGracePeriodDuration | string | `"0s"` | Customize the deletionOnFailureGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | +| admissionGracePeriodDuration | string | The AppWrapper defaults will be used | Customize the admissionGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | +| warmupGracePeriodDuration | string | The AppWrapper defaults will be used | Customize the warmupGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | +| failureGracePeriodDuration | string | The AppWrapper defaults will be used | Customize the failureGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | +| retryPausePeriodDuration | string | The AppWrapper defaults will be used | Customize the retryPausePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | +| retryLimit | integer | The AppWrapper defaults will be used | Customize the retryLimit; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | +| forcefulDeletionGracePeriodDuration | string | The AppWrapper defaults will be used | Customize the forcefulDelectionGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | +| deletionOnFailureGracePeriodDuration | string | The AppWrapper defaults will be used | Customize the deletionOnFailureGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | | restartPolicy | string | `"Never"` | Set Kubernertes policy for restarting failed containers "in place" (without restarting the Pod). | | terminationGracePeriodSeconds | integer | Kubernetes's default value is used | Set a non-default pod termination grace period (in seconds). | diff --git a/tools/pytorchjob-generator/chart/templates/appwrapper.yaml b/tools/pytorchjob-generator/chart/templates/appwrapper.yaml index 4281014..1cf56d3 100644 --- a/tools/pytorchjob-generator/chart/templates/appwrapper.yaml +++ b/tools/pytorchjob-generator/chart/templates/appwrapper.yaml @@ -54,13 +54,28 @@ metadata: name: {{ .Values.jobName }} namespace: {{ required "Please specify a 'namespace' in the user file" .Values.namespace }} annotations: + workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: "{{ .Chart.Version }}" + {{- if .Values.admissionGracePeriodDuration }} workload.codeflare.dev.appwrapper/admissionGracePeriodDuration: "{{ .Values.admissionGracePeriodDuration }}" + {{- end }} + {{- if .Values.warmupGracePeriodDuration }} workload.codeflare.dev.appwrapper/warmupGracePeriodDuration: "{{ .Values.warmupGracePeriodDuration }}" + {{- end }} + {{- if .Values.failureGracePeriodDuration }} workload.codeflare.dev.appwrapper/failureGracePeriodDuration: "{{ .Values.failureGracePeriodDuration }}" + {{- end }} + {{- if .Values.retryPausePeriodDuration }} workload.codeflare.dev.appwrapper/retryPausePeriodDuration: "{{ .Values.retryPausePeriodDuration }}" + {{- end }} + {{- if .Values.retryLimit }} workload.codeflare.dev.appwrapper/retryLimit: "{{ .Values.retryLimit }}" + {{- end }} + {{- if .Values.forcefulDeletionGracePeriodDuration }} workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: "{{ .Values.forcefulDeletionGracePeriodDuration }}" + {{- end }} + {{- if .Values.deletionOnFailureGracePeriodDuration }} workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: "{{ .Values.deletionOnFailureGracePeriodDuration }}" + {{- end }} labels: kueue.x-k8s.io/queue-name: {{ .Values.queueName }} {{- include "mlbatch.customLabels" . | indent 8 }} diff --git a/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap b/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap index 909dba6..683036e 100644 --- a/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap +++ b/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap @@ -4,13 +4,7 @@ Adding Volume Mounts: kind: AppWrapper metadata: annotations: - workload.codeflare.dev.appwrapper/admissionGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: 0s - workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: 600s - workload.codeflare.dev.appwrapper/retryLimit: "3" - workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 90s - workload.codeflare.dev.appwrapper/warmupGracePeriodDuration: 300s + workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.0 labels: kueue.x-k8s.io/queue-name: default-queue name: my-job @@ -149,13 +143,7 @@ Adding initContainers: kind: AppWrapper metadata: annotations: - workload.codeflare.dev.appwrapper/admissionGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: 0s - workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: 600s - workload.codeflare.dev.appwrapper/retryLimit: "3" - workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 90s - workload.codeflare.dev.appwrapper/warmupGracePeriodDuration: 300s + workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.0 labels: kueue.x-k8s.io/queue-name: default-queue name: my-job @@ -300,13 +288,7 @@ AppWrapper metadata should match snapshot: kind: AppWrapper metadata: annotations: - workload.codeflare.dev.appwrapper/admissionGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: 0s - workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: 600s - workload.codeflare.dev.appwrapper/retryLimit: "3" - workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 90s - workload.codeflare.dev.appwrapper/warmupGracePeriodDuration: 300s + workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.0 labels: kueue.x-k8s.io/queue-name: default-queue name: my-job @@ -425,13 +407,7 @@ AppWrapper spec should match snapshot: kind: AppWrapper metadata: annotations: - workload.codeflare.dev.appwrapper/admissionGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: 0s - workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: 600s - workload.codeflare.dev.appwrapper/retryLimit: "3" - workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 90s - workload.codeflare.dev.appwrapper/warmupGracePeriodDuration: 300s + workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.0 labels: kueue.x-k8s.io/queue-name: default-queue name: my-job @@ -550,13 +526,7 @@ Enabling NVMe: kind: AppWrapper metadata: annotations: - workload.codeflare.dev.appwrapper/admissionGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: 0s - workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: 600s - workload.codeflare.dev.appwrapper/retryLimit: "3" - workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 90s - workload.codeflare.dev.appwrapper/warmupGracePeriodDuration: 300s + workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.0 labels: kueue.x-k8s.io/queue-name: default-queue name: my-job @@ -705,13 +675,7 @@ Enabling RoCE GDR: kind: AppWrapper metadata: annotations: - workload.codeflare.dev.appwrapper/admissionGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: 0s - workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: 600s - workload.codeflare.dev.appwrapper/retryLimit: "3" - workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 90s - workload.codeflare.dev.appwrapper/warmupGracePeriodDuration: 300s + workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.0 labels: kueue.x-k8s.io/queue-name: default-queue name: my-job @@ -862,13 +826,7 @@ Enabling all advanced features at once: kind: AppWrapper metadata: annotations: - workload.codeflare.dev.appwrapper/admissionGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: 0s - workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: 600s - workload.codeflare.dev.appwrapper/retryLimit: "3" - workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 90s - workload.codeflare.dev.appwrapper/warmupGracePeriodDuration: 300s + workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.0 labels: kueue.x-k8s.io/queue-name: default-queue name: my-job @@ -1123,13 +1081,7 @@ Enabling sshGitConfig injects the envvars, volumes, and volumeMounts: kind: AppWrapper metadata: annotations: - workload.codeflare.dev.appwrapper/admissionGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: 0s - workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: 600s - workload.codeflare.dev.appwrapper/retryLimit: "3" - workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 90s - workload.codeflare.dev.appwrapper/warmupGracePeriodDuration: 300s + workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.0 labels: kueue.x-k8s.io/queue-name: default-queue name: my-job diff --git a/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml b/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml index 1ad0a22..13453e7 100644 --- a/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml +++ b/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml @@ -134,7 +134,37 @@ tests: command: ["sh", "-c", "echo hello world!"] asserts: - matchSnapshot: - path: spec.components[0].template + patch: spec.components[0].template + +- it: Setting fault tolerance annotations + set: + admissionGracePeriodDuration: "10s" + warmupGracePeriodDuration: "11s" + failureGracePeriodDuration: "22s" + retryPausePeriodDuration: "17s" + retryLimit: 42 + forcefulDeletionGracePeriodDuration: "19s" + deletionOnFailureGracePeriodDuration: "2s" + asserts: + - isSubset: + path: metadata.annotations + content: + workload.codeflare.dev.appwrapper/admissionGracePeriodDuration: "10s" + workload.codeflare.dev.appwrapper/warmupGracePeriodDuration: "11s" + workload.codeflare.dev.appwrapper/failureGracePeriodDuration: "22s" + workload.codeflare.dev.appwrapper/retryPausePeriodDuration: "17s" + workload.codeflare.dev.appwrapper/retryLimit: "42" + workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: "19s" + workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: "2s" + +- it: Setting jsut one tolerance annotation + set: + deletionOnFailureGracePeriodDuration: "6h" + asserts: + - isSubset: + path: metadata.annotations + content: + workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: "6h" - it: Enabling all advanced features at once set: diff --git a/tools/pytorchjob-generator/chart/values.schema.json b/tools/pytorchjob-generator/chart/values.schema.json index 6da4018..faede94 100644 --- a/tools/pytorchjob-generator/chart/values.schema.json +++ b/tools/pytorchjob-generator/chart/values.schema.json @@ -4,14 +4,7 @@ "required": [ "namespace", "jobName", - "containerImage", - "admissionGracePeriodDuration", - "warmupGracePeriodDuration", - "failureGracePeriodDuration", - "retryPausePeriodDuration", - "retryLimit", - "forcefulDeletionGracePeriodDuration", - "deletionOnFailureGracePeriodDuration" + "containerImage" ], "additionalProperties": false, "properties": { @@ -125,13 +118,34 @@ { "type": "null" }, { "type": "integer", "minimum": 0 } ]}, - "admissionGracePeriodDuration": { "$ref": "#/$defs/duration" }, - "warmupGracePeriodDuration": { "$ref": "#/$defs/duration" }, - "failureGracePeriodDuration": { "$ref": "#/$defs/duration" }, - "retryPausePeriodDuration": { "$ref": "#/$defs/duration" }, - "retryLimit": { "type": "integer", "minimum": 0, "maximum": 100 }, - "forcefulDeletionGracePeriodDuration": { "$ref": "#/$defs/duration" }, - "deletionOnFailureGracePeriodDuration" : { "$ref": "#/$defs/duration" } + "admissionGracePeriodDuration": { "oneOf" : [ + { "type": "null" }, + { "$ref": "#/$defs/duration" } + ]}, + "warmupGracePeriodDuration": { "oneOf" : [ + { "type": "null" }, + { "$ref": "#/$defs/duration" } + ]}, + "failureGracePeriodDuration": { "oneOf" : [ + { "type": "null" }, + { "$ref": "#/$defs/duration" } + ]}, + "retryPausePeriodDuration": { "oneOf" : [ + { "type": "null" }, + { "$ref": "#/$defs/duration" } + ]}, + "retryLimit": { "oneOf" : [ + { "type": "null" }, + { "type": "integer", "minimum": 0, "maximum": 100 } + ]}, + "forcefulDeletionGracePeriodDuration": { "oneOf" : [ + { "type": "null" }, + { "$ref": "#/$defs/duration" } + ]}, + "deletionOnFailureGracePeriodDuration" : { "oneOf" : [ + { "type": "null" }, + { "$ref": "#/$defs/duration" } + ]} }, "if": { diff --git a/tools/pytorchjob-generator/chart/values.yaml b/tools/pytorchjob-generator/chart/values.yaml index f10dc3e..0fec5ce 100644 --- a/tools/pytorchjob-generator/chart/values.yaml +++ b/tools/pytorchjob-generator/chart/values.yaml @@ -229,31 +229,38 @@ serviceAccountName: # service account name # -- (string) Customize the admissionGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ # @section -- Fault Tolerance -admissionGracePeriodDuration: "60s" +# @default -- The AppWrapper defaults will be used +admissionGracePeriodDuration: # -- (string) Customize the warmupGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ # @section -- Fault Tolerance -warmupGracePeriodDuration: "300s" +# @default -- The AppWrapper defaults will be used +warmupGracePeriodDuration: # -- (string) Customize the failureGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ # @section -- Fault Tolerance -failureGracePeriodDuration: "60s" +# @default -- The AppWrapper defaults will be used +failureGracePeriodDuration: # -- (string) Customize the retryPausePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ # @section -- Fault Tolerance -retryPausePeriodDuration: "90s" +# @default -- The AppWrapper defaults will be used +retryPausePeriodDuration: # -- (integer) Customize the retryLimit; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ # @section -- Fault Tolerance -retryLimit: 3 +# @default -- The AppWrapper defaults will be used +retryLimit: # -- (string) Customize the forcefulDelectionGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ # @section -- Fault Tolerance -forcefulDeletionGracePeriodDuration: "600s" +# @default -- The AppWrapper defaults will be used +forcefulDeletionGracePeriodDuration: # -- (string) Customize the deletionOnFailureGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ # @section -- Fault Tolerance -deletionOnFailureGracePeriodDuration: "0s" +# @default -- The AppWrapper defaults will be used +deletionOnFailureGracePeriodDuration: # -- (string) Set Kubernertes policy for restarting failed containers "in place" (without restarting the Pod). # @section -- Fault Tolerance