diff --git a/tools/pytorchjob-generator/chart/Chart.yaml b/tools/pytorchjob-generator/chart/Chart.yaml index 47ea0e2..e5ea414 100644 --- a/tools/pytorchjob-generator/chart/Chart.yaml +++ b/tools/pytorchjob-generator/chart/Chart.yaml @@ -2,5 +2,5 @@ apiVersion: v2 name: pytorchjob-generator description: An AppWrapper generator for PyTorchJobs type: application -version: 1.0.0 +version: 1.1.0 appVersion: "v1beta2" diff --git a/tools/pytorchjob-generator/chart/README.md b/tools/pytorchjob-generator/chart/README.md index cb78bcd..c8b01a9 100644 --- a/tools/pytorchjob-generator/chart/README.md +++ b/tools/pytorchjob-generator/chart/README.md @@ -2,7 +2,7 @@ An AppWrapper generator for PyTorchJobs -![Version: 1.0.0](https://img.shields.io/badge/Version-1.0.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v1beta2](https://img.shields.io/badge/AppVersion-v1beta2-informational?style=flat-square) +![Version: 1.1.0](https://img.shields.io/badge/Version-1.1.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v1beta2](https://img.shields.io/badge/AppVersion-v1beta2-informational?style=flat-square) ## Overview @@ -66,12 +66,12 @@ customize the Jobs generated by the tool. | Key | Type | Default | Description | |-----|------|---------|-------------| -| admissionGracePeriodDuration | string | `"60s"` | Customize the admissionGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | -| warmupGracePeriodDuration | string | `"300s"` | Customize the warmupGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | -| failureGracePeriodDuration | string | `"60s"` | Customize the failureGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | -| retryPausePeriodDuration | string | `"90s"` | Customize the retryPausePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | -| retryLimit | integer | `3` | Customize the retryLimit; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | -| forcefulDeletionGracePeriodDuration | string | `"600s"` | Customize the forcefulDelectionGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | -| deletionOnFailureGracePeriodDuration | string | `"0s"` | Customize the deletionOnFailureGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | +| admissionGracePeriodDuration | string | The AppWrapper defaults will be used | Customize the admissionGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | +| warmupGracePeriodDuration | string | The AppWrapper defaults will be used | Customize the warmupGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | +| failureGracePeriodDuration | string | The AppWrapper defaults will be used | Customize the failureGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | +| retryPausePeriodDuration | string | The AppWrapper defaults will be used | Customize the retryPausePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | +| retryLimit | integer | The AppWrapper defaults will be used | Customize the retryLimit; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | +| forcefulDeletionGracePeriodDuration | string | The AppWrapper defaults will be used | Customize the forcefulDelectionGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | +| deletionOnFailureGracePeriodDuration | string | The AppWrapper defaults will be used | Customize the deletionOnFailureGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ | | restartPolicy | string | `"Never"` | Set Kubernertes policy for restarting failed containers "in place" (without restarting the Pod). | | terminationGracePeriodSeconds | integer | Kubernetes's default value is used | Set a non-default pod termination grace period (in seconds). | diff --git a/tools/pytorchjob-generator/chart/templates/appwrapper.yaml b/tools/pytorchjob-generator/chart/templates/appwrapper.yaml index 4281014..1cf56d3 100644 --- a/tools/pytorchjob-generator/chart/templates/appwrapper.yaml +++ b/tools/pytorchjob-generator/chart/templates/appwrapper.yaml @@ -54,13 +54,28 @@ metadata: name: {{ .Values.jobName }} namespace: {{ required "Please specify a 'namespace' in the user file" .Values.namespace }} annotations: + workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: "{{ .Chart.Version }}" + {{- if .Values.admissionGracePeriodDuration }} workload.codeflare.dev.appwrapper/admissionGracePeriodDuration: "{{ .Values.admissionGracePeriodDuration }}" + {{- end }} + {{- if .Values.warmupGracePeriodDuration }} workload.codeflare.dev.appwrapper/warmupGracePeriodDuration: "{{ .Values.warmupGracePeriodDuration }}" + {{- end }} + {{- if .Values.failureGracePeriodDuration }} workload.codeflare.dev.appwrapper/failureGracePeriodDuration: "{{ .Values.failureGracePeriodDuration }}" + {{- end }} + {{- if .Values.retryPausePeriodDuration }} workload.codeflare.dev.appwrapper/retryPausePeriodDuration: "{{ .Values.retryPausePeriodDuration }}" + {{- end }} + {{- if .Values.retryLimit }} workload.codeflare.dev.appwrapper/retryLimit: "{{ .Values.retryLimit }}" + {{- end }} + {{- if .Values.forcefulDeletionGracePeriodDuration }} workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: "{{ .Values.forcefulDeletionGracePeriodDuration }}" + {{- end }} + {{- if .Values.deletionOnFailureGracePeriodDuration }} workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: "{{ .Values.deletionOnFailureGracePeriodDuration }}" + {{- end }} labels: kueue.x-k8s.io/queue-name: {{ .Values.queueName }} {{- include "mlbatch.customLabels" . | indent 8 }} diff --git a/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap b/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap index 909dba6..683036e 100644 --- a/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap +++ b/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap @@ -4,13 +4,7 @@ Adding Volume Mounts: kind: AppWrapper metadata: annotations: - workload.codeflare.dev.appwrapper/admissionGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: 0s - workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: 600s - workload.codeflare.dev.appwrapper/retryLimit: "3" - workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 90s - workload.codeflare.dev.appwrapper/warmupGracePeriodDuration: 300s + workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.0 labels: kueue.x-k8s.io/queue-name: default-queue name: my-job @@ -149,13 +143,7 @@ Adding initContainers: kind: AppWrapper metadata: annotations: - workload.codeflare.dev.appwrapper/admissionGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: 0s - workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: 600s - workload.codeflare.dev.appwrapper/retryLimit: "3" - workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 90s - workload.codeflare.dev.appwrapper/warmupGracePeriodDuration: 300s + workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.0 labels: kueue.x-k8s.io/queue-name: default-queue name: my-job @@ -300,13 +288,7 @@ AppWrapper metadata should match snapshot: kind: AppWrapper metadata: annotations: - workload.codeflare.dev.appwrapper/admissionGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: 0s - workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: 600s - workload.codeflare.dev.appwrapper/retryLimit: "3" - workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 90s - workload.codeflare.dev.appwrapper/warmupGracePeriodDuration: 300s + workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.0 labels: kueue.x-k8s.io/queue-name: default-queue name: my-job @@ -425,13 +407,7 @@ AppWrapper spec should match snapshot: kind: AppWrapper metadata: annotations: - workload.codeflare.dev.appwrapper/admissionGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: 0s - workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: 600s - workload.codeflare.dev.appwrapper/retryLimit: "3" - workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 90s - workload.codeflare.dev.appwrapper/warmupGracePeriodDuration: 300s + workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.0 labels: kueue.x-k8s.io/queue-name: default-queue name: my-job @@ -550,13 +526,7 @@ Enabling NVMe: kind: AppWrapper metadata: annotations: - workload.codeflare.dev.appwrapper/admissionGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: 0s - workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: 600s - workload.codeflare.dev.appwrapper/retryLimit: "3" - workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 90s - workload.codeflare.dev.appwrapper/warmupGracePeriodDuration: 300s + workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.0 labels: kueue.x-k8s.io/queue-name: default-queue name: my-job @@ -705,13 +675,7 @@ Enabling RoCE GDR: kind: AppWrapper metadata: annotations: - workload.codeflare.dev.appwrapper/admissionGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: 0s - workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: 600s - workload.codeflare.dev.appwrapper/retryLimit: "3" - workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 90s - workload.codeflare.dev.appwrapper/warmupGracePeriodDuration: 300s + workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.0 labels: kueue.x-k8s.io/queue-name: default-queue name: my-job @@ -862,13 +826,7 @@ Enabling all advanced features at once: kind: AppWrapper metadata: annotations: - workload.codeflare.dev.appwrapper/admissionGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: 0s - workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: 600s - workload.codeflare.dev.appwrapper/retryLimit: "3" - workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 90s - workload.codeflare.dev.appwrapper/warmupGracePeriodDuration: 300s + workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.0 labels: kueue.x-k8s.io/queue-name: default-queue name: my-job @@ -1123,13 +1081,7 @@ Enabling sshGitConfig injects the envvars, volumes, and volumeMounts: kind: AppWrapper metadata: annotations: - workload.codeflare.dev.appwrapper/admissionGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: 0s - workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 60s - workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: 600s - workload.codeflare.dev.appwrapper/retryLimit: "3" - workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 90s - workload.codeflare.dev.appwrapper/warmupGracePeriodDuration: 300s + workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.0 labels: kueue.x-k8s.io/queue-name: default-queue name: my-job diff --git a/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml b/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml index 1ad0a22..13453e7 100644 --- a/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml +++ b/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml @@ -134,7 +134,37 @@ tests: command: ["sh", "-c", "echo hello world!"] asserts: - matchSnapshot: - path: spec.components[0].template + patch: spec.components[0].template + +- it: Setting fault tolerance annotations + set: + admissionGracePeriodDuration: "10s" + warmupGracePeriodDuration: "11s" + failureGracePeriodDuration: "22s" + retryPausePeriodDuration: "17s" + retryLimit: 42 + forcefulDeletionGracePeriodDuration: "19s" + deletionOnFailureGracePeriodDuration: "2s" + asserts: + - isSubset: + path: metadata.annotations + content: + workload.codeflare.dev.appwrapper/admissionGracePeriodDuration: "10s" + workload.codeflare.dev.appwrapper/warmupGracePeriodDuration: "11s" + workload.codeflare.dev.appwrapper/failureGracePeriodDuration: "22s" + workload.codeflare.dev.appwrapper/retryPausePeriodDuration: "17s" + workload.codeflare.dev.appwrapper/retryLimit: "42" + workload.codeflare.dev.appwrapper/forcefulDeletionGracePeriodDuration: "19s" + workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: "2s" + +- it: Setting jsut one tolerance annotation + set: + deletionOnFailureGracePeriodDuration: "6h" + asserts: + - isSubset: + path: metadata.annotations + content: + workload.codeflare.dev.appwrapper/deletionOnFailureGracePeriodDuration: "6h" - it: Enabling all advanced features at once set: diff --git a/tools/pytorchjob-generator/chart/values.schema.json b/tools/pytorchjob-generator/chart/values.schema.json index 6da4018..faede94 100644 --- a/tools/pytorchjob-generator/chart/values.schema.json +++ b/tools/pytorchjob-generator/chart/values.schema.json @@ -4,14 +4,7 @@ "required": [ "namespace", "jobName", - "containerImage", - "admissionGracePeriodDuration", - "warmupGracePeriodDuration", - "failureGracePeriodDuration", - "retryPausePeriodDuration", - "retryLimit", - "forcefulDeletionGracePeriodDuration", - "deletionOnFailureGracePeriodDuration" + "containerImage" ], "additionalProperties": false, "properties": { @@ -125,13 +118,34 @@ { "type": "null" }, { "type": "integer", "minimum": 0 } ]}, - "admissionGracePeriodDuration": { "$ref": "#/$defs/duration" }, - "warmupGracePeriodDuration": { "$ref": "#/$defs/duration" }, - "failureGracePeriodDuration": { "$ref": "#/$defs/duration" }, - "retryPausePeriodDuration": { "$ref": "#/$defs/duration" }, - "retryLimit": { "type": "integer", "minimum": 0, "maximum": 100 }, - "forcefulDeletionGracePeriodDuration": { "$ref": "#/$defs/duration" }, - "deletionOnFailureGracePeriodDuration" : { "$ref": "#/$defs/duration" } + "admissionGracePeriodDuration": { "oneOf" : [ + { "type": "null" }, + { "$ref": "#/$defs/duration" } + ]}, + "warmupGracePeriodDuration": { "oneOf" : [ + { "type": "null" }, + { "$ref": "#/$defs/duration" } + ]}, + "failureGracePeriodDuration": { "oneOf" : [ + { "type": "null" }, + { "$ref": "#/$defs/duration" } + ]}, + "retryPausePeriodDuration": { "oneOf" : [ + { "type": "null" }, + { "$ref": "#/$defs/duration" } + ]}, + "retryLimit": { "oneOf" : [ + { "type": "null" }, + { "type": "integer", "minimum": 0, "maximum": 100 } + ]}, + "forcefulDeletionGracePeriodDuration": { "oneOf" : [ + { "type": "null" }, + { "$ref": "#/$defs/duration" } + ]}, + "deletionOnFailureGracePeriodDuration" : { "oneOf" : [ + { "type": "null" }, + { "$ref": "#/$defs/duration" } + ]} }, "if": { diff --git a/tools/pytorchjob-generator/chart/values.yaml b/tools/pytorchjob-generator/chart/values.yaml index f10dc3e..0fec5ce 100644 --- a/tools/pytorchjob-generator/chart/values.yaml +++ b/tools/pytorchjob-generator/chart/values.yaml @@ -229,31 +229,38 @@ serviceAccountName: # service account name # -- (string) Customize the admissionGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ # @section -- Fault Tolerance -admissionGracePeriodDuration: "60s" +# @default -- The AppWrapper defaults will be used +admissionGracePeriodDuration: # -- (string) Customize the warmupGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ # @section -- Fault Tolerance -warmupGracePeriodDuration: "300s" +# @default -- The AppWrapper defaults will be used +warmupGracePeriodDuration: # -- (string) Customize the failureGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ # @section -- Fault Tolerance -failureGracePeriodDuration: "60s" +# @default -- The AppWrapper defaults will be used +failureGracePeriodDuration: # -- (string) Customize the retryPausePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ # @section -- Fault Tolerance -retryPausePeriodDuration: "90s" +# @default -- The AppWrapper defaults will be used +retryPausePeriodDuration: # -- (integer) Customize the retryLimit; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ # @section -- Fault Tolerance -retryLimit: 3 +# @default -- The AppWrapper defaults will be used +retryLimit: # -- (string) Customize the forcefulDelectionGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ # @section -- Fault Tolerance -forcefulDeletionGracePeriodDuration: "600s" +# @default -- The AppWrapper defaults will be used +forcefulDeletionGracePeriodDuration: # -- (string) Customize the deletionOnFailureGracePeriod; see https://project-codeflare.github.io/appwrapper/arch-fault-tolerance/ # @section -- Fault Tolerance -deletionOnFailureGracePeriodDuration: "0s" +# @default -- The AppWrapper defaults will be used +deletionOnFailureGracePeriodDuration: # -- (string) Set Kubernertes policy for restarting failed containers "in place" (without restarting the Pod). # @section -- Fault Tolerance