diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index a1dcdad..6babbd9 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -49,7 +49,7 @@ jobs: make build-installer -e TAG=${RELEASE_VERSION} -e quay_repository=quay.io/ibm - name: Create GitHub Release - uses: softprops/action-gh-release@v1 + uses: softprops/action-gh-release@v2 with: name: Release ${{ env.RELEASE_VERSION }} generate_release_notes: true diff --git a/README.md b/README.md index 3edc862..5a23458 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ To install the latest release of AppWrapper in a Kubernetes cluster with Kueue a and configured, simply run the command: ```sh -kubectl apply --server-side -f https://github.com/project-codeflare/appwrapper/releases/download/v0.27.0/install.yaml +kubectl apply --server-side -f https://github.com/project-codeflare/appwrapper/releases/download/v0.28.0/install.yaml ``` The controller runs in the `appwrapper-system` namespace. diff --git a/docs/release_instructions.md b/docs/release_instructions.md index 9e95b18..e0c7161 100644 --- a/docs/release_instructions.md +++ b/docs/release_instructions.md @@ -24,5 +24,5 @@ will: go.mod. 5. Update the kustomization.yaml files in MLBatch to refer to the new release: - + setup.k8s-v1.25/appwrapper/kustomization.yaml + + setup.k8s-v1.27/appwrapper/kustomization.yaml + setup.k8s-v1.30/appwrapper/kustomization.yaml diff --git a/hack/create-test-cluster.sh b/hack/create-test-cluster.sh index 30fa3d5..ccc9387 100755 --- a/hack/create-test-cluster.sh +++ b/hack/create-test-cluster.sh @@ -12,17 +12,26 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Create and configure a kind cluster for running the e2e tests -# Does NOT install mcad +# Create and optionally configure a kind cluster for running the e2e tests export ROOT_DIR="$(dirname "$(dirname "$(readlink -fn "$0")")")" CLUSTER_STARTED="false" +CONFIGURE_CLUSTER=${CONFIGURE_CLUSTER:-"true"} source ${ROOT_DIR}/hack/e2e-util.sh -update_test_host -check_prerequisites -pull_images +if [[ "$CONFIGURE_CLUSTER" == "true" ]] +then + update_test_host + check_prerequisites + pull_images +fi + kind_up_cluster add_virtual_GPUs -configure_cluster + +if [[ "$CONFIGURE_CLUSTER" == "true" ]] +then + kind_load_images + configure_cluster +fi diff --git a/hack/default-queues.yaml b/hack/default-queues.yaml index 858b23c..23497c2 100644 --- a/hack/default-queues.yaml +++ b/hack/default-queues.yaml @@ -23,6 +23,6 @@ apiVersion: kueue.x-k8s.io/v1beta1 kind: LocalQueue metadata: namespace: "default" - name: "user-queue" + name: "default-queue" spec: clusterQueue: "cluster-queue" diff --git a/hack/e2e-util.sh b/hack/e2e-util.sh index bf3a637..ee4cd7d 100755 --- a/hack/e2e-util.sh +++ b/hack/e2e-util.sh @@ -14,8 +14,9 @@ export LOG_LEVEL=${TEST_LOG_LEVEL:-2} export CLEANUP_CLUSTER=${CLEANUP_CLUSTER:-"true"} -export CLUSTER_CONTEXT="--name test" +export CLUSTER_CONTEXT=${CLUSTER_CONTEXT:-"--name test"} export KIND_OPT=${KIND_OPT:=" --config ${ROOT_DIR}/hack/kind-config.yaml"} +export KIND_K8S_VERSION=${KIND_K8S_VERSION:-"1.27"} export KA_BIN=_output/bin export WAIT_TIME="20s" export KUTTL_VERSION=0.15.0 @@ -61,9 +62,9 @@ function update_test_host { which kind >/dev/null 2>&1 if [ $? -ne 0 ] then - # Download kind binary (0.24.0) - echo "Downloading and installing kind v0.24.0...." - sudo curl -o /usr/local/bin/kind -L https://github.com/kubernetes-sigs/kind/releases/download/v0.24.0/kind-linux-${arch} && \ + # Download kind binary (0.25.0) + echo "Downloading and installing kind v0.25.0...." + sudo curl -o /usr/local/bin/kind -L https://github.com/kubernetes-sigs/kind/releases/download/v0.25.0/kind-linux-${arch} && \ sudo chmod +x /usr/local/bin/kind [ $? -ne 0 ] && echo "Failed to download kind" && exit 1 echo "Kind was sucessfully installed." @@ -154,15 +155,68 @@ function pull_images { } function kind_up_cluster { - echo "Running kind: [kind create cluster ${CLUSTER_CONTEXT} ${KIND_OPT}]" - kind create cluster ${CLUSTER_CONTEXT} ${KIND_OPT} --wait ${WAIT_TIME} + # Determine node image tag based on kind version and desired kubernetes version + KIND_ACTUAL_VERSION=$(kind version | awk '/ /{print $2}') + case $KIND_ACTUAL_VERSION in + v0.25.0) + case $KIND_K8S_VERSION in + 1.27) + KIND_NODE_TAG=${KIND_NODE_TAG:="v1.27.16@sha256:2d21a61643eafc439905e18705b8186f3296384750a835ad7a005dceb9546d20"} + ;; + 1.29) + KIND_NODE_TAG=${KIND_NODE_TAG:="v1.29.10@sha256:3b2d8c31753e6c8069d4fc4517264cd20e86fd36220671fb7d0a5855103aa84b"} + ;; + 1.30) + KIND_NODE_TAG=${KIND_NODE_TAG:="v1.30.6@sha256:b6d08db72079ba5ae1f4a88a09025c0a904af3b52387643c285442afb05ab994"} + ;; + 1.31) + KIND_NODE_TAG=${KIND_NODE_TAG:="v1.31.2@sha256:18fbefc20a7113353c7b75b5c869d7145a6abd6269154825872dc59c1329912e"} + ;; + *) + echo "Unexpected kubernetes version: $KIND_K8S__VERSION" + exit 1 + ;; + esac + ;; + + v0.24.0) + case $KIND_K8S_VERSION in + 1.27) + KIND_NODE_TAG=${KIND_NODE_TAG:="v1.27.16@sha256:3fd82731af34efe19cd54ea5c25e882985bafa2c9baefe14f8deab1737d9fabe"} + ;; + 1.29) + KIND_NODE_TAG=${KIND_NODE_TAG:="v1.29.8@sha256:d46b7aa29567e93b27f7531d258c372e829d7224b25e3fc6ffdefed12476d3aa"} + ;; + 1.30) + KIND_NODE_TAG=${KIND_NODE_TAG:="v1.30.4@sha256:976ea815844d5fa93be213437e3ff5754cd599b040946b5cca43ca45c2047114"} + ;; + 1.31) + KIND_NODE_TAG=${KIND_NODE_TAG:="v1.31.0@sha256:53df588e04085fd41ae12de0c3fe4c72f7013bba32a20e7325357a1ac94ba865"} + ;; + *) + echo "Unexpected kubernetes version: $KIND_K8S__VERSION" + exit 1 + ;; + esac + ;; + + *) + echo "Unexpected kind version: $KIND_ACTUAL_VERSION" + exit 1 + ;; + esac + + echo "Running kind: [kind create cluster ${CLUSTER_CONTEXT} --image kindest/node:${KIND_NODE_TAG} ${KIND_OPT}]" + kind create cluster ${CLUSTER_CONTEXT} --image kindest/node:${KIND_NODE_TAG} ${KIND_OPT} --wait ${WAIT_TIME} if [ $? -ne 0 ] then echo "Failed to start kind cluster" exit 1 fi CLUSTER_STARTED="true" +} +function kind_load_images { for image in ${IMAGE_ECHOSERVER} ${IMAGE_BUSY_BOX_LATEST} ${IMAGE_KUBEFLOW_OPERATOR} ${IMAGE_KUBERAY_OPERATOR} do kind load docker-image ${image} ${CLUSTER_CONTEXT} diff --git a/hack/kind-config.yaml b/hack/kind-config.yaml index d32e663..19f7fc2 100644 --- a/hack/kind-config.yaml +++ b/hack/kind-config.yaml @@ -2,14 +2,6 @@ kind: Cluster apiVersion: kind.x-k8s.io/v1alpha4 # 1 control plane node and 2 worker nodes nodes: -# the control plane node config - role: control-plane - # kubernetes version 1.27.17 from kind v0.24.0 - image: kindest/node:v1.27.17@sha256:3fd82731af34efe19cd54ea5c25e882985bafa2c9baefe14f8deab1737d9fabe -# the workers - role: worker - # kubernetes version 1.27.17 from kind v0.24.0 - image: kindest/node:v1.27.17@sha256:3fd82731af34efe19cd54ea5c25e882985bafa2c9baefe14f8deab1737d9fabe - role: worker - # kubernetes version 1.27.17 from kind v0.24.0 - image: kindest/node:v1.27.17@sha256:3fd82731af34efe19cd54ea5c25e882985bafa2c9baefe14f8deab1737d9fabe diff --git a/hack/kueue-config/kustomization.yaml b/hack/kueue-config/kustomization.yaml index b5af1c5..d80e583 100644 --- a/hack/kueue-config/kustomization.yaml +++ b/hack/kueue-config/kustomization.yaml @@ -1,14 +1,11 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization -namespace: kueue-system - resources: - "https://github.com/kubernetes-sigs/kueue/config/default?ref=v0.8.3" configMapGenerator: - name: manager-config - namespace: kueue-system behavior: replace files: - controller_manager_config.yaml diff --git a/internal/controller/appwrapper/appwrapper_controller.go b/internal/controller/appwrapper/appwrapper_controller.go index 4f487a2..8fcbff7 100644 --- a/internal/controller/appwrapper/appwrapper_controller.go +++ b/internal/controller/appwrapper/appwrapper_controller.go @@ -268,21 +268,26 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request) } // Detect externally deleted components and transition to Failed with no GracePeriod or retry - detailMsg := fmt.Sprintf("Only found %v deployed components, but was expecting %v", compStatus.deployed, compStatus.expected) if compStatus.deployed != compStatus.expected { - meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{ - Type: string(workloadv1beta2.Unhealthy), - Status: metav1.ConditionTrue, - Reason: "MissingComponent", - Message: detailMsg, - }) - r.Recorder.Event(aw, v1.EventTypeNormal, string(workloadv1beta2.Unhealthy), "MissingComponent: "+detailMsg) - return ctrl.Result{}, r.transitionToPhase(ctx, orig, aw, workloadv1beta2.AppWrapperFailed) + // There may be a lag before created resources become visible in the cache; don't react too quickly. + whenDeployed := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.ResourcesDeployed)).LastTransitionTime + graceDuration := r.admissionGraceDuration(ctx, aw) + if time.Now().After(whenDeployed.Add(graceDuration)) { + detailMsg := fmt.Sprintf("Only found %v deployed components, but was expecting %v", compStatus.deployed, compStatus.expected) + meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{ + Type: string(workloadv1beta2.Unhealthy), + Status: metav1.ConditionTrue, + Reason: "MissingComponent", + Message: detailMsg, + }) + r.Recorder.Event(aw, v1.EventTypeNormal, string(workloadv1beta2.Unhealthy), "MissingComponent: "+detailMsg) + return ctrl.Result{}, r.transitionToPhase(ctx, orig, aw, workloadv1beta2.AppWrapperFailed) + } } // If a component's controller has put it into a failed state, we do not need // to allow a grace period. The situation will not self-correct. - detailMsg = fmt.Sprintf("Found %v failed components", compStatus.failed) + detailMsg := fmt.Sprintf("Found %v failed components", compStatus.failed) if compStatus.failed > 0 { meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{ Type: string(workloadv1beta2.Unhealthy), diff --git a/internal/controller/appwrapper/resource_management.go b/internal/controller/appwrapper/resource_management.go index eff0a7f..afa9b95 100644 --- a/internal/controller/appwrapper/resource_management.go +++ b/internal/controller/appwrapper/resource_management.go @@ -317,6 +317,18 @@ func (r *AppWrapperReconciler) createComponent(ctx context.Context, aw *workload } // fall through. This is not actually an error. The object already exists and the correct appwrapper owns it. } else { + // resource not actually created; patch status to reflect that + orig := copyForStatusPatch(aw) + meta.SetStatusCondition(&aw.Status.ComponentStatus[componentIdx].Conditions, metav1.Condition{ + Type: string(workloadv1beta2.ResourcesDeployed), + Status: metav1.ConditionFalse, + Reason: "ComponentCreationErrored", + }) + if patchErr := r.Status().Patch(ctx, aw, client.MergeFrom(orig)); patchErr != nil { + // ugh. Patch failed, so retry the create so we can get to a consistient state + return patchErr, false + } + // return actual error return err, meta.IsNoMatchError(err) || apierrors.IsInvalid(err) // fatal } } diff --git a/internal/webhook/suite_test.go b/internal/webhook/suite_test.go index fb52e77..0e7543f 100644 --- a/internal/webhook/suite_test.go +++ b/internal/webhook/suite_test.go @@ -62,8 +62,8 @@ var cancel context.CancelFunc const limitedUserName = "limited-user" const limitedUserID = "8da0fcfe-6d7f-4f44-b433-d91d22cc1b8c" -const defaultQueueName = "default-queue" -const userProvidedQueueName = "user-queue" +const defaultQueueName = "system-default-queue" +const userProvidedQueueName = "user-provided-queue" func TestWebhooks(t *testing.T) { RegisterFailHandler(Fail) diff --git a/pkg/config/config.go b/pkg/config/config.go index b730965..1720567 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -143,7 +143,11 @@ func ValidateAppWrapperConfig(config *AppWrapperConfig) error { config.FaultTolerance.AdmissionGracePeriod, config.FaultTolerance.GracePeriodMaximum) } if config.FaultTolerance.WarmupGracePeriod > config.FaultTolerance.GracePeriodMaximum { - return fmt.Errorf("WarmupGracePeriod %v exceeds GracePeriodCeiling %v", + return fmt.Errorf("AdmissionGracePeriod %v exceeds GracePeriodCeiling %v", + config.FaultTolerance.WarmupGracePeriod, config.FaultTolerance.GracePeriodMaximum) + } + if config.FaultTolerance.AdmissionGracePeriod > config.FaultTolerance.WarmupGracePeriod { + return fmt.Errorf("AdmissionGracePeriod %v exceeds AdmissionGracePeriod %v", config.FaultTolerance.WarmupGracePeriod, config.FaultTolerance.GracePeriodMaximum) } if config.FaultTolerance.SuccessTTL <= 0 { diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index f04ce9b..9810907 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -56,6 +56,9 @@ var _ = Describe("AppWrapper Config", func() { bad = &FaultToleranceConfig{WarmupGracePeriod: 10 * time.Second, GracePeriodMaximum: 1 * time.Second} Expect(ValidateAppWrapperConfig(&AppWrapperConfig{FaultTolerance: bad})).ShouldNot(Succeed()) + bad = &FaultToleranceConfig{AdmissionGracePeriod: 10 * time.Second, WarmupGracePeriod: 1 * time.Second} + Expect(ValidateAppWrapperConfig(&AppWrapperConfig{FaultTolerance: bad})).ShouldNot(Succeed()) + bad = &FaultToleranceConfig{SuccessTTL: -1 * time.Second} Expect(ValidateAppWrapperConfig(&AppWrapperConfig{FaultTolerance: bad})).ShouldNot(Succeed()) }) diff --git a/samples/wrapped-deployment.yaml b/samples/wrapped-deployment.yaml index a5d0ce1..b9e6c9c 100644 --- a/samples/wrapped-deployment.yaml +++ b/samples/wrapped-deployment.yaml @@ -3,7 +3,7 @@ kind: AppWrapper metadata: name: sample-deployment labels: - kueue.x-k8s.io/queue-name: user-queue + kueue.x-k8s.io/queue-name: default-queue spec: components: - template: diff --git a/samples/wrapped-failing-job.yaml b/samples/wrapped-failing-job.yaml index e65f564..130166d 100644 --- a/samples/wrapped-failing-job.yaml +++ b/samples/wrapped-failing-job.yaml @@ -3,7 +3,7 @@ kind: AppWrapper metadata: name: sample-failing-job labels: - kueue.x-k8s.io/queue-name: user-queue + kueue.x-k8s.io/queue-name: default-queue annotations: workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 10s workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 10s diff --git a/samples/wrapped-failing-pod.yaml b/samples/wrapped-failing-pod.yaml index f13555b..eb02df5 100644 --- a/samples/wrapped-failing-pod.yaml +++ b/samples/wrapped-failing-pod.yaml @@ -3,7 +3,7 @@ kind: AppWrapper metadata: name: sample-failing-pod labels: - kueue.x-k8s.io/queue-name: user-queue + kueue.x-k8s.io/queue-name: default-queue annotations: workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 10s workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 10s diff --git a/samples/wrapped-failing-pytorch-job.yaml b/samples/wrapped-failing-pytorch-job.yaml index d7bc7a2..e6cf910 100644 --- a/samples/wrapped-failing-pytorch-job.yaml +++ b/samples/wrapped-failing-pytorch-job.yaml @@ -3,7 +3,7 @@ kind: AppWrapper metadata: name: sample-failing-pytorch-job labels: - kueue.x-k8s.io/queue-name: user-queue + kueue.x-k8s.io/queue-name: default-queue spec: components: - template: diff --git a/samples/wrapped-gpu-job.yaml b/samples/wrapped-gpu-job.yaml index b606324..bc0cf44 100644 --- a/samples/wrapped-gpu-job.yaml +++ b/samples/wrapped-gpu-job.yaml @@ -3,7 +3,7 @@ kind: AppWrapper metadata: name: sample-gpu-job labels: - kueue.x-k8s.io/queue-name: user-queue + kueue.x-k8s.io/queue-name: default-queue annotations: workload.codeflare.dev.appwrapper/successTTLDuration: "1m" spec: diff --git a/samples/wrapped-job.yaml b/samples/wrapped-job.yaml index af73824..f8f4f3f 100644 --- a/samples/wrapped-job.yaml +++ b/samples/wrapped-job.yaml @@ -3,7 +3,7 @@ kind: AppWrapper metadata: name: sample-job labels: - kueue.x-k8s.io/queue-name: user-queue + kueue.x-k8s.io/queue-name: default-queue annotations: workload.codeflare.dev.appwrapper/successTTLDuration: "1m" spec: diff --git a/samples/wrapped-pod.yaml b/samples/wrapped-pod.yaml index 067e0eb..7ecd87e 100644 --- a/samples/wrapped-pod.yaml +++ b/samples/wrapped-pod.yaml @@ -3,7 +3,7 @@ kind: AppWrapper metadata: name: sample-pod labels: - kueue.x-k8s.io/queue-name: user-queue + kueue.x-k8s.io/queue-name: default-queue spec: components: - template: diff --git a/samples/wrapped-pytorch-job.yaml b/samples/wrapped-pytorch-job.yaml index 5577325..18f5cc6 100644 --- a/samples/wrapped-pytorch-job.yaml +++ b/samples/wrapped-pytorch-job.yaml @@ -3,7 +3,7 @@ kind: AppWrapper metadata: name: sample-pytorch-job labels: - kueue.x-k8s.io/queue-name: user-queue + kueue.x-k8s.io/queue-name: default-queue spec: components: - template: diff --git a/site/_config.yml b/site/_config.yml index f0ae32d..4a2a980 100644 --- a/site/_config.yml +++ b/site/_config.yml @@ -25,7 +25,7 @@ repository: project-codeflare/appwrapper # Variables for use in pages gh_main_url: https://github.com/project-codeflare/appwrapper/blob/main -appwrapper_version: v0.27.0 +appwrapper_version: v0.28.0 # Outputting permalink: /:categories/:title/ diff --git a/site/_pages/dev-setup.md b/site/_pages/dev-setup.md index a12aed3..ddc4b8a 100644 --- a/site/_pages/dev-setup.md +++ b/site/_pages/dev-setup.md @@ -44,7 +44,7 @@ You can verify Kueue is configured as expected with: ```sh % kubectl get localqueues,clusterqueues -o wide NAME CLUSTERQUEUE PENDING WORKLOADS ADMITTED WORKLOADS -localqueue.kueue.x-k8s.io/user-queue cluster-queue 0 0 +localqueue.kueue.x-k8s.io/default-queue cluster-queue 0 0 NAME COHORT STRATEGY PENDING WORKLOADS ADMITTED WORKLOADS clusterqueue.kueue.x-k8s.io/cluster-queue BestEffortFIFO 0 0 diff --git a/site/_pages/quick-start.md b/site/_pages/quick-start.md index e475702..e276031 100644 --- a/site/_pages/quick-start.md +++ b/site/_pages/quick-start.md @@ -48,7 +48,7 @@ apiVersion: kueue.x-k8s.io/v1beta1 kind: LocalQueue metadata: namespace: "default" - name: "user-queue" + name: "default-queue" spec: clusterQueue: "cluster-queue" ``` diff --git a/site/_pages/sample-batch-job.md b/site/_pages/sample-batch-job.md index 5a91cf8..dd34016 100644 --- a/site/_pages/sample-batch-job.md +++ b/site/_pages/sample-batch-job.md @@ -11,7 +11,7 @@ kind: AppWrapper metadata: name: sample-job labels: - kueue.x-k8s.io/queue-name: user-queue + kueue.x-k8s.io/queue-name: default-queue spec: components: - template: diff --git a/site/_pages/sample-pytorch.md b/site/_pages/sample-pytorch.md index 63362e1..845edac 100644 --- a/site/_pages/sample-pytorch.md +++ b/site/_pages/sample-pytorch.md @@ -11,7 +11,7 @@ kind: AppWrapper metadata: name: sample-pytorch-job labels: - kueue.x-k8s.io/queue-name: user-queue + kueue.x-k8s.io/queue-name: default-queue spec: components: - template: diff --git a/test/e2e/appwrapper_test.go b/test/e2e/appwrapper_test.go index 362f507..6b808a4 100644 --- a/test/e2e/appwrapper_test.go +++ b/test/e2e/appwrapper_test.go @@ -297,8 +297,13 @@ var _ = Describe("AppWrapper E2E Test", func() { Expect(aw.Status.Retries).Should(Equal(int32(2))) }) - It("Deleting a Running Component yields a failed AppWrapper", func() { - aw := createAppWrapper(ctx, pytorchjob(2, 500)) + It("Deleting a Running Component yields a failed AppWrapper", Label("slow"), func() { + aw := toAppWrapper(pytorchjob(2, 500)) + if aw.Annotations == nil { + aw.Annotations = make(map[string]string) + } + aw.Annotations[workloadv1beta2.AdmissionGracePeriodDurationAnnotation] = "5s" + Expect(getClient(ctx).Create(ctx, aw)).To(Succeed()) appwrappers = append(appwrappers, aw) Eventually(AppWrapperPhase(ctx, aw), 60*time.Second).Should(Equal(workloadv1beta2.AppWrapperRunning)) aw = getAppWrapper(ctx, types.NamespacedName{Name: aw.Name, Namespace: aw.Namespace})