Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ jobs:
make build-installer -e TAG=${RELEASE_VERSION} -e quay_repository=quay.io/ibm

- name: Create GitHub Release
uses: softprops/action-gh-release@v1
uses: softprops/action-gh-release@v2
with:
name: Release ${{ env.RELEASE_VERSION }}
generate_release_notes: true
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ To install the latest release of AppWrapper in a Kubernetes cluster with Kueue a
and configured, simply run the command:

```sh
kubectl apply --server-side -f https://github.com/project-codeflare/appwrapper/releases/download/v0.27.0/install.yaml
kubectl apply --server-side -f https://github.com/project-codeflare/appwrapper/releases/download/v0.28.0/install.yaml
```

The controller runs in the `appwrapper-system` namespace.
Expand Down
2 changes: 1 addition & 1 deletion docs/release_instructions.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,5 @@ will:
go.mod.

5. Update the kustomization.yaml files in MLBatch to refer to the new release:
+ setup.k8s-v1.25/appwrapper/kustomization.yaml
+ setup.k8s-v1.27/appwrapper/kustomization.yaml
+ setup.k8s-v1.30/appwrapper/kustomization.yaml
21 changes: 15 additions & 6 deletions hack/create-test-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,26 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# Create and configure a kind cluster for running the e2e tests
# Does NOT install mcad
# Create and optionally configure a kind cluster for running the e2e tests

export ROOT_DIR="$(dirname "$(dirname "$(readlink -fn "$0")")")"
CLUSTER_STARTED="false"
CONFIGURE_CLUSTER=${CONFIGURE_CLUSTER:-"true"}

source ${ROOT_DIR}/hack/e2e-util.sh

update_test_host
check_prerequisites
pull_images
if [[ "$CONFIGURE_CLUSTER" == "true" ]]
then
update_test_host
check_prerequisites
pull_images
fi

kind_up_cluster
add_virtual_GPUs
configure_cluster

if [[ "$CONFIGURE_CLUSTER" == "true" ]]
then
kind_load_images
configure_cluster
fi
2 changes: 1 addition & 1 deletion hack/default-queues.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,6 @@ apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
metadata:
namespace: "default"
name: "user-queue"
name: "default-queue"
spec:
clusterQueue: "cluster-queue"
66 changes: 60 additions & 6 deletions hack/e2e-util.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@

export LOG_LEVEL=${TEST_LOG_LEVEL:-2}
export CLEANUP_CLUSTER=${CLEANUP_CLUSTER:-"true"}
export CLUSTER_CONTEXT="--name test"
export CLUSTER_CONTEXT=${CLUSTER_CONTEXT:-"--name test"}
export KIND_OPT=${KIND_OPT:=" --config ${ROOT_DIR}/hack/kind-config.yaml"}
export KIND_K8S_VERSION=${KIND_K8S_VERSION:-"1.27"}
export KA_BIN=_output/bin
export WAIT_TIME="20s"
export KUTTL_VERSION=0.15.0
Expand Down Expand Up @@ -61,9 +62,9 @@ function update_test_host {
which kind >/dev/null 2>&1
if [ $? -ne 0 ]
then
# Download kind binary (0.24.0)
echo "Downloading and installing kind v0.24.0...."
sudo curl -o /usr/local/bin/kind -L https://github.com/kubernetes-sigs/kind/releases/download/v0.24.0/kind-linux-${arch} && \
# Download kind binary (0.25.0)
echo "Downloading and installing kind v0.25.0...."
sudo curl -o /usr/local/bin/kind -L https://github.com/kubernetes-sigs/kind/releases/download/v0.25.0/kind-linux-${arch} && \
sudo chmod +x /usr/local/bin/kind
[ $? -ne 0 ] && echo "Failed to download kind" && exit 1
echo "Kind was sucessfully installed."
Expand Down Expand Up @@ -154,15 +155,68 @@ function pull_images {
}

function kind_up_cluster {
echo "Running kind: [kind create cluster ${CLUSTER_CONTEXT} ${KIND_OPT}]"
kind create cluster ${CLUSTER_CONTEXT} ${KIND_OPT} --wait ${WAIT_TIME}
# Determine node image tag based on kind version and desired kubernetes version
KIND_ACTUAL_VERSION=$(kind version | awk '/ /{print $2}')
case $KIND_ACTUAL_VERSION in
v0.25.0)
case $KIND_K8S_VERSION in
1.27)
KIND_NODE_TAG=${KIND_NODE_TAG:="v1.27.16@sha256:2d21a61643eafc439905e18705b8186f3296384750a835ad7a005dceb9546d20"}
;;
1.29)
KIND_NODE_TAG=${KIND_NODE_TAG:="v1.29.10@sha256:3b2d8c31753e6c8069d4fc4517264cd20e86fd36220671fb7d0a5855103aa84b"}
;;
1.30)
KIND_NODE_TAG=${KIND_NODE_TAG:="v1.30.6@sha256:b6d08db72079ba5ae1f4a88a09025c0a904af3b52387643c285442afb05ab994"}
;;
1.31)
KIND_NODE_TAG=${KIND_NODE_TAG:="v1.31.2@sha256:18fbefc20a7113353c7b75b5c869d7145a6abd6269154825872dc59c1329912e"}
;;
*)
echo "Unexpected kubernetes version: $KIND_K8S__VERSION"
exit 1
;;
esac
;;

v0.24.0)
case $KIND_K8S_VERSION in
1.27)
KIND_NODE_TAG=${KIND_NODE_TAG:="v1.27.16@sha256:3fd82731af34efe19cd54ea5c25e882985bafa2c9baefe14f8deab1737d9fabe"}
;;
1.29)
KIND_NODE_TAG=${KIND_NODE_TAG:="v1.29.8@sha256:d46b7aa29567e93b27f7531d258c372e829d7224b25e3fc6ffdefed12476d3aa"}
;;
1.30)
KIND_NODE_TAG=${KIND_NODE_TAG:="v1.30.4@sha256:976ea815844d5fa93be213437e3ff5754cd599b040946b5cca43ca45c2047114"}
;;
1.31)
KIND_NODE_TAG=${KIND_NODE_TAG:="v1.31.0@sha256:53df588e04085fd41ae12de0c3fe4c72f7013bba32a20e7325357a1ac94ba865"}
;;
*)
echo "Unexpected kubernetes version: $KIND_K8S__VERSION"
exit 1
;;
esac
;;

*)
echo "Unexpected kind version: $KIND_ACTUAL_VERSION"
exit 1
;;
esac

echo "Running kind: [kind create cluster ${CLUSTER_CONTEXT} --image kindest/node:${KIND_NODE_TAG} ${KIND_OPT}]"
kind create cluster ${CLUSTER_CONTEXT} --image kindest/node:${KIND_NODE_TAG} ${KIND_OPT} --wait ${WAIT_TIME}
if [ $? -ne 0 ]
then
echo "Failed to start kind cluster"
exit 1
fi
CLUSTER_STARTED="true"
}

function kind_load_images {
for image in ${IMAGE_ECHOSERVER} ${IMAGE_BUSY_BOX_LATEST} ${IMAGE_KUBEFLOW_OPERATOR} ${IMAGE_KUBERAY_OPERATOR}
do
kind load docker-image ${image} ${CLUSTER_CONTEXT}
Expand Down
8 changes: 0 additions & 8 deletions hack/kind-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,6 @@ kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
# 1 control plane node and 2 worker nodes
nodes:
# the control plane node config
- role: control-plane
# kubernetes version 1.27.17 from kind v0.24.0
image: kindest/node:v1.27.17@sha256:3fd82731af34efe19cd54ea5c25e882985bafa2c9baefe14f8deab1737d9fabe
# the workers
- role: worker
# kubernetes version 1.27.17 from kind v0.24.0
image: kindest/node:v1.27.17@sha256:3fd82731af34efe19cd54ea5c25e882985bafa2c9baefe14f8deab1737d9fabe
- role: worker
# kubernetes version 1.27.17 from kind v0.24.0
image: kindest/node:v1.27.17@sha256:3fd82731af34efe19cd54ea5c25e882985bafa2c9baefe14f8deab1737d9fabe
3 changes: 0 additions & 3 deletions hack/kueue-config/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

namespace: kueue-system

resources:
- "https://github.com/kubernetes-sigs/kueue/config/default?ref=v0.8.3"

configMapGenerator:
- name: manager-config
namespace: kueue-system
behavior: replace
files:
- controller_manager_config.yaml
Expand Down
25 changes: 15 additions & 10 deletions internal/controller/appwrapper/appwrapper_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -268,21 +268,26 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
}

// Detect externally deleted components and transition to Failed with no GracePeriod or retry
detailMsg := fmt.Sprintf("Only found %v deployed components, but was expecting %v", compStatus.deployed, compStatus.expected)
if compStatus.deployed != compStatus.expected {
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
Type: string(workloadv1beta2.Unhealthy),
Status: metav1.ConditionTrue,
Reason: "MissingComponent",
Message: detailMsg,
})
r.Recorder.Event(aw, v1.EventTypeNormal, string(workloadv1beta2.Unhealthy), "MissingComponent: "+detailMsg)
return ctrl.Result{}, r.transitionToPhase(ctx, orig, aw, workloadv1beta2.AppWrapperFailed)
// There may be a lag before created resources become visible in the cache; don't react too quickly.
whenDeployed := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.ResourcesDeployed)).LastTransitionTime
graceDuration := r.admissionGraceDuration(ctx, aw)
if time.Now().After(whenDeployed.Add(graceDuration)) {
detailMsg := fmt.Sprintf("Only found %v deployed components, but was expecting %v", compStatus.deployed, compStatus.expected)
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
Type: string(workloadv1beta2.Unhealthy),
Status: metav1.ConditionTrue,
Reason: "MissingComponent",
Message: detailMsg,
})
r.Recorder.Event(aw, v1.EventTypeNormal, string(workloadv1beta2.Unhealthy), "MissingComponent: "+detailMsg)
return ctrl.Result{}, r.transitionToPhase(ctx, orig, aw, workloadv1beta2.AppWrapperFailed)
}
}

// If a component's controller has put it into a failed state, we do not need
// to allow a grace period. The situation will not self-correct.
detailMsg = fmt.Sprintf("Found %v failed components", compStatus.failed)
detailMsg := fmt.Sprintf("Found %v failed components", compStatus.failed)
if compStatus.failed > 0 {
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
Type: string(workloadv1beta2.Unhealthy),
Expand Down
12 changes: 12 additions & 0 deletions internal/controller/appwrapper/resource_management.go
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,18 @@ func (r *AppWrapperReconciler) createComponent(ctx context.Context, aw *workload
}
// fall through. This is not actually an error. The object already exists and the correct appwrapper owns it.
} else {
// resource not actually created; patch status to reflect that
orig := copyForStatusPatch(aw)
meta.SetStatusCondition(&aw.Status.ComponentStatus[componentIdx].Conditions, metav1.Condition{
Type: string(workloadv1beta2.ResourcesDeployed),
Status: metav1.ConditionFalse,
Reason: "ComponentCreationErrored",
})
if patchErr := r.Status().Patch(ctx, aw, client.MergeFrom(orig)); patchErr != nil {
// ugh. Patch failed, so retry the create so we can get to a consistient state
return patchErr, false
}
// return actual error
return err, meta.IsNoMatchError(err) || apierrors.IsInvalid(err) // fatal
}
}
Expand Down
4 changes: 2 additions & 2 deletions internal/webhook/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ var cancel context.CancelFunc

const limitedUserName = "limited-user"
const limitedUserID = "8da0fcfe-6d7f-4f44-b433-d91d22cc1b8c"
const defaultQueueName = "default-queue"
const userProvidedQueueName = "user-queue"
const defaultQueueName = "system-default-queue"
const userProvidedQueueName = "user-provided-queue"

func TestWebhooks(t *testing.T) {
RegisterFailHandler(Fail)
Expand Down
6 changes: 5 additions & 1 deletion pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,11 @@ func ValidateAppWrapperConfig(config *AppWrapperConfig) error {
config.FaultTolerance.AdmissionGracePeriod, config.FaultTolerance.GracePeriodMaximum)
}
if config.FaultTolerance.WarmupGracePeriod > config.FaultTolerance.GracePeriodMaximum {
return fmt.Errorf("WarmupGracePeriod %v exceeds GracePeriodCeiling %v",
return fmt.Errorf("AdmissionGracePeriod %v exceeds GracePeriodCeiling %v",
config.FaultTolerance.WarmupGracePeriod, config.FaultTolerance.GracePeriodMaximum)
}
if config.FaultTolerance.AdmissionGracePeriod > config.FaultTolerance.WarmupGracePeriod {
return fmt.Errorf("AdmissionGracePeriod %v exceeds AdmissionGracePeriod %v",
config.FaultTolerance.WarmupGracePeriod, config.FaultTolerance.GracePeriodMaximum)
}
if config.FaultTolerance.SuccessTTL <= 0 {
Expand Down
3 changes: 3 additions & 0 deletions pkg/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ var _ = Describe("AppWrapper Config", func() {
bad = &FaultToleranceConfig{WarmupGracePeriod: 10 * time.Second, GracePeriodMaximum: 1 * time.Second}
Expect(ValidateAppWrapperConfig(&AppWrapperConfig{FaultTolerance: bad})).ShouldNot(Succeed())

bad = &FaultToleranceConfig{AdmissionGracePeriod: 10 * time.Second, WarmupGracePeriod: 1 * time.Second}
Expect(ValidateAppWrapperConfig(&AppWrapperConfig{FaultTolerance: bad})).ShouldNot(Succeed())

bad = &FaultToleranceConfig{SuccessTTL: -1 * time.Second}
Expect(ValidateAppWrapperConfig(&AppWrapperConfig{FaultTolerance: bad})).ShouldNot(Succeed())
})
Expand Down
2 changes: 1 addition & 1 deletion samples/wrapped-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ kind: AppWrapper
metadata:
name: sample-deployment
labels:
kueue.x-k8s.io/queue-name: user-queue
kueue.x-k8s.io/queue-name: default-queue
spec:
components:
- template:
Expand Down
2 changes: 1 addition & 1 deletion samples/wrapped-failing-job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ kind: AppWrapper
metadata:
name: sample-failing-job
labels:
kueue.x-k8s.io/queue-name: user-queue
kueue.x-k8s.io/queue-name: default-queue
annotations:
workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 10s
workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 10s
Expand Down
2 changes: 1 addition & 1 deletion samples/wrapped-failing-pod.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ kind: AppWrapper
metadata:
name: sample-failing-pod
labels:
kueue.x-k8s.io/queue-name: user-queue
kueue.x-k8s.io/queue-name: default-queue
annotations:
workload.codeflare.dev.appwrapper/failureGracePeriodDuration: 10s
workload.codeflare.dev.appwrapper/retryPausePeriodDuration: 10s
Expand Down
2 changes: 1 addition & 1 deletion samples/wrapped-failing-pytorch-job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ kind: AppWrapper
metadata:
name: sample-failing-pytorch-job
labels:
kueue.x-k8s.io/queue-name: user-queue
kueue.x-k8s.io/queue-name: default-queue
spec:
components:
- template:
Expand Down
2 changes: 1 addition & 1 deletion samples/wrapped-gpu-job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ kind: AppWrapper
metadata:
name: sample-gpu-job
labels:
kueue.x-k8s.io/queue-name: user-queue
kueue.x-k8s.io/queue-name: default-queue
annotations:
workload.codeflare.dev.appwrapper/successTTLDuration: "1m"
spec:
Expand Down
2 changes: 1 addition & 1 deletion samples/wrapped-job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ kind: AppWrapper
metadata:
name: sample-job
labels:
kueue.x-k8s.io/queue-name: user-queue
kueue.x-k8s.io/queue-name: default-queue
annotations:
workload.codeflare.dev.appwrapper/successTTLDuration: "1m"
spec:
Expand Down
2 changes: 1 addition & 1 deletion samples/wrapped-pod.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ kind: AppWrapper
metadata:
name: sample-pod
labels:
kueue.x-k8s.io/queue-name: user-queue
kueue.x-k8s.io/queue-name: default-queue
spec:
components:
- template:
Expand Down
2 changes: 1 addition & 1 deletion samples/wrapped-pytorch-job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ kind: AppWrapper
metadata:
name: sample-pytorch-job
labels:
kueue.x-k8s.io/queue-name: user-queue
kueue.x-k8s.io/queue-name: default-queue
spec:
components:
- template:
Expand Down
2 changes: 1 addition & 1 deletion site/_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ repository: project-codeflare/appwrapper

# Variables for use in pages
gh_main_url: https://github.com/project-codeflare/appwrapper/blob/main
appwrapper_version: v0.27.0
appwrapper_version: v0.28.0

# Outputting
permalink: /:categories/:title/
Expand Down
2 changes: 1 addition & 1 deletion site/_pages/dev-setup.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ You can verify Kueue is configured as expected with:
```sh
% kubectl get localqueues,clusterqueues -o wide
NAME CLUSTERQUEUE PENDING WORKLOADS ADMITTED WORKLOADS
localqueue.kueue.x-k8s.io/user-queue cluster-queue 0 0
localqueue.kueue.x-k8s.io/default-queue cluster-queue 0 0

NAME COHORT STRATEGY PENDING WORKLOADS ADMITTED WORKLOADS
clusterqueue.kueue.x-k8s.io/cluster-queue BestEffortFIFO 0 0
Expand Down
2 changes: 1 addition & 1 deletion site/_pages/quick-start.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
metadata:
namespace: "default"
name: "user-queue"
name: "default-queue"
spec:
clusterQueue: "cluster-queue"
```
Expand Down
2 changes: 1 addition & 1 deletion site/_pages/sample-batch-job.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ kind: AppWrapper
metadata:
name: sample-job
labels:
kueue.x-k8s.io/queue-name: user-queue
kueue.x-k8s.io/queue-name: default-queue
spec:
components:
- template:
Expand Down
2 changes: 1 addition & 1 deletion site/_pages/sample-pytorch.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ kind: AppWrapper
metadata:
name: sample-pytorch-job
labels:
kueue.x-k8s.io/queue-name: user-queue
kueue.x-k8s.io/queue-name: default-queue
spec:
components:
- template:
Expand Down
Loading
Loading