From a604c2e2f2b667e852a229a5de785b14e119fa32 Mon Sep 17 00:00:00 2001 From: David Grove Date: Tue, 28 Jan 2025 16:35:35 -0500 Subject: [PATCH] update vanilla Kubernetes configuration to Kueue 0.10.1 Simplify mlbatch setup by using the managedJobsNamespaceSelector feature that was added in Kueue 0.10.0. --- SETUP.md | 15 +- setup.k8s-v1.27/TEAM-SETUP.md | 96 ------------ setup.k8s-v1.27/kueue/kustomization.yaml | 53 ------- setup.k8s-v1.30/CLUSTER-SETUP.md | 137 ------------------ setup.k8s-v1.30/UNINSTALL.md | 24 --- setup.k8s-v1.30/admission-policy.yaml | 34 ----- setup.k8s-v1.30/appwrapper/config_patch.yaml | 23 --- setup.k8s-v1.30/appwrapper/kustomization.yaml | 22 --- .../appwrapper/manager_resources_patch.yaml | 18 --- .../appwrapper/remove_default_namespace.yaml | 5 - .../coscheduler-priority-patch.yaml | 3 - setup.k8s-v1.30/default-flavor.yaml | 4 - setup.k8s-v1.30/kind/kind-config.yaml | 11 -- setup.k8s-v1.30/kuberay/kustomization.yaml | 17 --- .../kuberay/manager_resources_patch.yaml | 20 --- .../kuberay/remove_default_namespace.yaml | 5 - .../kueue/controller_manager_config.yaml | 64 -------- .../kueue/manager_resources_patch.yaml | 9 -- .../kueue/mutating_webhook_patch.yaml | 9 -- .../kueue/remove_default_namespace.yaml | 5 - .../kueue/validating_webhook_patch.yaml | 7 - setup.k8s-v1.30/mlbatch-edit-role.yaml | 135 ----------------- setup.k8s-v1.30/mlbatch-priorities.yaml | 26 ---- .../training-operator/kustomization.yaml | 19 --- .../manager_resources_patch.yaml | 20 --- .../CLUSTER-SETUP.md | 21 +-- {setup.k8s-v1.30 => setup.k8s}/TEAM-SETUP.md | 1 - {setup.k8s-v1.27 => setup.k8s}/UNINSTALL.md | 0 .../appwrapper/config_patch.yaml | 0 .../appwrapper/kustomization.yaml | 0 .../appwrapper/manager_resources_patch.yaml | 0 .../appwrapper/remove_default_namespace.yaml | 0 .../coscheduler-priority-patch.yaml | 0 .../default-flavor.yaml | 0 .../kind/kind-config.yaml | 0 .../kuberay/kustomization.yaml | 0 .../kuberay/manager_resources_patch.yaml | 0 .../kuberay/remove_default_namespace.yaml | 0 .../kueue/controller_manager_config.yaml | 16 +- .../kueue/kustomization.yaml | 4 +- .../kueue/manager_resources_patch.yaml | 0 .../kueue/mutating_webhook_patch.yaml | 0 .../kueue/remove_default_namespace.yaml | 0 .../kueue/validating_webhook_patch.yaml | 0 .../mlbatch-edit-role.yaml | 0 .../mlbatch-priorities.yaml | 0 .../training-operator/kustomization.yaml | 0 .../manager_resources_patch.yaml | 0 setup.tmpl/CLUSTER-SETUP.md.tmpl | 16 +- setup.tmpl/Kubernetes-v1.27.yaml | 8 - ...{Kubernetes-v1.30.yaml => Kubernetes.yaml} | 5 +- setup.tmpl/Makefile | 6 +- setup.tmpl/TEAM-SETUP.md.tmpl | 8 +- 53 files changed, 41 insertions(+), 825 deletions(-) delete mode 100644 setup.k8s-v1.27/TEAM-SETUP.md delete mode 100644 setup.k8s-v1.27/kueue/kustomization.yaml delete mode 100644 setup.k8s-v1.30/CLUSTER-SETUP.md delete mode 100644 setup.k8s-v1.30/UNINSTALL.md delete mode 100644 setup.k8s-v1.30/admission-policy.yaml delete mode 100644 setup.k8s-v1.30/appwrapper/config_patch.yaml delete mode 100644 setup.k8s-v1.30/appwrapper/kustomization.yaml delete mode 100644 setup.k8s-v1.30/appwrapper/manager_resources_patch.yaml delete mode 100644 setup.k8s-v1.30/appwrapper/remove_default_namespace.yaml delete mode 100644 setup.k8s-v1.30/coscheduler-priority-patch.yaml delete mode 100644 setup.k8s-v1.30/default-flavor.yaml delete mode 100644 setup.k8s-v1.30/kind/kind-config.yaml delete mode 100644 setup.k8s-v1.30/kuberay/kustomization.yaml delete mode 100644 setup.k8s-v1.30/kuberay/manager_resources_patch.yaml delete mode 100644 setup.k8s-v1.30/kuberay/remove_default_namespace.yaml delete mode 100644 setup.k8s-v1.30/kueue/controller_manager_config.yaml delete mode 100644 setup.k8s-v1.30/kueue/manager_resources_patch.yaml delete mode 100644 setup.k8s-v1.30/kueue/mutating_webhook_patch.yaml delete mode 100644 setup.k8s-v1.30/kueue/remove_default_namespace.yaml delete mode 100644 setup.k8s-v1.30/kueue/validating_webhook_patch.yaml delete mode 100644 setup.k8s-v1.30/mlbatch-edit-role.yaml delete mode 100644 setup.k8s-v1.30/mlbatch-priorities.yaml delete mode 100644 setup.k8s-v1.30/training-operator/kustomization.yaml delete mode 100644 setup.k8s-v1.30/training-operator/manager_resources_patch.yaml rename {setup.k8s-v1.27 => setup.k8s}/CLUSTER-SETUP.md (83%) rename {setup.k8s-v1.30 => setup.k8s}/TEAM-SETUP.md (99%) rename {setup.k8s-v1.27 => setup.k8s}/UNINSTALL.md (100%) rename {setup.k8s-v1.27 => setup.k8s}/appwrapper/config_patch.yaml (100%) rename {setup.k8s-v1.27 => setup.k8s}/appwrapper/kustomization.yaml (100%) rename {setup.k8s-v1.27 => setup.k8s}/appwrapper/manager_resources_patch.yaml (100%) rename {setup.k8s-v1.27 => setup.k8s}/appwrapper/remove_default_namespace.yaml (100%) rename {setup.k8s-v1.27 => setup.k8s}/coscheduler-priority-patch.yaml (100%) rename {setup.k8s-v1.27 => setup.k8s}/default-flavor.yaml (100%) rename {setup.k8s-v1.27 => setup.k8s}/kind/kind-config.yaml (100%) rename {setup.k8s-v1.27 => setup.k8s}/kuberay/kustomization.yaml (100%) rename {setup.k8s-v1.27 => setup.k8s}/kuberay/manager_resources_patch.yaml (100%) rename {setup.k8s-v1.27 => setup.k8s}/kuberay/remove_default_namespace.yaml (100%) rename {setup.k8s-v1.27 => setup.k8s}/kueue/controller_manager_config.yaml (78%) rename {setup.k8s-v1.30 => setup.k8s}/kueue/kustomization.yaml (97%) rename {setup.k8s-v1.27 => setup.k8s}/kueue/manager_resources_patch.yaml (100%) rename {setup.k8s-v1.27 => setup.k8s}/kueue/mutating_webhook_patch.yaml (100%) rename {setup.k8s-v1.27 => setup.k8s}/kueue/remove_default_namespace.yaml (100%) rename {setup.k8s-v1.27 => setup.k8s}/kueue/validating_webhook_patch.yaml (100%) rename {setup.k8s-v1.27 => setup.k8s}/mlbatch-edit-role.yaml (100%) rename {setup.k8s-v1.27 => setup.k8s}/mlbatch-priorities.yaml (100%) rename {setup.k8s-v1.27 => setup.k8s}/training-operator/kustomization.yaml (100%) rename {setup.k8s-v1.27 => setup.k8s}/training-operator/manager_resources_patch.yaml (100%) delete mode 100644 setup.tmpl/Kubernetes-v1.27.yaml rename setup.tmpl/{Kubernetes-v1.30.yaml => Kubernetes.yaml} (52%) diff --git a/SETUP.md b/SETUP.md index 0003773..88b419d 100644 --- a/SETUP.md +++ b/SETUP.md @@ -56,15 +56,8 @@ Instructions are provided for the following Red Hat OpenShift AI ***fast*** rele ## Kubernetes -On Kubernetes version 1.30 and later, an enhanced user experience is -available by using ValidatingAdmissionPolicies to streamline quota -enforcement. Follow these instructions when installing on 1.30+ clusters: - + [Kubernetes 1.30+ Cluster Setup](./setup.k8s-v1.30/CLUSTER-SETUP.md) - + [Kubernetes 1.30+ Team Setup](./setup.k8s-v1.30/TEAM-SETUP.md) - + [Kubernetes 1.30+ Uninstall](setup.k8s-v1.30/UNINSTALL.md) - -MLBatch can be installed on any Kubernetes cluster version 1.27 or later +MLBatch can be installed on any Kubernetes cluster version 1.29 or later by following these instructions: - + [Kubernetes Cluster Setup](./setup.k8s-v1.27/CLUSTER-SETUP.md) - + [Kubternets Team Setup](./setup.k8s-v1.27/TEAM-SETUP.md) - + [Kubernetes Uninstall](setup.k8s-v1.27/UNINSTALL.md) + + [Kubernetes Cluster Setup](./setup.k8s/CLUSTER-SETUP.md) + + [Kubternets Team Setup](./setup.k8s/TEAM-SETUP.md) + + [Kubernetes Uninstall](./setup.k8s/UNINSTALL.md) diff --git a/setup.k8s-v1.27/TEAM-SETUP.md b/setup.k8s-v1.27/TEAM-SETUP.md deleted file mode 100644 index d2056f6..0000000 --- a/setup.k8s-v1.27/TEAM-SETUP.md +++ /dev/null @@ -1,96 +0,0 @@ -# Team Setup - -A *team* in MLBatch is a group of users that share a resource quota. - -Before setting up your teams and quotas, please read [QUOTA_MAINTENANCE.md](../QUOTA_MAINTENANCE.md) -for a discussion of our recommended best practices. - -Setting up a new team requires the cluster admin to create a namespace, -a quota, a queue, and the required role bindings as described below. - -Create namespace: -```sh -kubectl create namespace team1 -``` - -For each user on the team, create a RoleBinding: -```sh -kubectl -n team1 apply -f- << EOF -kind: RoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: user-one -subjects: - - kind: User - apiGroup: rbac.authorization.k8s.io - name: user-one -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: mlbatch-edit -EOF -``` - -Specify the intended quota for the namespace by creating a `ClusterQueue`: -```sh -kubectl apply -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ClusterQueue -metadata: - name: team1-cluster-queue -spec: - namespaceSelector: {} - cohort: default-cohort - preemption: - withinClusterQueue: LowerOrNewerEqualPriority - reclaimWithinCohort: Any - borrowWithinCohort: - policy: Never - resourceGroups: - - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "nvidia.com/roce_gdr", "pods"] - flavors: - - name: default-flavor - resources: - - name: "cpu" - nominalQuota: 8000m - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "memory" - nominalQuota: 128Gi - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "nvidia.com/gpu" - nominalQuota: 16 - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "nvidia.com/roce_gdr" - nominalQuota: 4 - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "pods" - nominalQuota: 100 - # borrowingLimit: 0 - # lendingLimit: 0 -EOF -``` -Edit the above quantities to adjust the quota to the desired values. Pod counts -are optional and can be omitted from the list of covered resources. - -Uncomment all `borrowingLimit` lines to prevent this namespace from borrowing -quota from other namespaces. Uncomment all `lendingLimit` lines to prevent other -namespaces from borrowing quota from this namespace. - -Create a `LocalQueue` to bind the `ClusterQueue` to the namespace: -```sh -kubectl apply -n team1 -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: LocalQueue -metadata: - name: default-queue -spec: - clusterQueue: team1-cluster-queue -EOF -``` -We recommend naming the local queue `default-queue` as `AppWrappers` will -default to this queue name. - diff --git a/setup.k8s-v1.27/kueue/kustomization.yaml b/setup.k8s-v1.27/kueue/kustomization.yaml deleted file mode 100644 index 4d3b463..0000000 --- a/setup.k8s-v1.27/kueue/kustomization.yaml +++ /dev/null @@ -1,53 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -namespace: mlbatch-system - -resources: -- "https://github.com/kubernetes-sigs/kueue/config/default?ref=v0.8.3" - -labels: -- pairs: - app.kubernetes.io/name: kueue - app.kubernetes.io/component: controller - includeSelectors: true - -configMapGenerator: -- name: manager-config - namespace: kueue-system - behavior: replace - files: - - controller_manager_config.yaml - -images: -- name: us-central1-docker.pkg.dev/k8s-staging-images/kueue/kueue - newName: registry.k8s.io/kueue/kueue - newTag: v0.8.3 - -patches: -- path: manager_resources_patch.yaml -- path: mutating_webhook_patch.yaml -- path: remove_default_namespace.yaml -- path: validating_webhook_patch.yaml -- target: - kind: ClusterRole - name: manager-role - patch: | - - op: add - path: /rules/- - value: - apiGroups: - - workload.codeflare.dev - resources: - - appwrappers - verbs: - - get - - list - - watch -- target: - kind: Deployment - name: controller-manager - patch: | - - op: add - path: /spec/template/spec/containers/0/args/- - value: "--feature-gates=LendingLimit=true" diff --git a/setup.k8s-v1.30/CLUSTER-SETUP.md b/setup.k8s-v1.30/CLUSTER-SETUP.md deleted file mode 100644 index 65e90d5..0000000 --- a/setup.k8s-v1.30/CLUSTER-SETUP.md +++ /dev/null @@ -1,137 +0,0 @@ -# Cluster Setup - -The cluster setup installs and configures the following components: -+ Coscheduler -+ Kubeflow Training Operator -+ KubeRay -+ Kueue -+ AppWrappers -+ Cluster roles and priority classes - -## Priorities - -Create `default-priority`, `high-priority`, and `low-priority` priority classes: -```sh -kubectl apply -f setup.k8s-v1.30/mlbatch-priorities.yaml -``` - -## Coscheduler - -Install Coscheduler v0.28.9 as a secondary scheduler and configure packing: -```sh -helm install scheduler-plugins --namespace scheduler-plugins --create-namespace \ - scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ \ - --set-json pluginConfig='[{"args":{"scoringStrategy":{"resources":[{"name":"nvidia.com/gpu","weight":1}],"requestedToCapacityRatio":{"shape":[{"utilization":0,"score":0},{"utilization":100,"score":10}]},"type":"RequestedToCapacityRatio"}},"name":"NodeResourcesFit"},{"args":{"permitWaitingTimeSeconds":300},"name":"Coscheduling"}]' -``` -Patch Coscheduler pod priorities: -```sh -kubectl patch deployment -n scheduler-plugins --type=json --patch-file setup.k8s-v1.30/coscheduler-priority-patch.yaml scheduler-plugins-controller -kubectl patch deployment -n scheduler-plugins --type=json --patch-file setup.k8s-v1.30/coscheduler-priority-patch.yaml scheduler-plugins-scheduler -``` - -## Install Operators - -Create the mlbatch-system namespace -```sh -kubectl create namespace mlbatch-system -``` - -Install the Kubeflow Training Operator -```sh -kubectl apply --server-side -k setup.k8s-v1.30/training-operator -``` - -Install the KubeRay Operator -```sh -kubectl apply --server-side -k setup.k8s-v1.30/kuberay -``` - -Install Kueue -```sh -kubectl apply --server-side -k setup.k8s-v1.30/kueue -``` - -Install the AppWrapper Operator -```sh -kubectl apply --server-side -k setup.k8s-v1.30/appwrapper -``` -The provided configuration differs from the default configuration of the -operators as follows: -- Kubeflow Training Operator: - - `gang-scheduler-name` is set to `scheduler-plugins-scheduler`, -- Kueue: - - `batch/job` integration is disabled, - - `waitForPodsReady` is disabled, - - `LendingLimit` feature gate is enabled, - - `fairSharing` is enabled, - - `enableClusterQueueResources` metrics is enabled, -- AppWrapper operator: - - `userRBACAdmissionCheck` is disabled, - - `schedulerName` is set to `scheduler-plugins-scheduler`, - - `queueName` is set to `default-queue`, -- pod priorities, resource requests and limits have been adjusted. - -## Kueue Configuration - -Create Kueue's default flavor: -```sh -kubectl apply -f setup.k8s-v1.30/default-flavor.yaml -``` - -## Cluster Role - -Create `mlbatch-edit` role: -```sh -kubectl apply -f setup.k8s-v1.30/mlbatch-edit-role.yaml -``` -## Validating Admission Policy - -Create an admission policy to enforce that all pod-creating resources -permitted by the mlbatch-edit role that are created in team namespaces -will have local queue names and thus be subject to Kueue's quota management. -```sh -kubectl apply -f setup.k8s-v1.30/admission-policy.yaml -``` - -## Slack Cluster Queue - -Create the designated slack `ClusterQueue` which will be used to automate -minor adjustments to cluster capacity caused by node failures and -scheduler maintanence. -```sh -kubectl apply -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ClusterQueue -metadata: - name: slack-cluster-queue -spec: - namespaceSelector: {} - cohort: default-cohort - preemption: - withinClusterQueue: LowerOrNewerEqualPriority - reclaimWithinCohort: Any - borrowWithinCohort: - policy: Never - resourceGroups: - - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "nvidia.com/roce_gdr", "pods"] - flavors: - - name: default-flavor - resources: - - name: "cpu" - nominalQuota: 8000m - - name: "memory" - nominalQuota: 128Gi - - name: "nvidia.com/gpu" - nominalQuota: 8 - - name: "nvidia.com/roce_gdr" - nominalQuota: 1 - - name: "pods" - nominalQuota: 100 -EOF -``` -Edit the above quantities to adjust the quota to the desired -values. Pod counts are optional and can be omitted from the list of -covered resources. The `lendingLimit` for each resource will be -dynamically adjusted by the MLBatch system to reflect reduced cluster -capacity. See [QUOTA_MAINTENANCE.md](../QUOTA_MAINTENANCE.md) for a -detailed discussion of the role of the slack `ClusterQueue`. diff --git a/setup.k8s-v1.30/UNINSTALL.md b/setup.k8s-v1.30/UNINSTALL.md deleted file mode 100644 index 1796163..0000000 --- a/setup.k8s-v1.30/UNINSTALL.md +++ /dev/null @@ -1,24 +0,0 @@ -# Uninstall - -***First, remove all team namespaces and corresponding cluster queues.*** - -Then to uninstall the MLBatch controllers and reclaim the corresponding -namespaces, do the following: -```sh -# Delete operators and CRDs -kubectl delete -k setup.k8s-v1.30/appwrapper -kubectl delete -k setup.k8s-v1.30/kueue -kubectl delete -k setup.k8s-v1.30/kuberay -kubectl delete -k setup.k8s-v1.30/training-operator - -# Delete namespace -kubectl delete namespace mlbatch-system - -# Delete clusterole and admission policy -kubectl delete clusterrole mlbatch-edit -kubectl delete -f setup.k8s-v1.30/admission-policy.yaml - -# Coscheduler uninstall -helm uninstall -n scheduler-plugins scheduler-plugins -kubectl delete namespace scheduler-plugins -``` diff --git a/setup.k8s-v1.30/admission-policy.yaml b/setup.k8s-v1.30/admission-policy.yaml deleted file mode 100644 index 8155818..0000000 --- a/setup.k8s-v1.30/admission-policy.yaml +++ /dev/null @@ -1,34 +0,0 @@ -apiVersion: admissionregistration.k8s.io/v1 -kind: ValidatingAdmissionPolicy -metadata: - name: mlbatch-require-queue-name -spec: - failurePolicy: Fail - matchConstraints: - resourceRules: - - apiGroups: ["kubeflow.org"] - apiVersions: ["v1"] - operations: ["CREATE", "UPDATE"] - resources: ["pytorchjobs"] - - apiGroups: ["ray.io"] - apiVersions: ["v1"] - operations: ["CREATE", "UPDATE"] - resources: ["rayjobs","rayclusters"] - matchConditions: - - name: exclude-appwrapper-owned - expression: "!(has(object.metadata.ownerReferences) && object.metadata.ownerReferences.exists(o, o.apiVersion=='workload.codeflare.dev/v1beta2'&&o.kind=='AppWrapper'&&o.controller))" - validations: - - expression: "has(object.metadata.labels) && 'kueue.x-k8s.io/queue-name' in object.metadata.labels && object.metadata.labels['kueue.x-k8s.io/queue-name'] != ''" - message: "All non-AppWrapper workloads must have a 'kueue.x-k8s.io/queue-name' label with non-empty value." ---- -apiVersion: admissionregistration.k8s.io/v1 -kind: ValidatingAdmissionPolicyBinding -metadata: - name: mlbatch-require-queue-name -spec: - policyName: mlbatch-require-queue-name - validationActions: [Deny] - matchResources: - namespaceSelector: - matchLabels: - mlbatch-team-namespace: "true" diff --git a/setup.k8s-v1.30/appwrapper/config_patch.yaml b/setup.k8s-v1.30/appwrapper/config_patch.yaml deleted file mode 100644 index be6872e..0000000 --- a/setup.k8s-v1.30/appwrapper/config_patch.yaml +++ /dev/null @@ -1,23 +0,0 @@ -kind: ConfigMap -apiVersion: v1 -metadata: - name: appwrapper-operator-config - namespace: appwrapper-system -data: - config.yaml: | - appwrapper: - enableKueueIntegrations: true - kueueJobReconciller: - manageJobsWithoutQueueName: false - waitForPodsReady: - enable: false - defaultQueueName: default-queue - schedulerName: scheduler-plugins-scheduler - slackQueueName: slack-cluster-queue - userRBACAdmissionCheck: false - controllerManager: - health: - bindAddress: ":8081" - metrics: - bindAddress: "127.0.0.1:8080" - leaderElection: true diff --git a/setup.k8s-v1.30/appwrapper/kustomization.yaml b/setup.k8s-v1.30/appwrapper/kustomization.yaml deleted file mode 100644 index 234a57d..0000000 --- a/setup.k8s-v1.30/appwrapper/kustomization.yaml +++ /dev/null @@ -1,22 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -namespace: mlbatch-system - -resources: -- "https://github.com/project-codeflare/appwrapper/config/default?ref=v0.30.0" - -labels: -- pairs: - app.kubernetes.io/name: appwrapper - app.kubernetes.io/component: controller - includeSelectors: true - -images: -- name: quay.io/ibm/appwrapper - newTag: v0.30.0 - -patches: -- path: config_patch.yaml -- path: manager_resources_patch.yaml -- path: remove_default_namespace.yaml diff --git a/setup.k8s-v1.30/appwrapper/manager_resources_patch.yaml b/setup.k8s-v1.30/appwrapper/manager_resources_patch.yaml deleted file mode 100644 index 1b26c3c..0000000 --- a/setup.k8s-v1.30/appwrapper/manager_resources_patch.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: controller-manager - namespace: system -spec: - template: - spec: - priorityClassName: system-node-critical - containers: - - name: manager - resources: - requests: - cpu: 250m - memory: 250Mi - limits: - cpu: 1000m - memory: 1000Mi diff --git a/setup.k8s-v1.30/appwrapper/remove_default_namespace.yaml b/setup.k8s-v1.30/appwrapper/remove_default_namespace.yaml deleted file mode 100644 index b63fb95..0000000 --- a/setup.k8s-v1.30/appwrapper/remove_default_namespace.yaml +++ /dev/null @@ -1,5 +0,0 @@ -$patch: delete -apiVersion: v1 -kind: Namespace -metadata: - name: appwrapper-system diff --git a/setup.k8s-v1.30/coscheduler-priority-patch.yaml b/setup.k8s-v1.30/coscheduler-priority-patch.yaml deleted file mode 100644 index 278802f..0000000 --- a/setup.k8s-v1.30/coscheduler-priority-patch.yaml +++ /dev/null @@ -1,3 +0,0 @@ -- op: add - path: /spec/template/spec/priorityClassName - value: system-node-critical diff --git a/setup.k8s-v1.30/default-flavor.yaml b/setup.k8s-v1.30/default-flavor.yaml deleted file mode 100644 index 6cbccf3..0000000 --- a/setup.k8s-v1.30/default-flavor.yaml +++ /dev/null @@ -1,4 +0,0 @@ -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ResourceFlavor -metadata: - name: default-flavor diff --git a/setup.k8s-v1.30/kind/kind-config.yaml b/setup.k8s-v1.30/kind/kind-config.yaml deleted file mode 100644 index dc3a857..0000000 --- a/setup.k8s-v1.30/kind/kind-config.yaml +++ /dev/null @@ -1,11 +0,0 @@ -kind: Cluster -apiVersion: kind.x-k8s.io/v1alpha4 -nodes: -# the control plane node config -- role: control-plane - # kubernetes version 1.30.0 from kind v0.23.0 - image: kindest/node:v1.30.0@sha256:047357ac0cfea04663786a612ba1eaba9702bef25227a794b52890dd8bcd692e -# the worker -- role: worker - # kubernetes version 1.30.0 from kind v0.23.0 - image: kindest/node:v1.30.0@sha256:047357ac0cfea04663786a612ba1eaba9702bef25227a794b52890dd8bcd692e diff --git a/setup.k8s-v1.30/kuberay/kustomization.yaml b/setup.k8s-v1.30/kuberay/kustomization.yaml deleted file mode 100644 index 0161395..0000000 --- a/setup.k8s-v1.30/kuberay/kustomization.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -namespace: mlbatch-system - -resources: -- "https://github.com/ray-project/kuberay/ray-operator/config/default?ref=v1.1.0" - -labels: -- pairs: - app.kubernetes.io/name: kuberay - app.kubernetes.io/component: controller - includeSelectors: true - -patches: -- path: remove_default_namespace.yaml -- path: manager_resources_patch.yaml diff --git a/setup.k8s-v1.30/kuberay/manager_resources_patch.yaml b/setup.k8s-v1.30/kuberay/manager_resources_patch.yaml deleted file mode 100644 index 7bb80d9..0000000 --- a/setup.k8s-v1.30/kuberay/manager_resources_patch.yaml +++ /dev/null @@ -1,20 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: kuberay-operator - namespace: system -spec: - template: - spec: - priorityClassName: system-node-critical - containers: - - name: kuberay-operator - args: - - "--zap-log-level=2" - resources: - requests: - cpu: 100m - memory: 100Mi - limits: - cpu: 500m - memory: 1000Mi diff --git a/setup.k8s-v1.30/kuberay/remove_default_namespace.yaml b/setup.k8s-v1.30/kuberay/remove_default_namespace.yaml deleted file mode 100644 index b5977cc..0000000 --- a/setup.k8s-v1.30/kuberay/remove_default_namespace.yaml +++ /dev/null @@ -1,5 +0,0 @@ -$patch: delete -apiVersion: v1 -kind: Namespace -metadata: - name: ray-system diff --git a/setup.k8s-v1.30/kueue/controller_manager_config.yaml b/setup.k8s-v1.30/kueue/controller_manager_config.yaml deleted file mode 100644 index 4325927..0000000 --- a/setup.k8s-v1.30/kueue/controller_manager_config.yaml +++ /dev/null @@ -1,64 +0,0 @@ -apiVersion: config.kueue.x-k8s.io/v1beta1 -kind: Configuration -health: - healthProbeBindAddress: :8081 -metrics: - bindAddress: :8080 - enableClusterQueueResources: true -webhook: - port: 9443 -leaderElection: - leaderElect: true - resourceName: c1f6bfd2.kueue.x-k8s.io -controller: - groupKindConcurrency: - Job.batch: 5 - Pod: 5 - Workload.kueue.x-k8s.io: 5 - LocalQueue.kueue.x-k8s.io: 1 - ClusterQueue.kueue.x-k8s.io: 1 - ResourceFlavor.kueue.x-k8s.io: 1 -clientConnection: - qps: 50 - burst: 100 -#pprofBindAddress: :8083 -waitForPodsReady: - enable: false -# timeout: 5m -# blockAdmission: false -# requeuingStrategy: -# timestamp: Eviction -# backoffLimitCount: null # null indicates infinite requeuing -# backoffBaseSeconds: 60 -# backoffMaxSeconds: 3600 -#manageJobsWithoutQueueName: false -#internalCertManagement: -# enable: false -# webhookServiceName: "" -# webhookSecretName: "" -integrations: - frameworks: - - "batch/job" - - "kubeflow.org/mpijob" - - "ray.io/rayjob" - - "ray.io/raycluster" - - "jobset.x-k8s.io/jobset" - - "kubeflow.org/mxjob" - - "kubeflow.org/paddlejob" - - "kubeflow.org/pytorchjob" - - "kubeflow.org/tfjob" - - "kubeflow.org/xgboostjob" - # - "pod" - externalFrameworks: - - "AppWrapper.v1beta2.workload.codeflare.dev" -# podOptions: -# namespaceSelector: -# matchExpressions: -# - key: kubernetes.io/metadata.name -# operator: NotIn -# values: [ kube-system, kueue-system ] -fairSharing: - enable: true - preemptionStrategies: [LessThanOrEqualToFinalShare, LessThanInitialShare] -#resources: -# excludeResourcePrefixes: [] diff --git a/setup.k8s-v1.30/kueue/manager_resources_patch.yaml b/setup.k8s-v1.30/kueue/manager_resources_patch.yaml deleted file mode 100644 index 5dc7501..0000000 --- a/setup.k8s-v1.30/kueue/manager_resources_patch.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: controller-manager - namespace: system -spec: - template: - spec: - priorityClassName: system-node-critical diff --git a/setup.k8s-v1.30/kueue/mutating_webhook_patch.yaml b/setup.k8s-v1.30/kueue/mutating_webhook_patch.yaml deleted file mode 100644 index 61d0e1d..0000000 --- a/setup.k8s-v1.30/kueue/mutating_webhook_patch.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: admissionregistration.k8s.io/v1 -kind: MutatingWebhookConfiguration -metadata: - name: mutating-webhook-configuration -webhooks: - - $patch: delete - name: mpod.kb.io - - $patch: delete - name: mjob.kb.io diff --git a/setup.k8s-v1.30/kueue/remove_default_namespace.yaml b/setup.k8s-v1.30/kueue/remove_default_namespace.yaml deleted file mode 100644 index 787ee88..0000000 --- a/setup.k8s-v1.30/kueue/remove_default_namespace.yaml +++ /dev/null @@ -1,5 +0,0 @@ -$patch: delete -apiVersion: v1 -kind: Namespace -metadata: - name: kueue-system diff --git a/setup.k8s-v1.30/kueue/validating_webhook_patch.yaml b/setup.k8s-v1.30/kueue/validating_webhook_patch.yaml deleted file mode 100644 index 711b05d..0000000 --- a/setup.k8s-v1.30/kueue/validating_webhook_patch.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: admissionregistration.k8s.io/v1 -kind: ValidatingWebhookConfiguration -metadata: - name: validating-webhook-configuration -webhooks: - - $patch: delete - name: vpod.kb.io diff --git a/setup.k8s-v1.30/mlbatch-edit-role.yaml b/setup.k8s-v1.30/mlbatch-edit-role.yaml deleted file mode 100644 index 5182a64..0000000 --- a/setup.k8s-v1.30/mlbatch-edit-role.yaml +++ /dev/null @@ -1,135 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: mlbatch-edit -rules: -- apiGroups: - - "" - resources: - - pods - verbs: - - delete - - get - - list - - watch -- apiGroups: - - apps - resources: - - deployments - - statefulsets - verbs: - - delete - - get - - list - - watch -- apiGroups: - - "" - resources: - - services - - secrets - - configmaps - - persistentvolumeclaims - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - kueue.x-k8s.io - resources: - - "*" - verbs: - - get - - list - - watch -- apiGroups: - - kubeflow.org - resources: - - pytorchjobs - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - ray.io - resources: - - rayjobs - - rayclusters - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - batch - resources: - - jobs - verbs: - - delete - - get - - list - - watch -- apiGroups: - - workload.codeflare.dev - resources: - - appwrappers - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - scheduling.k8s.io - resources: - - priorityclasses - verbs: - - get - - list - - watch -- apiGroups: - - scheduling.x-k8s.io - resources: - - podgroups - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - events - verbs: - - get - - list - - watch -- apiGroups: - - "" - resources: - - namespaces - - pods/logs - verbs: - - get -- apiGroups: - - "" - resources: - - pods/exec - - pods/portforward - verbs: - - create diff --git a/setup.k8s-v1.30/mlbatch-priorities.yaml b/setup.k8s-v1.30/mlbatch-priorities.yaml deleted file mode 100644 index 77c8f3b..0000000 --- a/setup.k8s-v1.30/mlbatch-priorities.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: low-priority -value: 1 -preemptionPolicy: PreemptLowerPriority -globalDefault: false -description: "This is the priority class for all lower priority jobs." ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: default-priority -value: 5 -preemptionPolicy: PreemptLowerPriority -globalDefault: true -description: "This is the priority class for all jobs (default priority)." ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: high-priority -value: 10 -preemptionPolicy: PreemptLowerPriority -globalDefault: false -description: "This is the priority class defined for highly important jobs that would evict lower and default priority jobs." diff --git a/setup.k8s-v1.30/training-operator/kustomization.yaml b/setup.k8s-v1.30/training-operator/kustomization.yaml deleted file mode 100644 index 6aa6dc2..0000000 --- a/setup.k8s-v1.30/training-operator/kustomization.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -namespace: mlbatch-system - -resources: -- "https://github.com/kubeflow/training-operator/manifests/base?ref=v1.7.0" - -labels: -- pairs: - app.kubernetes.io/name: training-operator - app.kubernetes.io/component: controller - includeSelectors: true - -images: -- name: kubeflow/training-operator - newTag: "v1-855e096" - -patches: -- path: manager_resources_patch.yaml diff --git a/setup.k8s-v1.30/training-operator/manager_resources_patch.yaml b/setup.k8s-v1.30/training-operator/manager_resources_patch.yaml deleted file mode 100644 index 5bc1f6d..0000000 --- a/setup.k8s-v1.30/training-operator/manager_resources_patch.yaml +++ /dev/null @@ -1,20 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: training-operator -spec: - template: - spec: - priorityClassName: system-node-critical - containers: - - name: training-operator - args: - - "--zap-log-level=2" - - "--gang-scheduler-name=scheduler-plugins-scheduler" - resources: - requests: - cpu: 100m - memory: 100Mi - limits: - cpu: 500m - memory: 1000Mi diff --git a/setup.k8s-v1.27/CLUSTER-SETUP.md b/setup.k8s/CLUSTER-SETUP.md similarity index 83% rename from setup.k8s-v1.27/CLUSTER-SETUP.md rename to setup.k8s/CLUSTER-SETUP.md index 166ec49..74f6791 100644 --- a/setup.k8s-v1.27/CLUSTER-SETUP.md +++ b/setup.k8s/CLUSTER-SETUP.md @@ -12,7 +12,7 @@ The cluster setup installs and configures the following components: Create `default-priority`, `high-priority`, and `low-priority` priority classes: ```sh -kubectl apply -f setup.k8s-v1.27/mlbatch-priorities.yaml +kubectl apply -f setup.k8s/mlbatch-priorities.yaml ``` ## Coscheduler @@ -25,8 +25,8 @@ helm install scheduler-plugins --namespace scheduler-plugins --create-namespace ``` Patch Coscheduler pod priorities: ```sh -kubectl patch deployment -n scheduler-plugins --type=json --patch-file setup.k8s-v1.27/coscheduler-priority-patch.yaml scheduler-plugins-controller -kubectl patch deployment -n scheduler-plugins --type=json --patch-file setup.k8s-v1.27/coscheduler-priority-patch.yaml scheduler-plugins-scheduler +kubectl patch deployment -n scheduler-plugins --type=json --patch-file setup.k8s/coscheduler-priority-patch.yaml scheduler-plugins-controller +kubectl patch deployment -n scheduler-plugins --type=json --patch-file setup.k8s/coscheduler-priority-patch.yaml scheduler-plugins-scheduler ``` ## Install Operators @@ -38,30 +38,31 @@ kubectl create namespace mlbatch-system Install the Kubeflow Training Operator ```sh -kubectl apply --server-side -k setup.k8s-v1.27/training-operator +kubectl apply --server-side -k setup.k8s/training-operator ``` Install the KubeRay Operator ```sh -kubectl apply --server-side -k setup.k8s-v1.27/kuberay +kubectl apply --server-side -k setup.k8s/kuberay ``` Install Kueue ```sh -kubectl apply --server-side -k setup.k8s-v1.27/kueue +kubectl apply --server-side -k setup.k8s/kueue ``` Install the AppWrapper Operator ```sh -kubectl apply --server-side -k setup.k8s-v1.27/appwrapper +kubectl apply --server-side -k setup.k8s/appwrapper ``` The provided configuration differs from the default configuration of the operators as follows: - Kubeflow Training Operator: - `gang-scheduler-name` is set to `scheduler-plugins-scheduler`, - Kueue: - - `manageJobsWithoutQueueName` is enabled, - `batch/job` integration is disabled, + - `manageJobsWithoutQueueName` is enabled and configured via `managedJobsNamespaceSelector` to be + scoped to only namespaces that are labeled with `mlbatch-team-namespace=true`. - `waitForPodsReady` is disabled, - `LendingLimit` feature gate is enabled, - `fairSharing` is enabled, @@ -76,14 +77,14 @@ operators as follows: Create Kueue's default flavor: ```sh -kubectl apply -f setup.k8s-v1.27/default-flavor.yaml +kubectl apply -f setup.k8s/default-flavor.yaml ``` ## Cluster Role Create `mlbatch-edit` role: ```sh -kubectl apply -f setup.k8s-v1.27/mlbatch-edit-role.yaml +kubectl apply -f setup.k8s/mlbatch-edit-role.yaml ``` ## Slack Cluster Queue diff --git a/setup.k8s-v1.30/TEAM-SETUP.md b/setup.k8s/TEAM-SETUP.md similarity index 99% rename from setup.k8s-v1.30/TEAM-SETUP.md rename to setup.k8s/TEAM-SETUP.md index 24cfbe8..3f1fc38 100644 --- a/setup.k8s-v1.30/TEAM-SETUP.md +++ b/setup.k8s/TEAM-SETUP.md @@ -8,7 +8,6 @@ for a discussion of our recommended best practices. Setting up a new team requires the cluster admin to create a namespace, a quota, a queue, and the required role bindings as described below. - Create and label the namespace: ```sh kubectl create namespace team1 diff --git a/setup.k8s-v1.27/UNINSTALL.md b/setup.k8s/UNINSTALL.md similarity index 100% rename from setup.k8s-v1.27/UNINSTALL.md rename to setup.k8s/UNINSTALL.md diff --git a/setup.k8s-v1.27/appwrapper/config_patch.yaml b/setup.k8s/appwrapper/config_patch.yaml similarity index 100% rename from setup.k8s-v1.27/appwrapper/config_patch.yaml rename to setup.k8s/appwrapper/config_patch.yaml diff --git a/setup.k8s-v1.27/appwrapper/kustomization.yaml b/setup.k8s/appwrapper/kustomization.yaml similarity index 100% rename from setup.k8s-v1.27/appwrapper/kustomization.yaml rename to setup.k8s/appwrapper/kustomization.yaml diff --git a/setup.k8s-v1.27/appwrapper/manager_resources_patch.yaml b/setup.k8s/appwrapper/manager_resources_patch.yaml similarity index 100% rename from setup.k8s-v1.27/appwrapper/manager_resources_patch.yaml rename to setup.k8s/appwrapper/manager_resources_patch.yaml diff --git a/setup.k8s-v1.27/appwrapper/remove_default_namespace.yaml b/setup.k8s/appwrapper/remove_default_namespace.yaml similarity index 100% rename from setup.k8s-v1.27/appwrapper/remove_default_namespace.yaml rename to setup.k8s/appwrapper/remove_default_namespace.yaml diff --git a/setup.k8s-v1.27/coscheduler-priority-patch.yaml b/setup.k8s/coscheduler-priority-patch.yaml similarity index 100% rename from setup.k8s-v1.27/coscheduler-priority-patch.yaml rename to setup.k8s/coscheduler-priority-patch.yaml diff --git a/setup.k8s-v1.27/default-flavor.yaml b/setup.k8s/default-flavor.yaml similarity index 100% rename from setup.k8s-v1.27/default-flavor.yaml rename to setup.k8s/default-flavor.yaml diff --git a/setup.k8s-v1.27/kind/kind-config.yaml b/setup.k8s/kind/kind-config.yaml similarity index 100% rename from setup.k8s-v1.27/kind/kind-config.yaml rename to setup.k8s/kind/kind-config.yaml diff --git a/setup.k8s-v1.27/kuberay/kustomization.yaml b/setup.k8s/kuberay/kustomization.yaml similarity index 100% rename from setup.k8s-v1.27/kuberay/kustomization.yaml rename to setup.k8s/kuberay/kustomization.yaml diff --git a/setup.k8s-v1.27/kuberay/manager_resources_patch.yaml b/setup.k8s/kuberay/manager_resources_patch.yaml similarity index 100% rename from setup.k8s-v1.27/kuberay/manager_resources_patch.yaml rename to setup.k8s/kuberay/manager_resources_patch.yaml diff --git a/setup.k8s-v1.27/kuberay/remove_default_namespace.yaml b/setup.k8s/kuberay/remove_default_namespace.yaml similarity index 100% rename from setup.k8s-v1.27/kuberay/remove_default_namespace.yaml rename to setup.k8s/kuberay/remove_default_namespace.yaml diff --git a/setup.k8s-v1.27/kueue/controller_manager_config.yaml b/setup.k8s/kueue/controller_manager_config.yaml similarity index 78% rename from setup.k8s-v1.27/kueue/controller_manager_config.yaml rename to setup.k8s/kueue/controller_manager_config.yaml index 20c38eb..0f395ac 100644 --- a/setup.k8s-v1.27/kueue/controller_manager_config.yaml +++ b/setup.k8s/kueue/controller_manager_config.yaml @@ -12,10 +12,11 @@ leaderElection: resourceName: c1f6bfd2.kueue.x-k8s.io controller: groupKindConcurrency: - Job.batch: 5 +# Job.batch: 5 Pod: 5 Workload.kueue.x-k8s.io: 5 LocalQueue.kueue.x-k8s.io: 1 + Cohort.kueue.x-k8s.io: 1 ClusterQueue.kueue.x-k8s.io: 1 ResourceFlavor.kueue.x-k8s.io: 1 clientConnection: @@ -32,6 +33,9 @@ waitForPodsReady: # backoffBaseSeconds: 60 # backoffMaxSeconds: 3600 manageJobsWithoutQueueName: true +managedJobsNamespaceSelector: + matchLabels: + mlbatch-team-namespace: "true" #internalCertManagement: # enable: false # webhookServiceName: "" @@ -48,7 +52,9 @@ integrations: - "kubeflow.org/pytorchjob" - "kubeflow.org/tfjob" - "kubeflow.org/xgboostjob" - # - "pod" +# - "pod" +# - "deployment" # requires enabling pod integration +# - "statefulset" # requires enabling pod integration externalFrameworks: - "AppWrapper.v1beta2.workload.codeflare.dev" # podOptions: @@ -62,3 +68,9 @@ fairSharing: preemptionStrategies: [LessThanOrEqualToFinalShare, LessThanInitialShare] #resources: # excludeResourcePrefixes: [] +# transformations: +# - input: nvidia.com/mig-4g.5gb +# strategy: Replace | Retain +# outputs: +# example.com/accelerator-memory: 5Gi +# example.com/accelerator-gpc: 4 diff --git a/setup.k8s-v1.30/kueue/kustomization.yaml b/setup.k8s/kueue/kustomization.yaml similarity index 97% rename from setup.k8s-v1.30/kueue/kustomization.yaml rename to setup.k8s/kueue/kustomization.yaml index 4d3b463..0f31cc2 100644 --- a/setup.k8s-v1.30/kueue/kustomization.yaml +++ b/setup.k8s/kueue/kustomization.yaml @@ -4,7 +4,7 @@ kind: Kustomization namespace: mlbatch-system resources: -- "https://github.com/kubernetes-sigs/kueue/config/default?ref=v0.8.3" +- "https://github.com/kubernetes-sigs/kueue/config/default?ref=v0.10.1" labels: - pairs: @@ -22,7 +22,7 @@ configMapGenerator: images: - name: us-central1-docker.pkg.dev/k8s-staging-images/kueue/kueue newName: registry.k8s.io/kueue/kueue - newTag: v0.8.3 + newTag: v0.10.1 patches: - path: manager_resources_patch.yaml diff --git a/setup.k8s-v1.27/kueue/manager_resources_patch.yaml b/setup.k8s/kueue/manager_resources_patch.yaml similarity index 100% rename from setup.k8s-v1.27/kueue/manager_resources_patch.yaml rename to setup.k8s/kueue/manager_resources_patch.yaml diff --git a/setup.k8s-v1.27/kueue/mutating_webhook_patch.yaml b/setup.k8s/kueue/mutating_webhook_patch.yaml similarity index 100% rename from setup.k8s-v1.27/kueue/mutating_webhook_patch.yaml rename to setup.k8s/kueue/mutating_webhook_patch.yaml diff --git a/setup.k8s-v1.27/kueue/remove_default_namespace.yaml b/setup.k8s/kueue/remove_default_namespace.yaml similarity index 100% rename from setup.k8s-v1.27/kueue/remove_default_namespace.yaml rename to setup.k8s/kueue/remove_default_namespace.yaml diff --git a/setup.k8s-v1.27/kueue/validating_webhook_patch.yaml b/setup.k8s/kueue/validating_webhook_patch.yaml similarity index 100% rename from setup.k8s-v1.27/kueue/validating_webhook_patch.yaml rename to setup.k8s/kueue/validating_webhook_patch.yaml diff --git a/setup.k8s-v1.27/mlbatch-edit-role.yaml b/setup.k8s/mlbatch-edit-role.yaml similarity index 100% rename from setup.k8s-v1.27/mlbatch-edit-role.yaml rename to setup.k8s/mlbatch-edit-role.yaml diff --git a/setup.k8s-v1.27/mlbatch-priorities.yaml b/setup.k8s/mlbatch-priorities.yaml similarity index 100% rename from setup.k8s-v1.27/mlbatch-priorities.yaml rename to setup.k8s/mlbatch-priorities.yaml diff --git a/setup.k8s-v1.27/training-operator/kustomization.yaml b/setup.k8s/training-operator/kustomization.yaml similarity index 100% rename from setup.k8s-v1.27/training-operator/kustomization.yaml rename to setup.k8s/training-operator/kustomization.yaml diff --git a/setup.k8s-v1.27/training-operator/manager_resources_patch.yaml b/setup.k8s/training-operator/manager_resources_patch.yaml similarity index 100% rename from setup.k8s-v1.27/training-operator/manager_resources_patch.yaml rename to setup.k8s/training-operator/manager_resources_patch.yaml diff --git a/setup.tmpl/CLUSTER-SETUP.md.tmpl b/setup.tmpl/CLUSTER-SETUP.md.tmpl index 20df7b8..6a4b9ed 100644 --- a/setup.tmpl/CLUSTER-SETUP.md.tmpl +++ b/setup.tmpl/CLUSTER-SETUP.md.tmpl @@ -139,10 +139,9 @@ operators as follows: - Kubeflow Training Operator: - `gang-scheduler-name` is set to `scheduler-plugins-scheduler`, - Kueue: -{{- if not .VAP }} - - `manageJobsWithoutQueueName` is enabled, -{{- end }} - `batch/job` integration is disabled, + - `manageJobsWithoutQueueName` is enabled and configured via `managedJobsNamespaceSelector` to be + scoped to only namespaces that are labeled with `mlbatch-team-namespace=true`. - `waitForPodsReady` is disabled, - `LendingLimit` feature gate is enabled, {{- if .FAIRSHARE }} @@ -171,17 +170,6 @@ Create `mlbatch-edit` role: {{ .KUBECTL }} apply -f setup.{{ .VERSION }}/mlbatch-edit-role.yaml ``` -{{- if .VAP }} -## Validating Admission Policy - -Create an admission policy to enforce that all pod-creating resources -permitted by the mlbatch-edit role that are created in team namespaces -will have local queue names and thus be subject to Kueue's quota management. -```sh -{{ .KUBECTL }} apply -f setup.{{ .VERSION }}/admission-policy.yaml -``` -{{- end }} - {{- if .SLACKCQ }} ## Slack Cluster Queue diff --git a/setup.tmpl/Kubernetes-v1.27.yaml b/setup.tmpl/Kubernetes-v1.27.yaml deleted file mode 100644 index 35682d9..0000000 --- a/setup.tmpl/Kubernetes-v1.27.yaml +++ /dev/null @@ -1,8 +0,0 @@ -# Values for Kubernetes v1.27+ - -OPENSHIFT: false -VERSION: k8s-v1.27 -KUBECTL: kubectl -VAP: false -SLACKCQ: true -FAIRSHARE: true diff --git a/setup.tmpl/Kubernetes-v1.30.yaml b/setup.tmpl/Kubernetes.yaml similarity index 52% rename from setup.tmpl/Kubernetes-v1.30.yaml rename to setup.tmpl/Kubernetes.yaml index a755af6..ed9a752 100644 --- a/setup.tmpl/Kubernetes-v1.30.yaml +++ b/setup.tmpl/Kubernetes.yaml @@ -1,8 +1,7 @@ -# Values for Kubernetes v1.30+ +# Values for Kubernetes v1.29+ OPENSHIFT: false -VERSION: k8s-v1.30 +VERSION: k8s KUBECTL: kubectl -VAP: true SLACKCQ: true FAIRSHARE: true diff --git a/setup.tmpl/Makefile b/setup.tmpl/Makefile index 25e7c78..bc86801 100644 --- a/setup.tmpl/Makefile +++ b/setup.tmpl/Makefile @@ -29,10 +29,8 @@ docs: gotmpl ../tools/gotmpl/gotmpl -input ./TEAM-SETUP.md.tmpl -output ../setup.RHOAI-v2.15/TEAM-SETUP.md -values RHOAI-v2.15.yaml ../tools/gotmpl/gotmpl -input ./CLUSTER-SETUP.md.tmpl -output ../setup.RHOAI-v2.16/CLUSTER-SETUP.md -values RHOAI-v2.16.yaml ../tools/gotmpl/gotmpl -input ./TEAM-SETUP.md.tmpl -output ../setup.RHOAI-v2.16/TEAM-SETUP.md -values RHOAI-v2.16.yaml - ../tools/gotmpl/gotmpl -input ./CLUSTER-SETUP.md.tmpl -output ../setup.k8s-v1.27/CLUSTER-SETUP.md -values Kubernetes-v1.27.yaml - ../tools/gotmpl/gotmpl -input ./TEAM-SETUP.md.tmpl -output ../setup.k8s-v1.27/TEAM-SETUP.md -values Kubernetes-v1.27.yaml - ../tools/gotmpl/gotmpl -input ./CLUSTER-SETUP.md.tmpl -output ../setup.k8s-v1.30/CLUSTER-SETUP.md -values Kubernetes-v1.30.yaml - ../tools/gotmpl/gotmpl -input ./TEAM-SETUP.md.tmpl -output ../setup.k8s-v1.30/TEAM-SETUP.md -values Kubernetes-v1.30.yaml + ../tools/gotmpl/gotmpl -input ./CLUSTER-SETUP.md.tmpl -output ../setup.k8s/CLUSTER-SETUP.md -values Kubernetes.yaml + ../tools/gotmpl/gotmpl -input ./TEAM-SETUP.md.tmpl -output ../setup.k8s/TEAM-SETUP.md -values Kubernetes.yaml ##@ Dependencies diff --git a/setup.tmpl/TEAM-SETUP.md.tmpl b/setup.tmpl/TEAM-SETUP.md.tmpl index a0512d4..327e200 100644 --- a/setup.tmpl/TEAM-SETUP.md.tmpl +++ b/setup.tmpl/TEAM-SETUP.md.tmpl @@ -29,16 +29,16 @@ Bind cluster role to group in namespace: Setting up a new team requires the cluster admin to create a namespace, a quota, a queue, and the required role bindings as described below. -{{ if .VAP }} -Create and label the namespace: +{{ if .OPENSHIFT }} +Create the namespace: ```sh {{ .KUBECTL }} create namespace team1 -{{ .KUBECTL }} label namespace team1 'mlbatch-team-namespace=true' ``` {{- else -}} -Create namespace: +Create and label the namespace: ```sh {{ .KUBECTL }} create namespace team1 +{{ .KUBECTL }} label namespace team1 'mlbatch-team-namespace=true' ``` {{- end }}