From 7df061bf74c8d2b74b92ed64d857011b49f54ac7 Mon Sep 17 00:00:00 2001 From: David Grove Date: Wed, 27 Nov 2024 12:07:53 -0500 Subject: [PATCH] Prune RHOAI fast stream setups by dropping 2.11 and 2.14. --- SETUP.md | 10 - setup.RHOAI-v2.11/CLUSTER-SETUP.md | 117 ------- setup.RHOAI-v2.11/TEAM-SETUP.md | 91 ------ setup.RHOAI-v2.11/UNINSTALL.md | 23 -- setup.RHOAI-v2.11/UPGRADE.md | 31 -- .../coscheduler-priority-patch.yaml | 3 - setup.RHOAI-v2.11/default-flavor.yaml | 4 - setup.RHOAI-v2.11/mlbatch-dsc.yaml | 32 -- setup.RHOAI-v2.11/mlbatch-dsci.yaml | 14 - setup.RHOAI-v2.11/mlbatch-edit-role.yaml | 142 --------- setup.RHOAI-v2.11/mlbatch-priorities.yaml | 26 -- setup.RHOAI-v2.11/mlbatch-subscription.yaml | 287 ----------------- .../mlbatch-upgrade-configmaps.yaml | 102 ------ setup.RHOAI-v2.14/CLUSTER-SETUP.md | 148 --------- setup.RHOAI-v2.14/TEAM-SETUP.md | 91 ------ setup.RHOAI-v2.14/UNINSTALL.md | 23 -- setup.RHOAI-v2.14/UPGRADE.md | 29 -- .../coscheduler-priority-patch.yaml | 3 - setup.RHOAI-v2.14/default-flavor.yaml | 4 - setup.RHOAI-v2.14/mlbatch-dsc.yaml | 32 -- setup.RHOAI-v2.14/mlbatch-dsci.yaml | 14 - setup.RHOAI-v2.14/mlbatch-edit-role.yaml | 142 --------- setup.RHOAI-v2.14/mlbatch-priorities.yaml | 26 -- setup.RHOAI-v2.14/mlbatch-subscription.yaml | 301 ------------------ setup.tmpl/Makefile | 4 - setup.tmpl/RHOAI-v2.11.yaml | 6 - setup.tmpl/RHOAI-v2.14.yaml | 6 - 27 files changed, 1711 deletions(-) delete mode 100644 setup.RHOAI-v2.11/CLUSTER-SETUP.md delete mode 100644 setup.RHOAI-v2.11/TEAM-SETUP.md delete mode 100644 setup.RHOAI-v2.11/UNINSTALL.md delete mode 100644 setup.RHOAI-v2.11/UPGRADE.md delete mode 100644 setup.RHOAI-v2.11/coscheduler-priority-patch.yaml delete mode 100644 setup.RHOAI-v2.11/default-flavor.yaml delete mode 100644 setup.RHOAI-v2.11/mlbatch-dsc.yaml delete mode 100644 setup.RHOAI-v2.11/mlbatch-dsci.yaml delete mode 100644 setup.RHOAI-v2.11/mlbatch-edit-role.yaml delete mode 100644 setup.RHOAI-v2.11/mlbatch-priorities.yaml delete mode 100644 setup.RHOAI-v2.11/mlbatch-subscription.yaml delete mode 100644 setup.RHOAI-v2.11/mlbatch-upgrade-configmaps.yaml delete mode 100644 setup.RHOAI-v2.14/CLUSTER-SETUP.md delete mode 100644 setup.RHOAI-v2.14/TEAM-SETUP.md delete mode 100644 setup.RHOAI-v2.14/UNINSTALL.md delete mode 100644 setup.RHOAI-v2.14/UPGRADE.md delete mode 100644 setup.RHOAI-v2.14/coscheduler-priority-patch.yaml delete mode 100644 setup.RHOAI-v2.14/default-flavor.yaml delete mode 100644 setup.RHOAI-v2.14/mlbatch-dsc.yaml delete mode 100644 setup.RHOAI-v2.14/mlbatch-dsci.yaml delete mode 100644 setup.RHOAI-v2.14/mlbatch-edit-role.yaml delete mode 100644 setup.RHOAI-v2.14/mlbatch-priorities.yaml delete mode 100644 setup.RHOAI-v2.14/mlbatch-subscription.yaml delete mode 100644 setup.tmpl/RHOAI-v2.11.yaml delete mode 100644 setup.tmpl/RHOAI-v2.14.yaml diff --git a/SETUP.md b/SETUP.md index 34872d2..9d06384 100644 --- a/SETUP.md +++ b/SETUP.md @@ -47,16 +47,6 @@ Instructions are provided for the following Red Hat OpenShift AI ***fast*** rele + [RHOAI 2.15 Team Setup](./setup.RHOAI-v2.15/TEAM-SETUP.md) + [UPGRADING from RHOAI 2.14](./setup.RHOAI-v2.15/UPGRADE.md) + [RHOAI 2.15 Uninstall](./setup.RHOAI-v2.15/UNINSTALL.md) -+ Red Hat OpenShift AI 2.14 - + [RHOAI 2.14 Cluster Setup](./setup.RHOAI-v2.14/CLUSTER-SETUP.md) - + [RHOAI 2.14 Team Setup](./setup.RHOAI-v2.14/TEAM-SETUP.md) - + [UPGRADING from RHOAI 2.13](./setup.RHOAI-v2.14/UPGRADE.md) - + [RHOAI 2.14 Uninstall](./setup.RHOAI-v2.14/UNINSTALL.md) -+ Red Hat OpenShift AI 2.11 - + [RHOAI 2.11 Cluster Setup](./setup.RHOAI-v2.11/CLUSTER-SETUP.md) - + [RHOAI 2.11 Team Setup](./setup.RHOAI-v2.11/TEAM-SETUP.md) - + [UPGRADING from RHOAI 2.10](./setup.RHOAI-v2.11/UPGRADE.md) - + [RHOAI 2.11 Uninstall](./setup.RHOAI-v2.11/UNINSTALL.md) ## Kubernetes diff --git a/setup.RHOAI-v2.11/CLUSTER-SETUP.md b/setup.RHOAI-v2.11/CLUSTER-SETUP.md deleted file mode 100644 index 7a04b50..0000000 --- a/setup.RHOAI-v2.11/CLUSTER-SETUP.md +++ /dev/null @@ -1,117 +0,0 @@ -# Cluster Setup - -The cluster setup installs Red Hat OpenShift AI and Coscheduler, configures Kueue, -cluster roles, and priority classes. - -If MLBatch is deployed on a cluster that used to run earlier versions of ODH, -[MCAD](https://github.com/project-codeflare/mcad), Red Hat OpenShift AI, or Coscheduler, -make sure to scrub traces of these installations. In particular, make sure to -delete the following custom resource definitions (CRD) if present on the -cluster. Make sure to delete all instances prior to deleting the CRDs: -```sh -# Delete old appwrappers and crd -oc delete appwrappers --all -A -oc delete crd appwrappers.workload.codeflare.dev - -# Delete old noderesourcetopologies and crd -oc delete noderesourcetopologies --all -A -oc delete crd noderesourcetopologies.topology.node.k8s.io -``` - -## Priorities - -Create `default-priority`, `high-priority`, and `low-priority` priority classes: -```sh -oc apply -f setup.RHOAI-v2.11/mlbatch-priorities.yaml -``` - -## Coscheduler - -Install Coscheduler v0.28.9 as a secondary scheduler and configure packing: -```sh -helm install scheduler-plugins --namespace scheduler-plugins --create-namespace \ - scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ \ - --set-json pluginConfig='[{"args":{"scoringStrategy":{"resources":[{"name":"nvidia.com/gpu","weight":1}],"requestedToCapacityRatio":{"shape":[{"utilization":0,"score":0},{"utilization":100,"score":10}]},"type":"RequestedToCapacityRatio"}},"name":"NodeResourcesFit"}]' -``` -Patch Coscheduler pod priorities: -```sh -oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.11/coscheduler-priority-patch.yaml scheduler-plugins-controller -oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.11/coscheduler-priority-patch.yaml scheduler-plugins-scheduler -``` - -## Red Hat OpenShift AI - -Create the Red Hat OpenShift AI subscription: -```sh -oc apply -f setup.RHOAI-v2.11/mlbatch-subscription.yaml -```` -Identify install plan: -```sh -oc get ip -n redhat-ods-operator -``` -``` -NAMESPACE NAME CSV APPROVAL APPROVED -redhat-ods-operator install-kmh8w rhods-operator.2.10.0 Manual false -``` -Approve install plan replacing the generated plan name below with the actual -value: -```sh -oc patch ip -n redhat-ods-operator --type merge --patch '{"spec":{"approved":true}}' install-kmh8w -``` -Create DSC Initialization: -```sh -oc apply -f setup.RHOAI-v2.11/mlbatch-dsci.yaml -``` -Create Data Science Cluster: -```sh -oc apply -f setup.RHOAI-v2.11/mlbatch-dsc.yaml -``` -The provided DSCI and DSC are intended to install a minimal set of Red Hat OpenShift -AI managed components: `codeflare`, `kueue`, `ray`, and `trainingoperator`. The -remaining components such as `dashboard` can be optionally enabled. - -The configuration of the managed components differs from the default Red Hat OpenShift -AI configuration as follows: -- Kubeflow Training Operator: - - `gang-scheduler-name` is set to `scheduler-plugins-scheduler`, -- Kueue: - - `manageJobsWithoutQueueName` is enabled, - - `batch/job` integration is disabled, - - `waitForPodsReady` is disabled, - - `LendingLimit` feature gate is enabled, - - `enableClusterQueueResources` metrics is enabled, -- Codeflare operator: - - the AppWrapper controller is enabled and configured as follows: - - `userRBACAdmissionCheck` is disabled, - - `schedulerName` is set to `scheduler-plugins-scheduler`, - - `queueName` is set to `default-queue`, -- pod priorities, resource requests and limits have been adjusted. - -To work around https://issues.redhat.com/browse/RHOAIENG-7887 (a race condition -in Red Hat OpenShift AI installation), do a rolling restart of the Kueue manager. -```sh -oc rollout restart deployment/kueue-controller-manager -n redhat-ods-applications -``` - -After doing the restart, verify that you see the following lines in the -kueue-controller-manager's log: -```sh -{"level":"info","ts":"2024-06-25T20:17:25.689638786Z","logger":"controller-runtime.builder","caller":"builder/webhook.go:189","msg":"Registering a validating webhook","GVK":"kubeflow.org/v1, Kind=PyTorchJob","path":"/validate-kubeflow-org-v1-pytorchjob"} -{"level":"info","ts":"2024-06-25T20:17:25.689698615Z","logger":"controller-runtime.webhook","caller":"webhook/server.go:183","msg":"Registering webhook","path":"/validate-kubeflow-org-v1-pytorchjob"} -{"level":"info","ts":"2024-06-25T20:17:25.689743757Z","logger":"setup","caller":"jobframework/setup.go:81","msg":"Set up controller and webhook for job framework","jobFrameworkName":"kubeflow.org/pytorchjob"} - -``` - -## Kueue Configuration - -Create Kueue's default flavor: -```sh -oc apply -f setup.RHOAI-v2.11/default-flavor.yaml -``` - -## Cluster Role - -Create `mlbatch-edit` role: -```sh -oc apply -f setup.RHOAI-v2.11/mlbatch-edit-role.yaml -``` diff --git a/setup.RHOAI-v2.11/TEAM-SETUP.md b/setup.RHOAI-v2.11/TEAM-SETUP.md deleted file mode 100644 index 85c9429..0000000 --- a/setup.RHOAI-v2.11/TEAM-SETUP.md +++ /dev/null @@ -1,91 +0,0 @@ -# Team Setup - -A *team* in MLBatch is a group of users that share a resource quota. - -Before setting up your teams and quotas, please read [QUOTA_MAINTENANCE.md](../QUOTA_MAINTENANCE.md) -for a discussion of our recommended best practices. - - -Setting up a new team requires the cluster admin to create a project, -a user group, a quota, a queue, and the required role bindings as described below. - -Create project: -```sh -oc new-project team1 -``` -Create user group: -```sh -oc adm groups new team1-edit-group -``` -Add users to group for example: -```sh -oc adm groups add-users team1-edit-group user1 -``` -Bind cluster role to group in namespace: -```sh -oc adm policy add-role-to-group mlbatch-edit team1-edit-group --role-namespace="" --namespace team1 -``` - -Specify the intended quota for the namespace by creating a `ClusterQueue`: -```sh -oc apply -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ClusterQueue -metadata: - name: team1-cluster-queue -spec: - namespaceSelector: {} - cohort: default-cohort - preemption: - withinClusterQueue: LowerOrNewerEqualPriority - reclaimWithinCohort: Any - borrowWithinCohort: - policy: Never - resourceGroups: - - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "nvidia.com/roce_gdr", "pods"] - flavors: - - name: default-flavor - resources: - - name: "cpu" - nominalQuota: 8000m - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "memory" - nominalQuota: 128Gi - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "nvidia.com/gpu" - nominalQuota: 16 - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "nvidia.com/roce_gdr" - nominalQuota: 4 - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "pods" - nominalQuota: 100 - # borrowingLimit: 0 - # lendingLimit: 0 -EOF -``` -Edit the above quantities to adjust the quota to the desired values. Pod counts -are optional and can be omitted from the list of covered resources. - -Uncomment all `borrowingLimit` lines to prevent this namespace from borrowing -quota from other namespaces. Uncomment all `lendingLimit` lines to prevent other -namespaces from borrowing quota from this namespace. - -Create a `LocalQueue` to bind the `ClusterQueue` to the namespace: -```sh -oc apply -n team1 -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: LocalQueue -metadata: - name: default-queue -spec: - clusterQueue: team1-cluster-queue -EOF -``` -We recommend naming the local queue `default-queue` as `AppWrappers` will -default to this queue name. - diff --git a/setup.RHOAI-v2.11/UNINSTALL.md b/setup.RHOAI-v2.11/UNINSTALL.md deleted file mode 100644 index 776045d..0000000 --- a/setup.RHOAI-v2.11/UNINSTALL.md +++ /dev/null @@ -1,23 +0,0 @@ -# Uninstall - -***First, remove all team projects and corresponding cluster queues.*** - -Then to uninstall the MLBatch controllers and reclaim the corresponding -namespaces, run: -```sh -# OpenShift AI uninstall -oc delete dsc mlbatch-dsc -oc delete dsci mlbatch-dsci -oc delete subscription -n redhat-ods-operator rhods-operator -oc delete csv -n redhat-ods-operator -l operators.coreos.com/rhods-operator.redhat-ods-operator -oc delete crd featuretrackers.features.opendatahub.io \ - dscinitializations.dscinitialization.opendatahub.io \ - datascienceclusters.datasciencecluster.opendatahub.io -oc delete operators rhods-operator.redhat-ods-operator -oc delete operatorgroup -n redhat-ods-operator rhods-operator -oc delete namespace redhat-ods-applications redhat-ods-monitoring redhat-ods-operator - -# Coscheduler uninstall -helm uninstall -n scheduler-plugins scheduler-plugins -oc delete namespace scheduler-plugins -``` diff --git a/setup.RHOAI-v2.11/UPGRADE.md b/setup.RHOAI-v2.11/UPGRADE.md deleted file mode 100644 index 13821a3..0000000 --- a/setup.RHOAI-v2.11/UPGRADE.md +++ /dev/null @@ -1,31 +0,0 @@ -# Upgrading from RHOAI 2.10 - -These instructions assume you installed and configured RHOAI 2.10 following -the MLBatch [install instructions for RHOAI-v2.10](../setup.RHOAI-v2.10/CLUSTER-SETUP.md). - -Your subscription will have automatically created an unapproved -install plan to upgrade to RHOAI 2.11. - -Before beginning, verify that the expected install plan exists: -```sh -oc get ip -n redhat-ods-operator -``` -Typical output would be: -```sh -NAME CSV APPROVAL APPROVED -install-nqrbp rhods-operator.2.10.0 Manual true -install-st8vh rhods-operator.2.11.0 Manual false -``` - -Assuming the install plan exists you can begin the upgrade process. - -First, update the MLBatch modifications to the default RHOAI configuration maps. -```sh -oc apply -f setup.RHOAI-v2.11/mlbatch-upgrade-configmaps.yaml -``` - -Second, approve the install plan replacing the example plan name below with the actual -value on your cluster: -```sh -oc patch ip -n redhat-ods-operator --type merge --patch '{"spec":{"approved":true}}' install-st8vh -``` diff --git a/setup.RHOAI-v2.11/coscheduler-priority-patch.yaml b/setup.RHOAI-v2.11/coscheduler-priority-patch.yaml deleted file mode 100644 index 278802f..0000000 --- a/setup.RHOAI-v2.11/coscheduler-priority-patch.yaml +++ /dev/null @@ -1,3 +0,0 @@ -- op: add - path: /spec/template/spec/priorityClassName - value: system-node-critical diff --git a/setup.RHOAI-v2.11/default-flavor.yaml b/setup.RHOAI-v2.11/default-flavor.yaml deleted file mode 100644 index 6cbccf3..0000000 --- a/setup.RHOAI-v2.11/default-flavor.yaml +++ /dev/null @@ -1,4 +0,0 @@ -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ResourceFlavor -metadata: - name: default-flavor diff --git a/setup.RHOAI-v2.11/mlbatch-dsc.yaml b/setup.RHOAI-v2.11/mlbatch-dsc.yaml deleted file mode 100644 index 66336bc..0000000 --- a/setup.RHOAI-v2.11/mlbatch-dsc.yaml +++ /dev/null @@ -1,32 +0,0 @@ -apiVersion: datasciencecluster.opendatahub.io/v1 -kind: DataScienceCluster -metadata: - name: mlbatch-dsc -spec: - components: - codeflare: - managementState: Managed - dashboard: - managementState: Removed - datasciencepipelines: - managementState: Removed - kserve: - managementState: Removed - serving: - ingressGateway: - certificate: - type: SelfSigned - managementState: Removed - name: knative-serving - kueue: - managementState: Managed - modelmeshserving: - managementState: Removed - ray: - managementState: Managed - trainingoperator: - managementState: Managed - trustyai: - managementState: Removed - workbenches: - managementState: Removed diff --git a/setup.RHOAI-v2.11/mlbatch-dsci.yaml b/setup.RHOAI-v2.11/mlbatch-dsci.yaml deleted file mode 100644 index 77785c3..0000000 --- a/setup.RHOAI-v2.11/mlbatch-dsci.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: dscinitialization.opendatahub.io/v1 -kind: DSCInitialization -metadata: - name: mlbatch-dsci -spec: - applicationsNamespace: redhat-ods-applications - monitoring: - managementState: Managed - namespace: redhat-ods-monitoring - serviceMesh: - managementState: Removed - trustedCABundle: - customCABundle: "" - managementState: Managed diff --git a/setup.RHOAI-v2.11/mlbatch-edit-role.yaml b/setup.RHOAI-v2.11/mlbatch-edit-role.yaml deleted file mode 100644 index 42fd518..0000000 --- a/setup.RHOAI-v2.11/mlbatch-edit-role.yaml +++ /dev/null @@ -1,142 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: mlbatch-edit -rules: -- apiGroups: - - "" - resources: - - pods - verbs: - - delete - - get - - list - - watch -- apiGroups: - - apps - resources: - - deployments - - statefulsets - verbs: - - delete - - get - - list - - watch -- apiGroups: - - "" - resources: - - services - - secrets - - configmaps - - persistentvolumeclaims - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - kueue.x-k8s.io - resources: - - "*" - verbs: - - get - - list - - watch -- apiGroups: - - kubeflow.org - resources: - - pytorchjobs - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - ray.io - resources: - - rayjobs - - rayclusters - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - batch - resources: - - jobs - verbs: - - delete - - get - - list - - watch -- apiGroups: - - workload.codeflare.dev - resources: - - appwrappers - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - scheduling.k8s.io - resources: - - priorityclasses - verbs: - - get - - list - - watch -- apiGroups: - - scheduling.x-k8s.io - resources: - - podgroups - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - events - verbs: - - get - - list - - watch -- apiGroups: - - "" - resources: - - namespaces - - pods/logs - verbs: - - get -- apiGroups: - - "" - - project.openshift.io - resources: - - projects - verbs: - - get -- apiGroups: - - "" - resources: - - pods/exec - - pods/portforward - verbs: - - create diff --git a/setup.RHOAI-v2.11/mlbatch-priorities.yaml b/setup.RHOAI-v2.11/mlbatch-priorities.yaml deleted file mode 100644 index 77c8f3b..0000000 --- a/setup.RHOAI-v2.11/mlbatch-priorities.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: low-priority -value: 1 -preemptionPolicy: PreemptLowerPriority -globalDefault: false -description: "This is the priority class for all lower priority jobs." ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: default-priority -value: 5 -preemptionPolicy: PreemptLowerPriority -globalDefault: true -description: "This is the priority class for all jobs (default priority)." ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: high-priority -value: 10 -preemptionPolicy: PreemptLowerPriority -globalDefault: false -description: "This is the priority class defined for highly important jobs that would evict lower and default priority jobs." diff --git a/setup.RHOAI-v2.11/mlbatch-subscription.yaml b/setup.RHOAI-v2.11/mlbatch-subscription.yaml deleted file mode 100644 index 45866c9..0000000 --- a/setup.RHOAI-v2.11/mlbatch-subscription.yaml +++ /dev/null @@ -1,287 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: redhat-ods-operator ---- -apiVersion: v1 -kind: Namespace -metadata: - name: redhat-ods-applications ---- -apiVersion: operators.coreos.com/v1 -kind: OperatorGroup -metadata: - name: rhods-operator - namespace: redhat-ods-operator ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: mlbatch-codeflare - namespace: redhat-ods-operator -data: - manager.yaml: | - apiVersion: apps/v1 - kind: Deployment - metadata: - name: manager - namespace: system - spec: - selector: - matchLabels: - app.kubernetes.io/name: codeflare-operator - app.kubernetes.io/part-of: codeflare - replicas: 1 - template: - metadata: - annotations: - kubectl.kubernetes.io/default-container: manager - labels: - app.kubernetes.io/name: codeflare-operator - app.kubernetes.io/part-of: codeflare - spec: - priorityClassName: system-node-critical - securityContext: - runAsNonRoot: true - # TODO(user): For common cases that do not require escalating privileges - # it is recommended to ensure that all your Pods/Containers are restrictive. - # More info: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted - # Please uncomment the following code if your project does NOT have to work on old Kubernetes - # versions < 1.20 or on vendors versions which do NOT support this field by default (i.e. Openshift < 4.11 ). - # seccompProfile: - # type: RuntimeDefault - containers: - - command: - - /manager - image: $(codeflare_operator_controller_image) - imagePullPolicy: Always - name: manager - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - "ALL" - env: - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - ports: - - containerPort: 8080 - protocol: TCP - name: metrics - livenessProbe: - httpGet: - path: /healthz - port: 8081 - initialDelaySeconds: 15 - periodSeconds: 20 - readinessProbe: - httpGet: - path: /readyz - port: 8081 - initialDelaySeconds: 5 - periodSeconds: 10 - resources: - limits: - cpu: "1" - memory: 1Gi - requests: - cpu: "1" - memory: 1Gi - serviceAccountName: controller-manager - terminationGracePeriodSeconds: 10 ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: codeflare-operator-config - namespace: redhat-ods-applications -data: - config.yaml: | - appwrapper: - Config: - manageJobsWithoutQueueName: true - userRBACAdmissionCheck: false - schedulerName: scheduler-plugins-scheduler - defaultQueueName: default-queue - enabled: true ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: mlbatch-kuberay - namespace: redhat-ods-operator -data: - kuberay-operator-image-patch.yaml: | - apiVersion: apps/v1 - kind: Deployment - metadata: - name: kuberay-operator - spec: - template: - spec: - priorityClassName: system-node-critical - containers: - - name: kuberay-operator - image: $(image) ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: mlbatch-kueue - namespace: redhat-ods-operator -data: - controller_manager_config.yaml: | - apiVersion: config.kueue.x-k8s.io/v1beta1 - kind: Configuration - health: - healthProbeBindAddress: :8081 - metrics: - bindAddress: :8080 - enableClusterQueueResources: true - webhook: - port: 9443 - leaderElection: - leaderElect: true - resourceName: c1f6bfd2.kueue.x-k8s.io - controller: - groupKindConcurrency: - Job.batch: 5 - Pod: 5 - Workload.kueue.x-k8s.io: 5 - LocalQueue.kueue.x-k8s.io: 1 - ClusterQueue.kueue.x-k8s.io: 1 - ResourceFlavor.kueue.x-k8s.io: 1 - clientConnection: - qps: 50 - burst: 100 - #pprofBindAddress: :8082 - waitForPodsReady: - enable: false - blockAdmission: false - manageJobsWithoutQueueName: true - #internalCertManagement: - # enable: false - # webhookServiceName: "" - # webhookSecretName: "" - integrations: - frameworks: - # - "batch/job" - - "kubeflow.org/mpijob" - - "ray.io/rayjob" - - "ray.io/raycluster" - - "jobset.x-k8s.io/jobset" - - "kubeflow.org/mxjob" - - "kubeflow.org/paddlejob" - - "kubeflow.org/pytorchjob" - - "kubeflow.org/tfjob" - - "kubeflow.org/xgboostjob" - # - "pod" - externalFrameworks: - - "AppWrapper.v1beta2.workload.codeflare.dev" - # podOptions: - # namespaceSelector: - # matchExpressions: - # - key: kubernetes.io/metadata.name - # operator: NotIn - # values: [ kube-system, kueue-system ] - manager_config_patch.yaml: | - apiVersion: apps/v1 - kind: Deployment - metadata: - name: controller-manager - namespace: system - spec: - template: - spec: - priorityClassName: system-node-critical - containers: - - name: manager - image: $(image) - args: - - "--config=/controller_manager_config.yaml" - - "--zap-log-level=2" - - "--feature-gates=LendingLimit=true" - volumeMounts: - - name: manager-config - mountPath: /controller_manager_config.yaml - subPath: controller_manager_config.yaml - volumes: - - name: manager-config - configMap: - name: manager-config ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: mlbatch-training-operator - namespace: redhat-ods-operator -data: - manager_config_patch.yaml: | - apiVersion: apps/v1 - kind: Deployment - metadata: - name: training-operator - spec: - template: - spec: - priorityClassName: system-node-critical - containers: - - name: training-operator - image: $(image) - args: - - "--zap-log-level=2" - - "--gang-scheduler-name=scheduler-plugins-scheduler" - resources: - requests: - cpu: 100m - memory: 100Mi - limits: - cpu: 500m - memory: 1000Mi ---- -apiVersion: operators.coreos.com/v1alpha1 -kind: Subscription -metadata: - name: rhods-operator - namespace: redhat-ods-operator -spec: - channel: fast - installPlanApproval: Manual - name: rhods-operator - source: redhat-operators - sourceNamespace: openshift-marketplace - startingCSV: rhods-operator.2.11.0 - config: - env: - - name: "DISABLE_DSC_CONFIG" - volumeMounts: - - name: mlbatch-codeflare - mountPath: /opt/manifests/codeflare/manager/manager.yaml - subPath: manager.yaml - - name: mlbatch-kuberay - mountPath: /opt/manifests/ray/openshift/kuberay-operator-image-patch.yaml - subPath: kuberay-operator-image-patch.yaml - - name: mlbatch-kueue - mountPath: /opt/manifests/kueue/components/manager/controller_manager_config.yaml - subPath: controller_manager_config.yaml - - name: mlbatch-kueue - mountPath: /opt/manifests/kueue/rhoai/manager_config_patch.yaml - subPath: manager_config_patch.yaml - - name: mlbatch-training-operator - mountPath: /opt/manifests/trainingoperator/rhoai/manager_config_patch.yaml - subPath: manager_config_patch.yaml - volumes: - - name: mlbatch-codeflare - configMap: - name: mlbatch-codeflare - - name: mlbatch-kuberay - configMap: - name: mlbatch-kuberay - - name: mlbatch-kueue - configMap: - name: mlbatch-kueue - - name: mlbatch-training-operator - configMap: - name: mlbatch-training-operator diff --git a/setup.RHOAI-v2.11/mlbatch-upgrade-configmaps.yaml b/setup.RHOAI-v2.11/mlbatch-upgrade-configmaps.yaml deleted file mode 100644 index e798e25..0000000 --- a/setup.RHOAI-v2.11/mlbatch-upgrade-configmaps.yaml +++ /dev/null @@ -1,102 +0,0 @@ ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: codeflare-operator-config - namespace: redhat-ods-applications -data: - config.yaml: | - appwrapper: - Config: - manageJobsWithoutQueueName: true - userRBACAdmissionCheck: false - schedulerName: scheduler-plugins-scheduler - defaultQueueName: default-queue - enabled: true ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: mlbatch-kueue - namespace: redhat-ods-operator -data: - controller_manager_config.yaml: | - apiVersion: config.kueue.x-k8s.io/v1beta1 - kind: Configuration - health: - healthProbeBindAddress: :8081 - metrics: - bindAddress: :8080 - enableClusterQueueResources: true - webhook: - port: 9443 - leaderElection: - leaderElect: true - resourceName: c1f6bfd2.kueue.x-k8s.io - controller: - groupKindConcurrency: - Job.batch: 5 - Pod: 5 - Workload.kueue.x-k8s.io: 5 - LocalQueue.kueue.x-k8s.io: 1 - ClusterQueue.kueue.x-k8s.io: 1 - ResourceFlavor.kueue.x-k8s.io: 1 - clientConnection: - qps: 50 - burst: 100 - #pprofBindAddress: :8082 - waitForPodsReady: - enable: false - blockAdmission: false - manageJobsWithoutQueueName: true - #internalCertManagement: - # enable: false - # webhookServiceName: "" - # webhookSecretName: "" - integrations: - frameworks: - # - "batch/job" - - "kubeflow.org/mpijob" - - "ray.io/rayjob" - - "ray.io/raycluster" - - "jobset.x-k8s.io/jobset" - - "kubeflow.org/mxjob" - - "kubeflow.org/paddlejob" - - "kubeflow.org/pytorchjob" - - "kubeflow.org/tfjob" - - "kubeflow.org/xgboostjob" - # - "pod" - externalFrameworks: - - "AppWrapper.v1beta2.workload.codeflare.dev" - # podOptions: - # namespaceSelector: - # matchExpressions: - # - key: kubernetes.io/metadata.name - # operator: NotIn - # values: [ kube-system, kueue-system ] - manager_config_patch.yaml: | - apiVersion: apps/v1 - kind: Deployment - metadata: - name: controller-manager - namespace: system - spec: - template: - spec: - priorityClassName: system-node-critical - containers: - - name: manager - image: $(image) - args: - - "--config=/controller_manager_config.yaml" - - "--zap-log-level=2" - - "--feature-gates=LendingLimit=true" - volumeMounts: - - name: manager-config - mountPath: /controller_manager_config.yaml - subPath: controller_manager_config.yaml - volumes: - - name: manager-config - configMap: - name: manager-config ---- diff --git a/setup.RHOAI-v2.14/CLUSTER-SETUP.md b/setup.RHOAI-v2.14/CLUSTER-SETUP.md deleted file mode 100644 index c0c5dd9..0000000 --- a/setup.RHOAI-v2.14/CLUSTER-SETUP.md +++ /dev/null @@ -1,148 +0,0 @@ -# Cluster Setup - -The cluster setup installs Red Hat OpenShift AI and Coscheduler, configures Kueue, -cluster roles, and priority classes. - -If MLBatch is deployed on a cluster that used to run earlier versions of ODH, -[MCAD](https://github.com/project-codeflare/mcad), Red Hat OpenShift AI, or Coscheduler, -make sure to scrub traces of these installations. In particular, make sure to -delete the following custom resource definitions (CRD) if present on the -cluster. Make sure to delete all instances prior to deleting the CRDs: -```sh -# Delete old appwrappers and crd -oc delete appwrappers --all -A -oc delete crd appwrappers.workload.codeflare.dev - -# Delete old noderesourcetopologies and crd -oc delete noderesourcetopologies --all -A -oc delete crd noderesourcetopologies.topology.node.k8s.io -``` - -## Priorities - -Create `default-priority`, `high-priority`, and `low-priority` priority classes: -```sh -oc apply -f setup.RHOAI-v2.14/mlbatch-priorities.yaml -``` - -## Coscheduler - -Install Coscheduler v0.28.9 as a secondary scheduler and configure packing: -```sh -helm install scheduler-plugins --namespace scheduler-plugins --create-namespace \ - scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ \ - --set-json pluginConfig='[{"args":{"scoringStrategy":{"resources":[{"name":"nvidia.com/gpu","weight":1}],"requestedToCapacityRatio":{"shape":[{"utilization":0,"score":0},{"utilization":100,"score":10}]},"type":"RequestedToCapacityRatio"}},"name":"NodeResourcesFit"}]' -``` -Patch Coscheduler pod priorities: -```sh -oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.14/coscheduler-priority-patch.yaml scheduler-plugins-controller -oc patch deployment -n scheduler-plugins --type=json --patch-file setup.RHOAI-v2.14/coscheduler-priority-patch.yaml scheduler-plugins-scheduler -``` - -## Red Hat OpenShift AI - -Create the Red Hat OpenShift AI subscription: -```sh -oc apply -f setup.RHOAI-v2.14/mlbatch-subscription.yaml -```` -Identify install plan: -```sh -oc get ip -n redhat-ods-operator -``` -``` -NAMESPACE NAME CSV APPROVAL APPROVED -redhat-ods-operator install-kmh8w rhods-operator.2.10.0 Manual false -``` -Approve install plan replacing the generated plan name below with the actual -value: -```sh -oc patch ip -n redhat-ods-operator --type merge --patch '{"spec":{"approved":true}}' install-kmh8w -``` -Create DSC Initialization: -```sh -oc apply -f setup.RHOAI-v2.14/mlbatch-dsci.yaml -``` -Create Data Science Cluster: -```sh -oc apply -f setup.RHOAI-v2.14/mlbatch-dsc.yaml -``` -The provided DSCI and DSC are intended to install a minimal set of Red Hat OpenShift -AI managed components: `codeflare`, `kueue`, `ray`, and `trainingoperator`. The -remaining components such as `dashboard` can be optionally enabled. - -The configuration of the managed components differs from the default Red Hat OpenShift -AI configuration as follows: -- Kubeflow Training Operator: - - `gang-scheduler-name` is set to `scheduler-plugins-scheduler`, -- Kueue: - - `manageJobsWithoutQueueName` is enabled, - - `batch/job` integration is disabled, - - `waitForPodsReady` is disabled, - - `LendingLimit` feature gate is enabled, - - `enableClusterQueueResources` metrics is enabled, -- Codeflare operator: - - the AppWrapper controller is enabled and configured as follows: - - `userRBACAdmissionCheck` is disabled, - - `schedulerName` is set to `scheduler-plugins-scheduler`, - - `queueName` is set to `default-queue`, - - `slackQueueName` is set to `slack-cluster-queue` -- pod priorities, resource requests and limits have been adjusted. - - - -## Kueue Configuration - -Create Kueue's default flavor: -```sh -oc apply -f setup.RHOAI-v2.14/default-flavor.yaml -``` - -## Cluster Role - -Create `mlbatch-edit` role: -```sh -oc apply -f setup.RHOAI-v2.14/mlbatch-edit-role.yaml -``` - -## Slack Cluster Queue - -Create the designated slack `ClusterQueue` which will be used to automate -minor adjustments to cluster capacity caused by node failures and -scheduler maintanence. -```sh -oc apply -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ClusterQueue -metadata: - name: slack-cluster-queue -spec: - namespaceSelector: {} - cohort: default-cohort - preemption: - withinClusterQueue: LowerOrNewerEqualPriority - reclaimWithinCohort: Any - borrowWithinCohort: - policy: Never - resourceGroups: - - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "nvidia.com/roce_gdr", "pods"] - flavors: - - name: default-flavor - resources: - - name: "cpu" - nominalQuota: 8000m - - name: "memory" - nominalQuota: 128Gi - - name: "nvidia.com/gpu" - nominalQuota: 8 - - name: "nvidia.com/roce_gdr" - nominalQuota: 1 - - name: "pods" - nominalQuota: 100 -EOF -``` -Edit the above quantities to adjust the quota to the desired -values. Pod counts are optional and can be omitted from the list of -covered resources. The `lendingLimit` for each resource will be -dynamically adjusted by the MLBatch system to reflect reduced cluster -capacity. See [QUOTA_MAINTENANCE.md](../QUOTA_MAINTENANCE.md) for a -detailed discussion of the role of the slack `ClusterQueue`. diff --git a/setup.RHOAI-v2.14/TEAM-SETUP.md b/setup.RHOAI-v2.14/TEAM-SETUP.md deleted file mode 100644 index 85c9429..0000000 --- a/setup.RHOAI-v2.14/TEAM-SETUP.md +++ /dev/null @@ -1,91 +0,0 @@ -# Team Setup - -A *team* in MLBatch is a group of users that share a resource quota. - -Before setting up your teams and quotas, please read [QUOTA_MAINTENANCE.md](../QUOTA_MAINTENANCE.md) -for a discussion of our recommended best practices. - - -Setting up a new team requires the cluster admin to create a project, -a user group, a quota, a queue, and the required role bindings as described below. - -Create project: -```sh -oc new-project team1 -``` -Create user group: -```sh -oc adm groups new team1-edit-group -``` -Add users to group for example: -```sh -oc adm groups add-users team1-edit-group user1 -``` -Bind cluster role to group in namespace: -```sh -oc adm policy add-role-to-group mlbatch-edit team1-edit-group --role-namespace="" --namespace team1 -``` - -Specify the intended quota for the namespace by creating a `ClusterQueue`: -```sh -oc apply -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ClusterQueue -metadata: - name: team1-cluster-queue -spec: - namespaceSelector: {} - cohort: default-cohort - preemption: - withinClusterQueue: LowerOrNewerEqualPriority - reclaimWithinCohort: Any - borrowWithinCohort: - policy: Never - resourceGroups: - - coveredResources: ["cpu", "memory", "nvidia.com/gpu", "nvidia.com/roce_gdr", "pods"] - flavors: - - name: default-flavor - resources: - - name: "cpu" - nominalQuota: 8000m - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "memory" - nominalQuota: 128Gi - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "nvidia.com/gpu" - nominalQuota: 16 - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "nvidia.com/roce_gdr" - nominalQuota: 4 - # borrowingLimit: 0 - # lendingLimit: 0 - - name: "pods" - nominalQuota: 100 - # borrowingLimit: 0 - # lendingLimit: 0 -EOF -``` -Edit the above quantities to adjust the quota to the desired values. Pod counts -are optional and can be omitted from the list of covered resources. - -Uncomment all `borrowingLimit` lines to prevent this namespace from borrowing -quota from other namespaces. Uncomment all `lendingLimit` lines to prevent other -namespaces from borrowing quota from this namespace. - -Create a `LocalQueue` to bind the `ClusterQueue` to the namespace: -```sh -oc apply -n team1 -f- << EOF -apiVersion: kueue.x-k8s.io/v1beta1 -kind: LocalQueue -metadata: - name: default-queue -spec: - clusterQueue: team1-cluster-queue -EOF -``` -We recommend naming the local queue `default-queue` as `AppWrappers` will -default to this queue name. - diff --git a/setup.RHOAI-v2.14/UNINSTALL.md b/setup.RHOAI-v2.14/UNINSTALL.md deleted file mode 100644 index 776045d..0000000 --- a/setup.RHOAI-v2.14/UNINSTALL.md +++ /dev/null @@ -1,23 +0,0 @@ -# Uninstall - -***First, remove all team projects and corresponding cluster queues.*** - -Then to uninstall the MLBatch controllers and reclaim the corresponding -namespaces, run: -```sh -# OpenShift AI uninstall -oc delete dsc mlbatch-dsc -oc delete dsci mlbatch-dsci -oc delete subscription -n redhat-ods-operator rhods-operator -oc delete csv -n redhat-ods-operator -l operators.coreos.com/rhods-operator.redhat-ods-operator -oc delete crd featuretrackers.features.opendatahub.io \ - dscinitializations.dscinitialization.opendatahub.io \ - datascienceclusters.datasciencecluster.opendatahub.io -oc delete operators rhods-operator.redhat-ods-operator -oc delete operatorgroup -n redhat-ods-operator rhods-operator -oc delete namespace redhat-ods-applications redhat-ods-monitoring redhat-ods-operator - -# Coscheduler uninstall -helm uninstall -n scheduler-plugins scheduler-plugins -oc delete namespace scheduler-plugins -``` diff --git a/setup.RHOAI-v2.14/UPGRADE.md b/setup.RHOAI-v2.14/UPGRADE.md deleted file mode 100644 index 0f124ba..0000000 --- a/setup.RHOAI-v2.14/UPGRADE.md +++ /dev/null @@ -1,29 +0,0 @@ -# Upgrading from RHOAI 2.13 - -These instructions assume you installed and configured RHOAI 2.13 following -the MLBatch [install instructions for RHOAI-v2.13](../setup.RHOAI-v2.13/CLUSTER-SETUP.md) -and are subscribed to the fast channel. - -Your subscription will have automatically created an unapproved -install plan to upgrade to RHOAI 2.14. - -Before beginning, verify that the expected install plan exists: -```sh -oc get ip -n redhat-ods-operator -``` -Typical output would be: -```sh -NAME CSV APPROVAL APPROVED -install-kpzzl rhods-operator.2.14.0 Manual false -install-nqrbp rhods-operator.2.13.0 Manual true -``` - -Assuming the install plan exists you can begin the upgrade process. - -There are no MLBatch modifications to the default RHOAI configuration maps -beyond those already made in previous installs. Therefore, you can simply -approve the install plan replacing the example plan name below with the actual -value on your cluster: -```sh -oc patch ip -n redhat-ods-operator --type merge --patch '{"spec":{"approved":true}}' install-kpzzl -``` diff --git a/setup.RHOAI-v2.14/coscheduler-priority-patch.yaml b/setup.RHOAI-v2.14/coscheduler-priority-patch.yaml deleted file mode 100644 index 278802f..0000000 --- a/setup.RHOAI-v2.14/coscheduler-priority-patch.yaml +++ /dev/null @@ -1,3 +0,0 @@ -- op: add - path: /spec/template/spec/priorityClassName - value: system-node-critical diff --git a/setup.RHOAI-v2.14/default-flavor.yaml b/setup.RHOAI-v2.14/default-flavor.yaml deleted file mode 100644 index 6cbccf3..0000000 --- a/setup.RHOAI-v2.14/default-flavor.yaml +++ /dev/null @@ -1,4 +0,0 @@ -apiVersion: kueue.x-k8s.io/v1beta1 -kind: ResourceFlavor -metadata: - name: default-flavor diff --git a/setup.RHOAI-v2.14/mlbatch-dsc.yaml b/setup.RHOAI-v2.14/mlbatch-dsc.yaml deleted file mode 100644 index 66336bc..0000000 --- a/setup.RHOAI-v2.14/mlbatch-dsc.yaml +++ /dev/null @@ -1,32 +0,0 @@ -apiVersion: datasciencecluster.opendatahub.io/v1 -kind: DataScienceCluster -metadata: - name: mlbatch-dsc -spec: - components: - codeflare: - managementState: Managed - dashboard: - managementState: Removed - datasciencepipelines: - managementState: Removed - kserve: - managementState: Removed - serving: - ingressGateway: - certificate: - type: SelfSigned - managementState: Removed - name: knative-serving - kueue: - managementState: Managed - modelmeshserving: - managementState: Removed - ray: - managementState: Managed - trainingoperator: - managementState: Managed - trustyai: - managementState: Removed - workbenches: - managementState: Removed diff --git a/setup.RHOAI-v2.14/mlbatch-dsci.yaml b/setup.RHOAI-v2.14/mlbatch-dsci.yaml deleted file mode 100644 index 77785c3..0000000 --- a/setup.RHOAI-v2.14/mlbatch-dsci.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: dscinitialization.opendatahub.io/v1 -kind: DSCInitialization -metadata: - name: mlbatch-dsci -spec: - applicationsNamespace: redhat-ods-applications - monitoring: - managementState: Managed - namespace: redhat-ods-monitoring - serviceMesh: - managementState: Removed - trustedCABundle: - customCABundle: "" - managementState: Managed diff --git a/setup.RHOAI-v2.14/mlbatch-edit-role.yaml b/setup.RHOAI-v2.14/mlbatch-edit-role.yaml deleted file mode 100644 index 42fd518..0000000 --- a/setup.RHOAI-v2.14/mlbatch-edit-role.yaml +++ /dev/null @@ -1,142 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: mlbatch-edit -rules: -- apiGroups: - - "" - resources: - - pods - verbs: - - delete - - get - - list - - watch -- apiGroups: - - apps - resources: - - deployments - - statefulsets - verbs: - - delete - - get - - list - - watch -- apiGroups: - - "" - resources: - - services - - secrets - - configmaps - - persistentvolumeclaims - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - kueue.x-k8s.io - resources: - - "*" - verbs: - - get - - list - - watch -- apiGroups: - - kubeflow.org - resources: - - pytorchjobs - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - ray.io - resources: - - rayjobs - - rayclusters - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - batch - resources: - - jobs - verbs: - - delete - - get - - list - - watch -- apiGroups: - - workload.codeflare.dev - resources: - - appwrappers - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - scheduling.k8s.io - resources: - - priorityclasses - verbs: - - get - - list - - watch -- apiGroups: - - scheduling.x-k8s.io - resources: - - podgroups - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - events - verbs: - - get - - list - - watch -- apiGroups: - - "" - resources: - - namespaces - - pods/logs - verbs: - - get -- apiGroups: - - "" - - project.openshift.io - resources: - - projects - verbs: - - get -- apiGroups: - - "" - resources: - - pods/exec - - pods/portforward - verbs: - - create diff --git a/setup.RHOAI-v2.14/mlbatch-priorities.yaml b/setup.RHOAI-v2.14/mlbatch-priorities.yaml deleted file mode 100644 index 77c8f3b..0000000 --- a/setup.RHOAI-v2.14/mlbatch-priorities.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: low-priority -value: 1 -preemptionPolicy: PreemptLowerPriority -globalDefault: false -description: "This is the priority class for all lower priority jobs." ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: default-priority -value: 5 -preemptionPolicy: PreemptLowerPriority -globalDefault: true -description: "This is the priority class for all jobs (default priority)." ---- -apiVersion: scheduling.k8s.io/v1 -kind: PriorityClass -metadata: - name: high-priority -value: 10 -preemptionPolicy: PreemptLowerPriority -globalDefault: false -description: "This is the priority class defined for highly important jobs that would evict lower and default priority jobs." diff --git a/setup.RHOAI-v2.14/mlbatch-subscription.yaml b/setup.RHOAI-v2.14/mlbatch-subscription.yaml deleted file mode 100644 index 848f30d..0000000 --- a/setup.RHOAI-v2.14/mlbatch-subscription.yaml +++ /dev/null @@ -1,301 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: redhat-ods-operator ---- -apiVersion: v1 -kind: Namespace -metadata: - name: redhat-ods-applications ---- -apiVersion: operators.coreos.com/v1 -kind: OperatorGroup -metadata: - name: rhods-operator - namespace: redhat-ods-operator ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: mlbatch-codeflare - namespace: redhat-ods-operator -data: - manager.yaml: | - apiVersion: apps/v1 - kind: Deployment - metadata: - name: manager - namespace: system - spec: - selector: - matchLabels: - app.kubernetes.io/name: codeflare-operator - app.kubernetes.io/part-of: codeflare - replicas: 1 - template: - metadata: - annotations: - kubectl.kubernetes.io/default-container: manager - labels: - app.kubernetes.io/name: codeflare-operator - app.kubernetes.io/part-of: codeflare - spec: - priorityClassName: system-node-critical - securityContext: - runAsNonRoot: true - # TODO(user): For common cases that do not require escalating privileges - # it is recommended to ensure that all your Pods/Containers are restrictive. - # More info: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted - # Please uncomment the following code if your project does NOT have to work on old Kubernetes - # versions < 1.20 or on vendors versions which do NOT support this field by default (i.e. Openshift < 4.11 ). - # seccompProfile: - # type: RuntimeDefault - containers: - - command: - - /manager - image: $(codeflare_operator_controller_image) - imagePullPolicy: Always - name: manager - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - "ALL" - env: - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - ports: - - containerPort: 8080 - protocol: TCP - name: metrics - livenessProbe: - httpGet: - path: /healthz - port: 8081 - initialDelaySeconds: 15 - periodSeconds: 20 - readinessProbe: - httpGet: - path: /readyz - port: 8081 - initialDelaySeconds: 5 - periodSeconds: 10 - resources: - limits: - cpu: "1" - memory: 1Gi - requests: - cpu: "1" - memory: 1Gi - serviceAccountName: controller-manager - terminationGracePeriodSeconds: 10 ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: codeflare-operator-config - namespace: redhat-ods-applications -data: - config.yaml: | - appwrapper: - enabled: true - Config: - manageJobsWithoutQueueName: true - userRBACAdmissionCheck: false - schedulerName: scheduler-plugins-scheduler - defaultQueueName: default-queue - slackQueueName: slack-cluster-queue - autopilot: - injectAntiAffinities: false - resourceTaints: - nvidia.com/gpu: - - key: autopilot.ibm.com/gpuhealth - value: ERR - effect: NoSchedule - - key: autopilot.ibm.com/gpuhealth - value: TESTING - effect: NoSchedule - - key: autopilot.ibm.com/gpuhealth - value: EVICT - effect: NoExecute ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: mlbatch-kuberay - namespace: redhat-ods-operator -data: - kuberay-operator-image-patch.yaml: | - apiVersion: apps/v1 - kind: Deployment - metadata: - name: kuberay-operator - spec: - template: - spec: - priorityClassName: system-node-critical - containers: - - name: kuberay-operator - image: $(image) ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: mlbatch-kueue - namespace: redhat-ods-operator -data: - controller_manager_config.yaml: | - apiVersion: config.kueue.x-k8s.io/v1beta1 - kind: Configuration - health: - healthProbeBindAddress: :8081 - metrics: - bindAddress: :8080 - enableClusterQueueResources: true - webhook: - port: 9443 - leaderElection: - leaderElect: true - resourceName: c1f6bfd2.kueue.x-k8s.io - controller: - groupKindConcurrency: - Job.batch: 5 - Pod: 5 - Workload.kueue.x-k8s.io: 5 - LocalQueue.kueue.x-k8s.io: 1 - ClusterQueue.kueue.x-k8s.io: 1 - ResourceFlavor.kueue.x-k8s.io: 1 - clientConnection: - qps: 50 - burst: 100 - #pprofBindAddress: :8082 - waitForPodsReady: - enable: false - blockAdmission: false - manageJobsWithoutQueueName: true - #internalCertManagement: - # enable: false - # webhookServiceName: "" - # webhookSecretName: "" - integrations: - frameworks: - # - "batch/job" - - "kubeflow.org/mpijob" - - "ray.io/rayjob" - - "ray.io/raycluster" - - "jobset.x-k8s.io/jobset" - - "kubeflow.org/mxjob" - - "kubeflow.org/paddlejob" - - "kubeflow.org/pytorchjob" - - "kubeflow.org/tfjob" - - "kubeflow.org/xgboostjob" - # - "pod" - externalFrameworks: - - "AppWrapper.v1beta2.workload.codeflare.dev" - # podOptions: - # namespaceSelector: - # matchExpressions: - # - key: kubernetes.io/metadata.name - # operator: NotIn - # values: [ kube-system, kueue-system ] - manager_config_patch.yaml: | - apiVersion: apps/v1 - kind: Deployment - metadata: - name: controller-manager - namespace: system - spec: - template: - spec: - priorityClassName: system-node-critical - containers: - - name: manager - image: $(image) - args: - - "--config=/controller_manager_config.yaml" - - "--zap-log-level=2" - - "--feature-gates=LendingLimit=true" - volumeMounts: - - name: manager-config - mountPath: /controller_manager_config.yaml - subPath: controller_manager_config.yaml - volumes: - - name: manager-config - configMap: - name: manager-config ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: mlbatch-training-operator - namespace: redhat-ods-operator -data: - manager_config_patch.yaml: | - apiVersion: apps/v1 - kind: Deployment - metadata: - name: training-operator - spec: - template: - spec: - priorityClassName: system-node-critical - containers: - - name: training-operator - image: $(image) - args: - - "--zap-log-level=2" - - "--gang-scheduler-name=scheduler-plugins-scheduler" - resources: - requests: - cpu: 100m - memory: 100Mi - limits: - cpu: 500m - memory: 1000Mi ---- -apiVersion: operators.coreos.com/v1alpha1 -kind: Subscription -metadata: - name: rhods-operator - namespace: redhat-ods-operator -spec: - channel: fast - installPlanApproval: Manual - name: rhods-operator - source: redhat-operators - sourceNamespace: openshift-marketplace - startingCSV: rhods-operator.2.14.0 - config: - env: - - name: "DISABLE_DSC_CONFIG" - volumeMounts: - - name: mlbatch-codeflare - mountPath: /opt/manifests/codeflare/manager/manager.yaml - subPath: manager.yaml - - name: mlbatch-kuberay - mountPath: /opt/manifests/ray/openshift/kuberay-operator-image-patch.yaml - subPath: kuberay-operator-image-patch.yaml - - name: mlbatch-kueue - mountPath: /opt/manifests/kueue/components/manager/controller_manager_config.yaml - subPath: controller_manager_config.yaml - - name: mlbatch-kueue - mountPath: /opt/manifests/kueue/rhoai/manager_config_patch.yaml - subPath: manager_config_patch.yaml - - name: mlbatch-training-operator - mountPath: /opt/manifests/trainingoperator/rhoai/manager_config_patch.yaml - subPath: manager_config_patch.yaml - volumes: - - name: mlbatch-codeflare - configMap: - name: mlbatch-codeflare - - name: mlbatch-kuberay - configMap: - name: mlbatch-kuberay - - name: mlbatch-kueue - configMap: - name: mlbatch-kueue - - name: mlbatch-training-operator - configMap: - name: mlbatch-training-operator diff --git a/setup.tmpl/Makefile b/setup.tmpl/Makefile index c86e8b2..1507774 100644 --- a/setup.tmpl/Makefile +++ b/setup.tmpl/Makefile @@ -23,12 +23,8 @@ help: ## Display this help. docs: gotmpl ../tools/gotmpl/gotmpl -input ./CLUSTER-SETUP.md.tmpl -output ../setup.RHOAI-v2.10/CLUSTER-SETUP.md -values RHOAI-v2.10.yaml ../tools/gotmpl/gotmpl -input ./TEAM-SETUP.md.tmpl -output ../setup.RHOAI-v2.10/TEAM-SETUP.md -values RHOAI-v2.10.yaml - ../tools/gotmpl/gotmpl -input ./CLUSTER-SETUP.md.tmpl -output ../setup.RHOAI-v2.11/CLUSTER-SETUP.md -values RHOAI-v2.11.yaml - ../tools/gotmpl/gotmpl -input ./TEAM-SETUP.md.tmpl -output ../setup.RHOAI-v2.11/TEAM-SETUP.md -values RHOAI-v2.11.yaml ../tools/gotmpl/gotmpl -input ./CLUSTER-SETUP.md.tmpl -output ../setup.RHOAI-v2.13/CLUSTER-SETUP.md -values RHOAI-v2.13.yaml ../tools/gotmpl/gotmpl -input ./TEAM-SETUP.md.tmpl -output ../setup.RHOAI-v2.13/TEAM-SETUP.md -values RHOAI-v2.13.yaml - ../tools/gotmpl/gotmpl -input ./CLUSTER-SETUP.md.tmpl -output ../setup.RHOAI-v2.14/CLUSTER-SETUP.md -values RHOAI-v2.14.yaml - ../tools/gotmpl/gotmpl -input ./TEAM-SETUP.md.tmpl -output ../setup.RHOAI-v2.14/TEAM-SETUP.md -values RHOAI-v2.14.yaml ../tools/gotmpl/gotmpl -input ./CLUSTER-SETUP.md.tmpl -output ../setup.RHOAI-v2.15/CLUSTER-SETUP.md -values RHOAI-v2.15.yaml ../tools/gotmpl/gotmpl -input ./TEAM-SETUP.md.tmpl -output ../setup.RHOAI-v2.15/TEAM-SETUP.md -values RHOAI-v2.15.yaml ../tools/gotmpl/gotmpl -input ./CLUSTER-SETUP.md.tmpl -output ../setup.k8s-v1.27/CLUSTER-SETUP.md -values Kubernetes-v1.27.yaml diff --git a/setup.tmpl/RHOAI-v2.11.yaml b/setup.tmpl/RHOAI-v2.11.yaml deleted file mode 100644 index 6bc7d8b..0000000 --- a/setup.tmpl/RHOAI-v2.11.yaml +++ /dev/null @@ -1,6 +0,0 @@ -# Values for RHOAI 2.11 - -OPENSHIFT: true -VERSION: RHOAI-v2.11 -KUBECTL: oc -SLACKCQ: false diff --git a/setup.tmpl/RHOAI-v2.14.yaml b/setup.tmpl/RHOAI-v2.14.yaml deleted file mode 100644 index 631b9fd..0000000 --- a/setup.tmpl/RHOAI-v2.14.yaml +++ /dev/null @@ -1,6 +0,0 @@ -# Values for RHOAI 2.14 - -OPENSHIFT: true -VERSION: RHOAI-v2.14 -KUBECTL: oc -SLACKCQ: true \ No newline at end of file