Gpu Refactor (#298)

* add chart folders to gitignore * update validation to handle --enable-helm * refactor gpu configs into components * update wordlist * skip shellcheck --------- Co-authored-by: Alexis de Talhouët <adetalhouet89@gmail.com>
redhat-cop · May 2, 2024 · 0d9c167 · 0d9c167
1 parent 38eea3b
commit 0d9c167
Show file tree

Hide file tree

Showing 58 changed files with 1,076 additions and 134 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,8 @@
 venv/
+**/charts/
 
 .DS_Store
 temp
 /.idea/
 
-dictionary.dic
+dictionary.dic
diff --git a/.wordlist-md b/.wordlist-md
@@ -55,6 +55,7 @@ Lifecycle
 Logstash
 MTA
 MachineConfig
+MachineSet
 Minio
 MultiClusterHub
 NFD
@@ -177,6 +178,8 @@ letsencrypt
 libvirt
 lifecycle
 linux
+machineset
+mig
 microservices
 namespace
 namespaces

diff --git a/gpu-operator-certified/aggregate/overlays/aws/kustomization.yaml b/gpu-operator-certified/aggregate/overlays/aws/kustomization.yaml
@@ -0,0 +1,9 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+commonAnnotations:
+  argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true
+
+resources:
+  - ../../../operator/overlays/stable
+  - ../../../instance/overlays/aws
diff --git a/gpu-operator-certified/aggregate/overlays/default/kustomization.yaml b/gpu-operator-certified/aggregate/overlays/default/kustomization.yaml
@@ -4,8 +4,6 @@ kind: Kustomization
 commonAnnotations:
   argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true
 
-namespace: nvidia-gpu-operator
-
 resources:
-  - ../../../operator/overlays/stable
   - ../../../instance/overlays/default
+  - ../../../operator/overlays/stable
diff --git a/gpu-operator-certified/instance/README.md → gpu-operator-certified/instance/INFO.md b/gpu-operator-certified/instance/README.md → gpu-operator-certified/instance/INFO.md
@@ -1,9 +1,5 @@
 # GPU Notes
 
-For more info please review the following:
-
-- [Demo GPUs on OpenShift](https://github.com/redhat-na-ssa/demo-ocp-gpu)
-
 ## Instance Types
 
 AWS GPU Types:
@@ -40,12 +36,13 @@ Time-slicing GPU can be any Nvidia type (as documented by Nvidia):
   - `g3.8xlarge`  - 2 x M60
   - `g3.16xlarge` - 4 x M60
 
+
 ## Links
 
 - [Docs - AWS GPU Instances](https://aws.amazon.com/ec2/instance-types/#Accelerated_Computing)
 - [Docs - Nvidia GPU Operator on Openshift](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/openshift/contents.html)
 - [Docs - Nvidia GPU admin dashboard](https://docs.openshift.com/container-platform/4.11/monitoring/nvidia-gpu-admin-dashboard.html)
 - [Docs - MIG support in OCP](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/openshift/mig-ocp.html)
-- [Blog - RH Nvidia GPUs on OpenShift](https://cloud.redhat.com/blog/autoscaling-nvidia-gpus-on-red-hat-openshift)
+- [Blog - Red Hat Nvidia GPUs on OpenShift](https://cloud.redhat.com/blog/autoscaling-nvidia-gpus-on-red-hat-openshift)
 - [Demo - GPU DevSpaces](https://github.com/bkoz/devspaces)
 - [GPU Operator default config map](https://gitlab.com/nvidia/kubernetes/gpu-operator/-/blob/v23.6.1/assets/state-mig-manager/0400_configmap.yaml?ref_type=tags)
diff --git a/gpu-operator-certified/instance/base/cluster-policy.yaml b/gpu-operator-certified/instance/base/cluster-policy.yaml
@@ -2,6 +2,7 @@ kind: ClusterPolicy
 apiVersion: nvidia.com/v1
 metadata:
   name: gpu-cluster-policy
+  namespace: nvidia-gpu-operator
 spec:
   operator:
     defaultRuntime: crio
@@ -50,6 +51,10 @@ spec:
     updateStrategy: RollingUpdate
     rollingUpdate:
       maxUnavailable: '1'
+    tolerations:
+      - effect: NoSchedule
+        key: nvidia-gpu-only
+        operator: Exists
   devicePlugin:
     enabled: true
     config:

diff --git a/gpu-operator-certified/instance/base/device-plugin-config.yaml b/gpu-operator-certified/instance/base/device-plugin-config.yaml
@@ -2,4 +2,5 @@ apiVersion: v1
 kind: ConfigMap
 metadata:
   name: device-plugin-config
+  namespace: nvidia-gpu-operator
 data: {}
diff --git a/gpu-operator-certified/instance/base/kustomization.yaml b/gpu-operator-certified/instance/base/kustomization.yaml
@@ -1,8 +1,9 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 
-namespace: nvidia-gpu-operator
-
 resources:
   - cluster-policy.yaml
   - device-plugin-config.yaml
+
+components:
+  - ../components/monitoring-dashboard
diff --git a/gpu-operator-certified/instance/components/README.md b/gpu-operator-certified/instance/components/README.md
@@ -0,0 +1,28 @@
+# NVIDIA GPU Operator Components
+
+The included components are intended to be common patching patterns used on top of the default NVIDIA GPU operator instance to configure additional features.  Components are composable patches that can be added at the overlays layer on top of a base.
+
+This repo currently contains the following components:
+
+* [aws-gpu-machineset](aws-gpu-machineset)
+* [mig-mixed](mig-mixed)
+* [mig-single](mig-single)
+* [monitoring-dashboard](monitoring-dashboard)
+* [time-sliced](time-sliced)
+* [time-sliced-2](time-sliced-2)
+* [time-sliced-4](time-sliced-4)
+
+## Usage
+
+Components can be added to a base by adding the `components` section to your overlay `kustomization.yaml` file:
+
+```
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - ../../base
+
+components:
+  - ../../components/monitoring-dashboard
+```
diff --git a/gpu-operator-certified/instance/components/aws-gpu-machineset/README.md b/gpu-operator-certified/instance/components/aws-gpu-machineset/README.md
@@ -0,0 +1,24 @@
+# aws-gpu-machineset
+
+## Purpose
+
+This component is designed to setup a MachineSet with GPUs on an AWS based OpenShift cluster.
+
+This component triggers a job that creates a MachineSet based on your current MachineSet.
+
+This component has been tested using AWS based OpenShift instances provisioned by demo.redhat.com.
+
+## Usage
+
+This component can be added to a base by adding the `components` section to your overlay `kustomization.yaml` file:
+
+```
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - ../../base
+
+components:
+  - ../../components/aws-gpu-machineset
+```
diff --git a/gpu-operator-certified/instance/components/aws-gpu-machineset/job.sh b/gpu-operator-certified/instance/components/aws-gpu-machineset/job.sh
@@ -0,0 +1,110 @@
+#!/usr/bin/env bash
+# shellcheck disable=SC1091,SC2120
+
+ocp_aws_cluster(){
+  TARGET_NS=kube-system
+  OBJ=secret/aws-creds
+  echo "Checking if ${OBJ} exists in ${TARGET_NS} namespace"
+  oc -n "${TARGET_NS}" get "${OBJ}" -o name > /dev/null 2>&1 || return 1
+  echo "AWS cluster detected"
+}
+
+ocp_aws_create_gpu_machineset(){
+  # https://aws.amazon.com/ec2/instance-types/g4
+  # single gpu: g4dn.{2,4,8,16}xlarge
+  # multi gpu:  g4dn.12xlarge
+  # practical:  g4ad.4xlarge
+  # a100 (MIG): p4d.24xlarge
+  # h100 (MIG): p5.48xlarge
+
+  # https://aws.amazon.com/ec2/instance-types/dl1
+  # 8 x gaudi:  dl1.24xlarge
+
+  INSTANCE_TYPE=${1:-g4dn.4xlarge}
+
+  ocp_aws_clone_machineset "${INSTANCE_TYPE}"
+
+  MACHINE_SET_TYPE=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep "${INSTANCE_TYPE%.*}" | head -n1)
+
+  echo "Patching: ${MACHINE_SET_TYPE}"
+
+  # cosmetic
+  oc -n openshift-machine-api \
+    patch "${MACHINE_SET_TYPE}" \
+    --type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"node-role.kubernetes.io/gpu":""}}}}}}'
+
+  # taint nodes for gpu-only workloads
+  oc -n openshift-machine-api \
+    patch "${MACHINE_SET_TYPE}" \
+    --type=merge --patch '{"spec":{"template":{"spec":{"taints":[{"key":"nvidia-gpu-only","value":"","effect":"NoSchedule"}]}}}}'
+
+  # should use the default profile
+  # oc -n openshift-machine-api \
+  #   patch "${MACHINE_SET_TYPE}" \
+  #   --type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"nvidia.com/device-plugin.config":"no-time-sliced"}}}}}}'
+
+  # should help auto provisioner
+  oc -n openshift-machine-api \
+    patch "${MACHINE_SET_TYPE}" \
+    --type=merge --patch '{"spec":{"template":{"spec":{"metadata":{"labels":{"cluster-api/accelerator":"nvidia-gpu"}}}}}}'
+
+  oc -n openshift-machine-api \
+    patch "${MACHINE_SET_TYPE}" \
+    --type=merge --patch '{"metadata":{"labels":{"cluster-api/accelerator":"nvidia-gpu"}}}'
+
+  oc -n openshift-machine-api \
+    patch "${MACHINE_SET_TYPE}" \
+    --type=merge --patch '{"spec":{"template":{"spec":{"providerSpec":{"value":{"instanceType":"'"${INSTANCE_TYPE}"'"}}}}}}'
+}
+
+ocp_aws_clone_machineset(){
+  [ -z "${1}" ] && \
+  echo "
+    usage: ocp_aws_create_gpu_machineset < instance type, default g4dn.4xlarge >
+  "
+
+  INSTANCE_TYPE=${1:-g4dn.4xlarge}
+  MACHINE_SET=$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep worker | head -n1)
+
+  # check for an existing instance machine set
+  if oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | grep -q "${INSTANCE_TYPE%.*}"; then
+    echo "Exists: machineset - ${INSTANCE_TYPE}"
+  else
+    echo "Creating: machineset - ${INSTANCE_TYPE}"
+    oc -n openshift-machine-api \
+      get "${MACHINE_SET}" -o yaml | \
+        sed '/machine/ s/-worker/-'"${INSTANCE_TYPE}"'/g
+          /name/ s/-worker/-'"${INSTANCE_TYPE%.*}"'/g
+          s/instanceType.*/instanceType: '"${INSTANCE_TYPE}"'/
+          s/replicas.*/replicas: 0/' | \
+      oc apply -f -
+  fi
+}
+
+ocp_create_machineset_autoscale(){
+  MACHINE_MIN=${1:-0}
+  MACHINE_MAX=${2:-4}
+  MACHINE_SETS=${3:-$(oc -n openshift-machine-api get machinesets.machine.openshift.io -o name | sed 's@.*/@@' )}
+
+  for set in ${MACHINE_SETS}
+  do
+cat << YAML | oc apply -f -
+apiVersion: "autoscaling.openshift.io/v1beta1"
+kind: "MachineAutoscaler"
+metadata:
+  name: "${set}"
+  namespace: "openshift-machine-api"
+spec:
+  minReplicas: ${MACHINE_MIN}
+  maxReplicas: ${MACHINE_MAX}
+  scaleTargetRef:
+    apiVersion: machine.openshift.io/v1beta1
+    kind: MachineSet
+    name: "${set}"
+YAML
+  done
+}
+
+ocp_aws_cluster || exit 0
+ocp_aws_create_gpu_machineset
+ocp_create_machineset_autoscale
diff --git a/gpu-operator-certified/instance/components/aws-gpu-machineset/job.yaml b/gpu-operator-certified/instance/components/aws-gpu-machineset/job.yaml
@@ -0,0 +1,88 @@
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: job-aws-gpu-machineset
+  namespace: nvidia-gpu-operator
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: job-aws-gpu-machineset
+rules:
+- apiGroups:
+  - machine.openshift.io
+  resources:
+  - machinesets
+  verbs:
+  - '*'
+- apiGroups:
+  - autoscaling.openshift.io
+  resources:
+  - machineautoscalers
+  verbs:
+  - '*'
+- apiGroups:
+  - ''
+  resources:
+  - secrets
+  resourceNames:
+  - aws-creds
+  verbs:
+  - get
+  - list
+# - nonResourceURLs:
+#   - '*'
+#   verbs:
+#   - '*'
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: job-aws-gpu-machineset
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: job-aws-gpu-machineset
+subjects:
+  - kind: ServiceAccount
+    name: job-aws-gpu-machineset
+    namespace: nvidia-gpu-operator
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  generateName: job-aws-gpu-machineset-
+  name: job-aws-gpu-machineset
+  namespace: nvidia-gpu-operator
+  annotations:
+    argocd.argoproj.io/hook: Sync
+    # argocd.argoproj.io/hook-delete-policy: HookSucceeded
+spec:
+  template:
+    spec:
+      containers:
+        - name: job-aws-gpu-machineset
+          # image: image-registry.openshift-image-registry.svc:5000/openshift/tools:latest
+          image: registry.redhat.io/openshift4/ose-cli
+          env:
+            - name: NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+          command:
+            - /bin/bash
+            - -c
+            - /scripts/job.sh
+          volumeMounts:
+            - name: scripts
+              mountPath: /scripts
+      volumes:
+        - name: scripts
+          configMap:
+            name: job-aws-gpu-machineset
+            defaultMode: 0755
+      restartPolicy: Never
+      terminationGracePeriodSeconds: 30
+      serviceAccount: job-aws-gpu-machineset
+      serviceAccountName: job-aws-gpu-machineset
diff --git a/gpu-operator-certified/instance/components/aws-gpu-machineset/kustomization.yaml b/gpu-operator-certified/instance/components/aws-gpu-machineset/kustomization.yaml
@@ -0,0 +1,15 @@
+apiVersion: kustomize.config.k8s.io/v1alpha1
+kind: Component
+
+resources:
+  # - ../../../../../../scripts/library
+  - job.yaml
+
+generatorOptions:
+  disableNameSuffixHash: true
+
+configMapGenerator:
+  - name: job-aws-gpu-machineset
+    namespace: nvidia-gpu-operator
+    files:
+      - job.sh