Add autoscale check for GPU (opendatahub-io#573)

* Add autoscale check * Clean after rebase * Add scalable gpus to dropdown count * Change code to use MachineAutoscalers * Use machineautoscalers, add logic to frontend * check available machine replicas in max scalable calculation * Add role change and fix availability check * Fix machineSet get logic * Update frontend/src/pages/notebookController/screens/server/GPUSelectField.tsx Use the value from the reduce * Update backend/src/routes/api/gpu/gpuUtils.ts * Update backend/src/routes/api/gpu/gpuUtils.ts * Update frontend/src/pages/notebookController/screens/server/GPUSelectField.tsx * add saToken to kube, optimise gpu data getting Co-authored-by: Andrew Ballantyne <8126518+andrewballantyne@users.noreply.github.com>
strangiato · Oct 18, 2023 · 4df427f · 4df427f
1 parent 4c3e6af
commit 4df427f
Show file tree

Hide file tree

Showing 8 changed files with 201 additions and 29 deletions.
diff --git a/backend/src/plugins/kube.ts b/backend/src/plugins/kube.ts
@@ -27,6 +27,13 @@ export default fp(async (fastify: FastifyInstance) => {
     fastify.log.error(e, 'Failed to retrieve current namespace');
   }
 
+  let saToken;
+  try {
+    saToken = await getSAToken();
+  } catch (e) {
+    fastify.log.error(e, 'Failed to retrieve Service Account token');
+  }
+
   let clusterID;
   try {
     const clusterVersion = await customObjectsApi.getClusterCustomObject(
@@ -67,6 +74,7 @@ export default fp(async (fastify: FastifyInstance) => {
     clusterID,
     clusterBranding,
     rbac,
+    saToken,
   });
 
   // Initialize the watching of resources
@@ -93,3 +101,16 @@ const getCurrentNamespace = async () => {
     }
   });
 };
+
+const getSAToken = async () => {
+  return new Promise<string>((resolve, reject) => {
+    if (currentContext === 'inClusterContext') {
+      fs.readFile('/var/run/secrets/kubernetes.io/serviceaccount/token', (err, data) => {
+        if (err) {
+          reject(err);
+        }
+        resolve(String(data));
+      });
+    }
+  });
+};
diff --git a/backend/src/routes/api/gpu/gpuUtils.ts b/backend/src/routes/api/gpu/gpuUtils.ts
@@ -1,9 +1,26 @@
-import { KubeFastifyInstance, PrometheusResponse } from '../../../types';
+import {
+  MachineAutoscalerList,
+  GPUInfo,
+  KubeFastifyInstance,
+  PrometheusResponse,
+  MachineSet,
+  gpuScale,
+} from '../../../types';
 import { V1PodList } from '@kubernetes/client-node';
 import https from 'https';
-import * as fs from 'fs';
 
-export const getGPUNumber = async (fastify: KubeFastifyInstance): Promise<[boolean, number]> => {
+/** Storage to prevent heavy calls from being performed for EVERY user */
+const storage: { lastFetch: number; lastValue: GPUInfo } = {
+  lastValue: { available: 0, configured: false, autoscalers: [] },
+  lastFetch: 0,
+};
+
+export const getGPUNumber = async (fastify: KubeFastifyInstance): Promise<GPUInfo> => {
+  if (storage.lastFetch >= Date.now() - 30_000) {
+    fastify.log.info(`Returning cached gpu value (${JSON.stringify(storage)})`);
+    return storage.lastValue;
+  }
+  fastify.log.info(`Computing GPU state`);
   let maxGpuNumber = 0;
   let areGpusConfigured = false;
   const gpuPodList = await fastify.kube.coreV1Api
@@ -15,38 +32,45 @@ export const getGPUNumber = async (fastify: KubeFastifyInstance): Promise<[boole
       fastify.log.error(`Exception when calling DCGM exporter pods: ${e}`);
       return { items: [] } as V1PodList;
     });
+  const scalingLimit = await getGPUScaling(fastify);
   if (gpuPodList.items.length != 0) {
     areGpusConfigured = true;
-    const token = await new Promise<string>((resolve, reject) => {
-      fs.readFile('/var/run/secrets/kubernetes.io/serviceaccount/token', (err, data) => {
-        try {
-          resolve(String(data));
-        } catch {
-          reject('');
-          fastify.log.error(err);
-        }
-      });
-    });
+    const gpuDataResponses = [];
     for (let i = 0; i < gpuPodList.items.length; i++) {
-      const data = await getGPUData(gpuPodList.items[i].status.podIP, token);
-      if (data.code === 200) {
-        const gpuNumber = data.response;
-        if (gpuNumber > maxGpuNumber) {
-          maxGpuNumber = gpuNumber;
+      gpuDataResponses.push(getGPUData(gpuPodList.items[i].status.podIP, fastify.kube.saToken));
+    }
+
+    await Promise.all(gpuDataResponses).then((gpuDataList) => {
+      for (let i = 0; i < gpuDataList.length; i++) {
+        if (gpuDataList[i].code === 200) {
+          const gpuNumber = gpuDataList[i].response;
+          if (gpuNumber > maxGpuNumber) {
+            maxGpuNumber = gpuNumber;
+          }
+        } else {
+          fastify.log.warn(`Error getting GPUData ${gpuDataList[i].response}`);
         }
-      } else {
-        fastify.log.warn(`Error getting GPUData ${data.response}`);
       }
-    }
+    });
+  } else if (scalingLimit.length != 0) {
+    areGpusConfigured = true;
   }
-  return [areGpusConfigured, maxGpuNumber];
+
+  const data: GPUInfo = {
+    configured: areGpusConfigured,
+    available: maxGpuNumber,
+    autoscalers: scalingLimit,
+  };
+  storage.lastFetch = Date.now();
+  storage.lastValue = data;
+  return data;
 };
 
 export const getGPUData = async (
   podIP: string,
   token: string,
 ): Promise<{ code: number; response: number | any }> => {
-  return await new Promise((resolve, reject) => {
+  return new Promise((resolve, reject) => {
     const options = {
       hostname: 'thanos-querier.openshift-monitoring.svc.cluster.local',
       port: 9091,
@@ -79,3 +103,50 @@ export const getGPUData = async (
     httpsRequest.end();
   });
 };
+
+const getGPUScaling = async (fastify: KubeFastifyInstance): Promise<gpuScale[]> => {
+  const scalingList: gpuScale[] = [];
+  const autoscalerList = (
+    await fastify.kube.customObjectsApi.listNamespacedCustomObject(
+      'autoscaling.openshift.io',
+      'v1beta1',
+      'openshift-machine-api',
+      'machineautoscalers',
+    )
+  ).body as MachineAutoscalerList;
+
+  const machineSets = [];
+  for (let i = 0; i < autoscalerList.items.length; i++) {
+    const machineSetName = autoscalerList.items[i].spec.scaleTargetRef.name; //also gives info about kind and apiversion if needed in the future
+    machineSets.push(
+      fastify.kube.customObjectsApi
+        .getNamespacedCustomObject(
+          'machine.openshift.io',
+          'v1beta1',
+          'openshift-machine-api',
+          'machinesets',
+          machineSetName,
+        )
+        .catch((e) => {
+          fastify.log.warn(
+            `Autoscaler ${autoscalerList.items[i].metadata.name} did not contain MachineSet info. ${e.response.data.message}`,
+          );
+          return null;
+        }),
+    );
+  }
+  await Promise.all(machineSets).then((msList) => {
+    for (let i = 0; i < msList.length; i++) {
+      const machineSet = msList[i].body as MachineSet;
+      const gpuAmount = Number(machineSet?.metadata.annotations?.['machine.openshift.io/GPU']);
+      if (gpuAmount > 0) {
+        scalingList.push({
+          availableScale:
+            autoscalerList.items[i].spec.maxReplicas - (machineSet.status.availableReplicas || 0),
+          gpuNumber: gpuAmount,
+        });
+      }
+    }
+  });
+  return scalingList;
+};
diff --git a/backend/src/routes/api/status/statusUtils.ts b/backend/src/routes/api/status/statusUtils.ts
@@ -9,7 +9,8 @@ export const status = async (
   request: FastifyRequest,
 ): Promise<{ kube: KubeStatus }> => {
   const kubeContext = fastify.kube.currentContext;
-  const { currentContext, namespace, currentUser, clusterID, clusterBranding } = fastify.kube;
+  const { currentContext, namespace, currentUser, clusterID, clusterBranding, saToken } =
+    fastify.kube;
 
   const userName = await getUserName(fastify, request);
   const isAdmin = await isUserAdmin(fastify, userName, namespace);
@@ -33,6 +34,7 @@ export const status = async (
         clusterBranding,
         isAdmin,
         isAllowed,
+        saToken,
       },
     };
   }

diff --git a/backend/src/types.ts b/backend/src/types.ts
@@ -185,6 +185,7 @@ export type KubeStatus = {
   clusterBranding: string;
   isAdmin: boolean;
   isAllowed: boolean;
+  saToken: string;
 };
 
 export type KubeDecorator = KubeStatus & {
@@ -660,6 +661,52 @@ export type RecursivePartial<T> = {
   [P in keyof T]?: RecursivePartial<T[P]>;
 };
 
+export type GPUScaleType = {
+  type: 'nvidia.com/gpu' | 'amd.com/gpu';
+  min: number;
+  max: number;
+};
+
+export type MachineAutoscaler = {
+  spec: {
+    maxReplicas: number;
+    minReplicas: number;
+    scaleTargetRef: {
+      apiversion: string;
+      kind: string;
+      name: string;
+    };
+  };
+} & K8sResourceCommon;
+
+export type MachineSet = {
+  status: {
+    availableReplicas: number;
+    fullyLabeledReplicas: number;
+    observedGeneration: number;
+    readyReplicas: number;
+    replicas: number;
+  };
+} & K8sResourceCommon;
+
+export type MachineAutoscalerList = {
+  items: MachineAutoscaler[];
+} & K8sResourceCommon;
+
+export type MachineSetList = {
+  items: MachineSet[];
+} & K8sResourceCommon;
+
+export type gpuScale = {
+  availableScale: number;
+  gpuNumber: number;
+};
+
+export type GPUInfo = {
+  configured: boolean;
+  available: number;
+  autoscalers: gpuScale[];
+};
 export type EnvironmentVariable = EitherNotBoth<
   { value: string | number },
   { valueFrom: Record<string, unknown> }

diff --git a/frontend/src/pages/notebookController/screens/server/GPUSelectField.tsx b/frontend/src/pages/notebookController/screens/server/GPUSelectField.tsx
@@ -19,13 +19,23 @@ const GPUSelectField: React.FC<GPUSelectFieldProps> = ({ value, setValue }) => {
     let lastCall = 0;
     let cancelled = false;
     const fetchGPU = () => {
-      setFetching(true);
       lastCall = Date.now();
-      return getGPU().then(([areGpusAvailable, size]) => {
+      return getGPU().then((gpuInfo) => {
         if (cancelled) return;
-        setAreGpusAvailable(areGpusAvailable);
-        setGpuSize(size || 0);
+        setGpuSize(gpuInfo.available || 0);
+        setAreGpusAvailable(gpuInfo.configured);
         setFetching(false);
+        let availableScaleableGPU = 0;
+        if (gpuInfo.autoscalers) {
+          availableScaleableGPU = gpuInfo.autoscalers.reduce(
+            (highestValue, { availableScale, gpuNumber }) =>
+              availableScale > 0 ? Math.max(highestValue, gpuNumber) : highestValue,
+            0,
+          );
+        }
+        if (gpuInfo.available < availableScaleableGPU) {
+          setGpuSize(availableScaleableGPU);
+        }
       });
     };
 

diff --git a/frontend/src/services/gpuService.ts b/frontend/src/services/gpuService.ts
@@ -1,6 +1,7 @@
 import axios from 'axios';
+import { GPUInfo } from 'types';
 
-export const getGPU = (): Promise<[boolean, number]> => {
+export const getGPU = (): Promise<GPUInfo> => {
   const url = '/api/gpu';
   return axios
     .get(url)

diff --git a/frontend/src/types.ts b/frontend/src/types.ts
@@ -644,3 +644,14 @@ export type NotebookData = {
 };
 
 export type UsernameMap<V> = { [username: string]: V };
+
+export type gpuScale = {
+  availableScale: number;
+  gpuNumber: number;
+};
+
+export type GPUInfo = {
+  configured: boolean;
+  available: number;
+  autoscalers: gpuScale[];
+};
diff --git a/manifests/base/cluster-role.yaml b/manifests/base/cluster-role.yaml
@@ -3,6 +3,15 @@ apiVersion: rbac.authorization.k8s.io/v1
 metadata:
   name: odh-dashboard
 rules:
+  - verbs:
+      - get
+      - list
+    apiGroups:
+      - machine.openshift.io
+      - autoscaling.openshift.io
+    resources:
+      - machineautoscalers
+      - machinesets
   - verbs:
       - get
       - watch