Skip to content

Commit

Permalink
Add autoscale check for GPU (opendatahub-io#573)
Browse files Browse the repository at this point in the history
* Add autoscale check

* Clean after rebase

* Add scalable gpus to dropdown count

* Change code to use MachineAutoscalers

* Use machineautoscalers, add logic to frontend

* check available machine replicas in max scalable calculation

* Add role change and fix availability check

* Fix machineSet get logic

* Update frontend/src/pages/notebookController/screens/server/GPUSelectField.tsx

Use the value from the reduce

* Update backend/src/routes/api/gpu/gpuUtils.ts

* Update backend/src/routes/api/gpu/gpuUtils.ts

* Update frontend/src/pages/notebookController/screens/server/GPUSelectField.tsx

* add saToken to kube, optimise gpu data getting

Co-authored-by: Andrew Ballantyne <8126518+andrewballantyne@users.noreply.github.com>
  • Loading branch information
2 people authored and strangiato committed Oct 18, 2023
1 parent 4c3e6af commit 4df427f
Show file tree
Hide file tree
Showing 8 changed files with 201 additions and 29 deletions.
21 changes: 21 additions & 0 deletions backend/src/plugins/kube.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ export default fp(async (fastify: FastifyInstance) => {
fastify.log.error(e, 'Failed to retrieve current namespace');
}

let saToken;
try {
saToken = await getSAToken();
} catch (e) {
fastify.log.error(e, 'Failed to retrieve Service Account token');
}

let clusterID;
try {
const clusterVersion = await customObjectsApi.getClusterCustomObject(
Expand Down Expand Up @@ -67,6 +74,7 @@ export default fp(async (fastify: FastifyInstance) => {
clusterID,
clusterBranding,
rbac,
saToken,
});

// Initialize the watching of resources
Expand All @@ -93,3 +101,16 @@ const getCurrentNamespace = async () => {
}
});
};

const getSAToken = async () => {
return new Promise<string>((resolve, reject) => {
if (currentContext === 'inClusterContext') {
fs.readFile('/var/run/secrets/kubernetes.io/serviceaccount/token', (err, data) => {
if (err) {
reject(err);
}
resolve(String(data));
});
}
});
};
117 changes: 94 additions & 23 deletions backend/src/routes/api/gpu/gpuUtils.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,26 @@
import { KubeFastifyInstance, PrometheusResponse } from '../../../types';
import {
MachineAutoscalerList,
GPUInfo,
KubeFastifyInstance,
PrometheusResponse,
MachineSet,
gpuScale,
} from '../../../types';
import { V1PodList } from '@kubernetes/client-node';
import https from 'https';
import * as fs from 'fs';

export const getGPUNumber = async (fastify: KubeFastifyInstance): Promise<[boolean, number]> => {
/** Storage to prevent heavy calls from being performed for EVERY user */
const storage: { lastFetch: number; lastValue: GPUInfo } = {
lastValue: { available: 0, configured: false, autoscalers: [] },
lastFetch: 0,
};

export const getGPUNumber = async (fastify: KubeFastifyInstance): Promise<GPUInfo> => {
if (storage.lastFetch >= Date.now() - 30_000) {
fastify.log.info(`Returning cached gpu value (${JSON.stringify(storage)})`);
return storage.lastValue;
}
fastify.log.info(`Computing GPU state`);
let maxGpuNumber = 0;
let areGpusConfigured = false;
const gpuPodList = await fastify.kube.coreV1Api
Expand All @@ -15,38 +32,45 @@ export const getGPUNumber = async (fastify: KubeFastifyInstance): Promise<[boole
fastify.log.error(`Exception when calling DCGM exporter pods: ${e}`);
return { items: [] } as V1PodList;
});
const scalingLimit = await getGPUScaling(fastify);
if (gpuPodList.items.length != 0) {
areGpusConfigured = true;
const token = await new Promise<string>((resolve, reject) => {
fs.readFile('/var/run/secrets/kubernetes.io/serviceaccount/token', (err, data) => {
try {
resolve(String(data));
} catch {
reject('');
fastify.log.error(err);
}
});
});
const gpuDataResponses = [];
for (let i = 0; i < gpuPodList.items.length; i++) {
const data = await getGPUData(gpuPodList.items[i].status.podIP, token);
if (data.code === 200) {
const gpuNumber = data.response;
if (gpuNumber > maxGpuNumber) {
maxGpuNumber = gpuNumber;
gpuDataResponses.push(getGPUData(gpuPodList.items[i].status.podIP, fastify.kube.saToken));
}

await Promise.all(gpuDataResponses).then((gpuDataList) => {
for (let i = 0; i < gpuDataList.length; i++) {
if (gpuDataList[i].code === 200) {
const gpuNumber = gpuDataList[i].response;
if (gpuNumber > maxGpuNumber) {
maxGpuNumber = gpuNumber;
}
} else {
fastify.log.warn(`Error getting GPUData ${gpuDataList[i].response}`);
}
} else {
fastify.log.warn(`Error getting GPUData ${data.response}`);
}
}
});
} else if (scalingLimit.length != 0) {
areGpusConfigured = true;
}
return [areGpusConfigured, maxGpuNumber];

const data: GPUInfo = {
configured: areGpusConfigured,
available: maxGpuNumber,
autoscalers: scalingLimit,
};
storage.lastFetch = Date.now();
storage.lastValue = data;
return data;
};

export const getGPUData = async (
podIP: string,
token: string,
): Promise<{ code: number; response: number | any }> => {
return await new Promise((resolve, reject) => {
return new Promise((resolve, reject) => {
const options = {
hostname: 'thanos-querier.openshift-monitoring.svc.cluster.local',
port: 9091,
Expand Down Expand Up @@ -79,3 +103,50 @@ export const getGPUData = async (
httpsRequest.end();
});
};

const getGPUScaling = async (fastify: KubeFastifyInstance): Promise<gpuScale[]> => {
const scalingList: gpuScale[] = [];
const autoscalerList = (
await fastify.kube.customObjectsApi.listNamespacedCustomObject(
'autoscaling.openshift.io',
'v1beta1',
'openshift-machine-api',
'machineautoscalers',
)
).body as MachineAutoscalerList;

const machineSets = [];
for (let i = 0; i < autoscalerList.items.length; i++) {
const machineSetName = autoscalerList.items[i].spec.scaleTargetRef.name; //also gives info about kind and apiversion if needed in the future
machineSets.push(
fastify.kube.customObjectsApi
.getNamespacedCustomObject(
'machine.openshift.io',
'v1beta1',
'openshift-machine-api',
'machinesets',
machineSetName,
)
.catch((e) => {
fastify.log.warn(
`Autoscaler ${autoscalerList.items[i].metadata.name} did not contain MachineSet info. ${e.response.data.message}`,
);
return null;
}),
);
}
await Promise.all(machineSets).then((msList) => {
for (let i = 0; i < msList.length; i++) {
const machineSet = msList[i].body as MachineSet;
const gpuAmount = Number(machineSet?.metadata.annotations?.['machine.openshift.io/GPU']);
if (gpuAmount > 0) {
scalingList.push({
availableScale:
autoscalerList.items[i].spec.maxReplicas - (machineSet.status.availableReplicas || 0),
gpuNumber: gpuAmount,
});
}
}
});
return scalingList;
};
4 changes: 3 additions & 1 deletion backend/src/routes/api/status/statusUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ export const status = async (
request: FastifyRequest,
): Promise<{ kube: KubeStatus }> => {
const kubeContext = fastify.kube.currentContext;
const { currentContext, namespace, currentUser, clusterID, clusterBranding } = fastify.kube;
const { currentContext, namespace, currentUser, clusterID, clusterBranding, saToken } =
fastify.kube;

const userName = await getUserName(fastify, request);
const isAdmin = await isUserAdmin(fastify, userName, namespace);
Expand All @@ -33,6 +34,7 @@ export const status = async (
clusterBranding,
isAdmin,
isAllowed,
saToken,
},
};
}
Expand Down
47 changes: 47 additions & 0 deletions backend/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ export type KubeStatus = {
clusterBranding: string;
isAdmin: boolean;
isAllowed: boolean;
saToken: string;
};

export type KubeDecorator = KubeStatus & {
Expand Down Expand Up @@ -660,6 +661,52 @@ export type RecursivePartial<T> = {
[P in keyof T]?: RecursivePartial<T[P]>;
};

export type GPUScaleType = {
type: 'nvidia.com/gpu' | 'amd.com/gpu';
min: number;
max: number;
};

export type MachineAutoscaler = {
spec: {
maxReplicas: number;
minReplicas: number;
scaleTargetRef: {
apiversion: string;
kind: string;
name: string;
};
};
} & K8sResourceCommon;

export type MachineSet = {
status: {
availableReplicas: number;
fullyLabeledReplicas: number;
observedGeneration: number;
readyReplicas: number;
replicas: number;
};
} & K8sResourceCommon;

export type MachineAutoscalerList = {
items: MachineAutoscaler[];
} & K8sResourceCommon;

export type MachineSetList = {
items: MachineSet[];
} & K8sResourceCommon;

export type gpuScale = {
availableScale: number;
gpuNumber: number;
};

export type GPUInfo = {
configured: boolean;
available: number;
autoscalers: gpuScale[];
};
export type EnvironmentVariable = EitherNotBoth<
{ value: string | number },
{ valueFrom: Record<string, unknown> }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,23 @@ const GPUSelectField: React.FC<GPUSelectFieldProps> = ({ value, setValue }) => {
let lastCall = 0;
let cancelled = false;
const fetchGPU = () => {
setFetching(true);
lastCall = Date.now();
return getGPU().then(([areGpusAvailable, size]) => {
return getGPU().then((gpuInfo) => {
if (cancelled) return;
setAreGpusAvailable(areGpusAvailable);
setGpuSize(size || 0);
setGpuSize(gpuInfo.available || 0);
setAreGpusAvailable(gpuInfo.configured);
setFetching(false);
let availableScaleableGPU = 0;
if (gpuInfo.autoscalers) {
availableScaleableGPU = gpuInfo.autoscalers.reduce(
(highestValue, { availableScale, gpuNumber }) =>
availableScale > 0 ? Math.max(highestValue, gpuNumber) : highestValue,
0,
);
}
if (gpuInfo.available < availableScaleableGPU) {
setGpuSize(availableScaleableGPU);
}
});
};

Expand Down
3 changes: 2 additions & 1 deletion frontend/src/services/gpuService.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import axios from 'axios';
import { GPUInfo } from 'types';

export const getGPU = (): Promise<[boolean, number]> => {
export const getGPU = (): Promise<GPUInfo> => {
const url = '/api/gpu';
return axios
.get(url)
Expand Down
11 changes: 11 additions & 0 deletions frontend/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -644,3 +644,14 @@ export type NotebookData = {
};

export type UsernameMap<V> = { [username: string]: V };

export type gpuScale = {
availableScale: number;
gpuNumber: number;
};

export type GPUInfo = {
configured: boolean;
available: number;
autoscalers: gpuScale[];
};
9 changes: 9 additions & 0 deletions manifests/base/cluster-role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,15 @@ apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: odh-dashboard
rules:
- verbs:
- get
- list
apiGroups:
- machine.openshift.io
- autoscaling.openshift.io
resources:
- machineautoscalers
- machinesets
- verbs:
- get
- watch
Expand Down

0 comments on commit 4df427f

Please sign in to comment.