Skip to content

Commit

Permalink
Kind gpu support in controller (#211)
Browse files Browse the repository at this point in the history
I verified it's working by using the following server manifest:
```
apiVersion: substratus.ai/v1
kind: Server
metadata:
  name: falcon-7b-instruct
spec:
  build:
    git:
      url: https://github.com/substratusai/images
      path: model-server-basaran
      branch: basaran-4bit-mode
        #image: substratusai/model-server-basaran
  model:
    name: falcon-7b-instruct
  resources:
    gpu:
      count: 1
```

Note how I leave out the GPU type, which is desired because everyone
might be running different GPUs locally
  • Loading branch information
samos123 committed Aug 23, 2023
1 parent ea0270f commit 8361996
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 7 deletions.
5 changes: 2 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -361,9 +361,8 @@ $(PROTOC): $(LOCALBIN)
skaffold: $(SKAFFOLD)
$(SKAFFOLD): $(LOCALBIN)
@ test -s $(LOCALBIN)/skaffold || \
curl -Lo skaffold https://storage.googleapis.com/skaffold/releases/latest/skaffold-$(SKAFFOLD_PLATFORM) && \
chmod +x skaffold && \
mv skaffold $(LOCALBIN)/skaffold
curl -Lo $(LOCALBIN)/skaffold https://storage.googleapis.com/skaffold/releases/latest/skaffold-$(SKAFFOLD_PLATFORM) && \
chmod +x $(LOCALBIN)/skaffold

.PHONY: envsubst
envsubst:
Expand Down
1 change: 0 additions & 1 deletion config/sci/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ spec:
containers:
- name: sci
image: sci
imagePullPolicy: Always
envFrom:
- configMapRef:
name: system
Expand Down
11 changes: 11 additions & 0 deletions internal/resources/gpu_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,17 @@ type GPUInfo struct {
NodeSelector map[string]string
}

func GetGPUInfo(cloudName string, gpuType apiv1.GPUType) (*GPUInfo, bool) {
if cloudName == cloud.KindName {
return &GPUInfo{
ResourceName: corev1.ResourceName("nvidia.com/gpu"),
NodeSelector: map[string]string{},
}, true
}
gpuInfo, ok := cloudGPUs[cloudName][gpuType]
return gpuInfo, ok
}

var cloudGPUs = map[string]map[apiv1.GPUType]*GPUInfo{
cloud.GCPName: {
// https://cloud.google.com/compute/docs/gpus#nvidia_t4_gpus
Expand Down
4 changes: 1 addition & 3 deletions internal/resources/resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ func Apply(podMetadata *metav1.ObjectMeta, podSpec *corev1.PodSpec, containerNam
resources.Requests[corev1.ResourceMemory] = *resource.NewQuantity(res.Memory*gigabyte, resource.BinarySI)

if res.GPU != nil {
gpuInfo, ok := cloudGPUs[cloudName][res.GPU.Type]
gpuInfo, ok := GetGPUInfo(cloudName, res.GPU.Type)
if !ok {
return fmt.Errorf("GPU %s is not supported on cloud %s", res.GPU.Type, cloudName)
}
Expand All @@ -47,9 +47,7 @@ func Apply(podMetadata *metav1.ObjectMeta, podSpec *corev1.PodSpec, containerNam
podSpec.NodeSelector = map[string]string{}
}

// TODO: Make spot configurable.
// TODO: Move this GCP code into cloud-specific configuration.
podSpec.NodeSelector["cloud.google.com/gke-spot"] = "true"
// Toleration is needed to trigger NAP
// https://cloud.google.com/kubernetes-engine/docs/how-to/node-auto-provisioning#support_for_spot_vms
podSpec.Tolerations = append(podSpec.Tolerations, corev1.Toleration{
Expand Down
3 changes: 3 additions & 0 deletions skaffold.kind.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,6 @@ build:
dockerfile: Dockerfile.sci-kind
local:
push: false
deploy:
kubectl:
defaultNamespace: substratus

0 comments on commit 8361996

Please sign in to comment.