Skip to content

Commit

Permalink
A100 40GB Support (#165)
Browse files Browse the repository at this point in the history
* NAP does not support A100 yet
* Removes memory GPUInfo.Memory since it's no longer needed
* Rename GPUTypeNvidiaTeslaT4 to GPUTypeNvidiaT4 and `nvidia-tesla-t4`
to `nvidia-t4`

Potential issues: Changing the example to a100 might cause issues due to
frequent stockouts of A100. Let's switch back example to L4 if needed.
  • Loading branch information
samos123 committed Aug 3, 2023
1 parent ddd3606 commit 578f58f
Show file tree
Hide file tree
Showing 4 changed files with 167 additions and 8 deletions.
5 changes: 3 additions & 2 deletions api/v1/common_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,9 @@ type Resources struct {
type GPUType string

const (
GPUTypeNvidiaTeslaT4 = GPUType("nvidia-tesla-t4")
GPUTypeNvidiaL4 = GPUType("nvidia-l4")
GPUTypeNvidiaA100 = GPUType("nvidia-a100")
GPUTypeNvidiaT4 = GPUType("nvidia-t4")
GPUTypeNvidiaL4 = GPUType("nvidia-l4")
)

type GPUResources struct {
Expand Down
4 changes: 2 additions & 2 deletions examples/falcon-7b-instruct/finetuned-model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@ spec:
save_steps: 5
resources:
gpu:
count: 4
type: nvidia-l4
count: 1
type: nvidia-a100
155 changes: 155 additions & 0 deletions install/terraform/gcp/cluster_node_pools_gpu_a100.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
locals {
a100_locations = {
"asia-northeast1" = ["asia-northeast1-a", "asia-northeast1-c"]
"asia-northeast3" = ["asia-northeast3-a", "asia-northeast3-b"]
"asia-southeast1" = ["asia-southeast1-b", "asia-southeast1-c"]
"europe-west4" = ["europe-west4-a", "europe-west4-b"]
"me-west1" = ["me-west1-b", "me-west1-c"]
"us-central1" = ["us-central1-a", "us-central1-b", "us-central1-c", "us-central1-f"]
"us-east1" = ["us-east1-b"]
"us-west1" = ["us-west1-b"]
"us-west3" = ["us-west3-b"]
"us-west4" = ["us-west4-b"]
}
}

resource "google_container_node_pool" "a2-highgpu-1g" {
name = "a2-highgpu-1g"
count = var.attach_gpu_nodepools ? 1 : 0

cluster = google_container_cluster.main.id
initial_node_count = 0
node_locations = local.a100_locations[var.region]

autoscaling {
min_node_count = 0
max_node_count = 3
location_policy = "ANY"
}
management {
auto_repair = true
auto_upgrade = true
}

node_config {
spot = true
machine_type = "a2-highgpu-1g"
ephemeral_storage_local_ssd_config {
local_ssd_count = 1
}
gcfs_config {
enabled = true
}
}
lifecycle {
ignore_changes = [
initial_node_count
]
}
}


resource "google_container_node_pool" "a2-highgpu-2g" {
name = "a2-highgpu-2g"
count = var.attach_gpu_nodepools ? 1 : 0

cluster = google_container_cluster.main.id
initial_node_count = 0
node_locations = local.a100_locations[var.region]

autoscaling {
min_node_count = 0
max_node_count = 3
location_policy = "ANY"
}
management {
auto_repair = true
auto_upgrade = true
}

node_config {
spot = true
machine_type = "a2-highgpu-2g"
ephemeral_storage_local_ssd_config {
local_ssd_count = 1
}
gcfs_config {
enabled = true
}
}
lifecycle {
ignore_changes = [
initial_node_count
]
}
}

resource "google_container_node_pool" "a2-highgpu-4g" {
name = "a2-highgpu-4g"
count = var.attach_gpu_nodepools ? 1 : 0

cluster = google_container_cluster.main.id
initial_node_count = 0
node_locations = local.a100_locations[var.region]

autoscaling {
min_node_count = 0
max_node_count = 3
location_policy = "ANY"
}
management {
auto_repair = true
auto_upgrade = true
}

node_config {
spot = true
machine_type = "a2-highgpu-4g"
ephemeral_storage_local_ssd_config {
local_ssd_count = 1
}
gcfs_config {
enabled = true
}
}
lifecycle {
ignore_changes = [
initial_node_count
]
}
}

resource "google_container_node_pool" "a2-highgpu-8g" {
name = "a2-highgpu-8g"
count = var.attach_gpu_nodepools ? 1 : 0

cluster = google_container_cluster.main.id
initial_node_count = 0
node_locations = local.a100_locations[var.region]

autoscaling {
min_node_count = 0
max_node_count = 3
location_policy = "ANY"
}
management {
auto_repair = true
auto_upgrade = true
}

node_config {
spot = true
machine_type = "a2-highgpu-8g"
ephemeral_storage_local_ssd_config {
local_ssd_count = 1
}
gcfs_config {
enabled = true
}
}
lifecycle {
ignore_changes = [
initial_node_count
]
}
}
11 changes: 7 additions & 4 deletions internal/resources/gpu_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,31 @@ import (
)

type GPUInfo struct {
Memory int64
ResourceName corev1.ResourceName
NodeSelector map[string]string
}

var cloudGPUs = map[string]map[apiv1.GPUType]*GPUInfo{
cloud.GCPName: {
// https://cloud.google.com/compute/docs/gpus#nvidia_t4_gpus
apiv1.GPUTypeNvidiaTeslaT4: {
Memory: 16 * gigabyte,
apiv1.GPUTypeNvidiaT4: {
ResourceName: corev1.ResourceName("nvidia.com/gpu"),
NodeSelector: map[string]string{
"cloud.google.com/gke-accelerator": "nvidia-tesla-t4",
},
},
// https://cloud.google.com/compute/docs/gpus#l4-gpus
apiv1.GPUTypeNvidiaL4: {
Memory: 24 * gigabyte,
ResourceName: corev1.ResourceName("nvidia.com/gpu"),
NodeSelector: map[string]string{
"cloud.google.com/gke-accelerator": "nvidia-l4",
},
},
apiv1.GPUTypeNvidiaA100: {
ResourceName: corev1.ResourceName("nvidia.com/gpu"),
NodeSelector: map[string]string{
"cloud.google.com/gke-accelerator": "nvidia-tesla-a100",
},
},
},
}

0 comments on commit 578f58f

Please sign in to comment.