A100 40GB Support (#165)

* NAP does not support A100 yet * Removes memory GPUInfo.Memory since it's no longer needed * Rename GPUTypeNvidiaTeslaT4 to GPUTypeNvidiaT4 and `nvidia-tesla-t4` to `nvidia-t4` Potential issues: Changing the example to a100 might cause issues due to frequent stockouts of A100. Let's switch back example to L4 if needed.
substratusai · Aug 3, 2023 · 578f58f · 578f58f
1 parent ddd3606
commit 578f58f
Show file tree

Hide file tree

Showing 4 changed files with 167 additions and 8 deletions.
diff --git a/api/v1/common_types.go b/api/v1/common_types.go
@@ -94,8 +94,9 @@ type Resources struct {
 type GPUType string
 
 const (
-	GPUTypeNvidiaTeslaT4 = GPUType("nvidia-tesla-t4")
-	GPUTypeNvidiaL4      = GPUType("nvidia-l4")
+	GPUTypeNvidiaA100 = GPUType("nvidia-a100")
+	GPUTypeNvidiaT4   = GPUType("nvidia-t4")
+	GPUTypeNvidiaL4   = GPUType("nvidia-l4")
 )
 
 type GPUResources struct {

diff --git a/examples/falcon-7b-instruct/finetuned-model.yaml b/examples/falcon-7b-instruct/finetuned-model.yaml
@@ -16,5 +16,5 @@ spec:
     save_steps: 5
   resources:
     gpu:
-      count: 4
-      type: nvidia-l4
+      count: 1
+      type: nvidia-a100
diff --git a/install/terraform/gcp/cluster_node_pools_gpu_a100.tf b/install/terraform/gcp/cluster_node_pools_gpu_a100.tf
@@ -0,0 +1,155 @@
+locals {
+  a100_locations = {
+    "asia-northeast1" = ["asia-northeast1-a", "asia-northeast1-c"]
+    "asia-northeast3" = ["asia-northeast3-a", "asia-northeast3-b"]
+    "asia-southeast1" = ["asia-southeast1-b", "asia-southeast1-c"]
+    "europe-west4"    = ["europe-west4-a", "europe-west4-b"]
+    "me-west1"        = ["me-west1-b", "me-west1-c"]
+    "us-central1"     = ["us-central1-a", "us-central1-b", "us-central1-c", "us-central1-f"]
+    "us-east1"        = ["us-east1-b"]
+    "us-west1"        = ["us-west1-b"]
+    "us-west3"        = ["us-west3-b"]
+    "us-west4"        = ["us-west4-b"]
+  }
+}
+
+resource "google_container_node_pool" "a2-highgpu-1g" {
+  name  = "a2-highgpu-1g"
+  count = var.attach_gpu_nodepools ? 1 : 0
+
+  cluster            = google_container_cluster.main.id
+  initial_node_count = 0
+  node_locations     = local.a100_locations[var.region]
+
+  autoscaling {
+    min_node_count  = 0
+    max_node_count  = 3
+    location_policy = "ANY"
+  }
+  management {
+    auto_repair  = true
+    auto_upgrade = true
+  }
+
+  node_config {
+    spot         = true
+    machine_type = "a2-highgpu-1g"
+    ephemeral_storage_local_ssd_config {
+      local_ssd_count = 1
+    }
+    gcfs_config {
+      enabled = true
+    }
+  }
+  lifecycle {
+    ignore_changes = [
+      initial_node_count
+    ]
+  }
+}
+
+
+resource "google_container_node_pool" "a2-highgpu-2g" {
+  name  = "a2-highgpu-2g"
+  count = var.attach_gpu_nodepools ? 1 : 0
+
+  cluster            = google_container_cluster.main.id
+  initial_node_count = 0
+  node_locations     = local.a100_locations[var.region]
+
+  autoscaling {
+    min_node_count  = 0
+    max_node_count  = 3
+    location_policy = "ANY"
+  }
+  management {
+    auto_repair  = true
+    auto_upgrade = true
+  }
+
+  node_config {
+    spot         = true
+    machine_type = "a2-highgpu-2g"
+    ephemeral_storage_local_ssd_config {
+      local_ssd_count = 1
+    }
+    gcfs_config {
+      enabled = true
+    }
+  }
+  lifecycle {
+    ignore_changes = [
+      initial_node_count
+    ]
+  }
+}
+
+resource "google_container_node_pool" "a2-highgpu-4g" {
+  name  = "a2-highgpu-4g"
+  count = var.attach_gpu_nodepools ? 1 : 0
+
+  cluster            = google_container_cluster.main.id
+  initial_node_count = 0
+  node_locations     = local.a100_locations[var.region]
+
+  autoscaling {
+    min_node_count  = 0
+    max_node_count  = 3
+    location_policy = "ANY"
+  }
+  management {
+    auto_repair  = true
+    auto_upgrade = true
+  }
+
+  node_config {
+    spot         = true
+    machine_type = "a2-highgpu-4g"
+    ephemeral_storage_local_ssd_config {
+      local_ssd_count = 1
+    }
+    gcfs_config {
+      enabled = true
+    }
+  }
+  lifecycle {
+    ignore_changes = [
+      initial_node_count
+    ]
+  }
+}
+
+resource "google_container_node_pool" "a2-highgpu-8g" {
+  name  = "a2-highgpu-8g"
+  count = var.attach_gpu_nodepools ? 1 : 0
+
+  cluster            = google_container_cluster.main.id
+  initial_node_count = 0
+  node_locations     = local.a100_locations[var.region]
+
+  autoscaling {
+    min_node_count  = 0
+    max_node_count  = 3
+    location_policy = "ANY"
+  }
+  management {
+    auto_repair  = true
+    auto_upgrade = true
+  }
+
+  node_config {
+    spot         = true
+    machine_type = "a2-highgpu-8g"
+    ephemeral_storage_local_ssd_config {
+      local_ssd_count = 1
+    }
+    gcfs_config {
+      enabled = true
+    }
+  }
+  lifecycle {
+    ignore_changes = [
+      initial_node_count
+    ]
+  }
+}
diff --git a/internal/resources/gpu_info.go b/internal/resources/gpu_info.go
@@ -7,28 +7,31 @@ import (
 )
 
 type GPUInfo struct {
-	Memory       int64
 	ResourceName corev1.ResourceName
 	NodeSelector map[string]string
 }
 
 var cloudGPUs = map[string]map[apiv1.GPUType]*GPUInfo{
 	cloud.GCPName: {
 		// https://cloud.google.com/compute/docs/gpus#nvidia_t4_gpus
-		apiv1.GPUTypeNvidiaTeslaT4: {
-			Memory:       16 * gigabyte,
+		apiv1.GPUTypeNvidiaT4: {
 			ResourceName: corev1.ResourceName("nvidia.com/gpu"),
 			NodeSelector: map[string]string{
 				"cloud.google.com/gke-accelerator": "nvidia-tesla-t4",
 			},
 		},
 		// https://cloud.google.com/compute/docs/gpus#l4-gpus
 		apiv1.GPUTypeNvidiaL4: {
-			Memory:       24 * gigabyte,
 			ResourceName: corev1.ResourceName("nvidia.com/gpu"),
 			NodeSelector: map[string]string{
 				"cloud.google.com/gke-accelerator": "nvidia-l4",
 			},
 		},
+		apiv1.GPUTypeNvidiaA100: {
+			ResourceName: corev1.ResourceName("nvidia.com/gpu"),
+			NodeSelector: map[string]string{
+				"cloud.google.com/gke-accelerator": "nvidia-tesla-a100",
+			},
+		},
 	},
 }