From 59a9980b8234284ca65411f1faeb2120ceb82bf5 Mon Sep 17 00:00:00 2001 From: ilyee <493647673@qq.com> Date: Thu, 25 Feb 2021 15:24:28 +0800 Subject: [PATCH] support entire mount - entire mount pod has only one slave pod with all gpu attached to it - make an entire mount by set param is_entire_mount=true - entire mount pod is not able to mount more gpu before unmount - unmount an entire mount pod will unallocate all gpu --- pkg/api/gpu-mount/api.pb.go | 61 ++++++++++++++---------- pkg/api/gpu-mount/api.proto | 1 + pkg/server/gpu-mount/server.go | 13 ++++- pkg/util/gpu/allocator/allocator.go | 43 ++++++++++++++--- pkg/util/gpu/allocator/allocator_test.go | 3 +- pkg/util/gpu/collector/collector.go | 5 +- pkg/util/util.go | 3 +- 7 files changed, 92 insertions(+), 37 deletions(-) diff --git a/pkg/api/gpu-mount/api.pb.go b/pkg/api/gpu-mount/api.pb.go index 9ee6dee..8505efe 100644 --- a/pkg/api/gpu-mount/api.pb.go +++ b/pkg/api/gpu-mount/api.pb.go @@ -87,6 +87,7 @@ type AddGPURequest struct { PodName string `protobuf:"bytes,1,opt,name=pod_name,json=podName,proto3" json:"pod_name,omitempty"` Namespace string `protobuf:"bytes,2,opt,name=namespace,proto3" json:"namespace,omitempty"` GpuNum int32 `protobuf:"varint,3,opt,name=gpu_num,json=gpuNum,proto3" json:"gpu_num,omitempty"` + IsEntireMount bool `protobuf:"varint,4,opt,name=is_entire_mount,json=isEntireMount,proto3" json:"is_entire_mount,omitempty"` XXX_NoUnkeyedLiteral struct{} `json:"-"` XXX_unrecognized []byte `json:"-"` XXX_sizecache int32 `json:"-"` @@ -138,6 +139,13 @@ func (m *AddGPURequest) GetGpuNum() int32 { return 0 } +func (m *AddGPURequest) GetIsEntireMount() bool { + if m != nil { + return m.IsEntireMount + } + return false +} + type AddGPUResponse struct { AddGpuResult AddGPUResponse_AddGPUResult `protobuf:"varint,1,opt,name=add_gpu_result,json=addGpuResult,proto3,enum=gpu_mount.AddGPUResponse_AddGPUResult" json:"add_gpu_result,omitempty"` XXX_NoUnkeyedLiteral struct{} `json:"-"` @@ -291,32 +299,33 @@ func init() { func init() { proto.RegisterFile("api.proto", fileDescriptor_00212fb1f9d3bf1c) } var fileDescriptor_00212fb1f9d3bf1c = []byte{ - // 386 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xac, 0x93, 0xcf, 0x4e, 0x83, 0x40, - 0x10, 0x87, 0x4b, 0xff, 0x51, 0xa6, 0xb5, 0xd0, 0xd5, 0x44, 0xaa, 0x3d, 0x10, 0x0e, 0x86, 0x83, - 0xe9, 0xa1, 0x3e, 0x80, 0xa9, 0x07, 0xd1, 0x44, 0x1b, 0x42, 0x43, 0xe2, 0xc1, 0xa4, 0x41, 0x76, - 0xdb, 0x90, 0x14, 0x76, 0x65, 0xd9, 0x1a, 0x1f, 0xc7, 0xf7, 0xf0, 0xe1, 0x0c, 0x60, 0x5b, 0x6a, - 0xab, 0x27, 0x8f, 0xf3, 0xed, 0xce, 0x6f, 0xc8, 0x37, 0x2c, 0x28, 0x3e, 0x0b, 0x87, 0x2c, 0xa1, - 0x29, 0x45, 0xca, 0x82, 0x89, 0x59, 0x44, 0x45, 0x9c, 0x9a, 0x3e, 0x1c, 0x8d, 0x31, 0xb6, 0x1d, - 0xcf, 0x25, 0xaf, 0x82, 0xf0, 0x14, 0xf5, 0xa1, 0xc5, 0x28, 0x9e, 0xc5, 0x7e, 0x44, 0x74, 0xc9, - 0x90, 0x2c, 0xc5, 0x95, 0x19, 0xc5, 0x13, 0x3f, 0x22, 0x68, 0x00, 0x4a, 0x86, 0x39, 0xf3, 0x03, - 0xa2, 0x57, 0xf3, 0xb3, 0x2d, 0x40, 0xa7, 0x20, 0x67, 0xb1, 0xb1, 0x88, 0xf4, 0x9a, 0x21, 0x59, - 0x0d, 0xb7, 0xb9, 0x60, 0x62, 0x22, 0x22, 0xf3, 0x43, 0x82, 0xee, 0x7a, 0x06, 0x67, 0x34, 0xe6, - 0x04, 0x3d, 0x40, 0xd7, 0xc7, 0x78, 0x96, 0xdd, 0x4f, 0x08, 0x17, 0xcb, 0x34, 0x1f, 0xd5, 0x1d, - 0x5d, 0x0c, 0x37, 0x5f, 0x36, 0xdc, 0x6d, 0xd9, 0x96, 0x62, 0x99, 0xba, 0x1d, 0x1f, 0x63, 0x9b, - 0x89, 0xa2, 0x32, 0xc7, 0xd0, 0x29, 0x9f, 0xa2, 0x36, 0xc8, 0x53, 0x11, 0x04, 0x84, 0x73, 0xad, - 0x82, 0x8e, 0x41, 0xbd, 0x8f, 0xb9, 0x98, 0xcf, 0xc3, 0x20, 0x24, 0x71, 0x6a, 0x3b, 0x9e, 0x26, - 0x21, 0x15, 0xda, 0x0e, 0xc5, 0x13, 0x9a, 0xde, 0x52, 0x11, 0x63, 0xad, 0x6a, 0xbe, 0x81, 0xe6, - 0x92, 0x88, 0xae, 0xc8, 0x7f, 0x98, 0x38, 0x81, 0x86, 0x10, 0x21, 0xe6, 0x7a, 0xcd, 0xa8, 0x59, - 0x8a, 0x5b, 0x14, 0x19, 0x9d, 0xd3, 0x24, 0x20, 0x7a, 0xdd, 0x90, 0xac, 0x96, 0x5b, 0x14, 0xe6, - 0xa7, 0x04, 0xbd, 0xd2, 0xe4, 0x6f, 0x3f, 0x4f, 0xd0, 0x4b, 0x72, 0xb8, 0xaf, 0xe8, 0xb2, 0xa4, - 0x68, 0xaf, 0x71, 0x87, 0x64, 0xa2, 0xd4, 0x22, 0x66, 0xeb, 0xea, 0x11, 0xd4, 0x1f, 0x77, 0x76, - 0x75, 0xb5, 0x41, 0xb6, 0x1d, 0xef, 0x46, 0xf0, 0xf7, 0x03, 0x9a, 0x32, 0x60, 0x3b, 0xde, 0x06, - 0xd4, 0x47, 0xce, 0xfa, 0xf7, 0x99, 0x92, 0x64, 0x15, 0x06, 0x04, 0x5d, 0x43, 0xb3, 0x00, 0x48, - 0x3f, 0xb0, 0xcb, 0x5c, 0xec, 0x59, 0xff, 0xd7, 0x2d, 0x9b, 0x95, 0xd1, 0x73, 0x69, 0x13, 0xeb, - 0xd0, 0x3b, 0x50, 0x36, 0x0c, 0x9d, 0x1f, 0x16, 0x50, 0x44, 0x0f, 0xfe, 0xb2, 0x63, 0x56, 0x5e, - 0x9a, 0xf9, 0x03, 0xb8, 0xfa, 0x0a, 0x00, 0x00, 0xff, 0xff, 0xeb, 0x1e, 0xac, 0xb3, 0x0d, 0x03, - 0x00, 0x00, + // 415 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xac, 0x93, 0xc1, 0x6e, 0xd3, 0x40, + 0x10, 0x86, 0xb3, 0x4d, 0x1b, 0xd7, 0x93, 0x36, 0x76, 0x17, 0x24, 0x5c, 0xe8, 0xc1, 0xf2, 0xa1, + 0xf2, 0x01, 0xe5, 0x10, 0x1e, 0x00, 0x15, 0x09, 0x0c, 0x12, 0x8d, 0x2c, 0x57, 0x96, 0x38, 0x20, + 0x59, 0xc6, 0x3b, 0xa9, 0x56, 0xaa, 0x77, 0x17, 0xef, 0x6e, 0x11, 0x8f, 0xc0, 0x63, 0xf0, 0x1e, + 0x3c, 0x1c, 0xb2, 0xdd, 0x24, 0x0e, 0x0d, 0x9c, 0x38, 0xce, 0xb7, 0xde, 0x7f, 0xc6, 0xdf, 0xd8, + 0xe0, 0x96, 0x8a, 0xcf, 0x55, 0x23, 0x8d, 0xa4, 0xee, 0xad, 0xb2, 0x45, 0x2d, 0xad, 0x30, 0xd1, + 0x0f, 0x02, 0xa7, 0x57, 0x8c, 0x25, 0x69, 0x9e, 0xe1, 0x57, 0x8b, 0xda, 0xd0, 0x73, 0x38, 0x56, + 0x92, 0x15, 0xa2, 0xac, 0x31, 0x20, 0x21, 0x89, 0xdd, 0xcc, 0x51, 0x92, 0x2d, 0xcb, 0x1a, 0xe9, + 0x05, 0xb8, 0x2d, 0xd6, 0xaa, 0xac, 0x30, 0x38, 0xe8, 0xce, 0xb6, 0x80, 0x3e, 0x03, 0xa7, 0xcd, + 0x15, 0xb6, 0x0e, 0xc6, 0x21, 0x89, 0x8f, 0xb2, 0xc9, 0xad, 0xb2, 0x4b, 0x5b, 0xd3, 0x4b, 0xf0, + 0xb8, 0x2e, 0x50, 0x18, 0xde, 0x60, 0xdf, 0x36, 0x38, 0x0c, 0x49, 0x7c, 0x9c, 0x9d, 0x72, 0xfd, + 0xb6, 0xa3, 0xd7, 0xdd, 0x2c, 0x3f, 0x09, 0xcc, 0xd6, 0xb3, 0x68, 0x25, 0x85, 0x46, 0xfa, 0x11, + 0x66, 0x25, 0x63, 0x45, 0x9b, 0xdb, 0xa0, 0xb6, 0x77, 0xa6, 0x1b, 0x69, 0xb6, 0xb8, 0x9c, 0x6f, + 0x5e, 0x61, 0xbe, 0x7b, 0x65, 0x5b, 0xda, 0x3b, 0x93, 0x9d, 0x94, 0x8c, 0x25, 0xca, 0xf6, 0x55, + 0x74, 0x05, 0x27, 0xc3, 0x53, 0x3a, 0x05, 0xe7, 0xc6, 0x56, 0x15, 0x6a, 0xed, 0x8f, 0xe8, 0x13, + 0xf0, 0x3e, 0x08, 0x6d, 0x57, 0x2b, 0x5e, 0x71, 0x14, 0x26, 0x49, 0x73, 0x9f, 0x50, 0x0f, 0xa6, + 0xa9, 0x64, 0x4b, 0x69, 0xde, 0x49, 0x2b, 0x98, 0x7f, 0x10, 0x7d, 0x03, 0x3f, 0xc3, 0x5a, 0xde, + 0xe3, 0xff, 0x30, 0xf6, 0x14, 0x8e, 0xac, 0xe5, 0x4c, 0x07, 0xe3, 0x70, 0x1c, 0xbb, 0x59, 0x5f, + 0xb4, 0x74, 0x25, 0x9b, 0x0a, 0x1f, 0x24, 0xf5, 0x45, 0xf4, 0x8b, 0xc0, 0xd9, 0xa0, 0xf3, 0x83, + 0x9f, 0x4f, 0x70, 0xd6, 0x74, 0xf0, 0xb1, 0xa2, 0x97, 0x03, 0x45, 0x8f, 0x2e, 0xee, 0x90, 0x56, + 0x94, 0xd7, 0xc7, 0x6c, 0x5d, 0x5d, 0x83, 0xf7, 0xc7, 0x33, 0xbb, 0xba, 0xa6, 0xe0, 0x24, 0x69, + 0xfe, 0xc6, 0xea, 0xef, 0x7b, 0x34, 0xb5, 0x20, 0x49, 0xf3, 0x0d, 0x38, 0x5c, 0xa4, 0xeb, 0xcf, + 0xec, 0x06, 0x9b, 0x7b, 0x5e, 0x21, 0x7d, 0x0d, 0x93, 0x1e, 0xd0, 0x60, 0xcf, 0x2e, 0x3b, 0xb1, + 0xcf, 0xcf, 0xff, 0xba, 0xe5, 0x68, 0xb4, 0xf8, 0x3c, 0xd8, 0xc4, 0x3a, 0xf4, 0x3d, 0xb8, 0x1b, + 0x46, 0x5f, 0xec, 0x17, 0xd0, 0x47, 0x5f, 0xfc, 0xcb, 0x4e, 0x34, 0xfa, 0x32, 0xe9, 0xfe, 0x94, + 0x57, 0xbf, 0x03, 0x00, 0x00, 0xff, 0xff, 0xb4, 0xfe, 0x20, 0xcc, 0x36, 0x03, 0x00, 0x00, } // Reference imports to suppress errors if they are not otherwise used. diff --git a/pkg/api/gpu-mount/api.proto b/pkg/api/gpu-mount/api.proto index 56f7901..f8d6eaa 100644 --- a/pkg/api/gpu-mount/api.proto +++ b/pkg/api/gpu-mount/api.proto @@ -5,6 +5,7 @@ message AddGPURequest { string pod_name = 1; string namespace = 2; int32 gpu_num = 3; + bool is_entire_mount = 4; } message AddGPUResponse { diff --git a/pkg/server/gpu-mount/server.go b/pkg/server/gpu-mount/server.go index 089bc16..83a371e 100644 --- a/pkg/server/gpu-mount/server.go +++ b/pkg/server/gpu-mount/server.go @@ -9,6 +9,7 @@ import ( . "GPUMounter/pkg/util/log" "context" "errors" + k8s_error "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -53,8 +54,18 @@ func (gpuMountImpl GPUMountImpl) AddGPU(_ context.Context, request *gpu_mount.Ad } Logger.Info("Successfully get Pod: " + request.Namespace + " in cluster") + // if target pod is already entire mounted, it's not allowed to mount more gpu + if gpuMountImpl.IsEntireMount(targetPod) { + Logger.Error("Pod already entire mounted, not allowed to mount other gpu before unmount") + return nil, errors.New(gpu.FailedCreated) + } + gpuNum := int(request.GpuNum) - gpuResources, err := gpuMountImpl.GetAvailableGPU(targetPod, gpuNum) + gpuNumPerPod := 1 + if request.IsEntireMount { + gpuNumPerPod = gpuNum + } + gpuResources, err := gpuMountImpl.GetAvailableGPU(targetPod, gpuNum, gpuNumPerPod) if err != nil { if err.Error() == gpu.InsufficientGPU { diff --git a/pkg/util/gpu/allocator/allocator.go b/pkg/util/gpu/allocator/allocator.go index 0d18c0c..c7bcb5b 100644 --- a/pkg/util/gpu/allocator/allocator.go +++ b/pkg/util/gpu/allocator/allocator.go @@ -11,12 +11,13 @@ import ( "crypto/rand" "errors" "fmt" + "strconv" + "strings" + corev1 "k8s.io/api/core/v1" k8s_errors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "strconv" - "strings" ) type GPUAllocator struct { @@ -36,7 +37,7 @@ func NewGPUAllocator() (*GPUAllocator, error) { return gpuAllocator, nil } -func (gpuAllocator *GPUAllocator) GetAvailableGPU(ownerPod *corev1.Pod, gpuNum int) ([]*device.NvidiaGPU, error) { +func (gpuAllocator *GPUAllocator) GetAvailableGPU(ownerPod *corev1.Pod, totalGpuNum int, gpuNumPerPod int) ([]*device.NvidiaGPU, error) { clientset, err := config.GetClientSet() if err != nil { Logger.Error(err) @@ -45,9 +46,9 @@ func (gpuAllocator *GPUAllocator) GetAvailableGPU(ownerPod *corev1.Pod, gpuNum i } var slavePodNames []string - for idx := 0; idx < gpuNum; idx++ { + for idx := 0; idx < totalGpuNum/gpuNumPerPod; idx++ { // try create a gpu pod on specify node - slavePod := newGPUSlavePod(ownerPod, 1) + slavePod := newGPUSlavePod(ownerPod, gpuNumPerPod) slavePod, err = clientset.CoreV1().Pods(slavePod.Namespace).Create(context.TODO(), slavePod, metav1.CreateOptions{}) if err != nil { Logger.Error(err) @@ -105,11 +106,14 @@ func (gpuAllocator *GPUAllocator) GetRemoveGPU(ownerPod *corev1.Pod, uuids []str Logger.Error("Failed to Get Pod: ", ownerPod.Name, " Namespace: ", ownerPod.Namespace, " GPU resources") return nil, err } + var removeGPUs []*device.NvidiaGPU + isEntireMount := gpuAllocator.IsEntireMount(ownerPod) for _, gpuDev := range gpuResources { // GPU Mounter can only unmount the gpu mounted by GPU Mounter // so the removed gpu should belong to slave pod - if util.ContainString(uuids, gpuDev.UUID) && gpuDev.PodName != ownerPod.Name { + // if entire mount pod, remove all gpu + if (isEntireMount || util.ContainString(uuids, gpuDev.UUID)) && gpuDev.PodName != ownerPod.Name { removeGPUs = append(removeGPUs, gpuDev) } } @@ -150,6 +154,33 @@ func (gpuAllocator *GPUAllocator) DeleteSlavePods(slavePodNames []string) error return errors.New("Unkown status from checking goroutine ") } + +func (gpuAllocator *GPUAllocator) IsEntireMount(pod *corev1.Pod) bool { + Logger.Info("Check whether pod %s/%s is entire mount", pod.Namespace, pod.Name) + gpuResources, err := gpuAllocator.GetPodGPUResources(pod.Name, pod.Namespace) + if err != nil { + Logger.Error(err) + Logger.Error("Failed to Check Pod: ", pod.Name, " Namespace: ", pod.Namespace, " is entire mount or not") + return false + } + // entire mount pod has less slave pod than its gpu num + slavePodNames := make(map[string]interface{}, 0) + gpuNum := 0 + for _, gpuDev := range gpuResources { + if gpuDev.PodName != pod.Name { + slavePodNames[gpuDev.PodName] = struct{}{} + } + gpuNum++ + } + + // TODO: here we regard a mount as entire mount if pod's gpu num less than slave pods, + // is it possible to find a better method? + if len(slavePodNames) < gpuNum { + return true + } + return false +} + func newGPUSlavePod(ownerPod *corev1.Pod, gpuNum int) *corev1.Pod { // generate random ID randBytes := make([]byte, 3) diff --git a/pkg/util/gpu/allocator/allocator_test.go b/pkg/util/gpu/allocator/allocator_test.go index ebcf094..8ddc62a 100644 --- a/pkg/util/gpu/allocator/allocator_test.go +++ b/pkg/util/gpu/allocator/allocator_test.go @@ -4,8 +4,9 @@ import ( "GPUMounter/pkg/config" . "GPUMounter/pkg/util/log" "context" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) func TestGetAvailableGPU(t *testing.T) { diff --git a/pkg/util/gpu/collector/collector.go b/pkg/util/gpu/collector/collector.go index fcee85f..ae93f3b 100644 --- a/pkg/util/gpu/collector/collector.go +++ b/pkg/util/gpu/collector/collector.go @@ -7,12 +7,13 @@ import ( . "GPUMounter/pkg/util/log" "context" "fmt" - "google.golang.org/grpc" - podresourcesapi "k8s.io/kubernetes/pkg/kubelet/apis/podresources/v1alpha1" "net" "os" "strings" "time" + + "google.golang.org/grpc" + podresourcesapi "k8s.io/kubernetes/pkg/kubelet/apis/podresources/v1alpha1" ) type GPUCollector struct { diff --git a/pkg/util/util.go b/pkg/util/util.go index 3a39e4f..6c5e527 100644 --- a/pkg/util/util.go +++ b/pkg/util/util.go @@ -7,9 +7,10 @@ import ( . "GPUMounter/pkg/util/log" "GPUMounter/pkg/util/namespace" "errors" - corev1 "k8s.io/api/core/v1" "strconv" "strings" + + corev1 "k8s.io/api/core/v1" ) func MountGPU(pod *corev1.Pod, gpu *device.NvidiaGPU) error {