Skip to content

Commit

Permalink
Merge pull request #15 from ilyee/ilyee/entire-mount
Browse files Browse the repository at this point in the history
add a parameter `IsEntireMount` to support gang scheduling.
  • Loading branch information
pokerfaceSad committed Apr 19, 2021
2 parents fed987d + 59a9980 commit a80f501
Show file tree
Hide file tree
Showing 7 changed files with 92 additions and 37 deletions.
61 changes: 35 additions & 26 deletions pkg/api/gpu-mount/api.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pkg/api/gpu-mount/api.proto
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ message AddGPURequest {
string pod_name = 1;
string namespace = 2;
int32 gpu_num = 3;
bool is_entire_mount = 4;
}

message AddGPUResponse {
Expand Down
13 changes: 12 additions & 1 deletion pkg/server/gpu-mount/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
. "GPUMounter/pkg/util/log"
"context"
"errors"

k8s_error "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
Expand Down Expand Up @@ -53,8 +54,18 @@ func (gpuMountImpl GPUMountImpl) AddGPU(_ context.Context, request *gpu_mount.Ad
}
Logger.Info("Successfully get Pod: " + request.Namespace + " in cluster")

// if target pod is already entire mounted, it's not allowed to mount more gpu
if gpuMountImpl.IsEntireMount(targetPod) {
Logger.Error("Pod already entire mounted, not allowed to mount other gpu before unmount")
return nil, errors.New(gpu.FailedCreated)
}

gpuNum := int(request.GpuNum)
gpuResources, err := gpuMountImpl.GetAvailableGPU(targetPod, gpuNum)
gpuNumPerPod := 1
if request.IsEntireMount {
gpuNumPerPod = gpuNum
}
gpuResources, err := gpuMountImpl.GetAvailableGPU(targetPod, gpuNum, gpuNumPerPod)

if err != nil {
if err.Error() == gpu.InsufficientGPU {
Expand Down
43 changes: 37 additions & 6 deletions pkg/util/gpu/allocator/allocator.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@ import (
"crypto/rand"
"errors"
"fmt"
"strconv"
"strings"

corev1 "k8s.io/api/core/v1"
k8s_errors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"strconv"
"strings"
)

type GPUAllocator struct {
Expand All @@ -36,7 +37,7 @@ func NewGPUAllocator() (*GPUAllocator, error) {
return gpuAllocator, nil
}

func (gpuAllocator *GPUAllocator) GetAvailableGPU(ownerPod *corev1.Pod, gpuNum int) ([]*device.NvidiaGPU, error) {
func (gpuAllocator *GPUAllocator) GetAvailableGPU(ownerPod *corev1.Pod, totalGpuNum int, gpuNumPerPod int) ([]*device.NvidiaGPU, error) {
clientset, err := config.GetClientSet()
if err != nil {
Logger.Error(err)
Expand All @@ -45,9 +46,9 @@ func (gpuAllocator *GPUAllocator) GetAvailableGPU(ownerPod *corev1.Pod, gpuNum i
}

var slavePodNames []string
for idx := 0; idx < gpuNum; idx++ {
for idx := 0; idx < totalGpuNum/gpuNumPerPod; idx++ {
// try create a gpu pod on specify node
slavePod := newGPUSlavePod(ownerPod, 1)
slavePod := newGPUSlavePod(ownerPod, gpuNumPerPod)
slavePod, err = clientset.CoreV1().Pods(slavePod.Namespace).Create(context.TODO(), slavePod, metav1.CreateOptions{})
if err != nil {
Logger.Error(err)
Expand Down Expand Up @@ -105,11 +106,14 @@ func (gpuAllocator *GPUAllocator) GetRemoveGPU(ownerPod *corev1.Pod, uuids []str
Logger.Error("Failed to Get Pod: ", ownerPod.Name, " Namespace: ", ownerPod.Namespace, " GPU resources")
return nil, err
}

var removeGPUs []*device.NvidiaGPU
isEntireMount := gpuAllocator.IsEntireMount(ownerPod)
for _, gpuDev := range gpuResources {
// GPU Mounter can only unmount the gpu mounted by GPU Mounter
// so the removed gpu should belong to slave pod
if util.ContainString(uuids, gpuDev.UUID) && gpuDev.PodName != ownerPod.Name {
// if entire mount pod, remove all gpu
if (isEntireMount || util.ContainString(uuids, gpuDev.UUID)) && gpuDev.PodName != ownerPod.Name {
removeGPUs = append(removeGPUs, gpuDev)
}
}
Expand Down Expand Up @@ -150,6 +154,33 @@ func (gpuAllocator *GPUAllocator) DeleteSlavePods(slavePodNames []string) error
return errors.New("Unkown status from checking goroutine ")

}

func (gpuAllocator *GPUAllocator) IsEntireMount(pod *corev1.Pod) bool {
Logger.Info("Check whether pod %s/%s is entire mount", pod.Namespace, pod.Name)
gpuResources, err := gpuAllocator.GetPodGPUResources(pod.Name, pod.Namespace)
if err != nil {
Logger.Error(err)
Logger.Error("Failed to Check Pod: ", pod.Name, " Namespace: ", pod.Namespace, " is entire mount or not")
return false
}
// entire mount pod has less slave pod than its gpu num
slavePodNames := make(map[string]interface{}, 0)
gpuNum := 0
for _, gpuDev := range gpuResources {
if gpuDev.PodName != pod.Name {
slavePodNames[gpuDev.PodName] = struct{}{}
}
gpuNum++
}

// TODO: here we regard a mount as entire mount if pod's gpu num less than slave pods,
// is it possible to find a better method?
if len(slavePodNames) < gpuNum {
return true
}
return false
}

func newGPUSlavePod(ownerPod *corev1.Pod, gpuNum int) *corev1.Pod {
// generate random ID
randBytes := make([]byte, 3)
Expand Down
3 changes: 2 additions & 1 deletion pkg/util/gpu/allocator/allocator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ import (
"GPUMounter/pkg/config"
. "GPUMounter/pkg/util/log"
"context"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"testing"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func TestGetAvailableGPU(t *testing.T) {
Expand Down
5 changes: 3 additions & 2 deletions pkg/util/gpu/collector/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@ import (
. "GPUMounter/pkg/util/log"
"context"
"fmt"
"google.golang.org/grpc"
podresourcesapi "k8s.io/kubernetes/pkg/kubelet/apis/podresources/v1alpha1"
"net"
"os"
"strings"
"time"

"google.golang.org/grpc"
podresourcesapi "k8s.io/kubernetes/pkg/kubelet/apis/podresources/v1alpha1"
)

type GPUCollector struct {
Expand Down
3 changes: 2 additions & 1 deletion pkg/util/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ import (
. "GPUMounter/pkg/util/log"
"GPUMounter/pkg/util/namespace"
"errors"
corev1 "k8s.io/api/core/v1"
"strconv"
"strings"

corev1 "k8s.io/api/core/v1"
)

func MountGPU(pod *corev1.Pod, gpu *device.NvidiaGPU) error {
Expand Down

0 comments on commit a80f501

Please sign in to comment.