Skip to content

Commit

Permalink
fix cgroup driver to configurable as mentioned in issue #19
Browse files Browse the repository at this point in the history
* add environment variable `CGROUP_DRIVER`(default: cgroupfs) to set cgroup driver
* fix log format error in allocator.go L84
  • Loading branch information
pokerfaceSad committed Feb 17, 2022
1 parent 7036133 commit 163ef7b
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 5 deletions.
4 changes: 4 additions & 0 deletions deploy/gpu-mounter-workers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ spec:
- containerPort: 1200
command: ["/bin/bash"]
args: ["-c", "/GPUMounter/GPUMounter-worker"]
env:
- name: CGROUP_DRIVER
value: "cgroupfs"
# value: "systemd"
volumeMounts:
- name: cgroup
mountPath: /sys/fs/cgroup
Expand Down
8 changes: 8 additions & 0 deletions pkg/util/cgroup/cgroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,14 @@ func (cgroupName CgroupName) ToCgroupfs() string {
return "/" + path.Join(cgroupName...)
}

func GetCgroupDriver() (string, error) {
cgroupDriver := os.Getenv("CGROUP_DRIVER")
if cgroupDriver != "systemd" && cgroupDriver != "cgroupfs" {
return "", fmt.Errorf("unsupported cgroup driver: %s", cgroupDriver)
}
return cgroupDriver, nil
}

func GetCgroupName(cgroupDriver string, pod *corev1.Pod, containerID string) (string, error) {
containerRoot := NewCgroupName([]string{}, "kubepods")
PodCgroupNamePrefix := "pod"
Expand Down
7 changes: 6 additions & 1 deletion pkg/util/cgroup/cgroup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,12 @@ func TestGetCgroupName(t *testing.T) {
containerID := pod.Status.ContainerStatuses[0].ContainerID
containerID = strings.Replace(containerID, "docker://", "", 1)
fmt.Println(containerID)
cgroupName, err := GetCgroupName("cgroupfs", pod, containerID)
cgroupDriver, err := GetCgroupDriver()
if err != nil {
fmt.Println("Get cgroup driver failed")
panic(err)
}
cgroupName, err := GetCgroupName(cgroupDriver, pod, containerID)
if err != nil {
panic(err)
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/util/gpu/allocator/allocator.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ func (gpuAllocator *GPUAllocator) GetAvailableGPU(ownerPod *corev1.Pod, totalGpu
}
return nil, errors.New(gpu.FailedCreated)
case gpu.SuccessfullyCreated:
Logger.Info("Successfully create Slave Pod: %s, for Owner Pod: %s ", strings.Join(slavePodNames, ", "), ownerPod.Name)
Logger.Infof("Successfully create Slave Pod: %s, for Owner Pod: %s ", strings.Join(slavePodNames, ", "), ownerPod.Name)
var availableGPUResource []*device.NvidiaGPU
for _, slavePodName := range slavePodNames {
gpuResources, err := gpuAllocator.GetPodGPUResources(slavePodName, gpu.GPUPoolNamespace)
Expand Down
21 changes: 18 additions & 3 deletions pkg/util/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,12 @@ func MountGPU(pod *corev1.Pod, gpu *device.NvidiaGPU) error {
containerID := pod.Status.ContainerStatuses[0].ContainerID
containerID = strings.Replace(containerID, "docker://", "", 1)
Logger.Info("Pod :" + pod.Name + " container ID: " + containerID)
cgroupPath, err := cgroup.GetCgroupName("cgroupfs", pod, containerID)
cgroupDriver, err := cgroup.GetCgroupDriver()
if err != nil {
Logger.Error("Get cgroup driver failed")
return err
}
cgroupPath, err := cgroup.GetCgroupName(cgroupDriver, pod, containerID)
if err != nil {
Logger.Error("Get cgroup path for Pod: " + pod.Name + " failed")
return err
Expand Down Expand Up @@ -72,7 +77,12 @@ func UnmountGPU(pod *corev1.Pod, gpu *device.NvidiaGPU, forceRemove bool) error
containerID := pod.Status.ContainerStatuses[0].ContainerID
containerID = strings.Replace(containerID, "docker://", "", 1)
Logger.Info("Pod :" + pod.Name + " container ID: " + containerID)
cgroupPath, err := cgroup.GetCgroupName("cgroupfs", pod, containerID)
cgroupDriver, err := cgroup.GetCgroupDriver()
if err != nil {
Logger.Error("Get cgroup driver failed")
return err
}
cgroupPath, err := cgroup.GetCgroupName(cgroupDriver, pod, containerID)
if err != nil {
Logger.Error("Get cgroup path for Pod: " + pod.Name + " failed")
return err
Expand Down Expand Up @@ -144,7 +154,12 @@ func GetPodGPUProcesses(pod *corev1.Pod, gpu *device.NvidiaGPU) ([]string, error
containerID := pod.Status.ContainerStatuses[0].ContainerID
containerID = strings.Replace(containerID, "docker://", "", 1)
Logger.Info("Pod: " + pod.Name + " container ID: " + containerID)
cgroupPath, err := cgroup.GetCgroupName("cgroupfs", pod, containerID)
cgroupDriver, err := cgroup.GetCgroupDriver()
if err != nil {
Logger.Error("Get cgroup driver failed")
return nil, err
}
cgroupPath, err := cgroup.GetCgroupName(cgroupDriver, pod, containerID)
if err != nil {
Logger.Error("Get cgroup path for Pod: " + pod.Name + " failed")
return nil, err
Expand Down

0 comments on commit 163ef7b

Please sign in to comment.