Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use ray start block in Pod's entrypoint #77

Merged
merged 7 commits into from
Dec 2, 2021
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions ray-operator/config/samples/ray-cluster.complete.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ spec:
dashboard-host: '0.0.0.0'
num-cpus: '1' # can be auto-completed from the limits
node-ip-address: $MY_POD_IP # auto-completed as the head pod IP
block: 'true'
#pod template
template:
metadata:
Expand All @@ -43,11 +44,6 @@ spec:
containers:
- name: ray-head
image: rayproject/ray:1.8.0
# you can have any command and args here to run your code.
Jeffwan marked this conversation as resolved.
Show resolved Hide resolved
# the below command/args will be appended after the Ray start command and it args, and executed after Ray start.
command: ["python3"]
args:
- '/opt/sample_code.py'
env:
- name: CPU_REQUEST
valueFrom:
Expand Down Expand Up @@ -119,6 +115,7 @@ spec:
rayStartParams:
redis-password: 'LetMeInRay'
node-ip-address: $MY_POD_IP
block: 'true'
#pod template
template:
metadata:
Expand Down
6 changes: 1 addition & 5 deletions ray-operator/config/samples/ray-cluster.getting-started.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ spec:
dashboard-host: '0.0.0.0'
num-cpus: '1' # can be auto-completed from the limits
node-ip-address: $MY_POD_IP # auto-completed as the head pod IP
block: 'true'
#pod template
template:
metadata:
Expand All @@ -43,11 +44,6 @@ spec:
containers:
- name: ray-head
image: rayproject/ray:1.8.0
# you can have any command and args here to run your code.
# the below command/args will be appended after the Ray start command and it args, and executed after Ray start.
command: ["python"]
args:
- '/opt/sample_code.py'
env:
- name: MY_POD_IP
valueFrom:
Expand Down
8 changes: 3 additions & 5 deletions ray-operator/config/samples/ray-cluster.heterogeneous.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ spec:
dashboard-host: '0.0.0.0'
num-cpus: '1' # can be auto-completed from the limits
node-ip-address: $MY_POD_IP # auto-completed as the head pod IP
block: 'true'
#pod template
template:
metadata:
Expand All @@ -43,11 +44,6 @@ spec:
containers:
- name: ray-head
image: rayproject/ray:1.8.0
# you can have any command and args here to run your code.
# the below command/args will be appended after the Ray start command and it args, and executed after Ray start.
command: ["sleep"]
args:
- "infinity"
env:
- name: MY_POD_IP
valueFrom:
Expand Down Expand Up @@ -91,6 +87,7 @@ spec:
rayStartParams:
redis-password: 'LetMeInRay'
node-ip-address: $MY_POD_IP
block: 'true'
#pod template
template:
metadata:
Expand Down Expand Up @@ -165,6 +162,7 @@ spec:
rayStartParams:
redis-password: 'LetMeInRay'
node-ip-address: $MY_POD_IP
block: "true"
#pod template
template:
metadata:
Expand Down
6 changes: 1 addition & 5 deletions ray-operator/config/samples/ray-cluster.mini.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ spec:
dashboard-host: '0.0.0.0'
num-cpus: '1' # can be auto-completed from the limits
node-ip-address: $MY_POD_IP # auto-completed as the head pod IP
block: 'true'
#pod template
template:
metadata:
Expand All @@ -47,11 +48,6 @@ spec:
image: rayproject/ray:1.8.0
#image: rayproject/ray:nightly
#image: bonsaidev.azurecr.io/bonsai/lazer-0-9-0-cpu:dev
# you can have any command and args here to run your code.
# the below command/args will be appended after the Ray start command and it args, and executed after Ray start.
command: ["sleep"]
args:
- "infinity"
env:
- name: MY_POD_IP
valueFrom:
Expand Down
27 changes: 21 additions & 6 deletions ray-operator/controllers/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,15 +72,13 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayN
Spec: podTemplateSpec.Spec,
}
index := getRayContainerIndex(pod)
cont := concatenateContainerCommand(rayNodeType, rayStartParams)

addEmptyDir(&pod.Spec.Containers[index], &pod)
cleanupInvalidVolumeMounts(&pod.Spec.Containers[index], &pod)
if len(pod.Spec.InitContainers) > index {
cleanupInvalidVolumeMounts(&pod.Spec.InitContainers[index], &pod)
}

// saving temporarily the old command and args
var cmd, args string
if len(pod.Spec.Containers[index].Command) > 0 {
cmd = convertCmdToString(pod.Spec.Containers[index].Command)
Expand All @@ -89,17 +87,23 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayN
cmd += convertCmdToString(pod.Spec.Containers[index].Args)
}
if !strings.Contains(cmd, "ray start") {
cont := concatenateContainerCommand(rayNodeType, rayStartParams)
// replacing the old command
pod.Spec.Containers[index].Command = []string{"/bin/bash", "-c", "--"}
if cmd != "" {
// sleep infinity is used to keep the pod `running` after the last command exits, and not go into `completed` state
args = fmt.Sprintf("%s && %s && %s", cont, cmd, "sleep infinity")
args = fmt.Sprintf("%s && %s", cont, cmd)
} else {
args = fmt.Sprintf("%s && %s", cont, "sleep infinity")
args = cont
}

if !isRayStartWithBlock(rayStartParams) {
// sleep infinity is used to keep the pod `running` after the last command exits, and not go into `completed` state
args = args + " && sleep infinity"
}

pod.Spec.Containers[index].Args = []string{args}
}

for index := range pod.Spec.InitContainers {
setInitContainerEnvVars(&pod.Spec.InitContainers[index], svcName)
}
Expand All @@ -109,6 +113,13 @@ func BuildPod(podTemplateSpec v1.PodTemplateSpec, rayNodeType rayiov1alpha1.RayN
return pod
}

func isRayStartWithBlock(rayStartParams map[string]string) bool {
if blockValue, exist := rayStartParams["block"]; exist {
return strings.ToLower(blockValue) == "true"
}
return false
}

func convertCmdToString(cmdArr []string) (cmd string) {
cmdAggr := new(bytes.Buffer)
for _, v := range cmdArr {
Expand Down Expand Up @@ -264,7 +275,11 @@ func concatenateContainerCommand(nodeType rayiov1alpha1.RayNodeType, rayStartPar
func convertParamMap(rayStartParams map[string]string) (s string) {
flags := new(bytes.Buffer)
for k, v := range rayStartParams {
fmt.Fprintf(flags, " --%s=%s ", k, v)
if strings.ToLower(v) == "true" {
fmt.Fprintf(flags, " --%s ", k)
} else {
fmt.Fprintf(flags, " --%s=%s ", k, v)
}
}
return flags.String()
}
Expand Down
31 changes: 23 additions & 8 deletions ray-operator/controllers/common/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package common
import (
"fmt"
"reflect"
"sort"
"strings"
"testing"

Expand Down Expand Up @@ -43,10 +44,8 @@ var instance = &rayiov1alpha1.RayCluster{
Spec: corev1.PodSpec{
Containers: []corev1.Container{
corev1.Container{
Name: "ray-head",
Image: "rayproject/autoscaler",
Command: []string{"python"},
Args: []string{"/opt/code.py"},
Name: "ray-head",
Image: "rayproject/autoscaler",
Env: []corev1.EnvVar{
corev1.EnvVar{
Name: "MY_POD_IP",
Expand All @@ -72,6 +71,7 @@ var instance = &rayiov1alpha1.RayCluster{
"port": "6379",
"redis-password": "LetMeInRay",
"num-cpus": "1",
"block": "true",
},
Template: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Expand All @@ -84,10 +84,8 @@ var instance = &rayiov1alpha1.RayCluster{
Spec: corev1.PodSpec{
Containers: []corev1.Container{
corev1.Container{
Name: "ray-worker",
Image: "rayproject/autoscaler",
Command: []string{"echo"},
Args: []string{"Hello Ray"},
Name: "ray-worker",
Image: "rayproject/autoscaler",
Env: []corev1.EnvVar{
corev1.EnvVar{
Name: "MY_POD_IP",
Expand Down Expand Up @@ -141,4 +139,21 @@ func TestBuildPod(t *testing.T) {
if !reflect.DeepEqual(expectedResult, actualResult) {
t.Fatalf("Expected `%v` but got `%v`", expectedResult, actualResult)
}

expectedCommandArg := splitAndSort("ulimit -n 65536; ray start --block --num-cpus=1 --address=raycluster-sample-head-svc:6379 --port=6379 --redis-password=LetMeInRay")
if !reflect.DeepEqual(expectedCommandArg, splitAndSort(pod.Spec.Containers[0].Args[0])) {
t.Fatalf("Expected `%v` but got `%v`", expectedCommandArg, pod.Spec.Containers[0].Args)
}
}

func splitAndSort(s string) []string {
strs := strings.Split(s, " ")
result := make([]string, 0, len(strs))
for _, s := range strs {
if len(s) > 0 {
result = append(result, s)
}
}
sort.Strings(result)
return result
}