Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] Use image of Ray head container as the default Ray Autoscaler container #1401

Merged
merged 2 commits into from
Sep 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions helm-chart/kuberay-operator/crds/ray.io_rayclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6074,8 +6074,8 @@ spec:
type: string
type: object
rayVersion:
description: RayVersion is the version of ray being used. This determines
the autoscaler's image version.
description: RayVersion is used to determine the command for the Kubernetes
Job managed by RayJob
type: string
workerGroupSpecs:
description: WorkerGroupSpecs are the specs for the worker pods
Expand Down
4 changes: 2 additions & 2 deletions helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6342,8 +6342,8 @@ spec:
type: string
type: object
rayVersion:
description: RayVersion is the version of ray being used. This
determines the autoscaler's image version.
description: RayVersion is used to determine the command for the
Kubernetes Job managed by RayJob
type: string
workerGroupSpecs:
description: WorkerGroupSpecs are the specs for the worker pods
Expand Down
4 changes: 2 additions & 2 deletions helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6316,8 +6316,8 @@ spec:
type: string
type: object
rayVersion:
description: RayVersion is the version of ray being used. This
determines the autoscaler's image version.
description: RayVersion is used to determine the command for the
Kubernetes Job managed by RayJob
type: string
workerGroupSpecs:
description: WorkerGroupSpecs are the specs for the worker pods
Expand Down
5 changes: 3 additions & 2 deletions ray-operator/apis/ray/v1alpha1/raycluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ type RayClusterSpec struct {
HeadGroupSpec HeadGroupSpec `json:"headGroupSpec"`
// WorkerGroupSpecs are the specs for the worker pods
WorkerGroupSpecs []WorkerGroupSpec `json:"workerGroupSpecs,omitempty"`
// RayVersion is the version of ray being used. This determines the autoscaler's image version.
// RayVersion is used to determine the command for the Kubernetes Job managed by RayJob
RayVersion string `json:"rayVersion,omitempty"`
// EnableInTreeAutoscaling indicates whether operator should create in tree autoscaling configs
EnableInTreeAutoscaling *bool `json:"enableInTreeAutoscaling,omitempty"`
Expand Down Expand Up @@ -86,12 +86,13 @@ type AutoscalerOptions struct {
// More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/
SecurityContext *v1.SecurityContext `json:"securityContext,omitempty"`
// IdleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.
// Defaults to 60 (one minute).
// Defaults to 60 (one minute). It is not read by the KubeRay operator but by the Ray autoscaler.
IdleTimeoutSeconds *int32 `json:"idleTimeoutSeconds,omitempty"`
// UpscalingMode is "Conservative", "Default", or "Aggressive."
// Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.
// Default: Upscaling is not rate-limited.
// Aggressive: An alias for Default; upscaling is not rate-limited.
// It is not read by the KubeRay operator but by the Ray autoscaler.
UpscalingMode *UpscalingMode `json:"upscalingMode,omitempty"`
}

Expand Down
1 change: 0 additions & 1 deletion ray-operator/apis/ray/v1alpha1/raycluster_types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ var myRayCluster = &RayCluster{
Namespace: "default",
},
Spec: RayClusterSpec{
RayVersion: "1.0",
HeadGroupSpec: HeadGroupSpec{
Replicas: pointer.Int32Ptr(1),
RayStartParams: map[string]string{
Expand Down
4 changes: 1 addition & 3 deletions ray-operator/apis/ray/v1alpha1/rayservice_types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ var myRayService = &RayService{
},
},
RayClusterSpec: RayClusterSpec{
RayVersion: "1.12.1",
HeadGroupSpec: HeadGroupSpec{
Replicas: pointer.Int32Ptr(1),
RayStartParams: map[string]string{
Expand Down Expand Up @@ -355,8 +354,7 @@ var expected = `{

}
}
],
"rayVersion":"1.12.1"
]
}
},
"status":{
Expand Down
4 changes: 2 additions & 2 deletions ray-operator/config/crd/bases/ray.io_rayclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6074,8 +6074,8 @@ spec:
type: string
type: object
rayVersion:
description: RayVersion is the version of ray being used. This determines
the autoscaler's image version.
description: RayVersion is used to determine the command for the Kubernetes
Job managed by RayJob
type: string
workerGroupSpecs:
description: WorkerGroupSpecs are the specs for the worker pods
Expand Down
4 changes: 2 additions & 2 deletions ray-operator/config/crd/bases/ray.io_rayjobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6342,8 +6342,8 @@ spec:
type: string
type: object
rayVersion:
description: RayVersion is the version of ray being used. This
determines the autoscaler's image version.
description: RayVersion is used to determine the command for the
Kubernetes Job managed by RayJob
type: string
workerGroupSpecs:
description: WorkerGroupSpecs are the specs for the worker pods
Expand Down
4 changes: 2 additions & 2 deletions ray-operator/config/crd/bases/ray.io_rayservices.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6316,8 +6316,8 @@ spec:
type: string
type: object
rayVersion:
description: RayVersion is the version of ray being used. This
determines the autoscaler's image version.
description: RayVersion is used to determine the command for the
Kubernetes Job managed by RayJob
type: string
workerGroupSpecs:
description: WorkerGroupSpecs are the specs for the worker pods
Expand Down
2 changes: 0 additions & 2 deletions ray-operator/controllers/ray/common/ingress_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ var instanceWithIngressEnabled = &rayv1alpha1.RayCluster{
},
},
Spec: rayv1alpha1.RayClusterSpec{
RayVersion: "1.0",
HeadGroupSpec: rayv1alpha1.HeadGroupSpec{
Replicas: pointer.Int32Ptr(1),
Template: corev1.PodTemplateSpec{
Expand All @@ -49,7 +48,6 @@ var instanceWithIngressEnabledWithoutIngressClass = &rayv1alpha1.RayCluster{
Namespace: "default",
},
Spec: rayv1alpha1.RayClusterSpec{
RayVersion: "1.0",
HeadGroupSpec: rayv1alpha1.HeadGroupSpec{
Replicas: pointer.Int32Ptr(1),
Template: corev1.PodTemplateSpec{
Expand Down
43 changes: 2 additions & 41 deletions ray-operator/controllers/ray/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,8 @@ func DefaultHeadPodTemplate(instance rayv1alpha1.RayCluster, headSpec rayv1alpha
// set custom service account with proper roles bound.
// utils.CheckName clips the name to match the behavior of reconcileAutoscalerServiceAccount
podTemplate.Spec.ServiceAccountName = utils.CheckName(utils.GetHeadGroupServiceAccountName(&instance))
rayHeadImage := podTemplate.Spec.Containers[RayContainerIndex].Image
// Determine the default image to use for the Ray container.
autoscalerImage := getAutoscalerImage(rayHeadImage, instance.Spec.RayVersion)
// Use the same image as Ray head container by default.
autoscalerImage := podTemplate.Spec.Containers[RayContainerIndex].Image
// inject autoscaler container into head pod
autoscalerContainer := BuildAutoscalerContainer(autoscalerImage)
// Merge the user overrides from autoscalerOptions into the autoscaler container config.
Expand All @@ -138,44 +137,6 @@ func DefaultHeadPodTemplate(instance rayv1alpha1.RayCluster, headSpec rayv1alpha
return podTemplate
}

// getAutoscalerImage determines the default autoscaler image
func getAutoscalerImage(rayHeadImage string, rayVersion string) string {
if autoscalerSupportIsStable(rayVersion) {
// For Ray versions >= 2.0.0, use the Ray head's image to run the autoscaler.
return rayHeadImage
} else {
// For older Ray versions, use the Ray 2.0.0 image to run the autoscaler.
return FallbackDefaultAutoscalerImage
}
}

// Determine if autoscaler support is stable in the given rayVersion.
// Return false exactly when the major version is successfully parsed and less than 2.
// Example rayVersion inputs that return true: "2.0.0", "2.0", "2", "2.0.0rc1", "nightly", "latest", "unknown".
// Example inputs that return false: "1.13.0", "1.12", "1".
func autoscalerSupportIsStable(rayVersion string) bool {
// Try to determine major version by extracting everything that comes before the first "."
firstDotIndex := strings.Index(rayVersion, ".")
var majorVersionString string
if firstDotIndex == -1 {
// If there is no ".", try parsing the entire rayVersion as the major version.
majorVersionString = rayVersion
} else {
// Everything up to the first "."
majorVersionString = rayVersion[:firstDotIndex]
}

if majorVersion, err := strconv.Atoi(majorVersionString); err == nil {
return majorVersion >= 2
} else {
// If in doubt, just assume that the Ray version is >= 2.0.0,
// so that we use the Ray image to run the autoscaler.
// Currently, there is a lot of "doubt," since the version string is not validated.
// Users can always override the operator's choice of image with autoscalerOptions.image.
return true
}
}

func getEnableInitContainerInjection() bool {
if s := os.Getenv(EnableInitContainerInjectionEnvKey); strings.ToLower(s) == "false" {
return false
Expand Down
54 changes: 28 additions & 26 deletions ray-operator/controllers/ray/common/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ var instance = rayv1alpha1.RayCluster{
Namespace: "default",
},
Spec: rayv1alpha1.RayClusterSpec{
RayVersion: "2.0.0",
HeadGroupSpec: rayv1alpha1.HeadGroupSpec{
Replicas: pointer.Int32Ptr(1),
RayStartParams: map[string]string{
Expand Down Expand Up @@ -281,31 +280,6 @@ func TestAddEmptyDirVolumes(t *testing.T) {
assert.Equal(t, len(testPod.Spec.Volumes), 2)
}

func TestGetAutoscalerImage(t *testing.T) {
// rayVersion strings for which we judge autoscaler support is stable and thus
// use the same image for the autoscaler as for the Ray container.
newRayVersions := []string{"2.0.0", "2.0.0rc0", "2.0", "2", "latest", "nightly", "what's this"}
rayImage := "repo/image:tag"
for _, rayVersion := range newRayVersions {
expectedAutoscalerImage := rayImage
actualAutoscalerImage := getAutoscalerImage(rayImage, rayVersion)
if actualAutoscalerImage != expectedAutoscalerImage {
t.Fatalf("Expected `%v` but got `%v`", expectedAutoscalerImage, actualAutoscalerImage)
}
}

// rayVersion strings for which we judge autoscaler support is not stable and thus
// use the default Ray 2.0.0 image to run the autoscaler.
oldRayVersions := []string{"1", "1.13", "1.13.0"}
for _, rayVersion := range oldRayVersions {
expectedAutoscalerImage := "rayproject/ray:2.0.0"
actualAutoscalerImage := getAutoscalerImage(rayImage, rayVersion)
if actualAutoscalerImage != expectedAutoscalerImage {
t.Fatalf("Expected `%v` but got `%v`", expectedAutoscalerImage, actualAutoscalerImage)
}
}
}

func TestGetHeadPort(t *testing.T) {
headStartParams := make(map[string]string)
actualResult := GetHeadPort(headStartParams)
Expand Down Expand Up @@ -683,6 +657,34 @@ func TestHeadPodTemplate_WithAutoscalingEnabled(t *testing.T) {
}
}

func TestHeadPodTemplate_AutoscalerImage(t *testing.T) {
cluster := instance.DeepCopy()
cluster.Spec.EnableInTreeAutoscaling = &trueFlag
cluster.Spec.AutoscalerOptions = nil
podName := strings.ToLower(cluster.Name + DashSymbol + string(rayv1alpha1.HeadNode) + DashSymbol + utils.FormatInt32(0))

// Case 1: If `AutoscalerOptions.Image` is not set, the Autoscaler container should use the Ray head container's image by default.
expectedAutoscalerImage := cluster.Spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Image
podTemplateSpec := DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379")
pod := v1.Pod{
Spec: podTemplateSpec.Spec,
}
autoscalerContainerIndex := getAutoscalerContainerIndex(pod)
assert.Equal(t, expectedAutoscalerImage, podTemplateSpec.Spec.Containers[autoscalerContainerIndex].Image)

// Case 2: If `AutoscalerOptions.Image` is set, the Autoscaler container should use the specified image.
customAutoscalerImage := "custom-autoscaler-xxx"
cluster = instance.DeepCopy()
cluster.Spec.EnableInTreeAutoscaling = &trueFlag
cluster.Spec.AutoscalerOptions = &rayv1alpha1.AutoscalerOptions{
Image: &customAutoscalerImage,
}
podTemplateSpec = DefaultHeadPodTemplate(*cluster, cluster.Spec.HeadGroupSpec, podName, "6379")
pod.Spec = podTemplateSpec.Spec
autoscalerContainerIndex = getAutoscalerContainerIndex(pod)
assert.Equal(t, customAutoscalerImage, podTemplateSpec.Spec.Containers[autoscalerContainerIndex].Image)
}

// If no service account is specified in the RayCluster,
// the head pod's service account should be an empty string.
func TestHeadPodTemplate_WithNoServiceAccount(t *testing.T) {
Expand Down
1 change: 0 additions & 1 deletion ray-operator/controllers/ray/common/route_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ var instanceWithRouteEnabled = &rayv1alpha1.RayCluster{
},
},
Spec: rayv1alpha1.RayClusterSpec{
RayVersion: "1.0",
HeadGroupSpec: rayv1alpha1.HeadGroupSpec{
Replicas: pointer.Int32Ptr(1),
EnableIngress: pointer.BoolPtr(true),
Expand Down
2 changes: 0 additions & 2 deletions ray-operator/controllers/ray/common/service_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ var (
},
Spec: rayv1alpha1.RayServiceSpec{
RayClusterSpec: rayv1alpha1.RayClusterSpec{
RayVersion: "1.0",
HeadGroupSpec: rayv1alpha1.HeadGroupSpec{
ServiceType: corev1.ServiceTypeClusterIP,
},
Expand All @@ -40,7 +39,6 @@ var (
Namespace: "default",
},
Spec: rayv1alpha1.RayClusterSpec{
RayVersion: "1.0",
HeadServiceAnnotations: map[string]string{
headServiceAnnotationKey1: headServiceAnnotationValue1,
headServiceAnnotationKey2: headServiceAnnotationValue2,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,6 @@ func setupTest(t *testing.T) {
Namespace: namespaceStr,
},
Spec: rayv1alpha1.RayClusterSpec{
RayVersion: "1.0",
EnableInTreeAutoscaling: &enableInTreeAutoscaling,
HeadGroupSpec: rayv1alpha1.HeadGroupSpec{
Replicas: pointer.Int32Ptr(1),
Expand Down
1 change: 0 additions & 1 deletion ray-operator/controllers/ray/raycluster_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ var _ = Context("Inside the default namespace", func() {
Namespace: "default",
},
Spec: rayv1alpha1.RayClusterSpec{
RayVersion: "1.0",
EnableInTreeAutoscaling: &enableInTreeAutoscaling,
HeadGroupSpec: rayv1alpha1.HeadGroupSpec{
RayStartParams: map[string]string{
Expand Down
Loading