Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bug][breaking change] Unauthorized 401 error on fetching Ray Custom Resources from K8s API server #1128

Merged
merged 5 commits into from
Jun 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions helm-chart/ray-cluster/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ head:
# cpu: "500m"
# memory: "512Mi"
labels: {}
# Note: From KubeRay v0.6.0, users need to create the ServiceAccount by themselves if they specify the `serviceAccountName`
# in the headGroupSpec. See https://github.com/ray-project/kuberay/pull/1128 for more details.
serviceAccountName: ""
rayStartParams:
dashboard-host: '0.0.0.0'
Expand Down
3 changes: 3 additions & 0 deletions ray-operator/config/samples/ray-service.autoscaler.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ spec:
#pod template
template:
spec:
# Note: From KubeRay v0.6.0, users need to create the ServiceAccount by themselves if they specify the `serviceAccountName`
# in the headGroupSpec. See https://github.com/ray-project/kuberay/pull/1128 for more details.
# serviceAccountName: my-sa
containers:
- name: ray-head
image: rayproject/ray:2.4.0
Expand Down
13 changes: 13 additions & 0 deletions ray-operator/controllers/ray/raycluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1026,6 +1026,18 @@ func (r *RayClusterReconciler) reconcileAutoscalerServiceAccount(instance *rayio
return err
}

// If users specify ServiceAccountName for the head Pod, they need to create a ServiceAccount themselves.
// However, if KubeRay creates a ServiceAccount for users, the autoscaler may encounter permission issues during
// zero-downtime rolling updates when RayService is performed. See https://github.com/ray-project/kuberay/issues/1123
// for more details.
if instance.Spec.HeadGroupSpec.Template.Spec.ServiceAccountName == namespacedName.Name {
kevin85421 marked this conversation as resolved.
Show resolved Hide resolved
r.Log.Error(err, fmt.Sprintf(
"If users specify ServiceAccountName for the head Pod, they need to create a ServiceAccount themselves. "+
"However, ServiceAccount %s is not found. Please create one. "+
"See the PR description of https://github.com/ray-project/kuberay/pull/1128 for more details.", namespacedName.Name), "ServiceAccount", namespacedName)
return err
}

// Create service account for autoscaler if there's no existing one in the cluster.
serviceAccount, err := common.BuildServiceAccount(instance)
if err != nil {
Expand All @@ -1034,6 +1046,7 @@ func (r *RayClusterReconciler) reconcileAutoscalerServiceAccount(instance *rayio

// making sure the name is valid
serviceAccount.Name = utils.CheckName(serviceAccount.Name)

// Set controller reference
if err := controllerutil.SetControllerReference(instance, serviceAccount, r.Scheme); err != nil {
return err
Expand Down
55 changes: 53 additions & 2 deletions ray-operator/controllers/ray/raycluster_controller_fake_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"testing"

"github.com/ray-project/kuberay/ray-operator/controllers/ray/common"
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"

. "github.com/onsi/ginkgo"
rayiov1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1"
Expand Down Expand Up @@ -264,7 +265,6 @@ func setupTest(t *testing.T) {
},
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
ServiceAccountName: headGroupServiceAccount,
Containers: []corev1.Container{
{
Name: "ray-head",
Expand Down Expand Up @@ -878,7 +878,7 @@ func TestReconcile_AutoscalerServiceAccount(t *testing.T) {
fakeClient := clientFake.NewClientBuilder().WithRuntimeObjects(testPods...).Build()

saNamespacedName := types.NamespacedName{
Name: headGroupServiceAccount,
Name: utils.GetHeadGroupServiceAccountName(testRayCluster),
Namespace: namespaceStr,
}
sa := corev1.ServiceAccount{}
Expand All @@ -901,6 +901,57 @@ func TestReconcile_AutoscalerServiceAccount(t *testing.T) {
assert.Nil(t, err, "Fail to get head group ServiceAccount after reconciliation")
}

func TestReconcile_Autoscaler_ServiceAccountName(t *testing.T) {
setupTest(t)
defer tearDown(t)

// Specify a ServiceAccountName for the head Pod
myServiceAccount := corev1.ServiceAccount{
ObjectMeta: metav1.ObjectMeta{
Name: "my-sa",
Namespace: namespaceStr,
},
}
cluster := testRayCluster.DeepCopy()
cluster.Spec.HeadGroupSpec.Template.Spec.ServiceAccountName = myServiceAccount.Name

// Case 1: There is no ServiceAccount "my-sa" in the Kubernetes cluster
runtimeObjects := []runtime.Object{}
fakeClient := clientFake.NewClientBuilder().WithRuntimeObjects(runtimeObjects...).Build()

// Initialize the reconciler
testRayClusterReconciler := &RayClusterReconciler{
Client: fakeClient,
Recorder: &record.FakeRecorder{},
Scheme: scheme.Scheme,
Log: ctrl.Log.WithName("controllers").WithName("RayCluster"),
}

// If users specify ServiceAccountName for the head Pod, they need to create a ServiceAccount themselves.
// However, if KubeRay creates a ServiceAccount for users, the autoscaler may encounter permission issues during
// zero-downtime rolling updates when RayService is performed. See https://github.com/ray-project/kuberay/pull/1128
// for more details.
err := testRayClusterReconciler.reconcileAutoscalerServiceAccount(cluster)
assert.NotNil(t, err,
"When users specify ServiceAccountName for the head Pod, they need to create a ServiceAccount themselves. "+
"If the ServiceAccount does not exist, the reconciler should return an error. However, err is nil.")

// Case 2: There is a ServiceAccount "my-sa" in the Kubernetes cluster
runtimeObjects = []runtime.Object{&myServiceAccount}
fakeClient = clientFake.NewClientBuilder().WithRuntimeObjects(runtimeObjects...).Build()

// Initialize the reconciler
testRayClusterReconciler = &RayClusterReconciler{
Client: fakeClient,
Recorder: &record.FakeRecorder{},
Scheme: scheme.Scheme,
Log: ctrl.Log.WithName("controllers").WithName("RayCluster"),
}

err = testRayClusterReconciler.reconcileAutoscalerServiceAccount(cluster)
assert.Nil(t, err)
}

func TestReconcile_AutoscalerRoleBinding(t *testing.T) {
setupTest(t)
defer tearDown(t)
Expand Down
9 changes: 0 additions & 9 deletions ray-operator/controllers/ray/raycluster_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ var _ = Context("Inside the default namespace", func() {
},
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
ServiceAccountName: "head-service-account",
Containers: []corev1.Container{
{
Name: "ray-head",
Expand Down Expand Up @@ -168,14 +167,6 @@ var _ = Context("Inside the default namespace", func() {
Expect(pod.Status.Phase).Should(Or(Equal(corev1.PodPending), Equal(corev1.PodRunning)))
})

It("should create the head group's specified K8s ServiceAccount if it doesn't exist", func() {
saName := utils.GetHeadGroupServiceAccountName(myRayCluster)
sa := &corev1.ServiceAccount{}
Eventually(
getResourceFunc(ctx, client.ObjectKey{Name: saName, Namespace: "default"}, sa),
time.Second*15, time.Millisecond*500).Should(BeNil(), "My head group ServiceAccount = %v", saName)
})

It("should create the autoscaler K8s RoleBinding if it doesn't exist", func() {
rbName := myRayCluster.Name
rb := &rbacv1.RoleBinding{}
Expand Down
2 changes: 1 addition & 1 deletion ray-operator/controllers/ray/rayservice_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ var _ = Context("Inside the default namespace", func() {
// Hence, all the ServeStatuses[i].Status should be updated to UNHEALTHY.
//
// Note: LastUpdateTime/HealthLastUpdateTime will be overwritten via metav1.Now() in rayservice_controller.go.
// Hence, we cannot use `newTime`` to check whether the status is updated or not.
// Hence, we cannot use `newTime` to check whether the status is updated or not.
Eventually(
checkAllServeStatusesUnhealthy(ctx, myRayService),
time.Second*3, time.Millisecond*500).Should(BeTrue(), "myRayService status = %v", myRayService.Status)
Expand Down