Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ray job] support stop job after job cr is deleted in cluster selector mode #629

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions ray-operator/controllers/ray/rayjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ import (
"fmt"
"time"

"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/predicate"

"k8s.io/client-go/tools/record"

"github.com/go-logr/logr"
Expand Down Expand Up @@ -255,9 +258,39 @@ func (r *RayJobReconciler) SetupWithManager(mgr ctrl.Manager) error {
For(&rayv1alpha1.RayJob{}).
Owns(&rayv1alpha1.RayCluster{}).
Owns(&corev1.Service{}).
WithEventFilter(predicate.Funcs{
DeleteFunc: r.DeleteEventFilter,
}).
Complete(r)
}

func (r *RayJobReconciler) DeleteEventFilter(e event.DeleteEvent) bool {
r.Log.Info("event to delete", "event", e)

job, ok := e.Object.(*rayv1alpha1.RayJob)
if !ok {
r.Log.Info("failed to get job object from event", "event", e)
return false
}

if job.Status.JobStatus == rayv1alpha1.JobStatusRunning || job.Status.JobStatus == rayv1alpha1.JobStatusPending {
if job.Status.DashboardURL == "" || job.Status.JobId == "" {
r.Log.Info("dashboardURL or job_id is empty", "job", job)
return false
}
rayDashboardClient := utils.GetRayDashboardClientFunc()
rayDashboardClient.InitClient(job.Status.DashboardURL)
err := rayDashboardClient.StopJob(job.Status.JobId, &r.Log)
if err != nil {
r.Log.Info("failed to stop job", "error", err)
}
}
// The reconciler adds a finalizer so we perform clean-up
// when the delete timestamp is added
// Suppress Delete events to avoid filtering them out in the Reconcile function
return false
}

func (r *RayJobReconciler) getRayJobInstance(ctx context.Context, request ctrl.Request) (*rayv1alpha1.RayJob, error) {
rayJobInstance := &rayv1alpha1.RayJob{}
if err := r.Get(ctx, request.NamespacedName, rayJobInstance); err != nil {
Expand Down
33 changes: 33 additions & 0 deletions ray-operator/controllers/ray/utils/dashboard_httpclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ type RayDashboardClientInterface interface {
ConvertServeConfig(specs []rayv1alpha1.ServeConfigSpec) []ServeConfigSpec
GetJobInfo(jobId string) (*RayJobInfo, error)
SubmitJob(rayJob *rayv1alpha1.RayJob, log *logr.Logger) (jobId string, err error)
StopJob(jobName string, log *logr.Logger) (err error)
}

// GetRayDashboardClientFunc Used for unit tests.
Expand Down Expand Up @@ -315,6 +316,10 @@ type RayJobResponse struct {
JobId string `json:"job_id"`
}

type RayJobStopResponse struct {
Stopped bool `json:"stopped"`
}

func (r *RayDashboardClient) GetJobInfo(jobId string) (*RayJobInfo, error) {
req, err := http.NewRequest("GET", r.dashboardURL+JobPath+jobId, nil)
if err != nil {
Expand Down Expand Up @@ -377,6 +382,34 @@ func (r *RayDashboardClient) SubmitJob(rayJob *rayv1alpha1.RayJob, log *logr.Log
return jobResp.JobId, nil
}

func (r *RayDashboardClient) StopJob(jobName string, log *logr.Logger) (err error) {
log.Info("Stop a ray job", "rayJob", jobName)

req, err := http.NewRequest(http.MethodPost, r.dashboardURL+JobPath+jobName+"/stop", nil)
if err != nil {
return err
}

req.Header.Set("Content-Type", "application/json")
resp, err := r.client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()

body, _ := ioutil.ReadAll(resp.Body)

var jobStopResp RayJobStopResponse
if err = json.Unmarshal(body, &jobStopResp); err != nil {
return err
}

if !jobStopResp.Stopped {
return fmt.Errorf("failed to stopped job: %v", jobName)
}
return nil
}

func ConvertRayJobToReq(rayJob *rayv1alpha1.RayJob) (*RayJobRequest, error) {
req := &RayJobRequest{
Entrypoint: rayJob.Spec.Entrypoint,
Expand Down
19 changes: 18 additions & 1 deletion ray-operator/controllers/ray/utils/dashboard_httpclient_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ var _ = Describe("RayFrameworkGenerator", func() {
RuntimeEnv: encodedRuntimeEnv,
},
}
rayDashboardClient = &RayDashboardClient{}
rayDashboardClient.InitClient("127.0.0.1:8090")
})

Expand All @@ -62,7 +63,7 @@ var _ = Describe("RayFrameworkGenerator", func() {
bodyBytes, _ := json.Marshal(body)
return httpmock.NewBytesResponse(200, bodyBytes), nil
})
httpmock.RegisterResponder("Get", rayDashboardClient.dashboardURL+JobPath+expectJobId,
httpmock.RegisterResponder("GET", rayDashboardClient.dashboardURL+JobPath+expectJobId,
func(req *http.Request) (*http.Response, error) {
body := &RayJobInfo{
JobStatus: rayv1alpha1.JobStatusRunning,
Expand All @@ -82,4 +83,20 @@ var _ = Describe("RayFrameworkGenerator", func() {
Expect(rayJobInfo.Entrypoint).To(Equal(rayJob.Spec.Entrypoint))
Expect(rayJobInfo.JobStatus).To(Equal(rayv1alpha1.JobStatusRunning))
})

It("Test stop job", func() {
httpmock.Activate()
defer httpmock.DeactivateAndReset()
httpmock.RegisterResponder("POST", rayDashboardClient.dashboardURL+JobPath+"stop-job-1/stop",
func(req *http.Request) (*http.Response, error) {
body := &RayJobStopResponse{
Stopped: true,
}
bodyBytes, _ := json.Marshal(body)
return httpmock.NewBytesResponse(200, bodyBytes), nil
})

err := rayDashboardClient.StopJob("stop-job-1", &ctrl.Log)
Expect(err).To(BeNil())
})
})
4 changes: 4 additions & 0 deletions ray-operator/controllers/ray/utils/fake_serve_httpclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,7 @@ func (r *FakeRayDashboardClient) GetJobInfo(jobId string) (*RayJobInfo, error) {
func (r *FakeRayDashboardClient) SubmitJob(rayJob *rayv1alpha1.RayJob, log *logr.Logger) (jobId string, err error) {
return "", nil
}

func (r *FakeRayDashboardClient) StopJob(jobName string, log *logr.Logger) (err error) {
return nil
}
13 changes: 13 additions & 0 deletions ray-operator/controllers/ray/utils/utils_suite_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package utils_test

import (
"testing"

. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
)

func TestUtils(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "Utils Suite")
}