Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RayJob][Status][8/n] Only a RayJob with the status Running can transition to Complete at this moment #1774

Merged
merged 1 commit into from
Dec 27, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 14 additions & 12 deletions ray-operator/controllers/ray/rayjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,19 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
}
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, nil
case rayv1.JobDeploymentStatusRunning:
// If the JobStatus is in the SUCCEEDED or FAILED, it is impossible for the Ray job to transition to any other status
// because both of them are terminal status. Additionally, RayJob does not currently support retries. Hence, we can
// mark the RayJob as "Complete" to avoid unnecessary reconciliation. Note that the definition of "Complete" does not
// include STOPPED which is also a terminal status because `suspend` requires to stop the Ray job gracefully before
// delete the RayCluster.
if isJobSucceedOrFail(rayJobInstance.Status.JobStatus) {
if err = r.updateState(ctx, rayJobInstance, nil, rayJobInstance.Status.JobStatus, rayv1.JobDeploymentStatusComplete); err != nil {
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
}
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, nil
}
// TODO (kevin85421): Don't return here; we still need to move some of the code below into this block.
case rayv1.JobDeploymentStatusComplete:
// If this RayJob uses an existing RayCluster (i.e., ClusterSelector is set), we should not delete the RayCluster.
r.Log.Info("JobDeploymentStatusComplete", "RayJob", rayJobInstance.Name, "ShutdownAfterJobFinishes", rayJobInstance.Spec.ShutdownAfterJobFinishes, "ClusterSelector", rayJobInstance.Spec.ClusterSelector)
Expand All @@ -157,21 +170,10 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
}
}
}
// If the RayJob is completed, we should not requeue it.
return ctrl.Result{}, nil
}

// If the JobStatus is in the SUCCEEDED or FAILED, it is impossible for the Ray job to transition to any other status
// because both of them are terminal status. Additionally, RayJob does not currently support retries. Hence, we can
// mark the RayJob as "Complete" to avoid unnecessary reconciliation. Note that the definition of "Complete" does not
// include STOPPED which is also a terminal status because `suspend` requires to stop the Ray job gracefully before
// delete the RayCluster.
if isJobSucceedOrFail(rayJobInstance.Status.JobStatus) {
if err = r.updateState(ctx, rayJobInstance, nil, rayJobInstance.Status.JobStatus, rayv1.JobDeploymentStatusComplete); err != nil {
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
}
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, nil
}

var rayClusterInstance *rayv1.RayCluster
if rayClusterInstance, err = r.getOrCreateRayClusterInstance(ctx, rayJobInstance); err != nil {
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
Expand Down
Loading