Skip to content

Commit

Permalink
Support for 'PauseTimeInSeconds' so MCAD delays redispatching after a…
Browse files Browse the repository at this point in the history
… job has been requeued
  • Loading branch information
metalcycling committed Oct 24, 2023
1 parent 347b9c5 commit 55fbef9
Showing 1 changed file with 28 additions and 12 deletions.
40 changes: 28 additions & 12 deletions pkg/controller/queuejob/queuejob_controller_ex.go
Original file line number Diff line number Diff line change
Expand Up @@ -1898,8 +1898,9 @@ func (cc *XController) worker() {
}
klog.V(2).Infof("[worker] Delete resources for AppWrapper Job '%s/%s' due to preemption was sucessfull, status.CanRun=%t, status.State=%s", queuejob.Namespace, queuejob.Name, queuejob.Status.CanRun, queuejob.Status.State)

if queuejob.Spec.SchedSpec.Requeuing.ForceDeletionTimeInSeconds > 0 {
// Waiting for deletion of the AppWrapper to be complete before forcing the deletion of pods
if queuejob.Spec.SchedSpec.Requeuing.ForceDeletionTimeInSeconds > 0 || queuejob.Spec.SchedSpec.Requeuing.PauseTimeInSeconds > 0 {
// 1) Waiting for deletion of the AppWrapper to be complete before forcing the deletion of pods
// 2) Delaying redispatching with user specified wait time
var err error
newjob, err := cc.getAppWrapper(queuejob.Namespace, queuejob.Name, "[worker] get fresh AppWrapper")
if err != nil {
Expand All @@ -1920,17 +1921,20 @@ func (cc *XController) worker() {
return nil
}
} else if queuejob.Status.QueueJobState == arbv1.AppWrapperCondDeleted {
// The AppWrapper was preempted and its objects were deleted. In case the deletion was not successful for all the items
// MCAD will force delete any pods that remain in the system
if queuejob.Spec.SchedSpec.Requeuing.ForceDeletionTimeInSeconds > 0 {
index := getIndexOfMatchedCondition(queuejob, arbv1.AppWrapperCondDeleted, "AwaitingDeletion")
if index < 0 {
klog.V(4).Infof("WARNING: [worker] Forced deletion condition was not added after 'Cleanup'. Silently ignoring forced cleanup.")
} else {
deletionTime := queuejob.Status.Conditions[index].LastTransitionMicroTime.Add(time.Duration(queuejob.Spec.SchedSpec.Requeuing.ForceDeletionTimeInSeconds) * time.Second)
currentTime := time.Now()
// Checking of 'AwaitingDeletion' condition exists
index := getIndexOfMatchedCondition(queuejob, arbv1.AppWrapperCondDeleted, "AwaitingDeletion")
if index < 0 {
klog.V(4).Infof("WARNING: [worker] Forced deletion condition was not added after 'Cleanup'. Silently ignoring forced cleanup.")
} else {
// Get current time to compare to
currentTime := time.Now()

if currentTime.After(deletionTime) {
// The AppWrapper was preempted and its objects were deleted. In case the deletion was not successful for all the items
// MCAD will force delete any pods that remain in the system
if queuejob.Spec.SchedSpec.Requeuing.ForceDeletionTimeInSeconds > 0 {
forceDeletionTime := queuejob.Status.Conditions[index].LastTransitionMicroTime.Add(time.Duration(queuejob.Spec.SchedSpec.Requeuing.ForceDeletionTimeInSeconds) * time.Second)

if currentTime.After(forceDeletionTime) {
if err := cc.ForcefulCleanup(ctx, queuejob); err != nil {
klog.V(5).Infof("[worker] Forced deletion of remaining live pods didn't work (Ending %s/%s). Retrying in the next cycle.", queuejob.Namespace, queuejob.Name)
return nil
Expand All @@ -1940,6 +1944,18 @@ func (cc *XController) worker() {
return nil
}
}

// When a job is ready to be redispatched after it has been requeued due to preemption, MCAD will wait 'pauseTimeInSeconds' before redispatching
if queuejob.Spec.SchedSpec.Requeuing.PauseTimeInSeconds > 0 {
redispatchingTime := queuejob.Status.Conditions[index].LastTransitionMicroTime.Add(time.Duration(queuejob.Spec.SchedSpec.Requeuing.PauseTimeInSeconds) * time.Second)

if currentTime.After(redispatchingTime) {
klog.V(5).Infof("[worker] Ready to redispatch the AppWrapper (Ending %s/%s).", queuejob.Namespace, queuejob.Name)
} else {
klog.V(8).Infof("[worker] Waiting for 'PauseTimeInSeconds' seconds before redispatching job '%s/%s'.", queuejob.Namespace, queuejob.Name)
return nil
}
}
}
}

Expand Down

0 comments on commit 55fbef9

Please sign in to comment.