Skip to content
This repository was archived by the owner on Feb 18, 2025. It is now read-only.

Commit 44ee2c9

Browse files
authored
Simplify timeouts (#30)
1 parent 50bfefd commit 44ee2c9

File tree

6 files changed

+13
-56
lines changed

6 files changed

+13
-56
lines changed

api/v1beta1/appwrapper_types.go

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,6 @@ type AppWrapperDispatcherStatus struct {
9393
// When last dispatched
9494
LastDispatchingTime metav1.Time `json:"lastDispatchingTime,omitempty"`
9595

96-
// When last requeued
97-
LastRequeuingTime metav1.Time `json:"lastRequeuingTime,omitempty"`
98-
9996
// How many times requeued
10097
Requeued int32 `json:"requeued,omitempty"`
10198

api/v1beta1/zz_generated.deepcopy.go

Lines changed: 0 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/workload.codeflare.dev_appwrappers.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,6 @@ spec:
5252
description: When last dispatched
5353
format: date-time
5454
type: string
55-
lastRequeuingTime:
56-
description: When last requeued
57-
format: date-time
58-
type: string
5955
phase:
6056
description: Phase
6157
type: string

internal/controller/dispatcher.go

Lines changed: 4 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -78,35 +78,22 @@ func (r *Dispatcher) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
7878

7979
case mcadv1beta1.Dispatching:
8080
if len(appWrapper.Spec.DispatchingGates) > 0 {
81-
appWrapper.Spec.DispatcherStatus.LastRequeuingTime = metav1.Now()
8281
return r.update(ctx, appWrapper, mcadv1beta1.Requeuing, "requeued due to dispatching gate")
8382
}
8483
if appWrapper.Status.RunnerStatus.Phase == mcadv1beta1.Dispatching {
8584
// runner is ready to dispatch
8685
return r.update(ctx, appWrapper, mcadv1beta1.Running)
8786
}
88-
if isSlowDispatching(appWrapper) {
89-
// runner has not acknowledged the job
90-
// requeue or fail if max retries exhausted
91-
return r.requeueOrFail(ctx, appWrapper)
92-
} else {
93-
// requeue reconciliation
94-
return ctrl.Result{Requeue: true}, nil
95-
}
87+
// requeue reconciliation
88+
return ctrl.Result{Requeue: true}, nil
9689

9790
case mcadv1beta1.Requeuing:
9891
if appWrapper.Status.RunnerStatus.Phase == mcadv1beta1.Empty {
9992
// runner has deleted/never created the wrapped resources
10093
return r.update(ctx, appWrapper, mcadv1beta1.Queued)
10194
}
102-
if isSlowRequeuing(appWrapper) {
103-
// runner has not completed deletion
104-
// give up requeuing and fail instead
105-
return r.update(ctx, appWrapper, mcadv1beta1.Failed)
106-
} else {
107-
// requeue reconciliation
108-
return ctrl.Result{Requeue: true}, nil
109-
}
95+
// requeue reconciliation
96+
return ctrl.Result{Requeue: true}, nil
11097

11198
case mcadv1beta1.Running:
11299
if appWrapper.Status.RunnerStatus.Phase == mcadv1beta1.Succeeded {
@@ -122,7 +109,6 @@ func (r *Dispatcher) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
122109
return r.requeueOrFail(ctx, appWrapper)
123110
}
124111
if len(appWrapper.Spec.DispatchingGates) > 0 {
125-
appWrapper.Spec.DispatcherStatus.LastRequeuingTime = metav1.Now()
126112
return r.update(ctx, appWrapper, mcadv1beta1.Requeuing, "requeued due to dispatching gate")
127113
}
128114
// let the runner monitor the running job
@@ -190,7 +176,6 @@ func (r *Dispatcher) update(ctx context.Context, appWrapper *mcadv1beta1.AppWrap
190176
func (r *Dispatcher) requeueOrFail(ctx context.Context, appWrapper *mcadv1beta1.AppWrapper) (ctrl.Result, error) {
191177
if appWrapper.Spec.DispatcherStatus.Requeued < appWrapper.Spec.Scheduling.Requeuing.MaxNumRequeuings {
192178
appWrapper.Spec.DispatcherStatus.Requeued += 1
193-
appWrapper.Spec.DispatcherStatus.LastRequeuingTime = metav1.Now()
194179
return r.update(ctx, appWrapper, mcadv1beta1.Requeuing)
195180
}
196181
return r.update(ctx, appWrapper, mcadv1beta1.Failed, "maxNumRequeuings exceeded")
@@ -230,13 +215,3 @@ func (r *Dispatcher) dispatch(ctx context.Context) (ctrl.Result, error) {
230215
// requeue to continue to dispatch queued appWrappers
231216
return ctrl.Result{Requeue: true}, nil
232217
}
233-
234-
// Is requeuing too slow?
235-
func isSlowRequeuing(appWrapper *mcadv1beta1.AppWrapper) bool {
236-
return metav1.Now().After(appWrapper.Spec.DispatcherStatus.LastRequeuingTime.Add(requeuingTimeout))
237-
}
238-
239-
// Is dispatching too slow?
240-
func isSlowDispatching(appWrapper *mcadv1beta1.AppWrapper) bool {
241-
return metav1.Now().After(appWrapper.Spec.DispatcherStatus.LastDispatchingTime.Add(dispatchingTimeout))
242-
}

internal/controller/runner.go

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ func (r *Runner) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result,
8787
case mcadv1beta1.Requeuing:
8888
// delete wrapped resources
8989
if r.deleteResources(ctx, appWrapper) != 0 {
90-
if time.Now().After(appWrapper.Status.RunnerStatus.LastRequeuingTime.Add(time.Minute)) {
90+
if isSlowRequeuing(appWrapper) {
9191
r.forceDelete(ctx, appWrapper)
9292
}
9393
// requeue reconciliation
@@ -117,15 +117,6 @@ func (r *Runner) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result,
117117
if err != nil {
118118
return ctrl.Result{}, err
119119
}
120-
// check for failure conditions
121-
if counts.Failed > 0 {
122-
// set errored status
123-
return r.updateStatus(ctx, appWrapper, mcadv1beta1.Errored, "failed pod")
124-
}
125-
if isSlowRunning(appWrapper) && counts.Other > 0 {
126-
// set errored status
127-
return r.updateStatus(ctx, appWrapper, mcadv1beta1.Errored, "pod not ready")
128-
}
129120
if isSlowRunning(appWrapper) && counts.Running < int(appWrapper.Spec.Scheduling.MinAvailable) {
130121
// set errored status
131122
return r.updateStatus(ctx, appWrapper, mcadv1beta1.Errored, "too few running pods")
@@ -199,3 +190,8 @@ func (r *Runner) updateStatus(ctx context.Context, appWrapper *mcadv1beta1.AppWr
199190
func isSlowRunning(appWrapper *mcadv1beta1.AppWrapper) bool {
200191
return metav1.Now().After(appWrapper.Status.RunnerStatus.LastRunningTime.Add(runningTimeout))
201192
}
193+
194+
// Is requeuing too slow?
195+
func isSlowRequeuing(appWrapper *mcadv1beta1.AppWrapper) bool {
196+
return metav1.Now().After(appWrapper.Status.RunnerStatus.LastRequeuingTime.Add(time.Minute))
197+
}

internal/controller/timings.go

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,16 +25,11 @@ import (
2525
const (
2626
// Timeouts
2727
requeuingTimeout = 2 * time.Minute // minimum wait before aborting Requeuing
28-
dispatchingTimeout = 2 * time.Minute // minimum wait before aborting Dispatching
2928
runningTimeout = 5 * time.Minute // minimum wait before aborting Running
3029
cacheConflictTimeout = 5 * time.Minute // minimum wait before invalidating the cache
31-
clusterInfoTimeout = time.Minute // how long to cache cluster capacity
30+
clusterInfoTimeout = time.Minute // how often to refresh cluster capacity
3231

3332
// RequeueAfter delays
34-
runDelay = time.Minute // maximum delay before next reconciliation of a Running AppWrapper
35-
dispatchDelay = time.Minute // maximum delay before next "*/*" reconciliation (dispatch)
36-
37-
// The RequeueAfter delay is the maximum delay before the next reconciliation event.
38-
// Reconciliation may be triggered earlier due for instance to pod phase changes.
39-
// Reconciliation may be delayed due to the on-going reconciliation of other events.
33+
runDelay = time.Minute // how often to force check running AppWrapper health
34+
dispatchDelay = time.Minute // how often to force dispatch
4035
)

0 commit comments

Comments
 (0)