Skip to content
This repository was archived by the owner on Feb 18, 2025. It is now read-only.

Commit c443231

Browse files
authored
Fix inconsistencies between MCAD and MicroMCAD (#37)
* Separate status ""/pending/running/completed/failed and step ""/creating/created/deleting * Adjust MinAvailable/MaxNumRequeuings behavior * Tweak logging * Fix tests * Implement configurable requeuing delays
1 parent d79f860 commit c443231

File tree

11 files changed

+158
-98
lines changed

11 files changed

+158
-98
lines changed

api/v1beta1/appwrapper_types.go

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,13 @@ type SchedulingSpec struct {
4747
}
4848

4949
type RequeuingSpec struct {
50+
// Initial waiting time before requeuing conditions are checked
51+
// +kubebuilder:default=300
52+
TimeInSeconds int64 `json:"timeInSeconds,omitempty"`
53+
54+
// Wait time before trying to dispatch again after requeuing
55+
PauseTimeInSeconds int64 `json:"pauseTimeInSeconds,omitempty"`
56+
5057
// Max requeuings permitted
5158
MaxNumRequeuings int32 `json:"maxNumRequeuings,omitempty"`
5259
}
@@ -56,6 +63,9 @@ type AppWrapperStatus struct {
5663
// Phase
5764
Phase AppWrapperPhase `json:"state,omitempty"`
5865

66+
// Status of wrapped resources
67+
Step AppWrapperStep `json:"step,omitempty"`
68+
5969
// When last dispatched
6070
DispatchTimestamp metav1.Time `json:"dispatchTimestamp,omitempty"`
6171

@@ -72,27 +82,36 @@ type AppWrapperStatus struct {
7282
// AppWrapperPhase is the label for the AppWrapper status
7383
type AppWrapperPhase string
7484

85+
// AppWrapperState is the status of wrapped resources
86+
type AppWrapperStep string
87+
7588
const (
76-
// no resource reservation
89+
// Initial state upon creation of the AppWrapper object
7790
Empty AppWrapperPhase = ""
7891

79-
// no resource reservation
92+
// AppWrapper has not been dispatched yet or has been requeued
8093
Queued AppWrapperPhase = "Pending"
8194

82-
// resources are reserved
83-
Dispatching AppWrapperPhase = "Dispatching"
84-
85-
// resources are reserved
95+
// AppWrapper has been dispatched and not requeued
8696
Running AppWrapperPhase = "Running"
8797

88-
// no resource reservation
98+
// AppWrapper completed successfully
8999
Succeeded AppWrapperPhase = "Completed"
90100

91-
// resources are reserved (some pods may still be running)
101+
// AppWrapper failed and is not requeued
92102
Failed AppWrapperPhase = "Failed"
93103

94-
// resources are reserved (some pods may still be running)
95-
Requeuing AppWrapperPhase = "Requeuing"
104+
// Resources are not deployed
105+
Idle AppWrapperStep = ""
106+
107+
// MCAD is in the process of creating the wrapped resources
108+
Creating AppWrapperStep = "creating"
109+
110+
// The wrapped resources have been deployed successfully
111+
Created AppWrapperStep = "created"
112+
113+
// MCAD is in the process of deleting the wrapped resources
114+
Deleting AppWrapperStep = "deleting"
96115
)
97116

98117
// AppWrapper resources
@@ -137,6 +156,9 @@ type AppWrapperTransition struct {
137156

138157
// Phase entered
139158
Phase AppWrapperPhase `json:"state"`
159+
160+
// Status of wrapped resources
161+
Step AppWrapperStep `json:"step,omitempty"`
140162
}
141163

142164
//+kubebuilder:object:root=true

config/crd/bases/workload.codeflare.dev_appwrappers.yaml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,17 @@ spec:
127127
description: Max requeuings permitted
128128
format: int32
129129
type: integer
130+
pauseTimeInSeconds:
131+
description: Wait time before trying to dispatch again after
132+
requeuing
133+
format: int64
134+
type: integer
135+
timeInSeconds:
136+
default: 300
137+
description: Initial waiting time before requeuing conditions
138+
are checked
139+
format: int64
140+
type: integer
130141
type: object
131142
type: object
132143
required:
@@ -150,6 +161,9 @@ spec:
150161
state:
151162
description: Phase
152163
type: string
164+
step:
165+
description: Status of wrapped resources
166+
type: string
153167
transitions:
154168
description: Transition log
155169
items:
@@ -161,6 +175,9 @@ spec:
161175
state:
162176
description: Phase entered
163177
type: string
178+
step:
179+
description: Status of wrapped resources
180+
type: string
164181
time:
165182
description: Timestamp
166183
format: date-time

internal/controller/appwrapper_controller.go

Lines changed: 81 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,8 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
9191
// handle deletion
9292
if !appWrapper.DeletionTimestamp.IsZero() {
9393
// delete wrapped resources
94-
if r.deleteResources(ctx, appWrapper) != 0 {
95-
// deletion is pending, requeue reconciliation after delay
94+
if !r.delete(ctx, appWrapper, *appWrapper.DeletionTimestamp) {
95+
// requeue reconciliation after delay
9696
return ctrl.Result{RequeueAfter: deletionDelay}, nil
9797
}
9898
// remove finalizer
@@ -103,7 +103,7 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
103103
}
104104
// remove AppWrapper from cache
105105
r.deleteCachedPhase(appWrapper)
106-
log.FromContext(ctx).Info("Deleted", "state", "Deleted")
106+
log.FromContext(ctx).Info("Deleted")
107107
return ctrl.Result{}, nil
108108
}
109109

@@ -116,63 +116,70 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
116116
return ctrl.Result{}, err
117117
}
118118
}
119-
// set queued status only after adding finalizer
120-
return r.updateStatus(ctx, appWrapper, mcadv1beta1.Queued)
119+
// set queued/idle status only after adding finalizer
120+
return r.updateStatus(ctx, appWrapper, mcadv1beta1.Queued, mcadv1beta1.Idle)
121121

122122
case mcadv1beta1.Queued, mcadv1beta1.Succeeded:
123123
r.triggerDispatch()
124124
return ctrl.Result{}, nil
125125

126-
case mcadv1beta1.Dispatching:
127-
// create wrapped resources
128-
err, fatal := r.createResources(ctx, appWrapper)
129-
if err != nil {
130-
if fatal {
131-
return r.updateStatus(ctx, appWrapper, mcadv1beta1.Failed, err.Error())
126+
case mcadv1beta1.Running:
127+
switch appWrapper.Status.Step {
128+
case mcadv1beta1.Creating:
129+
// create wrapped resources
130+
if err, fatal := r.createResources(ctx, appWrapper); err != nil {
131+
return r.requeueOrFail(ctx, appWrapper, fatal, err.Error())
132132
}
133-
return r.requeueOrFail(ctx, appWrapper, err.Error())
134-
}
135-
// set running status only after successfully requesting the creation of all resources
136-
return r.updateStatus(ctx, appWrapper, mcadv1beta1.Running)
133+
// set running/created status only after successfully requesting the creation of all resources
134+
return r.updateStatus(ctx, appWrapper, mcadv1beta1.Running, mcadv1beta1.Created)
137135

138-
case mcadv1beta1.Running:
139-
// count AppWrapper pods
140-
counts, err := r.countPods(ctx, appWrapper)
141-
if err != nil {
142-
return ctrl.Result{}, err
143-
}
144-
// check pod count if dispatched for a while
145-
if isSlowDispatching(appWrapper) && counts.Running+counts.Succeeded < int(appWrapper.Spec.Scheduling.MinAvailable) {
136+
case mcadv1beta1.Created:
137+
// count AppWrapper pods
138+
counts, err := r.countPods(ctx, appWrapper)
139+
if err != nil {
140+
return ctrl.Result{}, err
141+
}
142+
// check pod count if dispatched for a while
143+
if metav1.Now().After(appWrapper.Status.DispatchTimestamp.Add(time.Duration(appWrapper.Spec.Scheduling.Requeuing.TimeInSeconds)*time.Second)) &&
144+
counts.Running+counts.Succeeded < int(appWrapper.Spec.Scheduling.MinAvailable) {
145+
customMessage := "expected pods " + strconv.Itoa(int(appWrapper.Spec.Scheduling.MinAvailable)) + " but found pods " + strconv.Itoa(counts.Running+counts.Succeeded)
146+
// requeue or fail if max retries exhausted with custom error message
147+
return r.requeueOrFail(ctx, appWrapper, false, customMessage)
148+
}
149+
// check for successful completion by looking at pods and wrapped resources
150+
success, err := r.isSuccessful(ctx, appWrapper, counts)
151+
if err != nil {
152+
return ctrl.Result{}, err
153+
}
154+
// set succeeded/idle status if done
155+
if success {
156+
return r.updateStatus(ctx, appWrapper, mcadv1beta1.Succeeded, mcadv1beta1.Idle)
157+
}
158+
// AppWrapper is healthy, requeue reconciliation after delay
159+
return ctrl.Result{RequeueAfter: runDelay}, nil
146160

147-
customMessage := "expected pods " + strconv.Itoa(int(appWrapper.Spec.Scheduling.MinAvailable)) + " but found pods " + strconv.Itoa(counts.Running+counts.Succeeded)
148-
// requeue or fail if max retries exhausted with custom error message
149-
return r.requeueOrFail(ctx, appWrapper, customMessage)
150-
}
151-
// check for successful completion by looking at pods and wrapped resources
152-
success, err := r.isSuccessful(ctx, appWrapper, counts)
153-
if err != nil {
154-
return ctrl.Result{}, err
155-
}
156-
// set succeeded status if done
157-
if success {
158-
return r.updateStatus(ctx, appWrapper, mcadv1beta1.Succeeded)
161+
case mcadv1beta1.Deleting:
162+
// delete wrapped resources
163+
if !r.delete(ctx, appWrapper, appWrapper.Status.RequeueTimestamp) {
164+
// requeue reconciliation after delay
165+
return ctrl.Result{RequeueAfter: deletionDelay}, nil
166+
}
167+
// reset status to queued/idle
168+
appWrapper.Status.Restarts += 1
169+
return r.updateStatus(ctx, appWrapper, mcadv1beta1.Queued, mcadv1beta1.Idle)
159170
}
160-
// AppWrapper is healthy, requeue reconciliation after delay
161-
return ctrl.Result{RequeueAfter: runDelay}, nil
162171

163-
case mcadv1beta1.Requeuing:
164-
// delete wrapped resources
165-
if r.deleteResources(ctx, appWrapper) != 0 {
166-
if !isSlowRequeuing(appWrapper) {
172+
case mcadv1beta1.Failed:
173+
switch appWrapper.Status.Step {
174+
case mcadv1beta1.Deleting:
175+
// delete wrapped resources
176+
if !r.delete(ctx, appWrapper, appWrapper.Status.RequeueTimestamp) {
167177
// requeue reconciliation after delay
168178
return ctrl.Result{RequeueAfter: deletionDelay}, nil
169179
}
170-
// forcefully delete wrapped resources and pods
171-
r.forceDelete(ctx, appWrapper)
180+
// set status to failed/idle
181+
return r.updateStatus(ctx, appWrapper, mcadv1beta1.Failed, mcadv1beta1.Idle)
172182
}
173-
// reset status to queued
174-
appWrapper.Status.Restarts += 1
175-
return r.updateStatus(ctx, appWrapper, mcadv1beta1.Queued)
176183
}
177184
return ctrl.Result{}, nil
178185
}
@@ -208,32 +215,39 @@ func (r *AppWrapperReconciler) podMapFunc(ctx context.Context, obj client.Object
208215
}
209216

210217
// Update AppWrapper status
211-
func (r *AppWrapperReconciler) updateStatus(ctx context.Context, appWrapper *mcadv1beta1.AppWrapper, phase mcadv1beta1.AppWrapperPhase, reason ...string) (ctrl.Result, error) {
218+
func (r *AppWrapperReconciler) updateStatus(ctx context.Context, appWrapper *mcadv1beta1.AppWrapper, phase mcadv1beta1.AppWrapperPhase, step mcadv1beta1.AppWrapperStep, reason ...string) (ctrl.Result, error) {
212219
// log transition
213220
now := metav1.Now()
214-
transition := mcadv1beta1.AppWrapperTransition{Time: now, Phase: phase}
221+
transition := mcadv1beta1.AppWrapperTransition{Time: now, Phase: phase, Step: step}
215222
if len(reason) > 0 {
216223
transition.Reason = reason[0]
217224
}
218225
appWrapper.Status.Transitions = append(appWrapper.Status.Transitions, transition)
219226
appWrapper.Status.Phase = phase
227+
appWrapper.Status.Step = step
220228
// update AppWrapper status in etcd, requeue reconciliation on failure
221229
if err := r.Status().Update(ctx, appWrapper); err != nil {
222230
return ctrl.Result{}, err
223231
}
224232
// cache AppWrapper status
225233
r.addCachedPhase(appWrapper)
226-
log.FromContext(ctx).Info(string(phase), "state", string(phase))
234+
log.FromContext(ctx).Info(string(phase), "state", phase, "step", step)
227235
return ctrl.Result{}, nil
228236
}
229237

230-
// Set requeuing or failed status depending on restarts count
231-
func (r *AppWrapperReconciler) requeueOrFail(ctx context.Context, appWrapper *mcadv1beta1.AppWrapper, reason string) (ctrl.Result, error) {
232-
if appWrapper.Status.Restarts < appWrapper.Spec.Scheduling.Requeuing.MaxNumRequeuings {
238+
// Set requeuing or failed status depending on error, configuration, and restarts count
239+
func (r *AppWrapperReconciler) requeueOrFail(ctx context.Context, appWrapper *mcadv1beta1.AppWrapper, fatal bool, reason string) (ctrl.Result, error) {
240+
if appWrapper.Spec.Scheduling.MinAvailable == 0 {
241+
// set failed status and leave resources as is
242+
return r.updateStatus(ctx, appWrapper, mcadv1beta1.Failed, appWrapper.Status.Step, reason)
243+
} else if fatal || appWrapper.Spec.Scheduling.Requeuing.MaxNumRequeuings > 0 && appWrapper.Status.Restarts >= appWrapper.Spec.Scheduling.Requeuing.MaxNumRequeuings {
244+
// set failed/deleting status (request deletion of wrapped resources)
233245
appWrapper.Status.RequeueTimestamp = metav1.Now()
234-
return r.updateStatus(ctx, appWrapper, mcadv1beta1.Requeuing, reason)
246+
return r.updateStatus(ctx, appWrapper, mcadv1beta1.Failed, mcadv1beta1.Deleting, reason)
235247
}
236-
return r.updateStatus(ctx, appWrapper, mcadv1beta1.Failed, reason)
248+
// requeue AppWrapper
249+
appWrapper.Status.RequeueTimestamp = metav1.Now()
250+
return r.updateStatus(ctx, appWrapper, mcadv1beta1.Running, mcadv1beta1.Deleting, reason)
237251
}
238252

239253
// Trigger dispatch by means of "*/*" request
@@ -270,18 +284,22 @@ func (r *AppWrapperReconciler) dispatch(ctx context.Context) (ctrl.Result, error
270284
}
271285
// set dispatching time and status
272286
appWrapper.Status.DispatchTimestamp = metav1.Now()
273-
if _, err := r.updateStatus(ctx, appWrapper, mcadv1beta1.Dispatching); err != nil {
287+
if _, err := r.updateStatus(ctx, appWrapper, mcadv1beta1.Running, mcadv1beta1.Creating); err != nil {
274288
return ctrl.Result{}, err
275289
}
276290
}
277291
}
278292

279-
// Is dispatching too slow?
280-
func isSlowDispatching(appWrapper *mcadv1beta1.AppWrapper) bool {
281-
return metav1.Now().After(appWrapper.Status.DispatchTimestamp.Add(runningTimeout))
282-
}
283-
284-
// Is requeuing too slow?
285-
func isSlowRequeuing(appWrapper *mcadv1beta1.AppWrapper) bool {
286-
return metav1.Now().After(appWrapper.Status.RequeueTimestamp.Add(requeuingTimeout))
293+
// Delete wrapped resources, forcing deletion after a delay
294+
func (r *AppWrapperReconciler) delete(ctx context.Context, appWrapper *mcadv1beta1.AppWrapper, time metav1.Time) bool {
295+
if r.deleteResources(ctx, appWrapper) != 0 {
296+
// resources still exist
297+
if !metav1.Now().After(time.Add(deletionTimeout)) {
298+
// there is still time
299+
return false
300+
}
301+
// force deletion
302+
r.forceDelete(ctx, appWrapper)
303+
}
304+
return true
287305
}

internal/controller/cache.go

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ type CachedAppWrapper struct {
4040
// AppWrapper phase
4141
Phase mcadv1beta1.AppWrapperPhase
4242

43+
// AppWrapper step
44+
Step mcadv1beta1.AppWrapperStep
45+
4346
// Number of transitions
4447
Transitions int
4548

@@ -49,7 +52,7 @@ type CachedAppWrapper struct {
4952

5053
// Add AppWrapper to cache
5154
func (r *AppWrapperReconciler) addCachedPhase(appWrapper *mcadv1beta1.AppWrapper) {
52-
r.Cache[appWrapper.UID] = &CachedAppWrapper{Phase: appWrapper.Status.Phase, Transitions: len(appWrapper.Status.Transitions)}
55+
r.Cache[appWrapper.UID] = &CachedAppWrapper{Phase: appWrapper.Status.Phase, Step: appWrapper.Status.Step, Transitions: len(appWrapper.Status.Transitions)}
5356
}
5457

5558
// Remove AppWrapper from cache
@@ -58,11 +61,11 @@ func (r *AppWrapperReconciler) deleteCachedPhase(appWrapper *mcadv1beta1.AppWrap
5861
}
5962

6063
// Get AppWrapper phase from cache if available or from AppWrapper if not
61-
func (r *AppWrapperReconciler) getCachedPhase(appWrapper *mcadv1beta1.AppWrapper) mcadv1beta1.AppWrapperPhase {
64+
func (r *AppWrapperReconciler) getCachedPhase(appWrapper *mcadv1beta1.AppWrapper) (mcadv1beta1.AppWrapperPhase, mcadv1beta1.AppWrapperStep) {
6265
if cached, ok := r.Cache[appWrapper.UID]; ok && cached.Transitions > len(appWrapper.Status.Transitions) {
63-
return cached.Phase // our cache is more up-to-date than the reconciler cache
66+
return cached.Phase, cached.Step // our cache is more up-to-date than the reconciler cache
6467
}
65-
return appWrapper.Status.Phase
68+
return appWrapper.Status.Phase, appWrapper.Status.Step
6669
}
6770

6871
// Check whether reconciler cache and our cache appear to be in sync
@@ -91,7 +94,7 @@ func (r *AppWrapperReconciler) isStale(ctx context.Context, appWrapper *mcadv1be
9194
}
9295
return true
9396
}
94-
if cached.Phase != status.Phase {
97+
if cached.Phase != status.Phase || cached.Step != status.Step {
9598
// assume something is wrong with our cache
9699
delete(r.Cache, appWrapper.UID)
97100
log.FromContext(ctx).Error(errors.New("cache conflict"), "Internal error")

0 commit comments

Comments
 (0)