@@ -43,9 +43,9 @@ type AppWrapperReconciler struct {
4343 client.Client
4444 Events chan event.GenericEvent
4545 Scheme * runtime.Scheme
46- Cache map [types.UID ]* mcadv1alpha1. AppWrapper // cache appWrapper updates to improve dispatch accuracy
47- ClusterCapacity Weights // cluster capacity available to mcad
48- NextSync time.Time // when to refresh cluster capacity
46+ Cache map [types.UID ]* CachedAppWrapper // cache appWrapper updates to improve dispatch accuracy
47+ ClusterCapacity Weights // cluster capacity available to mcad
48+ NextSync time.Time // when to refresh cluster capacity
4949}
5050
5151const (
@@ -65,6 +65,33 @@ type PodCounts struct {
6565 Succeeded int
6666}
6767
68+ // Cached AppWrapper status
69+ type CachedAppWrapper struct {
70+ // AppWrapper phase
71+ Phase mcadv1alpha1.AppWrapperPhase
72+
73+ // Number of condition (monotonically increasing, hence a good way to identify the most recent status)
74+ Conditions int
75+
76+ // First conflict detected between our cache and reconciler cache or nil
77+ Conflict * time.Time
78+ }
79+
80+ // We cache AppWrappers phases because the reconciler cache does not immediately reflect updates.
81+ // A Get or List call soon after an Update or Status.Update call may not reflect the latest object.
82+ // See: https://github.com/kubernetes-sigs/controller-runtime/issues/1622
83+ // Therefore we need to maintain our own cache to make sure new dispatching decisions accurately account
84+ // for recent dispatching decisions.
85+ // The cache is populated on phase updates.
86+ // The cache is only meant to be used for AppWrapper List calls when computing available resources.
87+ // We use the number of conditions to confirm our cached version is more recent than the reconciler cache.
88+ // We remove cache entries when removing finalizers. TODO: We should purge the cache from stale entries
89+ // periodically in case a finalizer is deleted outside of our control.
90+ // When reconciling an AppWrapper, we proactively detect and abort on conflicts as
91+ // there is no point working on a stale AppWrapper. We know etcd updates will fail.
92+ // To defend against bugs in the cache implementation and egregious AppWrapper edits,
93+ // we eventually give up on persistent conflicts and remove the AppWrapper phase from the cache.
94+
6895//+kubebuilder:rbac:groups=*,resources=*,verbs=*
6996
7097// Reconcile one AppWrapper or dispatch next AppWrapper
@@ -87,11 +114,12 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
87114 if appWrapper == nil { // no appWrapper eligible for dispatch
88115 return ctrl.Result {RequeueAfter : dispatchDelay }, nil // retry to dispatch later
89116 }
90- // abort and requeue reconciliation if reconciler cache is not updated
117+ // abort and requeue reconciliation if reconciler cache is stale
91118 if err := r .checkCache (appWrapper ); err != nil {
92119 return ctrl.Result {}, err
93120 }
94121 if appWrapper .Status .Phase != mcadv1alpha1 .Queued {
122+ // this check should be redundant but better be defensive
95123 return ctrl.Result {}, errors .New ("not queued" )
96124 }
97125 // set dispatching timestamp and status
@@ -115,7 +143,7 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
115143 return ctrl.Result {}, nil
116144 }
117145
118- // abort and requeue reconciliation if reconciler cache is not updated
146+ // abort and requeue reconciliation if reconciler cache is stale
119147 if err := r .checkCache (appWrapper ); err != nil {
120148 return ctrl.Result {}, err
121149 }
@@ -263,9 +291,11 @@ func (r *AppWrapperReconciler) updateStatus(ctx context.Context, appWrapper *mca
263291 return ctrl.Result {}, err // etcd update failed, abort and requeue reconciliation
264292 }
265293 log .Info (string (phase ))
266- r .Cache [appWrapper .UID ] = appWrapper // cache updated appWrapper
267- // this appWrapper is a deep copy of the reconciler cache already (obtained with r.Get)
268- // this appWrapper should not be mutated beyond this point
294+ // cache AppWrapper status
295+ r .Cache [appWrapper .UID ] = & CachedAppWrapper {
296+ Phase : appWrapper .Status .Phase ,
297+ ResourceVersion : appWrapper .ResourceVersion ,
298+ Conditions : len (appWrapper .Status .Conditions )}
269299 activeAfter := isActivePhase (phase )
270300 if activeBefore && ! activeAfter {
271301 r .triggerDispatchNext () // cluster may have more available capacity
@@ -293,11 +323,29 @@ func (r *AppWrapperReconciler) triggerDispatchNext() {
293323
294324// Check whether our cache and reconciler cache appear to be in sync
295325func (r * AppWrapperReconciler ) checkCache (appWrapper * mcadv1alpha1.AppWrapper ) error {
296- // check number of conditions is the same
297- if cached , ok := r .Cache [appWrapper .UID ]; ok && len (cached .Status .Conditions ) != len (appWrapper .Status .Conditions ) {
298- // reconciler cache and our cache are out of sync
299- delete (r .Cache , appWrapper .UID )
300- return errors .New ("cache conflict" ) // force redo
326+ if cached , ok := r .Cache [appWrapper .UID ]; ok {
327+ // check number of conditions
328+ if cached .Conditions > len (appWrapper .Status .Conditions ) {
329+ // reconciler cache appears to be behind
330+ if cached .Conflict != nil {
331+ if time .Now ().After (cached .Conflict .Add (cacheConflictTimeout )) {
332+ // this has been going on for a while, assume something is wrong with our cache
333+ delete (r .Cache , appWrapper .UID )
334+ return errors .New ("persistent cache conflict" ) // force redo
335+ }
336+ } else {
337+ now := time .Now ()
338+ cached .Conflict = & now // remember when conflict started
339+ }
340+ return errors .New ("stale reconciler cache" ) // force redo
341+ }
342+ if cached .Conditions < len (appWrapper .Status .Conditions ) || cached .Phase != appWrapper .Status .Phase {
343+ // something is wrong with our cache
344+ delete (r .Cache , appWrapper .UID )
345+ return errors .New ("stale phase cache" ) // force redo
346+ }
347+ // caches appear to be in sync
348+ cached .Conflict = nil // clear conflict timestamp
301349 }
302350 return nil
303351}
0 commit comments