From c59c47d7b8c906cbc6561cfcf31b013d4c191b9c Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Thu, 12 Nov 2020 17:23:58 -0500 Subject: [PATCH 01/28] Added conditions to AppWrapper CRD. Signed-off-by: Diana Arroyo --- CONTROLLER_VERSION | 2 +- pkg/apis/controller/v1alpha1/appwrapper.go | 40 +++++++--- .../v1alpha1/zz_generated.deepcopy.go | 25 +++++++ .../queuejob/queuejob_controller_ex.go | 73 +++++++++++++++---- pkg/controller/queuejob/utils.go | 13 +++- 5 files changed, 125 insertions(+), 28 deletions(-) diff --git a/CONTROLLER_VERSION b/CONTROLLER_VERSION index 83cf0d951..412114252 100644 --- a/CONTROLLER_VERSION +++ b/CONTROLLER_VERSION @@ -1 +1 @@ -1.29.1 +1.29.2 diff --git a/pkg/apis/controller/v1alpha1/appwrapper.go b/pkg/apis/controller/v1alpha1/appwrapper.go index 7798d9269..aa03bf0ba 100644 --- a/pkg/apis/controller/v1alpha1/appwrapper.go +++ b/pkg/apis/controller/v1alpha1/appwrapper.go @@ -188,7 +188,7 @@ type AppWrapperStatus struct { SystemPriority float64 `json:"systempriority,omitempty"` // State of QueueJob - Init, Queueing, HeadOfLine, Rejoining, ... - QueueJobState QueueJobState `json:"queuejobstate,omitempty"` + QueueJobState AppWrapperConditionType `json:"queuejobstate,omitempty"` // Microsecond level timestamp when controller first sees QueueJob (by Informer) ControllerFirstTimestamp metav1.MicroTime `json:"controllerfirsttimestamp,omitempty"` @@ -201,6 +201,10 @@ type AppWrapperStatus struct { // Indicate if message is a duplicate (for Informer to recognize duplicate messages) Local bool `json:"local,omitempty"` + + // Represents the latest available observations of a appwrapper's current condition. + Conditions []AppWrapperCondition `json:"conditions,omitempty"` + } type AppWrapperState string @@ -213,15 +217,31 @@ const ( AppWrapperStateFailed AppWrapperState = "Failed" ) -type QueueJobState string +type AppWrapperConditionType string const ( - QueueJobStateInit QueueJobState = "Init" - QueueJobStateQueueing QueueJobState = "Queueing" - QueueJobStateHeadOfLine QueueJobState = "HeadOfLine" - QueueJobStateRejoining QueueJobState = "Rejoining" - QueueJobStateDispatched QueueJobState = "Dispatched" - QueueJobStateRunning QueueJobState = "Running" - QueueJobStateDeleted QueueJobState = "Deleted" - QueueJobStateFailed QueueJobState = "Failed" + AppWrapperCondInit AppWrapperConditionType = "Init" + AppWrapperCondQueueing AppWrapperConditionType = "Queueing" + AppWrapperCondHeadOfLine AppWrapperConditionType = "HeadOfLine" + AppWrapperCondBackoff AppWrapperConditionType = "Backoff" + AppWrapperCondDispatched AppWrapperConditionType = "Dispatched" + AppWrapperCondRunning AppWrapperConditionType = "Running" + AppWrapperCondDeleted AppWrapperConditionType = "Deleted" + AppWrapperCondFailed AppWrapperConditionType = "Failed" ) + +// DeploymentCondition describes the state of a deployment at a certain point. +type AppWrapperCondition struct { + // Type of appwrapper condition. + Type AppWrapperConditionType `json:"type"` + // Status of the condition, one of True, False, Unknown. + Status v1.ConditionStatus `json:"status"` + // The last time this condition was updated. + LastUpdateMicroTime metav1.MicroTime `json:"lastUpdateMicroTime,omitempty"` + // Last time the condition transitioned from one status to another. + LastTransitionMicroTime metav1.MicroTime `json:"lastTransitionMicroTime,omitempty"` + // The reason for the condition's last transition. + Reason string `json:"reason,omitempty"` + // A human readable message indicating details about the transition. + Message string `json:"message,omitempty"` +} diff --git a/pkg/apis/controller/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/controller/v1alpha1/zz_generated.deepcopy.go index 6f10bc1da..337925c66 100644 --- a/pkg/apis/controller/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/controller/v1alpha1/zz_generated.deepcopy.go @@ -54,6 +54,24 @@ func (in *AppWrapper) DeepCopyObject() runtime.Object { return nil } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AppWrapperCondition) DeepCopyInto(out *AppWrapperCondition) { + *out = *in + in.LastUpdateMicroTime.DeepCopyInto(&out.LastUpdateMicroTime) + in.LastTransitionMicroTime.DeepCopyInto(&out.LastTransitionMicroTime) + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AppWrapperCondition. +func (in *AppWrapperCondition) DeepCopy() *AppWrapperCondition { + if in == nil { + return nil + } + out := new(AppWrapperCondition) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AppWrapperGenericResource) DeepCopyInto(out *AppWrapperGenericResource) { *out = *in @@ -219,6 +237,13 @@ func (in *AppWrapperSpec) DeepCopy() *AppWrapperSpec { func (in *AppWrapperStatus) DeepCopyInto(out *AppWrapperStatus) { *out = *in in.ControllerFirstTimestamp.DeepCopyInto(&out.ControllerFirstTimestamp) + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]AppWrapperCondition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } return } diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 6f526318c..f62c9a19e 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -643,7 +643,10 @@ func (qjm *XController) ScheduleNext() { } } - qj.Status.QueueJobState = arbv1.QueueJobStateHeadOfLine + qj.Status.QueueJobState = arbv1.AppWrapperCondHeadOfLine + cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondHeadOfLine, v1.ConditionTrue, "Front of queue.", "") + qj.Status.Conditions = append(qj.Status.Conditions, cond) + qj.Status.FilterIgnore = true // update QueueJobState only qjm.updateEtcd(qj, "ScheduleNext - setHOL") qjm.qjqueue.AddUnschedulableIfNotPresent(qj) // working on qj, avoid other threads putting it back to activeQ @@ -658,6 +661,8 @@ func (qjm *XController) ScheduleNext() { return } + dispatchFailedReason := "AppWrapper not runnable." + dispatchFailedMessage := "" if qjm.isDispatcher { // Dispatcher Mode agentId:=qjm.chooseAgent(qj) if agentId != "" { // A proper agent is found. @@ -679,8 +684,9 @@ func (qjm *XController) ScheduleNext() { glog.V(10).Infof("[TTime] %s, %s: ScheduleNextAfterEtcd", qj.Name, time.Now().Sub(qj.CreationTimestamp.Time)) return } else { - glog.V(2).Infof("[Controller: Dispatcher Mode] Cannot find an Agent with enough Resources\n") - go qjm.backoff(qj) + dispatchFailedMessage = "Cannot find an cluster with enough resources to dispatch AppWrapper." + glog.V(2).Infof("[Controller: Dispatcher Mode] %s %s\n", dispatchFailedReason, dispatchFailedMessage) + go qjm.backoff(qj, dispatchFailedReason, dispatchFailedMessage) } } else { // Agent Mode aggqj := qjm.GetAggregatedResources(qj) @@ -732,6 +738,7 @@ func (qjm *XController) ScheduleNext() { } } } else { // Not enough free resources to dispatch HOL + dispatchFailedMessage = "Insufficient resources to dispatch AppWrapper." glog.V(3).Infof("[ScheduleNext] HOL Blocking by %s for %s activeQ=%t Unsched=%t &qj=%p Version=%s Status=%+v", qj.Name, time.Now().Sub(HOLStartTime), qjm.qjqueue.IfExistActiveQ(qj), qjm.qjqueue.IfExistUnschedulableQ(qj), qj, qj.ResourceVersion, qj.Status) } // stop trying to dispatch after HeadOfLineHoldingTime @@ -743,7 +750,7 @@ func (qjm *XController) ScheduleNext() { } if !forwarded { // start thread to backoff glog.V(3).Infof("[ScheduleNext] HOL backoff %s after waiting for %s activeQ=%t Unsched=%t &qj=%p Version=%s Status=%+v", qj.Name, time.Now().Sub(HOLStartTime), qjm.qjqueue.IfExistActiveQ(qj), qjm.qjqueue.IfExistUnschedulableQ(qj), qj, qj.ResourceVersion, qj.Status) - go qjm.backoff(qj) + go qjm.backoff(qj, dispatchFailedReason, dispatchFailedMessage) } } } @@ -765,15 +772,19 @@ func (cc *XController) updateEtcd(qj *arbv1.AppWrapper, at string) error { return nil } -func (qjm *XController) backoff(q *arbv1.AppWrapper) { - q.Status.QueueJobState = arbv1.QueueJobStateRejoining +func (qjm *XController) backoff(q *arbv1.AppWrapper, reason string, message string) { + q.Status.QueueJobState = arbv1.AppWrapperCondBackoff + cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondBackoff, v1.ConditionTrue, reason, message) + q.Status.Conditions = append(q.Status.Conditions, cond) q.Status.FilterIgnore = true // update QueueJobState only, no work needed qjm.updateEtcd(q, "[backoff]Rejoining") qjm.qjqueue.AddUnschedulableIfNotPresent(q) glog.V(3).Infof("[backoff] %s move to unschedulableQ before sleep for %d seconds. activeQ=%t Unsched=%t &qj=%p Version=%s Status=%+v", q.Name, qjm.serverOption.BackoffTime, qjm.qjqueue.IfExistActiveQ((q)), qjm.qjqueue.IfExistUnschedulableQ((q)), q, q.ResourceVersion, q.Status) time.Sleep(time.Duration(qjm.serverOption.BackoffTime) * time.Second) qjm.qjqueue.MoveToActiveQueueIfExists(q) - q.Status.QueueJobState = arbv1.QueueJobStateQueueing + q.Status.QueueJobState = arbv1.AppWrapperCondQueueing + returnCond := GenerateAppWrapperCondition(arbv1.AppWrapperCondQueueing, v1.ConditionTrue, "Returned from backoff.", "") + q.Status.Conditions = append(q.Status.Conditions, returnCond) q.Status.FilterIgnore = true // update QueueJobState only, no work needed qjm.updateEtcd(q, "[backoff] Queueing") glog.V(3).Infof("[backoff] %s activeQ.Add after sleep for %d seconds. activeQ=%t Unsched=%t &qj=%p Version=%s Status=%+v", q.Name, qjm.serverOption.BackoffTime, qjm.qjqueue.IfExistActiveQ((q)), qjm.qjqueue.IfExistUnschedulableQ((q)), q, q.ResourceVersion, q.Status) @@ -845,7 +856,15 @@ func (qjm *XController) UpdateQueueJobs() { if newjob.Status.QueueJobState == "" { newjob.Status.ControllerFirstTimestamp = firstTime newjob.Status.SystemPriority = newjob.Spec.Priority - newjob.Status.QueueJobState = arbv1.QueueJobStateInit + newjob.Status.QueueJobState = arbv1.AppWrapperCondInit + newjob.Status.Conditions = []arbv1.AppWrapperCondition{ + arbv1.AppWrapperCondition{ + Type: arbv1.AppWrapperCondInit, + Status: v1.ConditionTrue, + LastUpdateMicroTime: metav1.NowMicro(), + LastTransitionMicroTime: metav1.NowMicro(), + }, + } glog.V(3).Infof("[UpdateQueueJobs] %s 0Delay=%.6f seconds CreationTimestamp=%s ControllerFirstTimestamp=%s", newjob.Name, time.Now().Sub(newjob.Status.ControllerFirstTimestamp.Time).Seconds(), newjob.CreationTimestamp, newjob.Status.ControllerFirstTimestamp) } @@ -873,7 +892,15 @@ func (cc *XController) addQueueJob(obj interface{}) { if qj.Status.QueueJobState == "" { qj.Status.ControllerFirstTimestamp = firstTime qj.Status.SystemPriority = qj.Spec.Priority - qj.Status.QueueJobState = arbv1.QueueJobStateInit + qj.Status.QueueJobState = arbv1.AppWrapperCondInit + qj.Status.Conditions = []arbv1.AppWrapperCondition{ + arbv1.AppWrapperCondition{ + Type: arbv1.AppWrapperCondInit, + Status: v1.ConditionTrue, + LastUpdateMicroTime: metav1.NowMicro(), + LastTransitionMicroTime: metav1.NowMicro(), + }, + } } else { glog.Warningf("[Informer-addQJ] Received and add by the informer for AppWrapper job %s which already has been seen and initialized current state %s with timestamp: %s, elapsed time of %.6f", qj.Name, qj.Status.State, qj.Status.ControllerFirstTimestamp, time.Now().Sub(qj.Status.ControllerFirstTimestamp.Time).Seconds()) @@ -1061,9 +1088,11 @@ func (cc *XController) syncQueueJob(qj *arbv1.AppWrapper) error { // update pods running, pending,... cc.qjobResControls[arbv1.ResourceTypePod].UpdateQueueJobStatus(qj) - if (qj.Status.Running > 0) { // set QueueJobStateRunning if at least one resource running - qj.Status.QueueJobState = arbv1.QueueJobStateRunning - qj.Status.FilterIgnore = true // Update QueueJobStateRunning + if (qj.Status.Running > 0) { // set AppWrapperCondRunning if at least one resource running + qj.Status.QueueJobState = arbv1.AppWrapperCondRunning + cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondRunning, v1.ConditionTrue, "", "") + qj.Status.Conditions = append(qj.Status.Conditions, cond) + qj.Status.FilterIgnore = true // Update AppWrapperCondRunning cc.updateEtcd(qj, "[syncQueueJob]setRunning") } } @@ -1123,7 +1152,10 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper) error { glog.V(10).Infof("[worker-manageQJ] leaving %s to qjqueue.UnschedulableQ activeQ=%t Unsched=%t &qj=%p Version=%s Status=%+v", qj.Name, cc.qjqueue.IfExistActiveQ(qj), cc.qjqueue.IfExistUnschedulableQ(qj), qj, qj.ResourceVersion, qj.Status) } else { glog.V(10).Infof("[worker-manageQJ] before add to activeQ %s activeQ=%t Unsched=%t &qj=%p Version=%s Status=%+v", qj.Name, cc.qjqueue.IfExistActiveQ(qj), cc.qjqueue.IfExistUnschedulableQ(qj), qj, qj.ResourceVersion, qj.Status) - qj.Status.QueueJobState = arbv1.QueueJobStateQueueing + qj.Status.QueueJobState = arbv1.AppWrapperCondQueueing + cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondQueueing, v1.ConditionTrue, "", "") + qj.Status.Conditions = append(qj.Status.Conditions, cond) + qj.Status.FilterIgnore = true // Update Queueing status, add to qjqueue for ScheduleNext cc.updateEtcd(qj, "[manageQueueJob]setQueueing") if err = cc.qjqueue.AddIfNotPresent(qj); err != nil { @@ -1156,11 +1188,14 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper) error { glog.V(3).Infof("[worker-manageQJ] %s 3Delay=%.6f seconds BeforeDispatchingToEtcd Version=%s Status=%+v", qj.Name, time.Now().Sub(qj.Status.ControllerFirstTimestamp.Time).Seconds(), qj.ResourceVersion, qj.Status) dispatched := true + dispatchFailureReason := "Item creation failure." + dispatchFailureMessage := "" for _, ar := range qj.Spec.AggrResources.Items { glog.V(10).Infof("[worker-manageQJ] before dispatch [%v].SyncQueueJob %s &qj=%p Version=%s Status=%+v", ar.Type, qj.Name, qj, qj.ResourceVersion, qj.Status) // Call Resource Controller of ar.Type to issue REST call to Etcd for resource creation err00 := cc.qjobResControls[ar.Type].SyncQueueJob(qj, &ar) if err00 != nil { + dispatchFailureMessage = fmt.Sprintf("Failed to create item: %s/%s", qj.Namespace, qj.Name) glog.V(3).Infof("[worker-manageQJ] Error dispatching job=%s type=%v Status=%+v err=%+v", qj.Name, ar.Type, qj.Status, err00) dispatched = false break @@ -1171,18 +1206,24 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper) error { glog.V(10).Infof("[worker-manageQJ] before dispatch Generic.SyncQueueJob %s &qj=%p Version=%s Status=%+v", qj.Name, qj, qj.ResourceVersion, qj.Status) _, err00 := cc.genericresources.SyncQueueJob(qj, &ar) if err00 != nil { + dispatchFailureMessage = fmt.Sprintf("Failed to create generic item: %s/%s", qj.Namespace, qj.Name) glog.Errorf("[worker-manageQJ] Error dispatching job=%s Status=%+v err=%+v", qj.Name, qj.Status, err00) dispatched = false } } - if dispatched { // set QueueJobStateRunning if all resources are successfully dispatched - qj.Status.QueueJobState = arbv1.QueueJobStateDispatched + if dispatched { // set AppWrapperCondRunning if all resources are successfully dispatched + qj.Status.QueueJobState = arbv1.AppWrapperCondDispatched + cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondDispatched, v1.ConditionTrue, "", "") + qj.Status.Conditions = append(qj.Status.Conditions, cond) + glog.V(3).Infof("[worker-manageQJ] %s 4Delay=%.6f seconds AllResourceDispatchedToEtcd Version=%s Status=%+v", qj.Name, time.Now().Sub(qj.Status.ControllerFirstTimestamp.Time).Seconds(), qj.ResourceVersion, qj.Status) } else { qj.Status.State = arbv1.AppWrapperStateFailed - qj.Status.QueueJobState = arbv1.QueueJobStateFailed + qj.Status.QueueJobState = arbv1.AppWrapperCondFailed + cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondFailed, v1.ConditionTrue, dispatchFailureReason, dispatchFailureMessage) + qj.Status.Conditions = append(qj.Status.Conditions, cond) cc.Cleanup(qj) } diff --git a/pkg/controller/queuejob/utils.go b/pkg/controller/queuejob/utils.go index d47f4ec8f..12bf4da46 100644 --- a/pkg/controller/queuejob/utils.go +++ b/pkg/controller/queuejob/utils.go @@ -8,7 +8,7 @@ import ( "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/uuid" - "k8s.io/client-go/rest" + "k8s.io/client-go/rest" arbv1 "github.com/IBM/multi-cluster-app-dispatcher/pkg/apis/controller/v1alpha1" "github.com/IBM/multi-cluster-app-dispatcher/pkg/client/clientset/controller-versioned/clients" @@ -146,4 +146,15 @@ func createAppWrapperKind(config *rest.Config) error { return nil } +// AppWrapperCondition returns condition of a AppWrapper condition. +func GenerateAppWrapperCondition(condType arbv1.AppWrapperConditionType, condStatus corev1.ConditionStatus, condReason string, condMsg string) arbv1.AppWrapperCondition { + return arbv1.AppWrapperCondition{ + Type: condType, + Status: condStatus, + LastUpdateMicroTime: metav1.NowMicro(), + LastTransitionMicroTime: metav1.NowMicro(), + Reason: condReason, + Message: condMsg, + } +} From 9de4206175811c7c5e9ce02043d1a24fe9761725 Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Fri, 13 Nov 2020 07:50:43 -0500 Subject: [PATCH 02/28] Reduced repeating conditions and coded reason for condition instead of generic string. Signed-off-by: Diana Arroyo --- .../queuejob/queuejob_controller_ex.go | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index f62c9a19e..36a9045de 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -644,7 +644,7 @@ func (qjm *XController) ScheduleNext() { } qj.Status.QueueJobState = arbv1.AppWrapperCondHeadOfLine - cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondHeadOfLine, v1.ConditionTrue, "Front of queue.", "") + cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondHeadOfLine, v1.ConditionTrue, "FrontOfQueue.", "") qj.Status.Conditions = append(qj.Status.Conditions, cond) qj.Status.FilterIgnore = true // update QueueJobState only @@ -661,7 +661,7 @@ func (qjm *XController) ScheduleNext() { return } - dispatchFailedReason := "AppWrapper not runnable." + dispatchFailedReason := "AppWrapperNotRunnable." dispatchFailedMessage := "" if qjm.isDispatcher { // Dispatcher Mode agentId:=qjm.chooseAgent(qj) @@ -783,7 +783,7 @@ func (qjm *XController) backoff(q *arbv1.AppWrapper, reason string, message stri time.Sleep(time.Duration(qjm.serverOption.BackoffTime) * time.Second) qjm.qjqueue.MoveToActiveQueueIfExists(q) q.Status.QueueJobState = arbv1.AppWrapperCondQueueing - returnCond := GenerateAppWrapperCondition(arbv1.AppWrapperCondQueueing, v1.ConditionTrue, "Returned from backoff.", "") + returnCond := GenerateAppWrapperCondition(arbv1.AppWrapperCondQueueing, v1.ConditionTrue, "BackoffTimerExpired.", "") q.Status.Conditions = append(q.Status.Conditions, returnCond) q.Status.FilterIgnore = true // update QueueJobState only, no work needed qjm.updateEtcd(q, "[backoff] Queueing") @@ -1088,12 +1088,13 @@ func (cc *XController) syncQueueJob(qj *arbv1.AppWrapper) error { // update pods running, pending,... cc.qjobResControls[arbv1.ResourceTypePod].UpdateQueueJobStatus(qj) - if (qj.Status.Running > 0) { // set AppWrapperCondRunning if at least one resource running + // Update etcd conditions if AppWrapper Job has at least 1 running pod and transitioning from dispatched to running. + if (qj.Status.QueueJobState != arbv1.AppWrapperCondRunning ) && (qj.Status.Running > 0) { qj.Status.QueueJobState = arbv1.AppWrapperCondRunning - cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondRunning, v1.ConditionTrue, "", "") + cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondRunning, v1.ConditionTrue, "PodsRunning", "") qj.Status.Conditions = append(qj.Status.Conditions, cond) qj.Status.FilterIgnore = true // Update AppWrapperCondRunning - cc.updateEtcd(qj, "[syncQueueJob]setRunning") + cc.updateEtcd(qj, "[syncQueueJob] setRunning") } } @@ -1153,7 +1154,7 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper) error { } else { glog.V(10).Infof("[worker-manageQJ] before add to activeQ %s activeQ=%t Unsched=%t &qj=%p Version=%s Status=%+v", qj.Name, cc.qjqueue.IfExistActiveQ(qj), cc.qjqueue.IfExistUnschedulableQ(qj), qj, qj.ResourceVersion, qj.Status) qj.Status.QueueJobState = arbv1.AppWrapperCondQueueing - cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondQueueing, v1.ConditionTrue, "", "") + cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondQueueing, v1.ConditionTrue, "AwaitingDispatch", "") qj.Status.Conditions = append(qj.Status.Conditions, cond) qj.Status.FilterIgnore = true // Update Queueing status, add to qjqueue for ScheduleNext @@ -1188,7 +1189,7 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper) error { glog.V(3).Infof("[worker-manageQJ] %s 3Delay=%.6f seconds BeforeDispatchingToEtcd Version=%s Status=%+v", qj.Name, time.Now().Sub(qj.Status.ControllerFirstTimestamp.Time).Seconds(), qj.ResourceVersion, qj.Status) dispatched := true - dispatchFailureReason := "Item creation failure." + dispatchFailureReason := "ItemCreationFailure." dispatchFailureMessage := "" for _, ar := range qj.Spec.AggrResources.Items { glog.V(10).Infof("[worker-manageQJ] before dispatch [%v].SyncQueueJob %s &qj=%p Version=%s Status=%+v", ar.Type, qj.Name, qj, qj.ResourceVersion, qj.Status) @@ -1214,7 +1215,7 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper) error { if dispatched { // set AppWrapperCondRunning if all resources are successfully dispatched qj.Status.QueueJobState = arbv1.AppWrapperCondDispatched - cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondDispatched, v1.ConditionTrue, "", "") + cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondDispatched, v1.ConditionTrue, "AppWrapperRunnable", "") qj.Status.Conditions = append(qj.Status.Conditions, cond) glog.V(3).Infof("[worker-manageQJ] %s 4Delay=%.6f seconds AllResourceDispatchedToEtcd Version=%s Status=%+v", From 61744739c34580195da0137ab2fffe579098c80a Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Fri, 13 Nov 2020 12:46:01 -0500 Subject: [PATCH 03/28] Debugging statements. Signed-off-by: Diana Arroyo --- pkg/controller/queuejob/queuejob_controller_ex.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 36a9045de..d3b8d1d97 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -1079,6 +1079,8 @@ func (cc *XController) syncQueueJob(qj *arbv1.AppWrapper) error { if larger(queueJob.ResourceVersion, qj.ResourceVersion) { glog.V(10).Infof("[worker-syncQJ] %s found more recent copy from cache &qj=%p qj=%+v", qj.Name, qj, qj) glog.V(10).Infof("[worker-syncQJ] %s found more recent copy from cache &queueJob=%p queueJob=%+v", queueJob.Name, queueJob, queueJob) + glog.V(3).Infof("[worker-syncQJ] %s found more recent copy from event queue &qj=%p qj=%+v", qj.Name, qj, qj) + glog.V(3).Infof("[worker-syncQJ] %s found more recent copy from cache &queueJob=%p queueJob=%+v", queueJob.Name, queueJob, queueJob) queueJob.DeepCopyInto(qj) } @@ -1154,7 +1156,7 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper) error { } else { glog.V(10).Infof("[worker-manageQJ] before add to activeQ %s activeQ=%t Unsched=%t &qj=%p Version=%s Status=%+v", qj.Name, cc.qjqueue.IfExistActiveQ(qj), cc.qjqueue.IfExistUnschedulableQ(qj), qj, qj.ResourceVersion, qj.Status) qj.Status.QueueJobState = arbv1.AppWrapperCondQueueing - cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondQueueing, v1.ConditionTrue, "AwaitingDispatch", "") + cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondQueueing, v1.ConditionTrue, "AwaitingHeadOfLine", "") qj.Status.Conditions = append(qj.Status.Conditions, cond) qj.Status.FilterIgnore = true // Update Queueing status, add to qjqueue for ScheduleNext From d9450fa389b6ca07666913dfcfe83f53a628ed3d Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Fri, 13 Nov 2020 15:36:02 -0500 Subject: [PATCH 04/28] Debugging statements. Signed-off-by: Diana Arroyo --- pkg/controller/queuejob/queuejob_controller_ex.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index d3b8d1d97..1fdc4885b 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -928,8 +928,10 @@ func (cc *XController) updateQueueJob(oldObj, newObj interface{}) { } // AppWrappers may come out of order. Ignore old ones. if (oldQJ.Name == newQJ.Name) && (larger(oldQJ.ResourceVersion, newQJ.ResourceVersion)) { - glog.V(10).Infof("[Informer-updateQJ] ignore OutOfOrder arrival &oldQJ=%p oldQJ=%+v", oldQJ, oldQJ) - glog.V(10).Infof("[Informer-updateQJ] ignore OutOfOrder arrival &newQJ=%p newQJ=%+v", newQJ, newQJ) + glog.V(10).Infof("[Informer-updateQJ] %s ignored OutOfOrder arrival &oldQJ=%p oldQJ=%+v", oldQJ.Name, oldQJ, oldQJ) + glog.V(10).Infof("[Informer-updateQJ] %s ignored OutOfOrder arrival &newQJ=%p newQJ=%+v", newQJ.Name, newQJ, newQJ) + glog.V(3).Infof("[Informer-updateQJ] %s ignore OutOfOrder arrival &oldQJ=%p oldQJ=%+v", oldQJ.Name, oldQJ, oldQJ) + glog.V(3).Infof("[Informer-updateQJ] i%s gnore OutOfOrder arrival &newQJ=%p newQJ=%+v", newQJ.Name, newQJ, newQJ) return } glog.V(3).Infof("[Informer-updateQJ] %s *Delay=%.6f seconds normal enqueue &newQJ=%p Version=%s Status=%+v", newQJ.Name, time.Now().Sub(newQJ.Status.ControllerFirstTimestamp.Time).Seconds(), newQJ, newQJ.ResourceVersion, newQJ.Status) From 1bd9095d0290f5b9e505db0af6a956ecc89c14f7 Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Fri, 13 Nov 2020 15:42:15 -0500 Subject: [PATCH 05/28] Debugging statements. Signed-off-by: Diana Arroyo --- pkg/controller/queuejob/queuejob_controller_ex.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 1fdc4885b..df055cb0c 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -1084,6 +1084,8 @@ func (cc *XController) syncQueueJob(qj *arbv1.AppWrapper) error { glog.V(3).Infof("[worker-syncQJ] %s found more recent copy from event queue &qj=%p qj=%+v", qj.Name, qj, qj) glog.V(3).Infof("[worker-syncQJ] %s found more recent copy from cache &queueJob=%p queueJob=%+v", queueJob.Name, queueJob, queueJob) queueJob.DeepCopyInto(qj) + glog.V(3).Infof("[worker-syncQJ] %s AFTER found more recent copy from event queue &qj=%p qj=%+v", qj.Name, qj, qj) + glog.V(3).Infof("[worker-syncQJ] %s AFTER found more recent copy from cache &queueJob=%p queueJob=%+v", queueJob.Name, queueJob, queueJob) } // If it is Agent (not a dispatcher), update pod information From 0380c5c5baf27bede79d1cb65928587f16d4ef7c Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Fri, 13 Nov 2020 16:34:44 -0500 Subject: [PATCH 06/28] Debugging statements. Signed-off-by: Diana Arroyo --- pkg/controller/queuejob/queuejob_controller_ex.go | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index df055cb0c..5e2981b31 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -920,6 +920,9 @@ func (cc *XController) updateQueueJob(oldObj, newObj interface{}) { return } oldQJ, ok := oldObj.(*arbv1.AppWrapper) + if (newQJ.Name == "aw-deployment-2-900cpu") { + glog.V(3).Infof("[Informer-updateQJ] %s arrival", newQJ.Name) + } if !ok { glog.Errorf("[Informer-updateQJ] old object is not AppWrapper. enqueue(newQJ). oldObj=%+v", oldObj) glog.V(4).Infof("[Informer-updateQJ] %s *Delay=%.6f seconds BadOldObject enqueue &newQJ=%p Version=%s Status=%+v", newQJ.Name, time.Now().Sub(newQJ.Status.ControllerFirstTimestamp.Time).Seconds(), newQJ, newQJ.ResourceVersion, newQJ.Status) @@ -930,8 +933,10 @@ func (cc *XController) updateQueueJob(oldObj, newObj interface{}) { if (oldQJ.Name == newQJ.Name) && (larger(oldQJ.ResourceVersion, newQJ.ResourceVersion)) { glog.V(10).Infof("[Informer-updateQJ] %s ignored OutOfOrder arrival &oldQJ=%p oldQJ=%+v", oldQJ.Name, oldQJ, oldQJ) glog.V(10).Infof("[Informer-updateQJ] %s ignored OutOfOrder arrival &newQJ=%p newQJ=%+v", newQJ.Name, newQJ, newQJ) - glog.V(3).Infof("[Informer-updateQJ] %s ignore OutOfOrder arrival &oldQJ=%p oldQJ=%+v", oldQJ.Name, oldQJ, oldQJ) - glog.V(3).Infof("[Informer-updateQJ] i%s gnore OutOfOrder arrival &newQJ=%p newQJ=%+v", newQJ.Name, newQJ, newQJ) + if (newQJ.Name == "aw-deployment-2-900cpu") { + glog.V(3).Infof("[Informer-updateQJ] %s ignore OutOfOrder arrival &oldQJ=%p oldQJ=%+v", oldQJ.Name, oldQJ, oldQJ) + glog.V(3).Infof("[Informer-updateQJ] i%s gnore OutOfOrder arrival &newQJ=%p newQJ=%+v", newQJ.Name, newQJ, newQJ) + } return } glog.V(3).Infof("[Informer-updateQJ] %s *Delay=%.6f seconds normal enqueue &newQJ=%p Version=%s Status=%+v", newQJ.Name, time.Now().Sub(newQJ.Status.ControllerFirstTimestamp.Time).Seconds(), newQJ, newQJ.ResourceVersion, newQJ.Status) @@ -962,6 +967,9 @@ func (cc *XController) enqueue(obj interface{}) { return } + if (qj.Name == "aw-deployment-2-900cpu") { + glog.V(3).Infof("[enqueue] %s eventQueue.Add_byEnqueue &qj=%p Version=%s Status=%+v aw=%v", qj.Name, qj, qj.ResourceVersion, qj.Status, qj) + } err := cc.eventQueue.Add(qj) // add to FIFO queue if not in, update object & keep position if already in FIFO queue if err != nil { glog.Errorf("[enqueue] Fail to enqueue %s to eventQueue, ignore. *Delay=%.6f seconds &qj=%p Version=%s Status=%+v err=%#v", qj.Name, time.Now().Sub(qj.Status.ControllerFirstTimestamp.Time).Seconds(), qj, qj.ResourceVersion, qj.Status, err) @@ -1168,6 +1176,9 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper) error { if err = cc.qjqueue.AddIfNotPresent(qj); err != nil { glog.Errorf("[worker-manageQJ] Fail to add %s to activeQueue. Back to eventQueue activeQ=%t Unsched=%t &qj=%p Version=%s Status=%+v err=%#v", qj.Name, cc.qjqueue.IfExistActiveQ(qj), cc.qjqueue.IfExistUnschedulableQ(qj), qj, qj.ResourceVersion, qj.Status, err) + if (qj.Name == "aw-deployment-2-900cpu") { + glog.V(3).Infof("[worker-manageQJ] %s eventQueue.Add_byEnqueue &qj=%p Version=%s Status=%+v aw=%+v", qj.Name, qj, qj.ResourceVersion, qj.Status, qj) + } cc.enqueue(qj) } else { glog.V(3).Infof("[worker-manageQJ] %s 1Delay=%.6f seconds activeQ.Add_success activeQ=%t Unsched=%t &qj=%p Version=%s Status=%+v", From a474ab538f824431f682de82b37cae3ea7de8479 Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Sat, 14 Nov 2020 09:11:07 -0500 Subject: [PATCH 07/28] Added additional debugging. Signed-off-by: Diana Arroyo --- .../queuejob/queuejob_controller_ex.go | 43 +++++++++++++------ 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 5e2981b31..14c3232eb 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -872,7 +872,7 @@ func (qjm *XController) UpdateQueueJobs() { // check eventQueue, qjqueue in program sequence to make sure job is not in qjqueue if _, exists, _ := qjm.eventQueue.Get(newjob); exists { continue } // do not enqueue if already in eventQueue if qjm.qjqueue.IfExist(newjob) { continue } // do not enqueue if already in qjqueue - err := qjm.eventQueue.AddIfNotPresent(newjob) // add to eventQueue if not in, otherwise, keep position without updating object, as object in eventQueue should be more recent + err = qjm.enqueueIfNotPresent(newjob) if err != nil { glog.Errorf("[UpdateQueueJobs] Fail to enqueue %s to eventQueue, ignore. *Delay=%.6f seconds &qj=%p Version=%s Status=%+v err=%#v", newjob.Name, time.Now().Sub(newjob.Status.ControllerFirstTimestamp.Time).Seconds(), newjob, newjob.ResourceVersion, newjob.Status, err) } else { @@ -920,7 +920,7 @@ func (cc *XController) updateQueueJob(oldObj, newObj interface{}) { return } oldQJ, ok := oldObj.(*arbv1.AppWrapper) - if (newQJ.Name == "aw-deployment-2-900cpu") { + if (newQJ.Name == "aw-generic-deployment-2-030") { glog.V(3).Infof("[Informer-updateQJ] %s arrival", newQJ.Name) } if !ok { @@ -933,7 +933,7 @@ func (cc *XController) updateQueueJob(oldObj, newObj interface{}) { if (oldQJ.Name == newQJ.Name) && (larger(oldQJ.ResourceVersion, newQJ.ResourceVersion)) { glog.V(10).Infof("[Informer-updateQJ] %s ignored OutOfOrder arrival &oldQJ=%p oldQJ=%+v", oldQJ.Name, oldQJ, oldQJ) glog.V(10).Infof("[Informer-updateQJ] %s ignored OutOfOrder arrival &newQJ=%p newQJ=%+v", newQJ.Name, newQJ, newQJ) - if (newQJ.Name == "aw-deployment-2-900cpu") { + if (newQJ.Name == "aw-generic-deployment-2-030") { glog.V(3).Infof("[Informer-updateQJ] %s ignore OutOfOrder arrival &oldQJ=%p oldQJ=%+v", oldQJ.Name, oldQJ, oldQJ) glog.V(3).Infof("[Informer-updateQJ] i%s gnore OutOfOrder arrival &newQJ=%p newQJ=%+v", newQJ.Name, newQJ, newQJ) } @@ -967,7 +967,7 @@ func (cc *XController) enqueue(obj interface{}) { return } - if (qj.Name == "aw-deployment-2-900cpu") { + if (qj.Name == "aw-generic-deployment-2-030") { glog.V(3).Infof("[enqueue] %s eventQueue.Add_byEnqueue &qj=%p Version=%s Status=%+v aw=%v", qj.Name, qj, qj.ResourceVersion, qj.Status, qj) } err := cc.eventQueue.Add(qj) // add to FIFO queue if not in, update object & keep position if already in FIFO queue @@ -978,6 +978,19 @@ func (cc *XController) enqueue(obj interface{}) { } } +func (cc *XController) enqueueIfNotPresent(obj interface{}) error { + aw, ok := obj.(*arbv1.AppWrapper) + if !ok { + return fmt.Errorf("[enqueueIfNotPresent] obj is not AppWrapper. obj=%+v", obj) + } + + if (aw.Name == "aw-generic-deployment-2-030") { + glog.V(3).Infof("[enqueue] %s eventQueue.Add_byEnqueue &qj=%p Version=%s Status=%+v aw=%v", aw.Name, aw, aw.ResourceVersion, aw.Status, aw) + } + err := cc.eventQueue.AddIfNotPresent(aw) // add to FIFO queue if not in, update object & keep position if already in FIFO queue + return err +} + func (cc *XController) agentEventQueueWorker() { if _, err := cc.agentEventQueue.Pop(func(obj interface{}) error { var queuejob *arbv1.AppWrapper @@ -1100,15 +1113,17 @@ func (cc *XController) syncQueueJob(qj *arbv1.AppWrapper) error { if(!cc.isDispatcher){ // we call sync for each controller // update pods running, pending,... - cc.qjobResControls[arbv1.ResourceTypePod].UpdateQueueJobStatus(qj) - - // Update etcd conditions if AppWrapper Job has at least 1 running pod and transitioning from dispatched to running. - if (qj.Status.QueueJobState != arbv1.AppWrapperCondRunning ) && (qj.Status.Running > 0) { - qj.Status.QueueJobState = arbv1.AppWrapperCondRunning - cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondRunning, v1.ConditionTrue, "PodsRunning", "") - qj.Status.Conditions = append(qj.Status.Conditions, cond) - qj.Status.FilterIgnore = true // Update AppWrapperCondRunning - cc.updateEtcd(qj, "[syncQueueJob] setRunning") + if (qj.Status.State == arbv1.AppWrapperStateActive) { + cc.qjobResControls[arbv1.ResourceTypePod].UpdateQueueJobStatus(qj) + + // Update etcd conditions if AppWrapper Job has at least 1 running pod and transitioning from dispatched to running. + if (qj.Status.QueueJobState != arbv1.AppWrapperCondRunning ) && (qj.Status.Running > 0) { + qj.Status.QueueJobState = arbv1.AppWrapperCondRunning + cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondRunning, v1.ConditionTrue, "PodsRunning", "") + qj.Status.Conditions = append(qj.Status.Conditions, cond) + qj.Status.FilterIgnore = true // Update AppWrapperCondRunning + cc.updateEtcd(qj, "[syncQueueJob] setRunning") + } } } @@ -1176,7 +1191,7 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper) error { if err = cc.qjqueue.AddIfNotPresent(qj); err != nil { glog.Errorf("[worker-manageQJ] Fail to add %s to activeQueue. Back to eventQueue activeQ=%t Unsched=%t &qj=%p Version=%s Status=%+v err=%#v", qj.Name, cc.qjqueue.IfExistActiveQ(qj), cc.qjqueue.IfExistUnschedulableQ(qj), qj, qj.ResourceVersion, qj.Status, err) - if (qj.Name == "aw-deployment-2-900cpu") { + if (qj.Name == "aw-generic-deployment-2-030") { glog.V(3).Infof("[worker-manageQJ] %s eventQueue.Add_byEnqueue &qj=%p Version=%s Status=%+v aw=%+v", qj.Name, qj, qj.ResourceVersion, qj.Status, qj) } cc.enqueue(qj) From 2d54f9b70d180f585d485f15240fa59d78c9428a Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Sat, 14 Nov 2020 12:06:55 -0500 Subject: [PATCH 08/28] Added additional debugging. Signed-off-by: Diana Arroyo --- .../queuejob/queuejob_controller_ex.go | 54 +++++++++++++++---- 1 file changed, 45 insertions(+), 9 deletions(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 14c3232eb..e4409d5a0 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -728,7 +728,7 @@ func (qjm *XController) ScheduleNext() { // Handle k8s watch race condition if err := qjm.updateEtcd(qj, "ScheduleNext - setCanRun"); err == nil { // add to eventQueue for dispatching to Etcd - if err = qjm.eventQueue.Add(qj); err != nil { // unsuccessful add to eventQueue, add back to activeQ + if err = qjm.enqueue(qj); err != nil { // unsuccessful add to eventQueue, add back to activeQ glog.Errorf("[ScheduleNext] Fail to add %s to eventQueue, activeQ.Add_toSchedulingQueue &qj=%p Version=%s Status=%+v err=%#v", qj.Name, qj, qj.ResourceVersion, qj.Status, err) qjm.qjqueue.MoveToActiveQueueIfExists(qj) } else { // successful add to eventQueue, remove from qjqueue @@ -960,14 +960,26 @@ func (cc *XController) deleteQueueJob(obj interface{}) { cc.enqueue(qj) } -func (cc *XController) enqueue(obj interface{}) { +func (cc *XController) enqueue(obj interface{}) error { qj, ok := obj.(*arbv1.AppWrapper) if !ok { - glog.Errorf("[enqueue] obj is not AppWrapper. obj=%+v", obj) - return + return fmt.Errorf("[enqueue] obj is not AppWrapper. obj=%+v", obj) } if (qj.Name == "aw-generic-deployment-2-030") { + objKey, err := GetQueueJobKey(obj) + if (err != nil) { + glog.V(3).Infof("[enqueue] Enqueue failed for %s. Error getting key from aw object: err=%+v", qj.Name, err) + } else { + _, exists, err := cc.eventQueue.GetByKey(objKey) + if (err != nil) { + glog.V(3).Infof("[enqueue] Enqueue failed for %s. Error getting aw from event queue: err=%+v", qj.Name, err) + } else { + if exists { + glog.V(3).Infof("[enqueue] %s found in event queue. This will add a duplicate in the following call to eventQueue.Add().", qj.Name) + } + } + } glog.V(3).Infof("[enqueue] %s eventQueue.Add_byEnqueue &qj=%p Version=%s Status=%+v aw=%v", qj.Name, qj, qj.ResourceVersion, qj.Status, qj) } err := cc.eventQueue.Add(qj) // add to FIFO queue if not in, update object & keep position if already in FIFO queue @@ -976,6 +988,7 @@ func (cc *XController) enqueue(obj interface{}) { } else { glog.V(10).Infof("[enqueue] %s *Delay=%.6f seconds eventQueue.Add_byEnqueue &qj=%p Version=%s Status=%+v", qj.Name, time.Now().Sub(qj.Status.ControllerFirstTimestamp.Time).Seconds(), qj, qj.ResourceVersion, qj.Status) } + return err } func (cc *XController) enqueueIfNotPresent(obj interface{}) error { @@ -985,8 +998,22 @@ func (cc *XController) enqueueIfNotPresent(obj interface{}) error { } if (aw.Name == "aw-generic-deployment-2-030") { - glog.V(3).Infof("[enqueue] %s eventQueue.Add_byEnqueue &qj=%p Version=%s Status=%+v aw=%v", aw.Name, aw, aw.ResourceVersion, aw.Status, aw) + objKey, err := GetQueueJobKey(obj) + if (err != nil) { + glog.V(3).Infof("[enqueueIfNotPresent] Enqueue failed for %s. Error getting key from aw object: err=%+v", aw.Name, err) + } else { + _, exists, err := cc.eventQueue.GetByKey(objKey) + if (err != nil) { + glog.V(3).Infof("[enqueueIfNotPresent] Enqueue failed for %s. Error getting aw from event queue: err=%+v", aw.Name, err) + } else { + if exists { + glog.V(3).Infof("[enqueueIfNotPresent] %s found in event queue. Should not be added on the following call to AddIfNotPresent().", aw.Name) + } + } + } + glog.V(3).Infof("[enqueueIfNotPresent] %s eventQueue.Add_byEnqueue &qj=%p Version=%s Status=%+v aw=%v", aw.Name, aw, aw.ResourceVersion, aw.Status, aw) } + err := cc.eventQueue.AddIfNotPresent(aw) // add to FIFO queue if not in, update object & keep position if already in FIFO queue return err } @@ -1070,6 +1097,10 @@ func (cc *XController) worker() { return nil } + if (queuejob.Name == "aw-generic-deployment-2-030") { + glog.V(3).Infof("[worker] Popped %s from event queue &queuejob=%p queuejob=%+v", queuejob.Name, queuejob, queuejob) + } + // sync AppWrapper if err := cc.syncQueueJob(queuejob); err != nil { glog.Errorf("[worker] Failed to sync AppWrapper %s, err %#v", queuejob.Name, err) @@ -1102,11 +1133,16 @@ func (cc *XController) syncQueueJob(qj *arbv1.AppWrapper) error { if larger(queueJob.ResourceVersion, qj.ResourceVersion) { glog.V(10).Infof("[worker-syncQJ] %s found more recent copy from cache &qj=%p qj=%+v", qj.Name, qj, qj) glog.V(10).Infof("[worker-syncQJ] %s found more recent copy from cache &queueJob=%p queueJob=%+v", queueJob.Name, queueJob, queueJob) - glog.V(3).Infof("[worker-syncQJ] %s found more recent copy from event queue &qj=%p qj=%+v", qj.Name, qj, qj) - glog.V(3).Infof("[worker-syncQJ] %s found more recent copy from cache &queueJob=%p queueJob=%+v", queueJob.Name, queueJob, queueJob) + if (qj.Name == "aw-generic-deployment-2-030") { + glog.V(3).Infof("[worker-syncQJ] %s found more recent copy from event queue &qj=%p qj=%+v", qj.Name, qj, qj) + glog.V(3).Infof("[worker-syncQJ] %s found more recent copy from cache &queueJob=%p queueJob=%+v", queueJob.Name, queueJob, queueJob) + } + queueJob.DeepCopyInto(qj) - glog.V(3).Infof("[worker-syncQJ] %s AFTER found more recent copy from event queue &qj=%p qj=%+v", qj.Name, qj, qj) - glog.V(3).Infof("[worker-syncQJ] %s AFTER found more recent copy from cache &queueJob=%p queueJob=%+v", queueJob.Name, queueJob, queueJob) + if (qj.Name == "aw-generic-deployment-2-030") { + glog.V(3).Infof("[worker-syncQJ] %s AFTER found more recent copy from event queue &qj=%p qj=%+v", qj.Name, qj, qj) + glog.V(3).Infof("[worker-syncQJ] %s AFTER found more recent copy from cache &queueJob=%p queueJob=%+v", queueJob.Name, queueJob, queueJob) + } } // If it is Agent (not a dispatcher), update pod information From e5f3f76ec97e59732dfbb6a974169ab3078aca54 Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Sat, 14 Nov 2020 12:50:48 -0500 Subject: [PATCH 09/28] Added additional debugging. Signed-off-by: Diana Arroyo --- pkg/controller/queuejob/queuejob_controller_ex.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index e4409d5a0..132212450 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -872,11 +872,14 @@ func (qjm *XController) UpdateQueueJobs() { // check eventQueue, qjqueue in program sequence to make sure job is not in qjqueue if _, exists, _ := qjm.eventQueue.Get(newjob); exists { continue } // do not enqueue if already in eventQueue if qjm.qjqueue.IfExist(newjob) { continue } // do not enqueue if already in qjqueue - err = qjm.enqueueIfNotPresent(newjob) + if (newjob.Name == "aw-generic-deployment-2-030") { + glog.V(3).Infof("[UpdateQueueJobs] %s *Delay=%.6f seconds eventQueue.Add_byUpdateQueueJobs &qj=%p Version=%s Status=%+v", newjob.Name, time.Now().Sub(newjob.Status.ControllerFirstTimestamp.Time).Seconds(), newjob, newjob.ResourceVersion, newjob.Status) + } + err = qjm.enqueueIfNotPresent(newjob) if err != nil { glog.Errorf("[UpdateQueueJobs] Fail to enqueue %s to eventQueue, ignore. *Delay=%.6f seconds &qj=%p Version=%s Status=%+v err=%#v", newjob.Name, time.Now().Sub(newjob.Status.ControllerFirstTimestamp.Time).Seconds(), newjob, newjob.ResourceVersion, newjob.Status, err) } else { - glog.V(3).Infof("[UpdateQueueJobs] %s *Delay=%.6f seconds eventQueue.Add_byUpdateQueueJobs &qj=%p Version=%s Status=%+v", newjob.Name, time.Now().Sub(newjob.Status.ControllerFirstTimestamp.Time).Seconds(), newjob, newjob.ResourceVersion, newjob.Status) + glog.V(4).Infof("[UpdateQueueJobs] %s *Delay=%.6f seconds eventQueue.Add_byUpdateQueueJobs &qj=%p Version=%s Status=%+v", newjob.Name, time.Now().Sub(newjob.Status.ControllerFirstTimestamp.Time).Seconds(), newjob, newjob.ResourceVersion, newjob.Status) } } } @@ -910,6 +913,9 @@ func (cc *XController) addQueueJob(obj interface{}) { qj.Name, time.Now().Sub(qj.Status.ControllerFirstTimestamp.Time).Seconds(), qj.CreationTimestamp, qj.Status.ControllerFirstTimestamp) glog.V(4).Infof("[Informer-addQJ] enqueue %s &qj=%p Version=%s Status=%+v", qj.Name, qj, qj.ResourceVersion, qj.Status) + if (qj.Name == "aw-generic-deployment-2-030") { + glog.V(3).Infof("[Informer-addQJ] enqueue %s &qj=%p Version=%s Status=%+v", qj.Name, qj, qj.ResourceVersion, qj.Status) + } cc.enqueue(qj) } From be348f06c598a11deb516223a35e5fca4ed88666 Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Sat, 14 Nov 2020 14:33:52 -0500 Subject: [PATCH 10/28] Added additional debugging. Signed-off-by: Diana Arroyo --- pkg/apis/controller/v1alpha1/appwrapper.go | 18 ++++++++++-------- .../queuejob/queuejob_controller_ex.go | 8 ++++++++ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/pkg/apis/controller/v1alpha1/appwrapper.go b/pkg/apis/controller/v1alpha1/appwrapper.go index aa03bf0ba..0cd322a58 100644 --- a/pkg/apis/controller/v1alpha1/appwrapper.go +++ b/pkg/apis/controller/v1alpha1/appwrapper.go @@ -220,14 +220,16 @@ const ( type AppWrapperConditionType string const ( - AppWrapperCondInit AppWrapperConditionType = "Init" - AppWrapperCondQueueing AppWrapperConditionType = "Queueing" - AppWrapperCondHeadOfLine AppWrapperConditionType = "HeadOfLine" - AppWrapperCondBackoff AppWrapperConditionType = "Backoff" - AppWrapperCondDispatched AppWrapperConditionType = "Dispatched" - AppWrapperCondRunning AppWrapperConditionType = "Running" - AppWrapperCondDeleted AppWrapperConditionType = "Deleted" - AppWrapperCondFailed AppWrapperConditionType = "Failed" + AppWrapperCondInit AppWrapperConditionType = "Init" + AppWrapperCondQueueing AppWrapperConditionType = "Queueing" + AppWrapperCondHeadOfLine AppWrapperConditionType = "HeadOfLine" + AppWrapperCondBackoff AppWrapperConditionType = "Backoff" + AppWrapperCondDispatched AppWrapperConditionType = "Dispatched" + AppWrapperCondRunning AppWrapperConditionType = "Running" + AppWrapperCondPreemptCandidate AppWrapperConditionType = "PreemptCandidate" + AppWrapperCondPreempted AppWrapperConditionType = "Preempted" + AppWrapperCondDeleted AppWrapperConditionType = "Deleted" + AppWrapperCondFailed AppWrapperConditionType = "Failed" ) // DeploymentCondition describes the state of a deployment at a certain point. diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 132212450..dd9333d14 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -389,6 +389,14 @@ func (qjm *XController) PreemptQueueJobs() { continue } newjob.Status.CanRun = false + if (q.Name == "aw-generic-deployment-2-030") { + glog.V(3).Infof("[PreemptQueueJobs] %s PreemptQueue &qj=%p Version=%s Status=%+v aw=%v", q.Name, q, q.ResourceVersion, q.Status, q) + glog.V(3).Infof("[PreemptQueueJobs] %s from cache &qj=%p Version=%s Status=%+v aw=%v", newjob.Name, newjob, newjob.ResourceVersion, newjob.Status, newjob) + } + message := fmt.Sprintf("Insufficient number of Running pods, minimum=%s, running=%s", string(newjob.Spec.SchedSpec.MinAvailable), string(newjob.Status.Running)) + cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondPreemptCandidate, v1.ConditionTrue, "MinPodsNotRunning", message) + newjob.Status.Conditions = append(newjob.Status.Conditions, cond) + if _, err := qjm.arbclients.ArbV1().AppWrappers(q.Namespace).Update(newjob); err != nil { glog.Errorf("Failed to update status of AppWrapper %v/%v: %v", q.Namespace, q.Name, err) From dc865d6a1fbe6ef4c25522d6610122fa8e0eef97 Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Sat, 14 Nov 2020 18:59:53 -0500 Subject: [PATCH 11/28] Added additional debugging. Signed-off-by: Diana Arroyo --- .../queuejob/queuejob_controller_ex.go | 31 ++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index dd9333d14..e37e76971 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -393,7 +393,7 @@ func (qjm *XController) PreemptQueueJobs() { glog.V(3).Infof("[PreemptQueueJobs] %s PreemptQueue &qj=%p Version=%s Status=%+v aw=%v", q.Name, q, q.ResourceVersion, q.Status, q) glog.V(3).Infof("[PreemptQueueJobs] %s from cache &qj=%p Version=%s Status=%+v aw=%v", newjob.Name, newjob, newjob.ResourceVersion, newjob.Status, newjob) } - message := fmt.Sprintf("Insufficient number of Running pods, minimum=%s, running=%s", string(newjob.Spec.SchedSpec.MinAvailable), string(newjob.Status.Running)) + message := fmt.Sprintf("Insufficient number of Running pods, minimum=%d, running=%v.", q.Spec.SchedSpec.MinAvailable, q.Status.Running) cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondPreemptCandidate, v1.ConditionTrue, "MinPodsNotRunning", message) newjob.Status.Conditions = append(newjob.Status.Conditions, cond) @@ -423,13 +423,36 @@ func (qjm *XController) GetQueueJobsEligibleForPreemption() []*arbv1.AppWrapper continue } } - if value.Status.State == arbv1.AppWrapperStateEnqueued { + + // Skip if AW Pending or just entering the system and does not have a state yet. + if (value.Status.State == arbv1.AppWrapperStateEnqueued) || (value.Status.State == ""){ + continue + } + + //Check to see if if this AW job has been dispatched for a time window before preempting + conditionsLen := len(value.Status.Conditions) + var dispatchConditionExists bool + dispatchConditionExists = false + var condition arbv1.AppWrapperCondition + // Get the last time the AppWrapper was dispatched + for i := (conditionsLen - 1); i > 0; i-- { + condition = value.Status.Conditions[i] + if (condition.Type != arbv1.AppWrapperCondDispatched) { + continue + } + dispatchConditionExists = true + break + } + // Now check for the minimum age and skip preempt if current time is not beyond minimum age + minAge := condition.LastTransitionMicroTime.Add(60 * time.Second) + if dispatchConditionExists && (time.Now().Before(minAge)) { continue } if int(value.Status.Running) < replicas { - if (replicas>0) { - glog.V(3).Infof("XQJ %s is eligible for preemption %v - %v , %v !!! \n", value.Name, value.Status.Running, replicas, value.Status.Succeeded) + //Check to see if if this AW job has been dispatched for a time window before preempting + if (replicas > 0) { + glog.V(3).Infof("AppWrapper %s is eligible for preemption %v - %v , %v !!! \n", value.Name, value.Status.Running, replicas, value.Status.Succeeded) qjobs = append(qjobs, value) } } From 4365053c4eb3793dcb792a16cd4234ca0781ec29 Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Sat, 14 Nov 2020 20:02:39 -0500 Subject: [PATCH 12/28] Added additional debugging. Signed-off-by: Diana Arroyo --- .../queuejob/queuejob_controller_ex.go | 40 ++++++++++--------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index e37e76971..b219ccfe7 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -429,28 +429,30 @@ func (qjm *XController) GetQueueJobsEligibleForPreemption() []*arbv1.AppWrapper continue } - //Check to see if if this AW job has been dispatched for a time window before preempting - conditionsLen := len(value.Status.Conditions) - var dispatchConditionExists bool - dispatchConditionExists = false - var condition arbv1.AppWrapperCondition - // Get the last time the AppWrapper was dispatched - for i := (conditionsLen - 1); i > 0; i-- { - condition = value.Status.Conditions[i] - if (condition.Type != arbv1.AppWrapperCondDispatched) { + if int(value.Status.Running) < replicas { + + //Check to see if if this AW job has been dispatched for a time window before preempting + conditionsLen := len(value.Status.Conditions) + var dispatchConditionExists bool + dispatchConditionExists = false + var condition arbv1.AppWrapperCondition + // Get the last time the AppWrapper was dispatched + for i := (conditionsLen - 1); i > 0; i-- { + condition = value.Status.Conditions[i] + if (condition.Type != arbv1.AppWrapperCondDispatched) { + continue + } + dispatchConditionExists = true + break + } + + // Now check for 0 running pods and for the minimum age and then + // skip preempt if current time is not beyond minimum age + minAge := condition.LastTransitionMicroTime.Add(60 * time.Second) + if (value.Status.Running <= 0) && (dispatchConditionExists && (time.Now().Before(minAge))) { continue } - dispatchConditionExists = true - break - } - // Now check for the minimum age and skip preempt if current time is not beyond minimum age - minAge := condition.LastTransitionMicroTime.Add(60 * time.Second) - if dispatchConditionExists && (time.Now().Before(minAge)) { - continue - } - if int(value.Status.Running) < replicas { - //Check to see if if this AW job has been dispatched for a time window before preempting if (replicas > 0) { glog.V(3).Infof("AppWrapper %s is eligible for preemption %v - %v , %v !!! \n", value.Name, value.Status.Running, replicas, value.Status.Succeeded) qjobs = append(qjobs, value) From 810e06e2f0547db95fce369d6c981f00311a3453 Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Sat, 14 Nov 2020 20:41:38 -0500 Subject: [PATCH 13/28] Removedadditional debugging. Signed-off-by: Diana Arroyo --- .../queuejob/queuejob_controller_ex.go | 68 +------------------ 1 file changed, 3 insertions(+), 65 deletions(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index b219ccfe7..5f719a2c9 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -389,10 +389,7 @@ func (qjm *XController) PreemptQueueJobs() { continue } newjob.Status.CanRun = false - if (q.Name == "aw-generic-deployment-2-030") { - glog.V(3).Infof("[PreemptQueueJobs] %s PreemptQueue &qj=%p Version=%s Status=%+v aw=%v", q.Name, q, q.ResourceVersion, q.Status, q) - glog.V(3).Infof("[PreemptQueueJobs] %s from cache &qj=%p Version=%s Status=%+v aw=%v", newjob.Name, newjob, newjob.ResourceVersion, newjob.Status, newjob) - } + message := fmt.Sprintf("Insufficient number of Running pods, minimum=%d, running=%v.", q.Spec.SchedSpec.MinAvailable, q.Status.Running) cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondPreemptCandidate, v1.ConditionTrue, "MinPodsNotRunning", message) newjob.Status.Conditions = append(newjob.Status.Conditions, cond) @@ -905,10 +902,8 @@ func (qjm *XController) UpdateQueueJobs() { // check eventQueue, qjqueue in program sequence to make sure job is not in qjqueue if _, exists, _ := qjm.eventQueue.Get(newjob); exists { continue } // do not enqueue if already in eventQueue if qjm.qjqueue.IfExist(newjob) { continue } // do not enqueue if already in qjqueue - if (newjob.Name == "aw-generic-deployment-2-030") { - glog.V(3).Infof("[UpdateQueueJobs] %s *Delay=%.6f seconds eventQueue.Add_byUpdateQueueJobs &qj=%p Version=%s Status=%+v", newjob.Name, time.Now().Sub(newjob.Status.ControllerFirstTimestamp.Time).Seconds(), newjob, newjob.ResourceVersion, newjob.Status) - } - err = qjm.enqueueIfNotPresent(newjob) + + err = qjm.enqueueIfNotPresent(newjob) if err != nil { glog.Errorf("[UpdateQueueJobs] Fail to enqueue %s to eventQueue, ignore. *Delay=%.6f seconds &qj=%p Version=%s Status=%+v err=%#v", newjob.Name, time.Now().Sub(newjob.Status.ControllerFirstTimestamp.Time).Seconds(), newjob, newjob.ResourceVersion, newjob.Status, err) } else { @@ -946,9 +941,6 @@ func (cc *XController) addQueueJob(obj interface{}) { qj.Name, time.Now().Sub(qj.Status.ControllerFirstTimestamp.Time).Seconds(), qj.CreationTimestamp, qj.Status.ControllerFirstTimestamp) glog.V(4).Infof("[Informer-addQJ] enqueue %s &qj=%p Version=%s Status=%+v", qj.Name, qj, qj.ResourceVersion, qj.Status) - if (qj.Name == "aw-generic-deployment-2-030") { - glog.V(3).Infof("[Informer-addQJ] enqueue %s &qj=%p Version=%s Status=%+v", qj.Name, qj, qj.ResourceVersion, qj.Status) - } cc.enqueue(qj) } @@ -959,9 +951,6 @@ func (cc *XController) updateQueueJob(oldObj, newObj interface{}) { return } oldQJ, ok := oldObj.(*arbv1.AppWrapper) - if (newQJ.Name == "aw-generic-deployment-2-030") { - glog.V(3).Infof("[Informer-updateQJ] %s arrival", newQJ.Name) - } if !ok { glog.Errorf("[Informer-updateQJ] old object is not AppWrapper. enqueue(newQJ). oldObj=%+v", oldObj) glog.V(4).Infof("[Informer-updateQJ] %s *Delay=%.6f seconds BadOldObject enqueue &newQJ=%p Version=%s Status=%+v", newQJ.Name, time.Now().Sub(newQJ.Status.ControllerFirstTimestamp.Time).Seconds(), newQJ, newQJ.ResourceVersion, newQJ.Status) @@ -972,10 +961,6 @@ func (cc *XController) updateQueueJob(oldObj, newObj interface{}) { if (oldQJ.Name == newQJ.Name) && (larger(oldQJ.ResourceVersion, newQJ.ResourceVersion)) { glog.V(10).Infof("[Informer-updateQJ] %s ignored OutOfOrder arrival &oldQJ=%p oldQJ=%+v", oldQJ.Name, oldQJ, oldQJ) glog.V(10).Infof("[Informer-updateQJ] %s ignored OutOfOrder arrival &newQJ=%p newQJ=%+v", newQJ.Name, newQJ, newQJ) - if (newQJ.Name == "aw-generic-deployment-2-030") { - glog.V(3).Infof("[Informer-updateQJ] %s ignore OutOfOrder arrival &oldQJ=%p oldQJ=%+v", oldQJ.Name, oldQJ, oldQJ) - glog.V(3).Infof("[Informer-updateQJ] i%s gnore OutOfOrder arrival &newQJ=%p newQJ=%+v", newQJ.Name, newQJ, newQJ) - } return } glog.V(3).Infof("[Informer-updateQJ] %s *Delay=%.6f seconds normal enqueue &newQJ=%p Version=%s Status=%+v", newQJ.Name, time.Now().Sub(newQJ.Status.ControllerFirstTimestamp.Time).Seconds(), newQJ, newQJ.ResourceVersion, newQJ.Status) @@ -1005,22 +990,6 @@ func (cc *XController) enqueue(obj interface{}) error { return fmt.Errorf("[enqueue] obj is not AppWrapper. obj=%+v", obj) } - if (qj.Name == "aw-generic-deployment-2-030") { - objKey, err := GetQueueJobKey(obj) - if (err != nil) { - glog.V(3).Infof("[enqueue] Enqueue failed for %s. Error getting key from aw object: err=%+v", qj.Name, err) - } else { - _, exists, err := cc.eventQueue.GetByKey(objKey) - if (err != nil) { - glog.V(3).Infof("[enqueue] Enqueue failed for %s. Error getting aw from event queue: err=%+v", qj.Name, err) - } else { - if exists { - glog.V(3).Infof("[enqueue] %s found in event queue. This will add a duplicate in the following call to eventQueue.Add().", qj.Name) - } - } - } - glog.V(3).Infof("[enqueue] %s eventQueue.Add_byEnqueue &qj=%p Version=%s Status=%+v aw=%v", qj.Name, qj, qj.ResourceVersion, qj.Status, qj) - } err := cc.eventQueue.Add(qj) // add to FIFO queue if not in, update object & keep position if already in FIFO queue if err != nil { glog.Errorf("[enqueue] Fail to enqueue %s to eventQueue, ignore. *Delay=%.6f seconds &qj=%p Version=%s Status=%+v err=%#v", qj.Name, time.Now().Sub(qj.Status.ControllerFirstTimestamp.Time).Seconds(), qj, qj.ResourceVersion, qj.Status, err) @@ -1036,23 +1005,6 @@ func (cc *XController) enqueueIfNotPresent(obj interface{}) error { return fmt.Errorf("[enqueueIfNotPresent] obj is not AppWrapper. obj=%+v", obj) } - if (aw.Name == "aw-generic-deployment-2-030") { - objKey, err := GetQueueJobKey(obj) - if (err != nil) { - glog.V(3).Infof("[enqueueIfNotPresent] Enqueue failed for %s. Error getting key from aw object: err=%+v", aw.Name, err) - } else { - _, exists, err := cc.eventQueue.GetByKey(objKey) - if (err != nil) { - glog.V(3).Infof("[enqueueIfNotPresent] Enqueue failed for %s. Error getting aw from event queue: err=%+v", aw.Name, err) - } else { - if exists { - glog.V(3).Infof("[enqueueIfNotPresent] %s found in event queue. Should not be added on the following call to AddIfNotPresent().", aw.Name) - } - } - } - glog.V(3).Infof("[enqueueIfNotPresent] %s eventQueue.Add_byEnqueue &qj=%p Version=%s Status=%+v aw=%v", aw.Name, aw, aw.ResourceVersion, aw.Status, aw) - } - err := cc.eventQueue.AddIfNotPresent(aw) // add to FIFO queue if not in, update object & keep position if already in FIFO queue return err } @@ -1136,9 +1088,6 @@ func (cc *XController) worker() { return nil } - if (queuejob.Name == "aw-generic-deployment-2-030") { - glog.V(3).Infof("[worker] Popped %s from event queue &queuejob=%p queuejob=%+v", queuejob.Name, queuejob, queuejob) - } // sync AppWrapper if err := cc.syncQueueJob(queuejob); err != nil { @@ -1172,16 +1121,8 @@ func (cc *XController) syncQueueJob(qj *arbv1.AppWrapper) error { if larger(queueJob.ResourceVersion, qj.ResourceVersion) { glog.V(10).Infof("[worker-syncQJ] %s found more recent copy from cache &qj=%p qj=%+v", qj.Name, qj, qj) glog.V(10).Infof("[worker-syncQJ] %s found more recent copy from cache &queueJob=%p queueJob=%+v", queueJob.Name, queueJob, queueJob) - if (qj.Name == "aw-generic-deployment-2-030") { - glog.V(3).Infof("[worker-syncQJ] %s found more recent copy from event queue &qj=%p qj=%+v", qj.Name, qj, qj) - glog.V(3).Infof("[worker-syncQJ] %s found more recent copy from cache &queueJob=%p queueJob=%+v", queueJob.Name, queueJob, queueJob) - } queueJob.DeepCopyInto(qj) - if (qj.Name == "aw-generic-deployment-2-030") { - glog.V(3).Infof("[worker-syncQJ] %s AFTER found more recent copy from event queue &qj=%p qj=%+v", qj.Name, qj, qj) - glog.V(3).Infof("[worker-syncQJ] %s AFTER found more recent copy from cache &queueJob=%p queueJob=%+v", queueJob.Name, queueJob, queueJob) - } } // If it is Agent (not a dispatcher), update pod information @@ -1266,9 +1207,6 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper) error { if err = cc.qjqueue.AddIfNotPresent(qj); err != nil { glog.Errorf("[worker-manageQJ] Fail to add %s to activeQueue. Back to eventQueue activeQ=%t Unsched=%t &qj=%p Version=%s Status=%+v err=%#v", qj.Name, cc.qjqueue.IfExistActiveQ(qj), cc.qjqueue.IfExistUnschedulableQ(qj), qj, qj.ResourceVersion, qj.Status, err) - if (qj.Name == "aw-generic-deployment-2-030") { - glog.V(3).Infof("[worker-manageQJ] %s eventQueue.Add_byEnqueue &qj=%p Version=%s Status=%+v aw=%+v", qj.Name, qj, qj.ResourceVersion, qj.Status, qj) - } cc.enqueue(qj) } else { glog.V(3).Infof("[worker-manageQJ] %s 1Delay=%.6f seconds activeQ.Add_success activeQ=%t Unsched=%t &qj=%p Version=%s Status=%+v", From 6a8bccadd295060889fa637f1112db65943ebee6 Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Sun, 15 Nov 2020 15:41:56 -0500 Subject: [PATCH 14/28] Removed duplicate failure conditions. Signed-off-by: Diana Arroyo --- deployment/aw-01.yaml | 41 ++ deployment/aw-02.yaml | 36 ++ deployment/kube-arbitrator/.helmignore | 21 + deployment/kube-arbitrator/Chart.yaml | 4 + .../kube-arbitrator/templates/NOTES.txt | 1 + .../kube-arbitrator/templates/_helpers.tpl | 23 + .../kube-arbitrator/templates/configmap.yaml | 11 + .../kube-arbitrator/templates/deployment.yaml | 398 ++++++++++++++++++ .../templates/imageSecret.yaml | 10 + deployment/kube-arbitrator/values.yaml | 54 +++ deployment/operator.py | 57 +++ deployment/pod.yaml | 17 + .../queuejob/queuejob_controller_ex.go | 10 +- pkg/controller/queuejob/utils.go | 22 + 14 files changed, 701 insertions(+), 4 deletions(-) create mode 100644 deployment/aw-01.yaml create mode 100644 deployment/aw-02.yaml create mode 100644 deployment/kube-arbitrator/.helmignore create mode 100644 deployment/kube-arbitrator/Chart.yaml create mode 100644 deployment/kube-arbitrator/templates/NOTES.txt create mode 100644 deployment/kube-arbitrator/templates/_helpers.tpl create mode 100644 deployment/kube-arbitrator/templates/configmap.yaml create mode 100644 deployment/kube-arbitrator/templates/deployment.yaml create mode 100644 deployment/kube-arbitrator/templates/imageSecret.yaml create mode 100644 deployment/kube-arbitrator/values.yaml create mode 100644 deployment/operator.py create mode 100644 deployment/pod.yaml diff --git a/deployment/aw-01.yaml b/deployment/aw-01.yaml new file mode 100644 index 000000000..0f6d55904 --- /dev/null +++ b/deployment/aw-01.yaml @@ -0,0 +1,41 @@ +apiVersion: mcad.ibm.com/v1alpha1 +kind: AppWrapper +metadata: + name: stateful-set-2-replicas + labels: + quota_context: "M" +spec: + schedSpec: + minAvailable: 2 + resources: + Items: + - replicas: 1 + type: StatefulSet + template: + apiVersion: apps/v1 # for versions before 1.9.0 use apps/v1beta2 + kind: StatefulSet + metadata: + name: stateful-set-2-replicas + labels: + app: stateful-set-2-replicas + spec: + selector: + matchLabels: + app: stateful-set-2-replicas + replicas: 2 + template: + metadata: + labels: + app: stateful-set-2-replicas + size: "2" + spec: + containers: + - name: stateful-set-2-replicas + image: k8s.gcr.io/echoserver:1.4 + resources: + requests: + cpu: "0.1" + memory: "200Mi" + limits: + cpu: "0.1" + memory: "200Mi" diff --git a/deployment/aw-02.yaml b/deployment/aw-02.yaml new file mode 100644 index 000000000..8fe12e57a --- /dev/null +++ b/deployment/aw-02.yaml @@ -0,0 +1,36 @@ +apiVersion: mcad.ibm.com/v1alpha1 +kind: AppWrapper +metadata: + name: deployment-2-replicas +spec: + schedSpec: + minAvailable: 2 + resources: + Items: + - replicas: 1 + type: Deployment + template: + apiVersion: apps/v1beta1 + kind: Deployment + metadata: + name: deployment-2-replicas + labels: + app: deployment-2-replicas + spec: + selector: + matchLabels: + app: deployment-2-replicas + replicas: 2 + template: + metadata: + labels: + app: deployment-2-replicas + spec: + containers: + - name: deployment-2-replicas + image: k8s.gcr.io/echoserver:1.4 + resources: + limits: + memory: 150Mi + requests: + memory: 150Mi diff --git a/deployment/kube-arbitrator/.helmignore b/deployment/kube-arbitrator/.helmignore new file mode 100644 index 000000000..f0c131944 --- /dev/null +++ b/deployment/kube-arbitrator/.helmignore @@ -0,0 +1,21 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*~ +# Various IDEs +.project +.idea/ +*.tmproj diff --git a/deployment/kube-arbitrator/Chart.yaml b/deployment/kube-arbitrator/Chart.yaml new file mode 100644 index 000000000..51fd40162 --- /dev/null +++ b/deployment/kube-arbitrator/Chart.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +description: The batch system of Kubernetes +name: kube-arbitrator +version: 0.1.0 diff --git a/deployment/kube-arbitrator/templates/NOTES.txt b/deployment/kube-arbitrator/templates/NOTES.txt new file mode 100644 index 000000000..56c65942c --- /dev/null +++ b/deployment/kube-arbitrator/templates/NOTES.txt @@ -0,0 +1 @@ +The batch system of Kubernetes. diff --git a/deployment/kube-arbitrator/templates/_helpers.tpl b/deployment/kube-arbitrator/templates/_helpers.tpl new file mode 100644 index 000000000..934a74f55 --- /dev/null +++ b/deployment/kube-arbitrator/templates/_helpers.tpl @@ -0,0 +1,23 @@ +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +*/}} +{{- define "fullname" -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +create image secret +*/}} +{{- define "imagePullSecret" }} +{{- printf "{\"auths\": {\"%s\": {\"auth\": \"%s\"}}}" .Values.imagePullSecret.registry (printf "%s:%s" .Values.imagePullSecret.username .Values.imagePullSecret.password | b64enc) | b64enc }} +{{- end }} diff --git a/deployment/kube-arbitrator/templates/configmap.yaml b/deployment/kube-arbitrator/templates/configmap.yaml new file mode 100644 index 000000000..f8e184ea2 --- /dev/null +++ b/deployment/kube-arbitrator/templates/configmap.yaml @@ -0,0 +1,11 @@ +#{{ if .Values.configMap.name }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Values.configMap.name }} + namespace: kube-system +data: + DISPATCHER_MODE: {{ .Values.configMap.dispatcherMode }} + DISPATCHER_AGENT_CONFIGS: {{ .Values.configMap.agentConfigs }} + QUOTA_REST_URL: {{ .Values.configMap.quotaRestUrl }} +#{{ end }} diff --git a/deployment/kube-arbitrator/templates/deployment.yaml b/deployment/kube-arbitrator/templates/deployment.yaml new file mode 100644 index 000000000..f880dca8e --- /dev/null +++ b/deployment/kube-arbitrator/templates/deployment.yaml @@ -0,0 +1,398 @@ +apiVersion: v1 +kind: Service +metadata: + name: custom-metrics-apiserver + namespace: kube-system +spec: + ports: + - name: https + port: 443 + targetPort: 6443 + - name: http + port: 80 + targetPort: 8080 + selector: + app: custom-metrics-apiserver +--- +#{{ if .Values.configMap.quotaRestUrl }} +apiVersion: v1 +kind: Service +metadata: + name: quota-http-server +spec: + type: NodePort + selector: + app: custom-metrics-apiserver + ports: + # By default and for convenience, the `targetPort` is set to the same value as the `port` field. + - port: 8082 + targetPort: 80 + # Optional field + # By default and for convenience, the Kubernetes control plane will allocate a port from a range (default: 30000-32767) + nodePort: 30082 +#{{ end }} +--- +apiVersion: apiregistration.k8s.io/v1beta1 +kind: APIService +metadata: + name: v1beta1.external.metrics.k8s.io +spec: + service: + name: custom-metrics-apiserver + namespace: kube-system + group: external.metrics.k8s.io + version: v1beta1 + insecureSkipTLSVerify: true + groupPriorityMinimum: 100 + versionPriority: 100 +--- +apiVersion: apiregistration.k8s.io/v1beta1 +kind: APIService +metadata: + name: v1beta1.custom.metrics.k8s.io +spec: + service: + name: custom-metrics-apiserver + namespace: kube-system + group: custom.metrics.k8s.io + version: v1beta1 + insecureSkipTLSVerify: true + groupPriorityMinimum: 100 + versionPriority: 100 +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: custom-metrics-server-resources +rules: +- apiGroups: + - custom.metrics.k8s.io + resources: ["*"] + verbs: ["*"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: custom-metrics-resource-reader +rules: +- apiGroups: + - "" + resources: + - namespaces + - pods + - services + verbs: + - get + - list +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: hpa-controller-custom-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: custom-metrics-server-resources +subjects: +- kind: ServiceAccount + name: horizontal-pod-autoscaler + namespace: kube-system +--- +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + name: schedulingspecs.arbitrator.incubator.k8s.io +spec: + group: arbitrator.incubator.k8s.io + names: + kind: SchedulingSpec + listKind: SchedulingSpecList + plural: schedulingspecs + singular: schedulingspec + scope: Namespaced + version: v1alpha1 +--- +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + name: queuejobs.arbitrator.incubator.k8s.io +spec: + group: arbitrator.incubator.k8s.io + names: + kind: QueueJob + listKind: QueueJobList + plural: queuejobs + singular: queuejob + scope: Namespaced + version: v1alpha1 +--- +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + name: xqueuejobs.arbitrator.incubator.k8s.io +spec: + group: arbitrator.incubator.k8s.io + names: + kind: XQueueJob + listKind: XQueueJobList + plural: xqueuejobs + singular: xqueuejob + scope: Namespaced + version: v1alpha1 +--- +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + name: appwrappers.arbitrator.incubator.k8s.io +spec: + group: arbitrator.incubator.k8s.io + names: + kind: AppWrapper + listKind: AppWrapperList + plural: appwrappers + singular: appwrapper + scope: Namespaced + version: v1alpha1 +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + annotations: + rbac.authorization.kubernetes.io/autoupdate: "true" + name: system:controller:xqueuejob-controller + labels: + kubernetes.io/bootstrapping: rbac-defaults +rules: +- apiGroups: + - arbitrator.incubator.k8s.io + resources: + - xqueuejobs + - queuejobs + - schedulingspecs + - appwrappers + verbs: + - create + - delete + - deletecollection + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - persistentvolumes + - namespaces + verbs: + - create + - delete + - deletecollection + - get + - list + - patch + - update + - watch +#{{ if .Values.quotaManagement.rbac.apiGroup }} +#{{ if .Values.quotaManagement.rbac.resource }} +- apiGroups: + - {{ .Values.quotaManagement.rbac.apiGroup }} + resources: + - {{ .Values.quotaManagement.rbac.resource }} + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +#{{ end }} +#{{ end }} +--- +#{{ if .Values.serviceAccount }} +apiVersion: v1 +#{{ if .Values.imagePullSecret.name }} +imagePullSecrets: +- name: {{ .Values.imagePullSecret.name }} +#{{ end }} +kind: ServiceAccount +metadata: + labels: + wdc.ibm.com/ownership: admin + name: {{ .Values.serviceAccount }} + namespace: kube-system +#{{ end }} +--- +#{{ if .Values.serviceAccount }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: custom-metrics:system:auth-delegator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:auth-delegator +subjects: +- kind: ServiceAccount + name: {{ .Values.serviceAccount }} + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: custom-metrics-auth-reader + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: extension-apiserver-authentication-reader +subjects: +- kind: ServiceAccount + name: {{ .Values.serviceAccount }} + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: custom-metrics-resource-reader +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: custom-metrics-resource-reader +subjects: +- kind: ServiceAccount + name: {{ .Values.serviceAccount }} + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + annotations: + rbac.authorization.kubernetes.io/autoupdate: "true" + labels: + kubernetes.io/bootstrapping: rbac-defaults + name: system:controller:xqueuejob-controller +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:controller:xqueuejob-controller +subjects: +- kind: ServiceAccount + name: {{ .Values.serviceAccount }} + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + annotations: + rbac.authorization.kubernetes.io/autoupdate: "true" + labels: + kubernetes.io/bootstrapping: rbac-defaults + name: system:controller:xqueuejob-controller-edit +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: edit +subjects: +- kind: ServiceAccount + name: {{ .Values.serviceAccount }} + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + annotations: + rbac.authorization.kubernetes.io/autoupdate: "true" + labels: + kubernetes.io/bootstrapping: rbac-defaults + name: system:controller:xqueuejob-controller-kube-scheduler +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:kube-scheduler +subjects: +- kind: ServiceAccount + name: {{ .Values.serviceAccount }} + namespace: kube-system +#{{ end }} +--- +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: {{ .Values.deploymentName }} + namespace: kube-system + labels: + chart: "{{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}" + app: custom-metrics-apiserver +spec: + replicas: {{ .Values.replicaCount }} + matchLabels: + app: custom-metrics-apiserver + template: + metadata: + labels: + app: custom-metrics-apiserver + name: {{ .Values.deploymentName }} + spec: +#{{ if .Values.serviceAccount }} + serviceAccountName: {{ .Values.serviceAccount }} +#{{ end }} +#{{ if .Values.nodeSelector.hostname }} + nodeSelector: + kubernetes.io/hostname: {{ .Values.nodeSelector.hostname }} +#{{ end }} + volumes: + - name: temp-vol + emptyDir: {} +#{{ if .Values.volumes.hostPath }} + - name: agent-config-vol + hostPath: + path: {{ .Values.volumes.hostPath }} +#{{ end }} + containers: +#{{ if .Values.configMap.quotaRestUrl }} + - name: "quota-httpserver" + image: "{{ .Values.httpServerImage.repository }}:{{ .Values.httpServerImage.tag }}" + imagePullPolicy: {{ .Values.httpServerImage.pullPolicy }} + ports: + - containerPort: 80 + - name: "quota-management" + image: "{{ .Values.httpImage.repository }}:{{ .Values.httpImage.tag }}" + imagePullPolicy: {{ .Values.httpImage.pullPolicy }} + ports: + - containerPort: 8081 + volumeMounts: + - mountPath: /tmp + name: temp-vol +#{{ if .Values.volumes.hostPath }} + - name: agent-config-vol + mountPath: /root/kubernetes +#{{ end }} +#{{ end }} + - name: {{ .Chart.Name }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: ["mcad-controller"] + args: ["--v", "{{ .Values.loglevel }}", "--logtostderr"] +# args: ["--v", "10", "--logtostderr", "--secure-port=6443"] + ports: + - containerPort: 6443 + name: https + - containerPort: 8080 + name: http + volumeMounts: + - mountPath: /tmp + name: temp-vol +#{{ if .Values.volumes.hostPath }} + - name: agent-config-vol + mountPath: /root/kubernetes +#{{ end }} +#{{ if .Values.configMap.name }} + envFrom: + - configMapRef: + name: {{ .Values.configMap.name }} +#{{ end }} + resources: +{{ toYaml .Values.resources | indent 10 }} + diff --git a/deployment/kube-arbitrator/templates/imageSecret.yaml b/deployment/kube-arbitrator/templates/imageSecret.yaml new file mode 100644 index 000000000..55d308d02 --- /dev/null +++ b/deployment/kube-arbitrator/templates/imageSecret.yaml @@ -0,0 +1,10 @@ +#{{ if .Values.imagePullSecret.name }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ .Values.imagePullSecret.name }} + namespace: kube-system +type: kubernetes.io/dockerconfigjson +data: + .dockerconfigjson: {{ template "imagePullSecret" . }} +#{{ end }} diff --git a/deployment/kube-arbitrator/values.yaml b/deployment/kube-arbitrator/values.yaml new file mode 100644 index 000000000..343c84274 --- /dev/null +++ b/deployment/kube-arbitrator/values.yaml @@ -0,0 +1,54 @@ +# Default values for kube-arbitrator. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. +deploymentName: xqueuejob-controller +namespace: kube-system +replicaCount: 1 +loglevel: 4 +image: + repository: mcad-controller + tag: latest + pullPolicy: Always + +httpImage: + repository: restserver + tag: latest + pullPolicy: Always + +httpServerImage: + repository: httpserver + tag: latest + pullPolicy: Always + +resources: + limits: + cpu: 2000m + memory: 2048Mi + requests: + cpu: 2000m + memory: 2048Mi + +imagePullSecret: + name: + username: iamapikey + registry: registry.stage1.ng.bluemix.net + password: dummyvalue + +serviceAccount: xqueuejob-controller + +nodeSelector: + hostname: + +configMap: + name: + dispatcherMode: '"false"' + agentConfigs: + quoteRestUrl: + +volumes: + hostPath: + +quotaManagement: + rbac: + apiGroup: + resource: \ No newline at end of file diff --git a/deployment/operator.py b/deployment/operator.py new file mode 100644 index 000000000..aed1a4ef2 --- /dev/null +++ b/deployment/operator.py @@ -0,0 +1,57 @@ +import kopf +import kubernetes +import yaml + +@kopf.on.create('ibm.com', 'v1beta1', 'resourceplans') +def create_fn(body, spec, **kwargs): + # Get info from Database object + name = body['metadata']['name'] + namespace = body['metadata']['namespace'] + type = spec['type'] + + # Make sure type is provided + #if not type: + # raise kopf.HandlerFatalError(f"Type must be set. Got {type}.") + + # Pod template + pod = {'apiVersion': 'v1', 'metadata': {'name' : name, 'labels': {'app': 'db'}}} + + # Service template + svc = {'apiVersion': 'v1', 'metadata': {'name' : name}, 'spec': { 'selector': {'app': 'db'}, 'type': 'NodePort'}} + + # Update templates based on Database specification + + #if type == 'mongo': + # image = 'mongo:4.0' + # port = 27017 + # pod['spec'] = { 'containers': [ { 'image': image, 'name': type } ]} + # svc['spec']['ports'] = [{ 'port': port, 'targetPort': port}] + #if type == 'mysql': + image = 'mysql:8.0' + port = 3306 + pod['spec'] = { 'containers': [ { 'image': image, 'name': type, 'env': [ { 'name': 'MYSQL_ROOT_PASSWORD', 'value': 'my_passwd' } ] } ]} + svc['spec']['ports'] = [{ 'port': port, 'targetPort': port}] + + # Make the Pod and Service the children of the Database object + kopf.adopt(pod, owner=body) + kopf.adopt(svc, owner=body) + + # Object used to communicate with the API Server + api = kubernetes.client.CoreV1Api() + + # Create Pod + obj = api.create_namespaced_pod(namespace, pod) + print(f"Pod {obj.metadata.name} created") + + # Create Service + obj = api.create_namespaced_service(namespace, svc) + print(f"NodePort Service {obj.metadata.name} created, exposing on port {obj.spec.ports[0].node_port}") + + # Update status + msg = f"Pod and Service created by Resource Plan {name}" + return {'message': msg} + +@kopf.on.delete('ibm.com', 'v1beta1', 'resourceplans') +def delete(body, **kwargs): + msg = f"Resource Plan {body['metadata']['name']} and its Pod / Service children deleted" + return {'message': msg} diff --git a/deployment/pod.yaml b/deployment/pod.yaml new file mode 100644 index 000000000..270805755 --- /dev/null +++ b/deployment/pod.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Pod +metadata: + name: static-web + labels: + role: myrole +spec: + containers: + - name: web + image: nginx + resources: + requests: + cpu: 1045m + ports: + - name: web + containerPort: 80 + protocol: TCP diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 5f719a2c9..f44b45442 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -1241,7 +1241,7 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper) error { // Call Resource Controller of ar.Type to issue REST call to Etcd for resource creation err00 := cc.qjobResControls[ar.Type].SyncQueueJob(qj, &ar) if err00 != nil { - dispatchFailureMessage = fmt.Sprintf("Failed to create item: %s/%s", qj.Namespace, qj.Name) + dispatchFailureMessage = fmt.Sprintf("%s/%s creation failure: %+v", qj.Namespace, qj.Name, err00) glog.V(3).Infof("[worker-manageQJ] Error dispatching job=%s type=%v Status=%+v err=%+v", qj.Name, ar.Type, qj.Status, err00) dispatched = false break @@ -1252,7 +1252,7 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper) error { glog.V(10).Infof("[worker-manageQJ] before dispatch Generic.SyncQueueJob %s &qj=%p Version=%s Status=%+v", qj.Name, qj, qj.ResourceVersion, qj.Status) _, err00 := cc.genericresources.SyncQueueJob(qj, &ar) if err00 != nil { - dispatchFailureMessage = fmt.Sprintf("Failed to create generic item: %s/%s", qj.Namespace, qj.Name) + dispatchFailureMessage = fmt.Sprintf("%s/%s creation failure: %+v", qj.Namespace, qj.Name, err00) glog.Errorf("[worker-manageQJ] Error dispatching job=%s Status=%+v err=%+v", qj.Name, qj.Status, err00) dispatched = false } @@ -1268,8 +1268,10 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper) error { } else { qj.Status.State = arbv1.AppWrapperStateFailed qj.Status.QueueJobState = arbv1.AppWrapperCondFailed - cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondFailed, v1.ConditionTrue, dispatchFailureReason, dispatchFailureMessage) - qj.Status.Conditions = append(qj.Status.Conditions, cond) + if ( !isLastConditionDuplicate(qj,arbv1.AppWrapperCondFailed, v1.ConditionTrue, dispatchFailureReason, dispatchFailureMessage) ) { + cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondFailed, v1.ConditionTrue, dispatchFailureReason, dispatchFailureMessage) + qj.Status.Conditions = append(qj.Status.Conditions, cond) + } cc.Cleanup(qj) } diff --git a/pkg/controller/queuejob/utils.go b/pkg/controller/queuejob/utils.go index 12bf4da46..07b3892cc 100644 --- a/pkg/controller/queuejob/utils.go +++ b/pkg/controller/queuejob/utils.go @@ -158,3 +158,25 @@ func GenerateAppWrapperCondition(condType arbv1.AppWrapperConditionType, condSta } } +// AppWrapperCondition returns condition of a AppWrapper condition. +func isLastConditionDuplicate(aw *arbv1.AppWrapper, condType arbv1.AppWrapperConditionType, condStatus corev1.ConditionStatus, condReason string, condMsg string) bool { + if (aw.Status.Conditions == nil) { + return false + } + + lastIndex := len(aw.Status.Conditions) - 1 + + if (lastIndex < 0) { + return false + } + + lastCond := aw.Status.Conditions[lastIndex] + if (lastCond.Type == condType) && + (lastCond.Status == condStatus) && + (lastCond.Reason == condReason) && + (lastCond.Message == condMsg) { + return true + } else { + return false + } +} \ No newline at end of file From e9a75bfcfd3a87101169759eacce7420bf3722a9 Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Sun, 15 Nov 2020 15:45:29 -0500 Subject: [PATCH 15/28] Removed extraneous files. Signed-off-by: Diana Arroyo --- deployment/aw-01.yaml | 41 -- deployment/aw-02.yaml | 36 -- deployment/kube-arbitrator/.helmignore | 21 - deployment/kube-arbitrator/Chart.yaml | 4 - .../kube-arbitrator/templates/NOTES.txt | 1 - .../kube-arbitrator/templates/_helpers.tpl | 23 - .../kube-arbitrator/templates/configmap.yaml | 11 - .../kube-arbitrator/templates/deployment.yaml | 398 ------------------ .../templates/imageSecret.yaml | 10 - deployment/kube-arbitrator/values.yaml | 54 --- deployment/operator.py | 57 --- deployment/pod.yaml | 17 - 12 files changed, 673 deletions(-) delete mode 100644 deployment/aw-01.yaml delete mode 100644 deployment/aw-02.yaml delete mode 100644 deployment/kube-arbitrator/.helmignore delete mode 100644 deployment/kube-arbitrator/Chart.yaml delete mode 100644 deployment/kube-arbitrator/templates/NOTES.txt delete mode 100644 deployment/kube-arbitrator/templates/_helpers.tpl delete mode 100644 deployment/kube-arbitrator/templates/configmap.yaml delete mode 100644 deployment/kube-arbitrator/templates/deployment.yaml delete mode 100644 deployment/kube-arbitrator/templates/imageSecret.yaml delete mode 100644 deployment/kube-arbitrator/values.yaml delete mode 100644 deployment/operator.py delete mode 100644 deployment/pod.yaml diff --git a/deployment/aw-01.yaml b/deployment/aw-01.yaml deleted file mode 100644 index 0f6d55904..000000000 --- a/deployment/aw-01.yaml +++ /dev/null @@ -1,41 +0,0 @@ -apiVersion: mcad.ibm.com/v1alpha1 -kind: AppWrapper -metadata: - name: stateful-set-2-replicas - labels: - quota_context: "M" -spec: - schedSpec: - minAvailable: 2 - resources: - Items: - - replicas: 1 - type: StatefulSet - template: - apiVersion: apps/v1 # for versions before 1.9.0 use apps/v1beta2 - kind: StatefulSet - metadata: - name: stateful-set-2-replicas - labels: - app: stateful-set-2-replicas - spec: - selector: - matchLabels: - app: stateful-set-2-replicas - replicas: 2 - template: - metadata: - labels: - app: stateful-set-2-replicas - size: "2" - spec: - containers: - - name: stateful-set-2-replicas - image: k8s.gcr.io/echoserver:1.4 - resources: - requests: - cpu: "0.1" - memory: "200Mi" - limits: - cpu: "0.1" - memory: "200Mi" diff --git a/deployment/aw-02.yaml b/deployment/aw-02.yaml deleted file mode 100644 index 8fe12e57a..000000000 --- a/deployment/aw-02.yaml +++ /dev/null @@ -1,36 +0,0 @@ -apiVersion: mcad.ibm.com/v1alpha1 -kind: AppWrapper -metadata: - name: deployment-2-replicas -spec: - schedSpec: - minAvailable: 2 - resources: - Items: - - replicas: 1 - type: Deployment - template: - apiVersion: apps/v1beta1 - kind: Deployment - metadata: - name: deployment-2-replicas - labels: - app: deployment-2-replicas - spec: - selector: - matchLabels: - app: deployment-2-replicas - replicas: 2 - template: - metadata: - labels: - app: deployment-2-replicas - spec: - containers: - - name: deployment-2-replicas - image: k8s.gcr.io/echoserver:1.4 - resources: - limits: - memory: 150Mi - requests: - memory: 150Mi diff --git a/deployment/kube-arbitrator/.helmignore b/deployment/kube-arbitrator/.helmignore deleted file mode 100644 index f0c131944..000000000 --- a/deployment/kube-arbitrator/.helmignore +++ /dev/null @@ -1,21 +0,0 @@ -# Patterns to ignore when building packages. -# This supports shell glob matching, relative path matching, and -# negation (prefixed with !). Only one pattern per line. -.DS_Store -# Common VCS dirs -.git/ -.gitignore -.bzr/ -.bzrignore -.hg/ -.hgignore -.svn/ -# Common backup files -*.swp -*.bak -*.tmp -*~ -# Various IDEs -.project -.idea/ -*.tmproj diff --git a/deployment/kube-arbitrator/Chart.yaml b/deployment/kube-arbitrator/Chart.yaml deleted file mode 100644 index 51fd40162..000000000 --- a/deployment/kube-arbitrator/Chart.yaml +++ /dev/null @@ -1,4 +0,0 @@ -apiVersion: v1 -description: The batch system of Kubernetes -name: kube-arbitrator -version: 0.1.0 diff --git a/deployment/kube-arbitrator/templates/NOTES.txt b/deployment/kube-arbitrator/templates/NOTES.txt deleted file mode 100644 index 56c65942c..000000000 --- a/deployment/kube-arbitrator/templates/NOTES.txt +++ /dev/null @@ -1 +0,0 @@ -The batch system of Kubernetes. diff --git a/deployment/kube-arbitrator/templates/_helpers.tpl b/deployment/kube-arbitrator/templates/_helpers.tpl deleted file mode 100644 index 934a74f55..000000000 --- a/deployment/kube-arbitrator/templates/_helpers.tpl +++ /dev/null @@ -1,23 +0,0 @@ -{{/* vim: set filetype=mustache: */}} -{{/* -Expand the name of the chart. -*/}} -{{- define "name" -}} -{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} -{{- end -}} - -{{/* -Create a default fully qualified app name. -We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). -*/}} -{{- define "fullname" -}} -{{- $name := default .Chart.Name .Values.nameOverride -}} -{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} -{{- end -}} - -{{/* -create image secret -*/}} -{{- define "imagePullSecret" }} -{{- printf "{\"auths\": {\"%s\": {\"auth\": \"%s\"}}}" .Values.imagePullSecret.registry (printf "%s:%s" .Values.imagePullSecret.username .Values.imagePullSecret.password | b64enc) | b64enc }} -{{- end }} diff --git a/deployment/kube-arbitrator/templates/configmap.yaml b/deployment/kube-arbitrator/templates/configmap.yaml deleted file mode 100644 index f8e184ea2..000000000 --- a/deployment/kube-arbitrator/templates/configmap.yaml +++ /dev/null @@ -1,11 +0,0 @@ -#{{ if .Values.configMap.name }} -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ .Values.configMap.name }} - namespace: kube-system -data: - DISPATCHER_MODE: {{ .Values.configMap.dispatcherMode }} - DISPATCHER_AGENT_CONFIGS: {{ .Values.configMap.agentConfigs }} - QUOTA_REST_URL: {{ .Values.configMap.quotaRestUrl }} -#{{ end }} diff --git a/deployment/kube-arbitrator/templates/deployment.yaml b/deployment/kube-arbitrator/templates/deployment.yaml deleted file mode 100644 index f880dca8e..000000000 --- a/deployment/kube-arbitrator/templates/deployment.yaml +++ /dev/null @@ -1,398 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: custom-metrics-apiserver - namespace: kube-system -spec: - ports: - - name: https - port: 443 - targetPort: 6443 - - name: http - port: 80 - targetPort: 8080 - selector: - app: custom-metrics-apiserver ---- -#{{ if .Values.configMap.quotaRestUrl }} -apiVersion: v1 -kind: Service -metadata: - name: quota-http-server -spec: - type: NodePort - selector: - app: custom-metrics-apiserver - ports: - # By default and for convenience, the `targetPort` is set to the same value as the `port` field. - - port: 8082 - targetPort: 80 - # Optional field - # By default and for convenience, the Kubernetes control plane will allocate a port from a range (default: 30000-32767) - nodePort: 30082 -#{{ end }} ---- -apiVersion: apiregistration.k8s.io/v1beta1 -kind: APIService -metadata: - name: v1beta1.external.metrics.k8s.io -spec: - service: - name: custom-metrics-apiserver - namespace: kube-system - group: external.metrics.k8s.io - version: v1beta1 - insecureSkipTLSVerify: true - groupPriorityMinimum: 100 - versionPriority: 100 ---- -apiVersion: apiregistration.k8s.io/v1beta1 -kind: APIService -metadata: - name: v1beta1.custom.metrics.k8s.io -spec: - service: - name: custom-metrics-apiserver - namespace: kube-system - group: custom.metrics.k8s.io - version: v1beta1 - insecureSkipTLSVerify: true - groupPriorityMinimum: 100 - versionPriority: 100 ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: custom-metrics-server-resources -rules: -- apiGroups: - - custom.metrics.k8s.io - resources: ["*"] - verbs: ["*"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: custom-metrics-resource-reader -rules: -- apiGroups: - - "" - resources: - - namespaces - - pods - - services - verbs: - - get - - list ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: hpa-controller-custom-metrics -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: custom-metrics-server-resources -subjects: -- kind: ServiceAccount - name: horizontal-pod-autoscaler - namespace: kube-system ---- -apiVersion: apiextensions.k8s.io/v1beta1 -kind: CustomResourceDefinition -metadata: - name: schedulingspecs.arbitrator.incubator.k8s.io -spec: - group: arbitrator.incubator.k8s.io - names: - kind: SchedulingSpec - listKind: SchedulingSpecList - plural: schedulingspecs - singular: schedulingspec - scope: Namespaced - version: v1alpha1 ---- -apiVersion: apiextensions.k8s.io/v1beta1 -kind: CustomResourceDefinition -metadata: - name: queuejobs.arbitrator.incubator.k8s.io -spec: - group: arbitrator.incubator.k8s.io - names: - kind: QueueJob - listKind: QueueJobList - plural: queuejobs - singular: queuejob - scope: Namespaced - version: v1alpha1 ---- -apiVersion: apiextensions.k8s.io/v1beta1 -kind: CustomResourceDefinition -metadata: - name: xqueuejobs.arbitrator.incubator.k8s.io -spec: - group: arbitrator.incubator.k8s.io - names: - kind: XQueueJob - listKind: XQueueJobList - plural: xqueuejobs - singular: xqueuejob - scope: Namespaced - version: v1alpha1 ---- -apiVersion: apiextensions.k8s.io/v1beta1 -kind: CustomResourceDefinition -metadata: - name: appwrappers.arbitrator.incubator.k8s.io -spec: - group: arbitrator.incubator.k8s.io - names: - kind: AppWrapper - listKind: AppWrapperList - plural: appwrappers - singular: appwrapper - scope: Namespaced - version: v1alpha1 ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - annotations: - rbac.authorization.kubernetes.io/autoupdate: "true" - name: system:controller:xqueuejob-controller - labels: - kubernetes.io/bootstrapping: rbac-defaults -rules: -- apiGroups: - - arbitrator.incubator.k8s.io - resources: - - xqueuejobs - - queuejobs - - schedulingspecs - - appwrappers - verbs: - - create - - delete - - deletecollection - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - persistentvolumes - - namespaces - verbs: - - create - - delete - - deletecollection - - get - - list - - patch - - update - - watch -#{{ if .Values.quotaManagement.rbac.apiGroup }} -#{{ if .Values.quotaManagement.rbac.resource }} -- apiGroups: - - {{ .Values.quotaManagement.rbac.apiGroup }} - resources: - - {{ .Values.quotaManagement.rbac.resource }} - verbs: - - get - - list - - watch - - create - - update - - patch - - delete -#{{ end }} -#{{ end }} ---- -#{{ if .Values.serviceAccount }} -apiVersion: v1 -#{{ if .Values.imagePullSecret.name }} -imagePullSecrets: -- name: {{ .Values.imagePullSecret.name }} -#{{ end }} -kind: ServiceAccount -metadata: - labels: - wdc.ibm.com/ownership: admin - name: {{ .Values.serviceAccount }} - namespace: kube-system -#{{ end }} ---- -#{{ if .Values.serviceAccount }} -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: custom-metrics:system:auth-delegator -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: system:auth-delegator -subjects: -- kind: ServiceAccount - name: {{ .Values.serviceAccount }} - namespace: kube-system ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: custom-metrics-auth-reader - namespace: kube-system -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: extension-apiserver-authentication-reader -subjects: -- kind: ServiceAccount - name: {{ .Values.serviceAccount }} - namespace: kube-system ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: custom-metrics-resource-reader -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: custom-metrics-resource-reader -subjects: -- kind: ServiceAccount - name: {{ .Values.serviceAccount }} - namespace: kube-system ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - annotations: - rbac.authorization.kubernetes.io/autoupdate: "true" - labels: - kubernetes.io/bootstrapping: rbac-defaults - name: system:controller:xqueuejob-controller -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: system:controller:xqueuejob-controller -subjects: -- kind: ServiceAccount - name: {{ .Values.serviceAccount }} - namespace: kube-system ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - annotations: - rbac.authorization.kubernetes.io/autoupdate: "true" - labels: - kubernetes.io/bootstrapping: rbac-defaults - name: system:controller:xqueuejob-controller-edit -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: edit -subjects: -- kind: ServiceAccount - name: {{ .Values.serviceAccount }} - namespace: kube-system ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - annotations: - rbac.authorization.kubernetes.io/autoupdate: "true" - labels: - kubernetes.io/bootstrapping: rbac-defaults - name: system:controller:xqueuejob-controller-kube-scheduler -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: system:kube-scheduler -subjects: -- kind: ServiceAccount - name: {{ .Values.serviceAccount }} - namespace: kube-system -#{{ end }} ---- -apiVersion: extensions/v1beta1 -kind: Deployment -metadata: - name: {{ .Values.deploymentName }} - namespace: kube-system - labels: - chart: "{{ .Chart.Name }}-{{ .Chart.Version | replace "+" "_" }}" - app: custom-metrics-apiserver -spec: - replicas: {{ .Values.replicaCount }} - matchLabels: - app: custom-metrics-apiserver - template: - metadata: - labels: - app: custom-metrics-apiserver - name: {{ .Values.deploymentName }} - spec: -#{{ if .Values.serviceAccount }} - serviceAccountName: {{ .Values.serviceAccount }} -#{{ end }} -#{{ if .Values.nodeSelector.hostname }} - nodeSelector: - kubernetes.io/hostname: {{ .Values.nodeSelector.hostname }} -#{{ end }} - volumes: - - name: temp-vol - emptyDir: {} -#{{ if .Values.volumes.hostPath }} - - name: agent-config-vol - hostPath: - path: {{ .Values.volumes.hostPath }} -#{{ end }} - containers: -#{{ if .Values.configMap.quotaRestUrl }} - - name: "quota-httpserver" - image: "{{ .Values.httpServerImage.repository }}:{{ .Values.httpServerImage.tag }}" - imagePullPolicy: {{ .Values.httpServerImage.pullPolicy }} - ports: - - containerPort: 80 - - name: "quota-management" - image: "{{ .Values.httpImage.repository }}:{{ .Values.httpImage.tag }}" - imagePullPolicy: {{ .Values.httpImage.pullPolicy }} - ports: - - containerPort: 8081 - volumeMounts: - - mountPath: /tmp - name: temp-vol -#{{ if .Values.volumes.hostPath }} - - name: agent-config-vol - mountPath: /root/kubernetes -#{{ end }} -#{{ end }} - - name: {{ .Chart.Name }} - image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" - imagePullPolicy: {{ .Values.image.pullPolicy }} - command: ["mcad-controller"] - args: ["--v", "{{ .Values.loglevel }}", "--logtostderr"] -# args: ["--v", "10", "--logtostderr", "--secure-port=6443"] - ports: - - containerPort: 6443 - name: https - - containerPort: 8080 - name: http - volumeMounts: - - mountPath: /tmp - name: temp-vol -#{{ if .Values.volumes.hostPath }} - - name: agent-config-vol - mountPath: /root/kubernetes -#{{ end }} -#{{ if .Values.configMap.name }} - envFrom: - - configMapRef: - name: {{ .Values.configMap.name }} -#{{ end }} - resources: -{{ toYaml .Values.resources | indent 10 }} - diff --git a/deployment/kube-arbitrator/templates/imageSecret.yaml b/deployment/kube-arbitrator/templates/imageSecret.yaml deleted file mode 100644 index 55d308d02..000000000 --- a/deployment/kube-arbitrator/templates/imageSecret.yaml +++ /dev/null @@ -1,10 +0,0 @@ -#{{ if .Values.imagePullSecret.name }} -apiVersion: v1 -kind: Secret -metadata: - name: {{ .Values.imagePullSecret.name }} - namespace: kube-system -type: kubernetes.io/dockerconfigjson -data: - .dockerconfigjson: {{ template "imagePullSecret" . }} -#{{ end }} diff --git a/deployment/kube-arbitrator/values.yaml b/deployment/kube-arbitrator/values.yaml deleted file mode 100644 index 343c84274..000000000 --- a/deployment/kube-arbitrator/values.yaml +++ /dev/null @@ -1,54 +0,0 @@ -# Default values for kube-arbitrator. -# This is a YAML-formatted file. -# Declare variables to be passed into your templates. -deploymentName: xqueuejob-controller -namespace: kube-system -replicaCount: 1 -loglevel: 4 -image: - repository: mcad-controller - tag: latest - pullPolicy: Always - -httpImage: - repository: restserver - tag: latest - pullPolicy: Always - -httpServerImage: - repository: httpserver - tag: latest - pullPolicy: Always - -resources: - limits: - cpu: 2000m - memory: 2048Mi - requests: - cpu: 2000m - memory: 2048Mi - -imagePullSecret: - name: - username: iamapikey - registry: registry.stage1.ng.bluemix.net - password: dummyvalue - -serviceAccount: xqueuejob-controller - -nodeSelector: - hostname: - -configMap: - name: - dispatcherMode: '"false"' - agentConfigs: - quoteRestUrl: - -volumes: - hostPath: - -quotaManagement: - rbac: - apiGroup: - resource: \ No newline at end of file diff --git a/deployment/operator.py b/deployment/operator.py deleted file mode 100644 index aed1a4ef2..000000000 --- a/deployment/operator.py +++ /dev/null @@ -1,57 +0,0 @@ -import kopf -import kubernetes -import yaml - -@kopf.on.create('ibm.com', 'v1beta1', 'resourceplans') -def create_fn(body, spec, **kwargs): - # Get info from Database object - name = body['metadata']['name'] - namespace = body['metadata']['namespace'] - type = spec['type'] - - # Make sure type is provided - #if not type: - # raise kopf.HandlerFatalError(f"Type must be set. Got {type}.") - - # Pod template - pod = {'apiVersion': 'v1', 'metadata': {'name' : name, 'labels': {'app': 'db'}}} - - # Service template - svc = {'apiVersion': 'v1', 'metadata': {'name' : name}, 'spec': { 'selector': {'app': 'db'}, 'type': 'NodePort'}} - - # Update templates based on Database specification - - #if type == 'mongo': - # image = 'mongo:4.0' - # port = 27017 - # pod['spec'] = { 'containers': [ { 'image': image, 'name': type } ]} - # svc['spec']['ports'] = [{ 'port': port, 'targetPort': port}] - #if type == 'mysql': - image = 'mysql:8.0' - port = 3306 - pod['spec'] = { 'containers': [ { 'image': image, 'name': type, 'env': [ { 'name': 'MYSQL_ROOT_PASSWORD', 'value': 'my_passwd' } ] } ]} - svc['spec']['ports'] = [{ 'port': port, 'targetPort': port}] - - # Make the Pod and Service the children of the Database object - kopf.adopt(pod, owner=body) - kopf.adopt(svc, owner=body) - - # Object used to communicate with the API Server - api = kubernetes.client.CoreV1Api() - - # Create Pod - obj = api.create_namespaced_pod(namespace, pod) - print(f"Pod {obj.metadata.name} created") - - # Create Service - obj = api.create_namespaced_service(namespace, svc) - print(f"NodePort Service {obj.metadata.name} created, exposing on port {obj.spec.ports[0].node_port}") - - # Update status - msg = f"Pod and Service created by Resource Plan {name}" - return {'message': msg} - -@kopf.on.delete('ibm.com', 'v1beta1', 'resourceplans') -def delete(body, **kwargs): - msg = f"Resource Plan {body['metadata']['name']} and its Pod / Service children deleted" - return {'message': msg} diff --git a/deployment/pod.yaml b/deployment/pod.yaml deleted file mode 100644 index 270805755..000000000 --- a/deployment/pod.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: static-web - labels: - role: myrole -spec: - containers: - - name: web - image: nginx - resources: - requests: - cpu: 1045m - ports: - - name: web - containerPort: 80 - protocol: TCP From fc7313bd823bd51ae240b030b5ab960407258ac7 Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Sun, 15 Nov 2020 19:42:59 -0500 Subject: [PATCH 16/28] Added quota failure message to conditions. Signed-off-by: Diana Arroyo --- pkg/controller/queuejob/queuejob_controller_ex.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 97d958d0a..3a95dd4d3 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -928,6 +928,7 @@ func (qjm *XController) ScheduleNext() { } } } else { // Not enough free resources to dispatch HOL + dispatchFailedMessage = "Insufficient quota to dispatch AppWrapper." glog.V(3).Infof("[ScheduleNext] HOL Blocking by %s for %s activeQ=%t Unsched=%t &qj=%p Version=%s Status=%+v due to quota limits", qj.Name, time.Now().Sub(HOLStartTime), qjm.qjqueue.IfExistActiveQ(qj), qjm.qjqueue.IfExistUnschedulableQ(qj), qj, qj.ResourceVersion, qj.Status) } } else { // Not enough free resources to dispatch HOL From a2f7e3f65d01850b9fb6c87d8f99fffac9e3a3d6 Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Mon, 16 Nov 2020 02:41:55 -0500 Subject: [PATCH 17/28] Fix to refresh cache before updating backoff conditions. Signed-off-by: Diana Arroyo --- .../queuejob/queuejob_controller_ex.go | 46 +++++++++++++------ 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 3a95dd4d3..8c028420f 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -967,21 +967,39 @@ func (cc *XController) updateEtcd(qj *arbv1.AppWrapper, at string) error { } func (qjm *XController) backoff(q *arbv1.AppWrapper, reason string, message string) { - q.Status.QueueJobState = arbv1.AppWrapperCondBackoff - cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondBackoff, v1.ConditionTrue, reason, message) - q.Status.Conditions = append(q.Status.Conditions, cond) - q.Status.FilterIgnore = true // update QueueJobState only, no work needed - qjm.updateEtcd(q, "backoff - Rejoining") - qjm.qjqueue.AddUnschedulableIfNotPresent(q) - glog.V(3).Infof("[backoff] %s move to unschedulableQ before sleep for %d seconds. activeQ=%t Unsched=%t &qj=%p Version=%s Status=%+v", q.Name, qjm.serverOption.BackoffTime, qjm.qjqueue.IfExistActiveQ((q)), qjm.qjqueue.IfExistUnschedulableQ((q)), q, q.ResourceVersion, q.Status) + var workingAW *arbv1.AppWrapper + apiCacheAWJob, e := qjm.queueJobLister.AppWrappers(q.Namespace).Get(q.Name) + // Update condition + if (e == nil) { + workingAW = apiCacheAWJob + apiCacheAWJob.Status.QueueJobState = arbv1.AppWrapperCondBackoff + cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondBackoff, v1.ConditionTrue, reason, message) + workingAW.Status.Conditions = append(workingAW.Status.Conditions, cond) + workingAW.Status.FilterIgnore = true // update QueueJobState only, no work needed + qjm.updateEtcd(workingAW, "backoff - Rejoining") + } else { + workingAW = q + glog.Errorf("[backoff] Failed to retrieve cached object for %s/%s. Continuing with possible stale object without updating conditions.", workingAW.Namespace,workingAW.Name) + + } + qjm.qjqueue.AddUnschedulableIfNotPresent(workingAW) + glog.V(3).Infof("[backoff] %s move to unschedulableQ before sleep for %d seconds. activeQ=%t Unsched=%t &qj=%p Version=%s Status=%+v", workingAW.Name, + qjm.serverOption.BackoffTime, qjm.qjqueue.IfExistActiveQ((workingAW)), qjm.qjqueue.IfExistUnschedulableQ((workingAW)), workingAW, workingAW.ResourceVersion, workingAW.Status) time.Sleep(time.Duration(qjm.serverOption.BackoffTime) * time.Second) - qjm.qjqueue.MoveToActiveQueueIfExists(q) - q.Status.QueueJobState = arbv1.AppWrapperCondQueueing - returnCond := GenerateAppWrapperCondition(arbv1.AppWrapperCondQueueing, v1.ConditionTrue, "BackoffTimerExpired.", "") - q.Status.Conditions = append(q.Status.Conditions, returnCond) - q.Status.FilterIgnore = true // update QueueJobState only, no work needed - qjm.updateEtcd(q, "backoff - Queueing") - glog.V(3).Infof("[backoff] %s activeQ.Add after sleep for %d seconds. activeQ=%t Unsched=%t &aw=%p Version=%s Status=%+v", q.Name, qjm.serverOption.BackoffTime, qjm.qjqueue.IfExistActiveQ((q)), qjm.qjqueue.IfExistUnschedulableQ((q)), q, q.ResourceVersion, q.Status) + qjm.qjqueue.MoveToActiveQueueIfExists(workingAW) + + // Update condition after backoff + apiCacheAWJob, e = qjm.queueJobLister.AppWrappers(q.Namespace).Get(q.Name) + if (e == nil) { + workingAW = apiCacheAWJob + workingAW.Status.QueueJobState = arbv1.AppWrapperCondQueueing + returnCond := GenerateAppWrapperCondition(arbv1.AppWrapperCondQueueing, v1.ConditionTrue, "BackoffTimerExpired.", "") + workingAW.Status.Conditions = append(workingAW.Status.Conditions, returnCond) + workingAW.Status.FilterIgnore = true // update QueueJobState only, no work needed + qjm.updateEtcd(workingAW, "backoff - Queueing") + } + glog.V(3).Infof("[backoff] %s activeQ.Add after sleep for %d seconds. activeQ=%t Unsched=%t &aw=%p Version=%s Status=%+v", workingAW.Name, + qjm.serverOption.BackoffTime, qjm.qjqueue.IfExistActiveQ((workingAW)), qjm.qjqueue.IfExistUnschedulableQ((workingAW)), workingAW, workingAW.ResourceVersion, workingAW.Status) } // Run start AppWrapper Controller From bd5e584dca81d5a48b5b3ebeeb2f6e1f0674735a Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Mon, 16 Nov 2020 03:45:51 -0500 Subject: [PATCH 18/28] Fixed etcd update to get latest object from cache before update. Signed-off-by: Diana Arroyo --- .../queuejob/queuejob_controller_ex.go | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 8c028420f..f5a6e672b 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -952,15 +952,25 @@ func (qjm *XController) ScheduleNext() { // Update AppWrappers in etcd // todo: This is a current workaround for duplicate message bug. func (cc *XController) updateEtcd(qj *arbv1.AppWrapper, at string) error { - qj.Status.Sender = "before "+ at // set Sender string to indicate code location - qj.Status.Local = false // for Informer FilterFunc to pickup - if _, err := cc.arbclients.ArbV1().AppWrappers(qj.Namespace).Update(qj); err != nil { - glog.Errorf("[updateEtcd] Failed to update status of AppWrapper %s, namespace: %s at %s err=%v", qj.Name, qj.Namespace, at, err) + apiCacheAWJob, e := cc.queueJobLister.AppWrappers(qj.Namespace).Get(qj.Name) + + if (e != nil) { + glog.Errorf("[updateEtcd] Failed to update status of AppWrapper %s, namespace: %s at %s err=%v", + apiCacheAWJob.Name, apiCacheAWJob.Namespace, at, e) + return e + } + + apiCacheAWJob.Status.Sender = "before "+ at // set Sender string to indicate code location + apiCacheAWJob.Status.Local = false // for Informer FilterFunc to pickup + if _, err := cc.arbclients.ArbV1().AppWrappers(apiCacheAWJob.Namespace).Update(apiCacheAWJob); err != nil { + glog.Errorf("[updateEtcd] Failed to update status of AppWrapper %s, namespace: %s at %s err=%v", + apiCacheAWJob.Name, apiCacheAWJob.Namespace, at, err) return err // } else { // qjj should be the same as qj except with newer ResourceVersion // qj.ResourceVersion = qjj.ResourceVersion // update new ResourceVersion from etcd } - glog.V(10).Infof("[updateEtcd] AppWrapperUpdate success %s at %s &qj=%p qj=%+v", qj.Name, at, qj, qj) + glog.V(10).Infof("[updateEtcd] AppWrapperUpdate success %s at %s &qj=%p qj=%+v", + apiCacheAWJob.Name, at, apiCacheAWJob, apiCacheAWJob) //qj.Status.Local = true // for Informer FilterFunc to ignore duplicate //qj.Status.Sender = "after "+ at // set Sender string to indicate code location return nil @@ -1077,7 +1087,7 @@ func (qjm *XController) UpdateQueueJobs() { LastTransitionMicroTime: metav1.NowMicro(), }, } - glog.V(3).Infof("[UpdateQueueJobs] %s 0Delay=%.6f seconds CreationTimestamp=%s ControllerFirstTimestamp=%s", + glog.V(4).Infof("[UpdateQueueJobs] %s 0Delay=%.6f seconds CreationTimestamp=%s ControllerFirstTimestamp=%s", newjob.Name, time.Now().Sub(newjob.Status.ControllerFirstTimestamp.Time).Seconds(), newjob.CreationTimestamp, newjob.Status.ControllerFirstTimestamp) } glog.V(10).Infof("[UpdateQueueJobs] %s: qjqueue=%t &qj=%p Version=%s Status=%+v", newjob.Name, qjm.qjqueue.IfExist(newjob), newjob, newjob.ResourceVersion, newjob.Status) @@ -1607,7 +1617,7 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool //Cleanup function func (cc *XController) Cleanup(queuejob *arbv1.AppWrapper) error { - glog.V(3).Infof("[Cleanup] begin AppWrapper %s Version=%s Status=%+v\n", queuejob.Name, queuejob.ResourceVersion, queuejob.Status) + glog.V(4).Infof("[Cleanup] begin AppWrapper %s Version=%s Status=%+v\n", queuejob.Name, queuejob.ResourceVersion, queuejob.Status) if !cc.isDispatcher { if queuejob.Spec.AggrResources.Items != nil { @@ -1632,7 +1642,7 @@ func (cc *XController) Cleanup(queuejob *arbv1.AppWrapper) error { queuejob.Status.Running = 0 queuejob.Status.Succeeded = 0 queuejob.Status.Failed = 0 - glog.V(3).Infof("[Cleanup] end AppWrapper %s Version=%s Status=%+v\n", queuejob.Name, queuejob.ResourceVersion, queuejob.Status) + glog.V(4).Infof("[Cleanup] end AppWrapper %s Version=%s Status=%+v\n", queuejob.Name, queuejob.ResourceVersion, queuejob.Status) return nil } From 2a4ae7e86ba8e9869e6fc93dcdd526e01d79c940 Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Mon, 16 Nov 2020 04:29:40 -0500 Subject: [PATCH 19/28] Adjusted logging levels in cleanup to debugging level. Signed-off-by: Diana Arroyo --- pkg/controller/queuejob/queuejob_controller_ex.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index f5a6e672b..0f1cf8229 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -1642,7 +1642,7 @@ func (cc *XController) Cleanup(queuejob *arbv1.AppWrapper) error { queuejob.Status.Running = 0 queuejob.Status.Succeeded = 0 queuejob.Status.Failed = 0 - glog.V(4).Infof("[Cleanup] end AppWrapper %s Version=%s Status=%+v\n", queuejob.Name, queuejob.ResourceVersion, queuejob.Status) + glog.V(10).Infof("[Cleanup] end AppWrapper %s Version=%s Status=%+v\n", queuejob.Name, queuejob.ResourceVersion, queuejob.Status) return nil } From e03d4c12f748d317c9b27910650d443fd7b1a148 Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Mon, 16 Nov 2020 05:06:40 -0500 Subject: [PATCH 20/28] Added additional time for initial test case to account for new conditions field updates. Signed-off-by: Diana Arroyo --- test/e2e/queue.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/e2e/queue.go b/test/e2e/queue.go index 902cca15f..f04a8c8e0 100644 --- a/test/e2e/queue.go +++ b/test/e2e/queue.go @@ -29,7 +29,7 @@ var _ = Describe("AppWrapper E2E Test", func() { It("Create AppWrapper - Generic 100 Deployment Only - 2 pods each", func() { context := initTestContext() - defer cleanupTestContextExtendedTime(context, (240 * time.Second)) + defer cleanupTestContextExtendedTime(context, (300 * time.Second)) const ( awCount = 100 @@ -54,7 +54,7 @@ var _ = Describe("AppWrapper E2E Test", func() { } // Give the deployments time to create pods - time.Sleep(2 * time.Minute) + time.Sleep(3 * time.Minute) for i := 0; i < awCount; i++ { if ((i+1) % modDivisor) == 0 || i ==0 { fmt.Fprintf(os.Stdout, "[e2e] Checking for %d replicas running for AW %s.\n", replicas, aws[i].Name) From 3ef8fc75161608bbbb12152359590758b48d4194 Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Mon, 16 Nov 2020 08:04:53 -0500 Subject: [PATCH 21/28] Add logic to remove initial deletion of AW resources. Signed-off-by: Diana Arroyo --- .../queuejob/queuejob_controller_ex.go | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 0f1cf8229..f30cf13e5 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -1412,12 +1412,17 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool if !qj.Status.CanRun && (qj.Status.State != arbv1.AppWrapperStateEnqueued && qj.Status.State != arbv1.AppWrapperStateDeleted) { // if there are running resources for this job then delete them because the job was put in // pending state... - glog.V(2).Infof("[manageQueueJob] Deleting resources for AppWrapper Job %s because it was preempted (newjob) status=%+v\n", qj.Name, qj.Status) - err = cc.Cleanup(qj) - glog.V(8).Infof("[manageQueueJob] Validation after deleting resources for AppWrapper Job %s because it was be preempted (newjob) status=%+v\n", qj.Name, qj.Status) - if err != nil { - glog.Errorf("[manageQueueJob] Fail to delete resources for AppWrapper Job %s, err=%#v", qj.Name, err) - return err + + // If this the first time seeing this AW, no need to delete. + stateLen := len(qj.Status.State) + if (stateLen > 0) { + glog.V(2).Infof("[manageQueueJob] Deleting resources for AppWrapper Job %s because it was preempted, status=%+v\n", qj.Name, qj.Status) + err = cc.Cleanup(qj) + glog.V(8).Infof("[manageQueueJob] Validation after deleting resources for AppWrapper Job %s because it was be preempted, status=%+v\n", qj.Name, qj.Status) + if err != nil { + glog.Errorf("[manageQueueJob] Fail to delete resources for AppWrapper Job %s, err=%#v", qj.Name, err) + return err + } } qj.Status.State = arbv1.AppWrapperStateEnqueued From 61026b3a19c6cc180576d0319d717fb1a678321b Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Mon, 16 Nov 2020 09:01:06 -0500 Subject: [PATCH 22/28] Backout changes of added conditions for backoff. Signed-off-by: Diana Arroyo --- .../queuejob/queuejob_controller_ex.go | 50 ++++++++++++++----- 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index f30cf13e5..7fb26ffe1 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -960,6 +960,8 @@ func (cc *XController) updateEtcd(qj *arbv1.AppWrapper, at string) error { return e } + //TODO: Remove next line + apiCacheAWJob = qj apiCacheAWJob.Status.Sender = "before "+ at // set Sender string to indicate code location apiCacheAWJob.Status.Local = false // for Informer FilterFunc to pickup if _, err := cc.arbclients.ArbV1().AppWrappers(apiCacheAWJob.Namespace).Update(apiCacheAWJob); err != nil { @@ -976,38 +978,60 @@ func (cc *XController) updateEtcd(qj *arbv1.AppWrapper, at string) error { return nil } +func (cc *XController) updateStatusInEtcd(qj *arbv1.AppWrapper, at string) error { + apiCacheAWJob, e := cc.queueJobLister.AppWrappers(qj.Namespace).Get(qj.Name) + + if (e != nil) { + glog.Errorf("[updateEtcd] Failed to update status of AppWrapper %s, namespace: %s at %s err=%v", + apiCacheAWJob.Name, apiCacheAWJob.Namespace, at, e) + return e + } + if _, err := cc.arbclients.ArbV1().AppWrappers(apiCacheAWJob.Namespace).UpdateStatus(apiCacheAWJob); err != nil { + glog.Errorf("[updateEtcd] Failed to update status of AppWrapper %s, namespace: %s at %s err=%v", + apiCacheAWJob.Name, apiCacheAWJob.Namespace, at, err) + return err + } + glog.V(10).Infof("[updateEtcd] AppWrapperUpdate success %s at %s &qj=%p qj=%+v", + apiCacheAWJob.Name, at, apiCacheAWJob, apiCacheAWJob) + return nil +} + func (qjm *XController) backoff(q *arbv1.AppWrapper, reason string, message string) { var workingAW *arbv1.AppWrapper - apiCacheAWJob, e := qjm.queueJobLister.AppWrappers(q.Namespace).Get(q.Name) - // Update condition - if (e == nil) { - workingAW = apiCacheAWJob - apiCacheAWJob.Status.QueueJobState = arbv1.AppWrapperCondBackoff + //TODO: Remove next line + workingAW = q + //apiCacheAWJob, e := qjm.queueJobLister.AppWrappers(q.Namespace).Get(q.Name) + //// Update condition + //if (e == nil) { + // workingAW = apiCacheAWJob + // apiCacheAWJob.Status.QueueJobState = arbv1.AppWrapperCondBackoff cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondBackoff, v1.ConditionTrue, reason, message) workingAW.Status.Conditions = append(workingAW.Status.Conditions, cond) workingAW.Status.FilterIgnore = true // update QueueJobState only, no work needed qjm.updateEtcd(workingAW, "backoff - Rejoining") - } else { - workingAW = q + //qjm.updateStatusInEtcd(workingAW, "backoff - Rejoining") + //} else { + // workingAW = q glog.Errorf("[backoff] Failed to retrieve cached object for %s/%s. Continuing with possible stale object without updating conditions.", workingAW.Namespace,workingAW.Name) - } + //} qjm.qjqueue.AddUnschedulableIfNotPresent(workingAW) glog.V(3).Infof("[backoff] %s move to unschedulableQ before sleep for %d seconds. activeQ=%t Unsched=%t &qj=%p Version=%s Status=%+v", workingAW.Name, qjm.serverOption.BackoffTime, qjm.qjqueue.IfExistActiveQ((workingAW)), qjm.qjqueue.IfExistUnschedulableQ((workingAW)), workingAW, workingAW.ResourceVersion, workingAW.Status) time.Sleep(time.Duration(qjm.serverOption.BackoffTime) * time.Second) qjm.qjqueue.MoveToActiveQueueIfExists(workingAW) - // Update condition after backoff - apiCacheAWJob, e = qjm.queueJobLister.AppWrappers(q.Namespace).Get(q.Name) - if (e == nil) { - workingAW = apiCacheAWJob + //// Update condition after backoff + //apiCacheAWJob, e = qjm.queueJobLister.AppWrappers(q.Namespace).Get(q.Name) + //if (e == nil) { + // workingAW = apiCacheAWJob workingAW.Status.QueueJobState = arbv1.AppWrapperCondQueueing returnCond := GenerateAppWrapperCondition(arbv1.AppWrapperCondQueueing, v1.ConditionTrue, "BackoffTimerExpired.", "") workingAW.Status.Conditions = append(workingAW.Status.Conditions, returnCond) workingAW.Status.FilterIgnore = true // update QueueJobState only, no work needed qjm.updateEtcd(workingAW, "backoff - Queueing") - } + //qjm.updateStatusInEtcd(workingAW, "backoff - Queueing") + //} glog.V(3).Infof("[backoff] %s activeQ.Add after sleep for %d seconds. activeQ=%t Unsched=%t &aw=%p Version=%s Status=%+v", workingAW.Name, qjm.serverOption.BackoffTime, qjm.qjqueue.IfExistActiveQ((workingAW)), qjm.qjqueue.IfExistUnschedulableQ((workingAW)), workingAW, workingAW.ResourceVersion, workingAW.Status) } From b4007a3fdc4db96cf54f409ac849c86fe74fc16b Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Mon, 16 Nov 2020 09:42:38 -0500 Subject: [PATCH 23/28] Revert changes to test for 100 AWs. Signed-off-by: Diana Arroyo --- test/e2e/queue.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/e2e/queue.go b/test/e2e/queue.go index f04a8c8e0..902cca15f 100644 --- a/test/e2e/queue.go +++ b/test/e2e/queue.go @@ -29,7 +29,7 @@ var _ = Describe("AppWrapper E2E Test", func() { It("Create AppWrapper - Generic 100 Deployment Only - 2 pods each", func() { context := initTestContext() - defer cleanupTestContextExtendedTime(context, (300 * time.Second)) + defer cleanupTestContextExtendedTime(context, (240 * time.Second)) const ( awCount = 100 @@ -54,7 +54,7 @@ var _ = Describe("AppWrapper E2E Test", func() { } // Give the deployments time to create pods - time.Sleep(3 * time.Minute) + time.Sleep(2 * time.Minute) for i := 0; i < awCount; i++ { if ((i+1) % modDivisor) == 0 || i ==0 { fmt.Fprintf(os.Stdout, "[e2e] Checking for %d replicas running for AW %s.\n", replicas, aws[i].Name) From d8b333e78900494a30408f7e665374879e2ac6e1 Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Mon, 16 Nov 2020 09:47:39 -0500 Subject: [PATCH 24/28] Removed additional query to cache for updates to etcd. Signed-off-by: Diana Arroyo --- .../queuejob/queuejob_controller_ex.go | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 7fb26ffe1..b1392a584 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -952,14 +952,16 @@ func (qjm *XController) ScheduleNext() { // Update AppWrappers in etcd // todo: This is a current workaround for duplicate message bug. func (cc *XController) updateEtcd(qj *arbv1.AppWrapper, at string) error { - apiCacheAWJob, e := cc.queueJobLister.AppWrappers(qj.Namespace).Get(qj.Name) - - if (e != nil) { - glog.Errorf("[updateEtcd] Failed to update status of AppWrapper %s, namespace: %s at %s err=%v", - apiCacheAWJob.Name, apiCacheAWJob.Namespace, at, e) - return e - } + //apiCacheAWJob, e := cc.queueJobLister.AppWrappers(qj.Namespace).Get(qj.Name) + // + //if (e != nil) { + // glog.Errorf("[updateEtcd] Failed to update status of AppWrapper %s, namespace: %s at %s err=%v", + // apiCacheAWJob.Name, apiCacheAWJob.Namespace, at, e) + // return e + //} + //TODO: Remove next line + var apiCacheAWJob*arbv1.AppWrapper //TODO: Remove next line apiCacheAWJob = qj apiCacheAWJob.Status.Sender = "before "+ at // set Sender string to indicate code location From cd2a1fc0613e26d22826a235621722ecf454267e Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Mon, 16 Nov 2020 10:28:43 -0500 Subject: [PATCH 25/28] Pad more time before cleanup of test case 100 AWs. Signed-off-by: Diana Arroyo --- test/e2e/queue.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/queue.go b/test/e2e/queue.go index 902cca15f..03b8bfa29 100644 --- a/test/e2e/queue.go +++ b/test/e2e/queue.go @@ -29,7 +29,7 @@ var _ = Describe("AppWrapper E2E Test", func() { It("Create AppWrapper - Generic 100 Deployment Only - 2 pods each", func() { context := initTestContext() - defer cleanupTestContextExtendedTime(context, (240 * time.Second)) + defer cleanupTestContextExtendedTime(context, (300 * time.Second)) const ( awCount = 100 From f8a26bc0dc6e9256bc1899b73bb315cafe52082c Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Mon, 16 Nov 2020 11:20:29 -0500 Subject: [PATCH 26/28] Added non-cache status update to conditions. Signed-off-by: Diana Arroyo --- .../queuejob/queuejob_controller_ex.go | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index b1392a584..492c4a2c0 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -981,13 +981,17 @@ func (cc *XController) updateEtcd(qj *arbv1.AppWrapper, at string) error { } func (cc *XController) updateStatusInEtcd(qj *arbv1.AppWrapper, at string) error { - apiCacheAWJob, e := cc.queueJobLister.AppWrappers(qj.Namespace).Get(qj.Name) - - if (e != nil) { - glog.Errorf("[updateEtcd] Failed to update status of AppWrapper %s, namespace: %s at %s err=%v", - apiCacheAWJob.Name, apiCacheAWJob.Namespace, at, e) - return e - } + //apiCacheAWJob, e := cc.queueJobLister.AppWrappers(qj.Namespace).Get(qj.Name) + // + //if (e != nil) { + // glog.Errorf("[updateEtcd] Failed to update status of AppWrapper %s, namespace: %s at %s err=%v", + // apiCacheAWJob.Name, apiCacheAWJob.Namespace, at, e) + // return e + //} + //TODO: Remove next line + var apiCacheAWJob*arbv1.AppWrapper + //TODO: Remove next line + apiCacheAWJob = qj if _, err := cc.arbclients.ArbV1().AppWrappers(apiCacheAWJob.Namespace).UpdateStatus(apiCacheAWJob); err != nil { glog.Errorf("[updateEtcd] Failed to update status of AppWrapper %s, namespace: %s at %s err=%v", apiCacheAWJob.Name, apiCacheAWJob.Namespace, at, err) @@ -1010,8 +1014,8 @@ func (qjm *XController) backoff(q *arbv1.AppWrapper, reason string, message stri cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondBackoff, v1.ConditionTrue, reason, message) workingAW.Status.Conditions = append(workingAW.Status.Conditions, cond) workingAW.Status.FilterIgnore = true // update QueueJobState only, no work needed - qjm.updateEtcd(workingAW, "backoff - Rejoining") - //qjm.updateStatusInEtcd(workingAW, "backoff - Rejoining") + //qjm.updateEtcd(workingAW, "backoff - Rejoining") + qjm.updateStatusInEtcd(workingAW, "backoff - Rejoining") //} else { // workingAW = q glog.Errorf("[backoff] Failed to retrieve cached object for %s/%s. Continuing with possible stale object without updating conditions.", workingAW.Namespace,workingAW.Name) @@ -1031,8 +1035,8 @@ func (qjm *XController) backoff(q *arbv1.AppWrapper, reason string, message stri returnCond := GenerateAppWrapperCondition(arbv1.AppWrapperCondQueueing, v1.ConditionTrue, "BackoffTimerExpired.", "") workingAW.Status.Conditions = append(workingAW.Status.Conditions, returnCond) workingAW.Status.FilterIgnore = true // update QueueJobState only, no work needed - qjm.updateEtcd(workingAW, "backoff - Queueing") - //qjm.updateStatusInEtcd(workingAW, "backoff - Queueing") + //qjm.updateEtcd(workingAW, "backoff - Queueing") + qjm.updateStatusInEtcd(workingAW, "backoff - Queueing") //} glog.V(3).Infof("[backoff] %s activeQ.Add after sleep for %d seconds. activeQ=%t Unsched=%t &aw=%p Version=%s Status=%+v", workingAW.Name, qjm.serverOption.BackoffTime, qjm.qjqueue.IfExistActiveQ((workingAW)), qjm.qjqueue.IfExistUnschedulableQ((workingAW)), workingAW, workingAW.ResourceVersion, workingAW.Status) From 66802c0a4f587e8778b40e2ad62981423ece34d3 Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Mon, 16 Nov 2020 11:53:07 -0500 Subject: [PATCH 27/28] Added refresh of AW from cache before updating initial backoff condition. Signed-off-by: Diana Arroyo --- .../queuejob/queuejob_controller_ex.go | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 492c4a2c0..0d7b5b1d5 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -1004,23 +1004,21 @@ func (cc *XController) updateStatusInEtcd(qj *arbv1.AppWrapper, at string) error func (qjm *XController) backoff(q *arbv1.AppWrapper, reason string, message string) { var workingAW *arbv1.AppWrapper - //TODO: Remove next line - workingAW = q - //apiCacheAWJob, e := qjm.queueJobLister.AppWrappers(q.Namespace).Get(q.Name) - //// Update condition - //if (e == nil) { - // workingAW = apiCacheAWJob - // apiCacheAWJob.Status.QueueJobState = arbv1.AppWrapperCondBackoff + apiCacheAWJob, e := qjm.queueJobLister.AppWrappers(q.Namespace).Get(q.Name) + // Update condition + if (e == nil) { + workingAW = apiCacheAWJob + apiCacheAWJob.Status.QueueJobState = arbv1.AppWrapperCondBackoff cond := GenerateAppWrapperCondition(arbv1.AppWrapperCondBackoff, v1.ConditionTrue, reason, message) workingAW.Status.Conditions = append(workingAW.Status.Conditions, cond) workingAW.Status.FilterIgnore = true // update QueueJobState only, no work needed //qjm.updateEtcd(workingAW, "backoff - Rejoining") qjm.updateStatusInEtcd(workingAW, "backoff - Rejoining") - //} else { - // workingAW = q + } else { + workingAW = q glog.Errorf("[backoff] Failed to retrieve cached object for %s/%s. Continuing with possible stale object without updating conditions.", workingAW.Namespace,workingAW.Name) - //} + } qjm.qjqueue.AddUnschedulableIfNotPresent(workingAW) glog.V(3).Infof("[backoff] %s move to unschedulableQ before sleep for %d seconds. activeQ=%t Unsched=%t &qj=%p Version=%s Status=%+v", workingAW.Name, qjm.serverOption.BackoffTime, qjm.qjqueue.IfExistActiveQ((workingAW)), qjm.qjqueue.IfExistUnschedulableQ((workingAW)), workingAW, workingAW.ResourceVersion, workingAW.Status) From c9379d00a813f4500f71574a348c3a1907174c5e Mon Sep 17 00:00:00 2001 From: Diana Arroyo Date: Mon, 16 Nov 2020 12:35:56 -0500 Subject: [PATCH 28/28] Cleanup for add conditions branch. Signed-off-by: Diana Arroyo --- .../queuejob/queuejob_controller_ex.go | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 0d7b5b1d5..371ea4e8c 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -981,16 +981,7 @@ func (cc *XController) updateEtcd(qj *arbv1.AppWrapper, at string) error { } func (cc *XController) updateStatusInEtcd(qj *arbv1.AppWrapper, at string) error { - //apiCacheAWJob, e := cc.queueJobLister.AppWrappers(qj.Namespace).Get(qj.Name) - // - //if (e != nil) { - // glog.Errorf("[updateEtcd] Failed to update status of AppWrapper %s, namespace: %s at %s err=%v", - // apiCacheAWJob.Name, apiCacheAWJob.Namespace, at, e) - // return e - //} - //TODO: Remove next line var apiCacheAWJob*arbv1.AppWrapper - //TODO: Remove next line apiCacheAWJob = qj if _, err := cc.arbclients.ArbV1().AppWrappers(apiCacheAWJob.Namespace).UpdateStatus(apiCacheAWJob); err != nil { glog.Errorf("[updateEtcd] Failed to update status of AppWrapper %s, namespace: %s at %s err=%v", @@ -1025,17 +1016,6 @@ func (qjm *XController) backoff(q *arbv1.AppWrapper, reason string, message stri time.Sleep(time.Duration(qjm.serverOption.BackoffTime) * time.Second) qjm.qjqueue.MoveToActiveQueueIfExists(workingAW) - //// Update condition after backoff - //apiCacheAWJob, e = qjm.queueJobLister.AppWrappers(q.Namespace).Get(q.Name) - //if (e == nil) { - // workingAW = apiCacheAWJob - workingAW.Status.QueueJobState = arbv1.AppWrapperCondQueueing - returnCond := GenerateAppWrapperCondition(arbv1.AppWrapperCondQueueing, v1.ConditionTrue, "BackoffTimerExpired.", "") - workingAW.Status.Conditions = append(workingAW.Status.Conditions, returnCond) - workingAW.Status.FilterIgnore = true // update QueueJobState only, no work needed - //qjm.updateEtcd(workingAW, "backoff - Queueing") - qjm.updateStatusInEtcd(workingAW, "backoff - Queueing") - //} glog.V(3).Infof("[backoff] %s activeQ.Add after sleep for %d seconds. activeQ=%t Unsched=%t &aw=%p Version=%s Status=%+v", workingAW.Name, qjm.serverOption.BackoffTime, qjm.qjqueue.IfExistActiveQ((workingAW)), qjm.qjqueue.IfExistUnschedulableQ((workingAW)), workingAW, workingAW.ResourceVersion, workingAW.Status) }