-
Notifications
You must be signed in to change notification settings - Fork 45
/
reserving_handler.go
170 lines (149 loc) · 7.41 KB
/
reserving_handler.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
// Copyright 2023 Ant Group Co., Ltd.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package handler
import (
"fmt"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/client-go/kubernetes"
listers "k8s.io/client-go/listers/core/v1"
"github.com/secretflow/kuscia/pkg/common"
kusciaapisv1alpha1 "github.com/secretflow/kuscia/pkg/crd/apis/kuscia/v1alpha1"
kusciaclientset "github.com/secretflow/kuscia/pkg/crd/clientset/versioned"
kuscialistersv1alpha1 "github.com/secretflow/kuscia/pkg/crd/listers/kuscia/v1alpha1"
"github.com/secretflow/kuscia/pkg/utils/nlog"
utilsres "github.com/secretflow/kuscia/pkg/utils/resources"
)
// ReservingHandler is used to handle task resource group which phase is reserving.
type ReservingHandler struct {
kubeClient kubernetes.Interface
kusciaClient kusciaclientset.Interface
namespaceLister listers.NamespaceLister
podLister listers.PodLister
trLister kuscialistersv1alpha1.TaskResourceLister
}
// NewReservingHandler returns a ReservingHandler instance.
func NewReservingHandler(deps *Dependencies) *ReservingHandler {
return &ReservingHandler{
kubeClient: deps.KubeClient,
kusciaClient: deps.KusciaClient,
namespaceLister: deps.NamespaceLister,
podLister: deps.PodLister,
trLister: deps.TrLister,
}
}
// Handle is used to perform the real logic.
func (h *ReservingHandler) Handle(trg *kusciaapisv1alpha1.TaskResourceGroup) (needUpdate bool, err error) {
now := metav1.Now().Rfc3339Copy()
if needUpdate, err = h.updatePodAnnotations(now, trg); err != nil {
nlog.Error(err)
return true, err
}
if !utilsres.SelfClusterAsInitiator(h.namespaceLister, trg.Spec.Initiator, trg.Annotations) {
return false, nil
}
return h.summarizeTaskResourcesInfo(now, trg)
}
func (h *ReservingHandler) updatePodAnnotations(now metav1.Time, trg *kusciaapisv1alpha1.TaskResourceGroup) (needUpdate bool, err error) {
if err = updatePodAnnotations(string(trg.UID), h.podLister, h.kubeClient); err != nil {
cond, _ := utilsres.GetTaskResourceGroupCondition(&trg.Status, kusciaapisv1alpha1.PodAnnotationUpdated)
needUpdate = utilsres.SetTaskResourceGroupCondition(&now, cond, v1.ConditionFalse, fmt.Sprintf("Update pod annotation failed, %v", err.Error()))
return needUpdate, err
}
if utilsres.IsExistingTaskResourceGroupCondition(&trg.Status, kusciaapisv1alpha1.PodAnnotationUpdated, v1.ConditionFalse) {
cond, _ := utilsres.GetTaskResourceGroupCondition(&trg.Status, kusciaapisv1alpha1.PodAnnotationUpdated)
needUpdate = utilsres.SetTaskResourceGroupCondition(&now, cond, v1.ConditionTrue, "")
return needUpdate, nil
}
return false, nil
}
func (h *ReservingHandler) summarizeTaskResourcesInfo(now metav1.Time, trg *kusciaapisv1alpha1.TaskResourceGroup) (needUpdate bool, err error) {
var trs []*kusciaapisv1alpha1.TaskResource
var trsCount, reservedCount, failedCount int
partySet := make(map[string]struct{})
var allParties []kusciaapisv1alpha1.TaskResourceGroupParty
allParties = append(allParties, trg.Spec.Parties...)
allParties = append(allParties, trg.Spec.OutOfControlledParties...)
for _, party := range allParties {
if _, exist := partySet[party.DomainID]; exist {
continue
}
trs, err = h.trLister.TaskResources(party.DomainID).List(labels.SelectorFromSet(labels.Set{common.LabelTaskResourceGroupUID: string(trg.UID)}))
if err != nil {
cond, _ := utilsres.GetTaskResourceGroupCondition(&trg.Status, kusciaapisv1alpha1.TaskResourcesListed)
needUpdate = utilsres.SetTaskResourceGroupCondition(&now, cond, v1.ConditionFalse, fmt.Sprintf("List task resources failed, %v", err.Error()))
return needUpdate, err
}
for _, tr := range trs {
trsCount++
if tr.Status.Phase == kusciaapisv1alpha1.TaskResourcePhaseReserved {
reservedCount++
}
if tr.Status.Phase == kusciaapisv1alpha1.TaskResourcePhaseFailed {
failedCount++
}
}
partySet[party.DomainID] = struct{}{}
}
if utilsres.IsExistingTaskResourceGroupCondition(&trg.Status, kusciaapisv1alpha1.TaskResourcesListed, v1.ConditionFalse) {
cond, _ := utilsres.GetTaskResourceGroupCondition(&trg.Status, kusciaapisv1alpha1.TaskResourcesListed)
needUpdate = utilsres.SetTaskResourceGroupCondition(&now, cond, v1.ConditionTrue, "")
}
totalParty := len(allParties)
if trg.Spec.MinReservedMembers > totalParty {
trg.Status.Phase = kusciaapisv1alpha1.TaskResourceGroupPhaseFailed
trg.Status.LastTransitionTime = &now
cond, _ := utilsres.GetTaskResourceGroupCondition(&trg.Status, kusciaapisv1alpha1.TaskResourcesReserved)
needUpdate = utilsres.SetTaskResourceGroupCondition(&now, cond, v1.ConditionFalse,
fmt.Sprintf("Task resource group min reserved member %v is greater than total parties count %v", trg.Spec.MinReservedMembers, totalParty))
return needUpdate, nil
}
if trg.Spec.MinReservedMembers > trsCount {
trg.Status.Phase = kusciaapisv1alpha1.TaskResourceGroupPhaseFailed
trg.Status.LastTransitionTime = &now
cond, _ := utilsres.GetTaskResourceGroupCondition(&trg.Status, kusciaapisv1alpha1.TaskResourcesReserved)
needUpdate = utilsres.SetTaskResourceGroupCondition(&now, cond, v1.ConditionFalse,
fmt.Sprintf("Task resource group min reserved member %v is greater than task resources count %v", trg.Spec.MinReservedMembers, trsCount))
return needUpdate, nil
}
if reservedCount >= trg.Spec.MinReservedMembers {
trg.Status.Phase = kusciaapisv1alpha1.TaskResourceGroupPhaseReserved
trg.Status.LastTransitionTime = &now
cond, _ := utilsres.GetTaskResourceGroupCondition(&trg.Status, kusciaapisv1alpha1.TaskResourcesReserved)
needUpdate = utilsres.SetTaskResourceGroupCondition(&now, cond, v1.ConditionTrue, "")
return needUpdate, nil
}
if trg.Spec.MinReservedMembers > totalParty-failedCount {
cond, _ := utilsres.GetTaskResourceGroupCondition(&trg.Status, kusciaapisv1alpha1.TaskResourcesReserved)
// patch all party status phase to failed.
trCondReason := "Task resource group state changed to reserve-failed, so set the task resource status to failed"
if err = patchTaskResourceStatus(trg, kusciaapisv1alpha1.TaskResourcePhaseFailed, kusciaapisv1alpha1.TaskResourceCondFailed,
trCondReason, h.kusciaClient, h.trLister); err != nil {
needUpdate = utilsres.SetTaskResourceGroupCondition(&now, cond, v1.ConditionFalse,
fmt.Sprintf("Patch task resources status failed, %v", err.Error()))
return needUpdate, err
}
trg.Status.Phase = kusciaapisv1alpha1.TaskResourceGroupPhaseReserveFailed
if trg.Labels != nil && trg.Labels[common.LabelInterConnProtocolType] == string(kusciaapisv1alpha1.InterConnBFIA) {
trg.Status.Phase = kusciaapisv1alpha1.TaskResourceGroupPhaseFailed
}
trg.Status.LastTransitionTime = &now
needUpdate = utilsres.SetTaskResourceGroupCondition(&now, cond, v1.ConditionFalse,
fmt.Sprintf("The remaining no-failed parties count %v is less than the schedulable threshold %v",
totalParty-failedCount, trg.Spec.MinReservedMembers))
return needUpdate, nil
}
return needUpdate, nil
}