/
run_daemonset.go
294 lines (250 loc) · 7.94 KB
/
run_daemonset.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
package collect
import (
"bytes"
"context"
"fmt"
"io"
"path/filepath"
"strings"
"sync"
"time"
"github.com/pkg/errors"
troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
v1 "k8s.io/client-go/kubernetes/typed/core/v1"
"k8s.io/client-go/rest"
"k8s.io/klog/v2"
)
const (
defaultTimeout = time.Duration(60 * time.Second)
)
type CollectRunDaemonSet struct {
Collector *troubleshootv1beta2.RunDaemonSet
BundlePath string
Namespace string
ClientConfig *rest.Config
Client kubernetes.Interface
Context context.Context
RBACErrors
}
func (c *CollectRunDaemonSet) Title() string {
return getCollectorName(c)
}
func (c *CollectRunDaemonSet) IsExcluded() (bool, error) {
return isExcluded(c.Collector.Exclude)
}
func (c *CollectRunDaemonSet) Collect(progressChan chan<- interface{}) (CollectorResult, error) {
ctx := context.Background()
client, err := kubernetes.NewForConfig(c.ClientConfig)
if err != nil {
return nil, errors.Wrap(err, "failed to create client from config")
}
// create DaemonSet Spec
dsSpec, err := createDaemonSetSpec(c.Collector)
if err != nil {
return nil, errors.Wrap(err, "failed to create DaemonSet spec")
}
// create ImagePullSecret
var secretName string
if c.Collector.ImagePullSecret != nil && c.Collector.ImagePullSecret.Data != nil {
secretName, err = createSecret(ctx, client, dsSpec.ObjectMeta.Namespace, c.Collector.ImagePullSecret)
if err != nil {
return nil, errors.Wrap(err, "failed to create ImagePullSecret")
}
dsSpec.Spec.Template.Spec.ImagePullSecrets = append(dsSpec.Spec.Template.Spec.ImagePullSecrets, corev1.LocalObjectReference{Name: secretName})
}
// run DaemonSet
ds, err := client.AppsV1().DaemonSets(dsSpec.ObjectMeta.Namespace).Create(ctx, dsSpec, metav1.CreateOptions{})
klog.V(2).Infof("DaemonSet %s has been created", ds.Name)
if err != nil {
return nil, errors.Wrap(err, "failed to create DaemonSet")
}
defer func() {
// delete DaemonSet
err := client.AppsV1().DaemonSets(ds.ObjectMeta.Namespace).Delete(ctx, ds.ObjectMeta.Name, metav1.DeleteOptions{})
if err != nil {
klog.Errorf("Failed to delete DaemonSet %s: %v", ds.Name, err)
}
// delete ImagePullSecret
if secretName != "" {
err := client.CoreV1().Secrets(ds.ObjectMeta.Namespace).Delete(ctx, secretName, metav1.DeleteOptions{})
if err != nil {
klog.Errorf("Failed to delete Secret %s: %v", secretName, err)
}
}
}()
// set custom timeout if any
var (
timeout time.Duration
errInvalidDuration error
)
if c.Collector.Timeout != "" {
timeout, errInvalidDuration = time.ParseDuration(c.Collector.Timeout)
if errInvalidDuration != nil {
return nil, errors.Wrapf(errInvalidDuration, "failed to parse timeout %q", c.Collector.Timeout)
}
}
if timeout <= time.Duration(0) {
timeout = defaultTimeout
}
timeoutCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
// block till DaemonSet has right number of scheduled Pods
err = waitForDaemonSetPods(timeoutCtx, client, ds)
if err != nil {
return nil, errors.Wrap(err, "failed to wait for DaemonSet pods")
}
klog.V(2).Infof("DaemonSet %s has desired number of pods", ds.Name)
// get all Pods in DaemonSet
pods, err := client.CoreV1().Pods(ds.ObjectMeta.Namespace).List(ctx, metav1.ListOptions{
LabelSelector: getLabelSelector(ds),
})
if err != nil {
return nil, errors.Wrap(err, "failed to list pods")
}
results := NewResult()
// collect logs from all Pods
// or save error message if failed to get logs
wg := &sync.WaitGroup{}
mtx := &sync.Mutex{}
for _, pod := range pods.Items {
wg.Add(1)
go func(pod corev1.Pod) {
defer wg.Done()
select {
case <-timeoutCtx.Done():
klog.Errorf("Timeout reached while waiting for pod %s", pod.Name)
return
default:
}
var logs []byte
nodeName, err := getPodNodeAtCompletion(timeoutCtx, client.CoreV1(), pod)
if err != nil {
nodeName = fmt.Sprintf("unknown-node-%s", pod.Name)
errString := fmt.Sprintf("Failed to get node name/wait for pod %s to complete: %v", pod.Name, err)
klog.Error(errString)
logs = []byte(errString)
} else {
logs, err = getPodLog(timeoutCtx, client.CoreV1(), pod)
if err != nil {
errString := fmt.Sprintf("Failed to get log from pod %s: %v", pod.Name, err)
klog.Error(errString)
logs = []byte(errString)
}
}
mtx.Lock()
defer mtx.Unlock()
results[nodeName] = logs
klog.V(2).Infof("Collected logs for pod %s", pod.Name)
}(pod)
}
wg.Wait()
output := NewResult()
for k, v := range results {
filename := k + ".log"
err := output.SaveResult(c.BundlePath, filepath.Join(c.Collector.Name, filename), bytes.NewBuffer(v))
if err != nil {
return nil, err
}
}
return output, nil
}
func createDaemonSetSpec(c *troubleshootv1beta2.RunDaemonSet) (*appsv1.DaemonSet, error) {
ds := &appsv1.DaemonSet{}
labels := make(map[string]string)
labels["troubleshoot-role"] = "run-daemonset-collector"
namespace := "default"
if c.Namespace != "" {
namespace = c.Namespace
}
ds.ObjectMeta = metav1.ObjectMeta{
GenerateName: fmt.Sprintf("run-daemonset-%s-", c.Name),
Namespace: namespace,
Labels: labels,
}
ds.Spec = appsv1.DaemonSetSpec{
Selector: &metav1.LabelSelector{
MatchLabels: labels,
},
Template: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Namespace: namespace,
Labels: labels,
},
Spec: c.PodSpec,
},
}
return ds, nil
}
func getLabelSelector(ds *appsv1.DaemonSet) string {
labelSelector := ""
for k, v := range ds.Spec.Template.ObjectMeta.Labels {
labelSelector += k + "=" + v + ","
}
return strings.TrimSuffix(labelSelector, ",")
}
func getPodLog(ctx context.Context, client v1.CoreV1Interface, pod corev1.Pod) ([]byte, error) {
podLogOpts := corev1.PodLogOptions{
Container: pod.Spec.Containers[0].Name,
}
req := client.Pods(pod.Namespace).GetLogs(pod.Name, &podLogOpts)
logs, err := req.Stream(ctx)
if err != nil {
return nil, errors.Wrap(err, "failed to get log stream")
}
defer logs.Close()
return io.ReadAll(logs)
}
// getPodNodeAtCompletion waits for the Pod to complete and returns the node name
func getPodNodeAtCompletion(ctx context.Context, client v1.CoreV1Interface, pod corev1.Pod) (string, error) {
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return "", ctx.Err()
case <-ticker.C:
pod, err := client.Pods(pod.Namespace).Get(ctx, pod.Name, metav1.GetOptions{})
if err != nil {
return "", errors.Wrap(err, "failed to get pod")
}
if pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodSucceeded {
return pod.Spec.NodeName, nil
}
// we assume a container restart means the Pod has completed before
if len(pod.Status.ContainerStatuses) > 0 && pod.Status.ContainerStatuses[0].RestartCount > 0 {
return pod.Spec.NodeName, nil
}
if pod.Status.Phase == corev1.PodPending {
for _, v := range pod.Status.ContainerStatuses {
if v.State.Waiting != nil && v.State.Waiting.Reason == "ImagePullBackOff" {
return "", errors.New("wait for pod aborted after getting pod status 'ImagePullBackOff'")
}
}
}
}
}
}
// waitForDaemonSetPods waits for the DaemonSet to have the desired number of pods scheduled
func waitForDaemonSetPods(ctx context.Context, client kubernetes.Interface, ds *appsv1.DaemonSet) error {
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return ctx.Err()
case <-ticker.C:
ds, err := client.AppsV1().DaemonSets(ds.ObjectMeta.Namespace).Get(ctx, ds.ObjectMeta.Name, metav1.GetOptions{})
if err != nil {
return errors.Wrap(err, "failed to get DaemonSet")
}
// we return as soon as the desired number of pods are scheduled
if ds.Status.DesiredNumberScheduled == ds.Status.CurrentNumberScheduled {
return nil
}
}
}
}