Skip to content

Commit

Permalink
fix(collector): solving pod stuck in terminating with CNI issue (#1196)
Browse files Browse the repository at this point in the history
* feat(collector): add force delete for pods when they're done with in a collector
  • Loading branch information
DexterYan committed Jun 15, 2023
1 parent 03c53ca commit 5b1e482
Show file tree
Hide file tree
Showing 4 changed files with 138 additions and 9 deletions.
21 changes: 19 additions & 2 deletions examples/preflight/e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,15 @@ spec:
name: config/replicas.txt
data: "5"
- runPod:
collectorName: "static-hi"
collectorName: "static-hi-1"
podSpec:
containers:
- name: static-hi
image: alpine:3
command: ["echo", "hi static!"]
## This collector is intentionally duplicated to test same pod should be terminated after success
- runPod:
collectorName: "static-hi-2"
podSpec:
containers:
- name: static-hi
Expand Down Expand Up @@ -47,7 +55,16 @@ spec:
message: You have at least 5 replicas
- textAnalyze:
checkName: Said hi!
fileName: /static-hi.log
fileName: /static-hi-1.log
regex: 'hi static'
outcomes:
- fail:
message: Didn't say hi.
- pass:
message: Said hi!
- textAnalyze:
checkName: Said hi!
fileName: /static-hi-2.log
regex: 'hi static'
outcomes:
- fail:
Expand Down
64 changes: 61 additions & 3 deletions pkg/collect/copy_from_host.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@ import (
"archive/tar"
"bytes"
"context"
"fmt"
"io"
"os"
"path/filepath"
"strings"
"time"

"github.com/pkg/errors"
troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2"
"github.com/replicatedhq/troubleshoot/pkg/constants"
"github.com/replicatedhq/troubleshoot/pkg/k8sutil"
"github.com/segmentio/ksuid"
appsv1 "k8s.io/api/apps/v1"
Expand All @@ -19,6 +22,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
restclient "k8s.io/client-go/rest"
Expand Down Expand Up @@ -210,13 +214,12 @@ func copyFromHostCreateDaemonSet(ctx context.Context, client kubernetes.Interfac
}

createdDS, err := client.AppsV1().DaemonSets(namespace).Create(ctx, &ds, metav1.CreateOptions{})

if err != nil {
return "", cleanup, errors.Wrap(err, "create daemonset")
}
cleanupFuncs = append(cleanupFuncs, func() {
if err := client.AppsV1().DaemonSets(namespace).Delete(context.Background(), createdDS.Name, metav1.DeleteOptions{}); err != nil {
klog.Errorf("Failed to delete daemonset %s: %v", createdDS.Name, err)
}
deleteDaemonSet(client, ctx, createdDS, namespace, labels)
})

// This timeout is different from collector timeout.
Expand All @@ -227,6 +230,7 @@ func copyFromHostCreateDaemonSet(ctx context.Context, client kubernetes.Interfac
select {
case <-time.After(1 * time.Second):
case <-childCtx.Done():
klog.V(2).Infof("Timed out waiting for daemonset %s to be ready", createdDS.Name)
return createdDS.Name, cleanup, errors.Wrap(ctx.Err(), "wait for daemonset")
}

Expand Down Expand Up @@ -366,3 +370,57 @@ func copyFilesFromHost(ctx context.Context, dstPath string, clientConfig *restcl

return result, stderr.Bytes(), nil
}

func deleteDaemonSet(client kubernetes.Interface, ctx context.Context, createdDS *appsv1.DaemonSet, namespace string, labels map[string]string) {
klog.V(2).Infof("Daemonset %s has been scheduled for deletion", createdDS.Name)
if err := client.AppsV1().DaemonSets(namespace).Delete(context.Background(), createdDS.Name, metav1.DeleteOptions{}); err != nil {
klog.Errorf("Failed to delete daemonset %s: %v", createdDS.Name, err)
return
}

var labelSelector []string
for k, v := range labels {
labelSelector = append(labelSelector, fmt.Sprintf("%s=%s", k, v))
}

dsPods := &corev1.PodList{}
klog.V(2).Infof("Continuously poll each second for Pod deletion of DaemontSet %s for maximum %d seconds", createdDS.Name, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION/time.Second)

err := wait.PollUntilContextTimeout(ctx, time.Second, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION, true, func(ctx context.Context) (bool, error) {
pods, listErr := client.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{
LabelSelector: strings.Join(labelSelector, ","),
})

if listErr != nil {
klog.Errorf("Failed to list pods created by %s daemonset: %v", createdDS.Name, listErr)
}
// If there are no pods remaining, return true to stop the polling
if len(pods.Items) == 0 {
return true, nil
}
// If there is an error from context (e.g., context deadline exceeded), return the error.
if ctx.Err() != nil {
return false, ctx.Err()
}
// If there are still pods remaining and there was no context error, save the list of pods,
dsPods = pods
return false, nil
})

// If there was an error from the polling (e.g., the context deadline was exceeded before all pods were deleted),
// delete each remaining pod with a zero-second grace period
if err != nil {
zeroGracePeriod := int64(0)
for _, pod := range dsPods.Items {
klog.V(2).Infof("Pod %s forcefully deleted after reaching the maximum wait time of %d seconds", pod.Name, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION/time.Second)
err := client.CoreV1().Pods(namespace).Delete(context.TODO(), pod.Name, metav1.DeleteOptions{
GracePeriodSeconds: &zeroGracePeriod,
})
if err != nil {
klog.Errorf("Failed to wait for pod %s deletion: %v", pod.Name, err)
return
}
klog.V(2).Infof("Daemonset pod %s in %s namespace has been deleted", pod.Name, pod.Namespace)
}
}
}
59 changes: 56 additions & 3 deletions pkg/collect/run_pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (

"github.com/pkg/errors"
troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2"
"github.com/replicatedhq/troubleshoot/pkg/constants"
"github.com/replicatedhq/troubleshoot/pkg/k8sutil"
corev1 "k8s.io/api/core/v1"
"k8s.io/client-go/kubernetes"
Expand All @@ -22,6 +23,7 @@ import (

kuberneteserrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
)

type CollectRunPod struct {
Expand Down Expand Up @@ -55,9 +57,7 @@ func (c *CollectRunPod) Collect(progressChan chan<- interface{}) (CollectorResul
return nil, errors.Wrap(err, "failed to run pod")
}
defer func() {
if err := client.CoreV1().Pods(pod.Namespace).Delete(context.Background(), pod.Name, metav1.DeleteOptions{}); err != nil {
klog.Errorf("Failed to delete pod %s: %v", pod.Name, err)
}
deletePod(ctx, client, pod)
}()

if c.Collector.ImagePullSecret != nil && c.Collector.ImagePullSecret.Data != nil {
Expand Down Expand Up @@ -152,6 +152,8 @@ func runPodWithSpec(ctx context.Context, client *kubernetes.Clientset, runPodCol
}

created, err := client.CoreV1().Pods(namespace).Create(ctx, &pod, metav1.CreateOptions{})
klog.V(2).Infof("Pod %s has been created", pod.Name)

if err != nil {
return nil, errors.Wrap(err, "failed to create pod")
}
Expand Down Expand Up @@ -180,8 +182,23 @@ func runWithoutTimeout(ctx context.Context, bundlePath string, clientConfig *res
if v.State.Waiting != nil && v.State.Waiting.Reason == "ImagePullBackOff" {
return nil, errors.Errorf("run pod aborted after getting pod status 'ImagePullBackOff'")
}

if v.State.Waiting != nil && v.State.Waiting.Reason == "ContainerCreating" {
podEvents, err := client.CoreV1().Events(pod.Namespace).List(ctx, metav1.ListOptions{FieldSelector: (fmt.Sprintf("involvedObject.name=%s", pod.Name)), TypeMeta: metav1.TypeMeta{Kind: "Pod"}})
if err != nil {
return nil, errors.Wrap(err, "failed to get pod events")
}

for _, podEvent := range podEvents.Items {
if podEvent.Reason == "FailedCreatePodSandBox" {
klog.V(2).Infof("Pod %s failed to setup network for sandbox", pod.Name)
return nil, errors.Errorf("run pod aborted after getting pod status 'FailedCreatePodSandBox'")
}
}
}
}
}

time.Sleep(time.Second * 1)
}

Expand Down Expand Up @@ -424,3 +441,39 @@ func savePodDetails(ctx context.Context, client *kubernetes.Clientset, output Co
}
return output, nil
}

func deletePod(ctx context.Context, client *kubernetes.Clientset, pod *corev1.Pod) {
if err := client.CoreV1().Pods(pod.Namespace).Delete(context.Background(), pod.Name, metav1.DeleteOptions{}); err != nil {
klog.Errorf("Failed to delete pod %s: %v", pod.Name, err)
return
}
klog.V(2).Infof("Pod %s has been scheduled for deletion", pod.Name)

// Wait until the pod is deleted
// Poll every second to check if the Pod has been deleted.
klog.V(2).Infof("Continuously poll each second for Pod %s deletion for maximum %d seconds", pod.Name, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION/time.Second)
err := wait.PollUntilContextTimeout(ctx, time.Second, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION, true, func(ctx context.Context) (bool, error) {
_, getErr := client.CoreV1().Pods(pod.Namespace).Get(ctx, pod.Name, metav1.GetOptions{})
// If the Pod is not found, it has been deleted.
if kuberneteserrors.IsNotFound(getErr) {
return true, nil
}
// If there is an error from context (e.g., context deadline exceeded), return the error.
if ctx.Err() != nil {
return false, ctx.Err()
}
// Otherwise, the Pod has not yet been deleted. Keep polling.
return false, nil
})
if err != nil {
zeroGracePeriod := int64(0)
klog.V(2).Infof("Pod %s forcefully deleted after reaching the maximum wait time of %d seconds", pod.Name, constants.MAX_TIME_TO_WAIT_FOR_POD_DELETION/time.Second)
if err := client.CoreV1().Pods(pod.Namespace).Delete(context.Background(), pod.Name, metav1.DeleteOptions{
GracePeriodSeconds: &zeroGracePeriod,
}); err != nil {
klog.Errorf("Failed to wait for pod %s deletion: %v", pod.Name, err)
return
}
klog.V(2).Infof("Pod %s in %s namespace has been deleted", pod.Name, pod.Namespace)
}
}
3 changes: 2 additions & 1 deletion pkg/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ const (
VERSION_FILENAME = "version.yaml"
// DEFAULT_LOGS_COLLECTOR_TIMEOUT is the default timeout for logs collector.
DEFAULT_LOGS_COLLECTOR_TIMEOUT = 60 * time.Second

// MAX_TIME_TO_WAIT_FOR_POD_DELETION is the maximum time to wait for pod deletion.
MAX_TIME_TO_WAIT_FOR_POD_DELETION = 60 * time.Second
// Tracing constants
LIB_TRACER_NAME = "github.com/replicatedhq/troubleshoot"
TROUBLESHOOT_ROOT_SPAN_NAME = "ReplicatedTroubleshootRootSpan"
Expand Down

0 comments on commit 5b1e482

Please sign in to comment.