Skip to content

Commit

Permalink
agent: dump stack on stale probes
Browse files Browse the repository at this point in the history
Most of the time, when we see a stale probe, it's due to a deadlock. So,
write a stack dump to disk (since we're probably going to be restarted
soon due to a liveness probe).

To prevent any sort of excessive resource consumption, only dump stack
once every 5 minutes, and always write to the same file. Also, let's
make the check lock-free while we're at it.

Also, make sure we capture this file in bugtool.

Signed-off-by: Casey Callendrello <cdc@isovalent.com>
  • Loading branch information
squeed committed Feb 22, 2023
1 parent e8e5952 commit 87f7a11
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 1 deletion.
2 changes: 1 addition & 1 deletion daemon/cmd/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -1102,7 +1102,7 @@ func (d *Daemon) startStatusCollector(cleaner *daemonCleanup) {
d.statusResponse.CniChaining = d.getCNIChainingStatus()
d.statusResponse.IdentityRange = d.getIdentityRange()

d.statusCollector = status.NewCollector(probes, status.Config{})
d.statusCollector = status.NewCollector(probes, status.Config{StackdumpPath: "/run/cilium/state/agent.stack.gz"})

// Set up a signal handler function which prints out logs related to daemon status.
cleaner.cleanupFuncs.Add(func() {
Expand Down
48 changes: 48 additions & 0 deletions pkg/status/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,12 @@
package status

import (
"compress/gzip"
"context"
"fmt"
"os"
"runtime/pprof"
"sync/atomic"
"time"

"github.com/sirupsen/logrus"
Expand Down Expand Up @@ -66,13 +70,18 @@ type Collector struct {
stop chan struct{}
staleProbes map[string]struct{}
probeStartTime map[string]time.Time

// lastStackdumpTime is the last time we dumped stack; only do it
// every 5 minutes so we don't waste resources.
lastStackdumpTime atomic.Int64
}

// Config is the collector configuration
type Config struct {
WarningThreshold time.Duration
FailureThreshold time.Duration
Interval time.Duration
StackdumpPath string
}

// NewCollector creates a collector and starts the given probes.
Expand Down Expand Up @@ -246,8 +255,47 @@ func (c *Collector) updateProbeStatus(p *Probe, data interface{}, stale bool, er
logfields.StartTime: startTime,
logfields.Probe: p.Name,
}).Warn("Timeout while waiting probe")

// We just had a probe time out. This is commonly caused by a deadlock.
// So, capture a stack dump to aid in debugging.
go c.maybeDumpStack()
}

// Notify the probe about status update
p.OnStatusUpdate(Status{Err: err, Data: data, StaleWarning: stale})
}

// maybeDumpStack dumps the goroutine stack to a file on disk (usually in /run/cilium/state)
// if one hasn't been written in the past 5 minutes.
// This is triggered if a collector is stale, which can be caused by deadlocks.
func (c *Collector) maybeDumpStack() {
if c.config.StackdumpPath == "" {
return
}

now := time.Now().Unix()
before := c.lastStackdumpTime.Load()
if now-before < 5*60 {
return
}
swapped := c.lastStackdumpTime.CompareAndSwap(before, now)
if !swapped {
return
}

profile := pprof.Lookup("goroutine")
if profile == nil {
return
}

out, err := os.Create(c.config.StackdumpPath)
if err != nil {
log.WithError(err).WithField("path", c.config.StackdumpPath).Warn("Failed to write stack dump")
}
defer out.Close()
gzout := gzip.NewWriter(out)
defer gzout.Close()

profile.WriteTo(gzout, 2) // 2: print same stack format as panic
log.Infof("Wrote stack dump to %s", c.config.StackdumpPath)
}

0 comments on commit 87f7a11

Please sign in to comment.