From a0b18e121ba706d0161d09bbe4043624f99f06a3 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Fri, 27 Mar 2026 11:29:16 +0200 Subject: [PATCH] search: use structured logging for crashed shards We recently had some crashes in production while experimenting with ZOEKT_RE2_THRESHOLD_BYTES and found it hard to consume the stack traces since they were over multiple lines. --- search/shards.go | 25 +++++++++++++++++++++++-- search/shards_test.go | 5 +++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/search/shards.go b/search/shards.go index bee903f99..64668ba41 100644 --- a/search/shards.go +++ b/search/shards.go @@ -30,6 +30,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" + sglog "github.com/sourcegraph/log" "go.uber.org/atomic" "golang.org/x/sync/semaphore" @@ -41,6 +42,10 @@ import ( ) var ( + shardRecoveryLogger = sync.OnceValue(func() sglog.Logger { + return sglog.Scoped("searchShards") + }) + metricShardsLoaded = promauto.NewGauge(prometheus.GaugeOpts{ Name: "zoekt_shards_loaded", Help: "The number of shards currently loaded", @@ -927,12 +932,28 @@ func copyFiles(sr *zoekt.SearchResult) { } } +func logShardCrash(operation string, s zoekt.Searcher, recovered any, stack []byte) { + fields := []sglog.Field{ + sglog.String("operation", operation), + sglog.String("shard", s.String()), + sglog.String("stacktrace", string(stack)), + } + + if err, ok := recovered.(error); ok { + fields = append(fields, sglog.Error(err)) + } else { + fields = append(fields, sglog.String("panic", fmt.Sprint(recovered))) + } + + shardRecoveryLogger().Error("crashed shard", fields...) +} + func searchOneShard(ctx context.Context, s zoekt.Searcher, q query.Q, opts *zoekt.SearchOptions) (sr *zoekt.SearchResult, err error) { metricSearchShardRunning.Inc() defer func() { metricSearchShardRunning.Dec() if e := recover(); e != nil { - log.Printf("[ERROR] crashed shard: %s: %#v, %s", s, e, debug.Stack()) + logShardCrash("search", s, e, debug.Stack()) if sr == nil { sr = &zoekt.SearchResult{} @@ -954,7 +975,7 @@ func listOneShard(ctx context.Context, s zoekt.Searcher, q query.Q, opts *zoekt. defer func() { metricListShardRunning.Dec() if r := recover(); r != nil { - log.Printf("[ERROR] crashed shard: %s: %s, %s", s.String(), r, debug.Stack()) + logShardCrash("list", s, r, debug.Stack()) sink <- shardListResult{ &zoekt.RepoList{Crashes: 1}, nil, } diff --git a/search/shards_test.go b/search/shards_test.go index 668075b1f..ed1b7a733 100644 --- a/search/shards_test.go +++ b/search/shards_test.go @@ -36,6 +36,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" "github.com/grafana/regexp" + sglog "github.com/sourcegraph/log" "github.com/sourcegraph/zoekt/index" @@ -75,6 +76,10 @@ func TestCrashResilience(t *testing.T) { log.SetOutput(out) defer log.SetOutput(oldOut) + oldShardRecoveryLogger := shardRecoveryLogger + shardRecoveryLogger = sglog.NoOp + defer func() { shardRecoveryLogger = oldShardRecoveryLogger }() + ss := newShardedSearcher(2) ss.ranked.Store([]*rankedShard{{Searcher: &crashSearcher{}}})