From 9358161b692024d52852e3737a4dfe3e5476eb3b Mon Sep 17 00:00:00 2001 From: thomas Date: Thu, 6 Mar 2025 13:45:16 -0800 Subject: [PATCH] ranking: downweight binary files --- cmd/zoekt-index/main.go | 2 +- index/builder.go | 34 ++++---------------- index/builder_test.go | 6 ++-- index/document.go | 50 ++++++++++++++++++++++++++++++ index/file_category.go | 18 ++++++++--- index/index_test.go | 18 +++++------ index/shard_builder.go | 27 ++++++++-------- index/shard_builder_test.go | 4 +-- internal/e2e/examples/example.bin | Bin 0 -> 14 bytes internal/e2e/scoring_test.go | 18 +++++++++-- internal/gitindex/index.go | 8 ++--- 11 files changed, 118 insertions(+), 67 deletions(-) create mode 100644 index/document.go create mode 100644 internal/e2e/examples/example.bin diff --git a/cmd/zoekt-index/main.go b/cmd/zoekt-index/main.go index bd7c4ddca..74186753e 100644 --- a/cmd/zoekt-index/main.go +++ b/cmd/zoekt-index/main.go @@ -137,7 +137,7 @@ func indexArg(arg string, opts index.Options, ignore map[string]struct{}) error if f.size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(displayName) { if err := builder.Add(index.Document{ Name: displayName, - SkipReason: fmt.Sprintf("document size %d larger than limit %d", f.size, opts.SizeMax), + SkipReason: index.SkipReasonTooLarge, }); err != nil { return err } diff --git a/index/builder.go b/index/builder.go index 6c93d1008..415740f68 100644 --- a/index/builder.go +++ b/index/builder.go @@ -601,17 +601,17 @@ func (b *Builder) Add(doc Document) error { // we pass through a part of the source tree with binary/large // files, the corresponding shard would be mostly empty, so // insert a reason here too. - doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax) - } else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil { - doc.SkipReason = err.Error() + doc.SkipReason = SkipReasonTooLarge + } else if skip := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); skip != SkipReasonNone { + doc.SkipReason = skip } b.todo = append(b.todo, &doc) - if doc.SkipReason == "" { + if doc.SkipReason == SkipReasonNone { b.size += len(doc.Name) + len(doc.Content) } else { - b.size += len(doc.Name) + len(doc.SkipReason) + b.size += len(doc.Name) // Drop the content if we are skipping the document. Skipped content is not counted towards the // shard size limit, so otherwise we might buffer too much data in memory before flushing. doc.Content = nil @@ -865,7 +865,7 @@ type rankedDoc struct { // have a higher chance of being searched before limits kick in. func rank(d *Document, origIdx int) []float64 { skipped := 0.0 - if d.SkipReason != "" { + if d.SkipReason != SkipReasonNone { skipped = 1.0 } @@ -1074,28 +1074,6 @@ func (e *deltaIndexOptionsMismatchError) Error() string { return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions) } -// Document holds a document (file) to index. -type Document struct { - Name string - Content []byte - Branches []string - SubRepositoryPath string - Language string - Category FileCategory - - // If set, something is wrong with the file contents, and this - // is the reason it wasn't indexed. - SkipReason string - - // Document sections for symbols. Offsets should use bytes. - Symbols []DocumentSection - SymbolsMetaData []*zoekt.Symbol -} - -type DocumentSection struct { - Start, End uint32 -} - // umask holds the Umask of the current process var umask os.FileMode diff --git a/index/builder_test.go b/index/builder_test.go index ce4fcb8b0..ec36001eb 100644 --- a/index/builder_test.go +++ b/index/builder_test.go @@ -246,7 +246,7 @@ func TestDontCountContentOfSkippedFiles(t *testing.T) { if err != nil { t.Fatal(err) } - if len(b.todo) != 1 || b.todo[0].SkipReason == "" { + if len(b.todo) != 1 || b.todo[0].SkipReason == SkipReasonNone { t.Fatalf("document should have been skipped") } if b.todo[0].Content != nil { @@ -1143,7 +1143,7 @@ func TestFileRank(t *testing.T) { docs: []*Document{ { Name: "binary_file", - SkipReason: "binary file", + SkipReason: SkipReasonBinary, }, { Name: "some_test.go", @@ -1151,7 +1151,7 @@ func TestFileRank(t *testing.T) { }, { Name: "large_file.go", - SkipReason: "too large", + SkipReason: SkipReasonTooLarge, }, { Name: "file.go", diff --git a/index/document.go b/index/document.go new file mode 100644 index 000000000..68a5b1e98 --- /dev/null +++ b/index/document.go @@ -0,0 +1,50 @@ +package index + +import "github.com/sourcegraph/zoekt" + +// Document holds a document (file) to index. +type Document struct { + Name string + Content []byte + Branches []string + SubRepositoryPath string + Language string + Category FileCategory + + SkipReason SkipReason + + // Document sections for symbols. Offsets should use bytes. + Symbols []DocumentSection + SymbolsMetaData []*zoekt.Symbol +} + +type SkipReason int + +const ( + SkipReasonNone SkipReason = iota + SkipReasonTooLarge + SkipReasonTooSmall + SkipReasonBinary + SkipReasonTooManyTrigrams +) + +func (s SkipReason) explanation() string { + switch s { + case SkipReasonNone: + return "" + case SkipReasonTooLarge: + return "exceeds the maximum size limit" + case SkipReasonTooSmall: + return "contains too few trigrams" + case SkipReasonBinary: + return "contains binary content" + case SkipReasonTooManyTrigrams: + return "contains too many trigrams" + default: + return "unknown skip reason" + } +} + +type DocumentSection struct { + Start, End uint32 +} diff --git a/index/file_category.go b/index/file_category.go index 60561535d..6ca4800af 100644 --- a/index/file_category.go +++ b/index/file_category.go @@ -22,16 +22,22 @@ const ( FileCategoryGenerated FileCategoryConfig FileCategoryDotFile + FileCategoryBinary FileCategoryDocumentation ) func DetermineFileCategory(doc *Document) { + if doc.SkipReason == SkipReasonBinary { + doc.Category = FileCategoryBinary + return + } + name := doc.Name content := doc.Content - // If this document has been skipped, it's likely very large. In this case, we just guess the category based - // on the filename to avoid examining the contents. Note: passing nil content is allowed by the go-enry contract. - if doc.SkipReason != "" { + // If this document was skipped because it was too large, just guess the category based on the filename to avoid + // examining the contents. Note: passing nil content is allowed by the go-enry contract. + if doc.SkipReason == SkipReasonTooLarge || doc.SkipReason == SkipReasonBinary { content = nil } @@ -56,7 +62,7 @@ func DetermineFileCategory(doc *Document) { // lowPriority returns true if this file category is considered 'low priority'. This is used // in search scoring to down-weight matches in these files. func (c FileCategory) lowPriority() bool { - return c == FileCategoryTest || c == FileCategoryVendored || c == FileCategoryGenerated + return c == FileCategoryTest || c == FileCategoryVendored || c == FileCategoryGenerated || c == FileCategoryBinary } func (c FileCategory) encode() (byte, error) { @@ -77,6 +83,8 @@ func (c FileCategory) encode() (byte, error) { return 6, nil case FileCategoryDocumentation: return 7, nil + case FileCategoryBinary: + return 8, nil default: return 0, errors.New("unrecognized file category") } @@ -98,6 +106,8 @@ func decodeCategory(c byte) (FileCategory, error) { return FileCategoryDotFile, nil case 7: return FileCategoryDocumentation, nil + case 8: + return FileCategoryBinary, nil default: return FileCategoryMissing, errors.New("unrecognized file category") } diff --git a/index/index_test.go b/index/index_test.go index 0621e53d7..1cb36d615 100644 --- a/index/index_test.go +++ b/index/index_test.go @@ -2403,7 +2403,7 @@ func TestUnicodeVariableLength(t *testing.T) { t.Run("LineMatches", func(t *testing.T) { b := testShardBuilder(t, nil, - Document{Name: "f1", Content: []byte(corpus)}) + Document{Name: "f1", Content: corpus}) res := searchForTest(t, b, &query.Substring{Pattern: needle, Content: true}) if len(res.Files) != 1 { @@ -2413,7 +2413,7 @@ func TestUnicodeVariableLength(t *testing.T) { t.Run("ChunkMatches", func(t *testing.T) { b := testShardBuilder(t, nil, - Document{Name: "f1", Content: []byte(corpus)}) + Document{Name: "f1", Content: corpus}) res := searchForTest(t, b, &query.Substring{Pattern: needle, Content: true}, chunkOpts) if len(res.Files) != 1 { @@ -3191,7 +3191,7 @@ func TestSymbolRegexpAll(t *testing.T) { } for j, sec := range want.Symbols { - if sec.Start != uint32(got[j].Start.ByteOffset) { + if sec.Start != got[j].Start.ByteOffset { t.Fatalf("got offset %d, want %d in doc %s", got[j].Start.ByteOffset, sec.Start, want.Name) } } @@ -3372,24 +3372,24 @@ func TestDocChecker(t *testing.T) { // Test valid and invalid text for _, text := range []string{"", "simple ascii", "símplé unicödé", "\uFEFFwith utf8 'bom'", "with \uFFFD unicode replacement char"} { - if err := docChecker.Check([]byte(text), 20000, false); err != nil { - t.Errorf("Check(%q): %v", text, err) + if skip := docChecker.Check([]byte(text), 20000, false); skip != SkipReasonNone { + t.Errorf("Check(%q): %v", text, skip) } } for _, text := range []string{"zero\x00byte", "xx", "0123456789abcdefghi"} { - if err := docChecker.Check([]byte(text), 15, false); err == nil { + if skip := docChecker.Check([]byte(text), 15, false); skip == SkipReasonNone { t.Errorf("Check(%q) succeeded", text) } } // Test valid and invalid text with an allowed large file for _, text := range []string{"0123456789abcdefghi", "qwertyuiopasdfghjklzxcvbnm"} { - if err := docChecker.Check([]byte(text), 15, true); err != nil { - t.Errorf("Check(%q): %v", text, err) + if skip := docChecker.Check([]byte(text), 15, true); skip != SkipReasonNone { + t.Errorf("Check(%q): %v", text, skip) } } for _, text := range []string{"zero\x00byte", "xx"} { - if err := docChecker.Check([]byte(text), 15, true); err == nil { + if skip := docChecker.Check([]byte(text), 15, true); skip == SkipReasonNone { t.Errorf("Check(%q) succeeded", text) } } diff --git a/index/shard_builder.go b/index/shard_builder.go index 2d80b6412..a7139856e 100644 --- a/index/shard_builder.go +++ b/index/shard_builder.go @@ -402,7 +402,7 @@ func DetermineLanguageIfUnknown(doc *Document) { return } - if doc.SkipReason != "" { + if doc.SkipReason != SkipReasonNone { // If this document has been skipped, it's likely very large, or it's a non-code file like binary. // In this case, we just guess the language based on file name to avoid examining the contents. // Note: passing nil content is allowed by the go-enry contract (the underlying library we use here). @@ -414,14 +414,12 @@ func DetermineLanguageIfUnknown(doc *Document) { // Add a file which only occurs in certain branches. func (b *ShardBuilder) Add(doc Document) error { - hasher := crc64.New(crc64.MakeTable(crc64.ISO)) - - if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 { - doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx) + if index := bytes.IndexByte(doc.Content, 0); index > 0 { + doc.SkipReason = SkipReasonBinary } - if doc.SkipReason != "" { - doc.Content = []byte(notIndexedMarker + doc.SkipReason) + if doc.SkipReason != SkipReasonNone { + doc.Content = []byte(notIndexedMarker + doc.SkipReason.explanation()) doc.Symbols = nil doc.SymbolsMetaData = nil } @@ -481,6 +479,7 @@ func (b *ShardBuilder) Add(doc Document) error { b.subRepos = append(b.subRepos, subRepoIdx) b.repos = append(b.repos, uint16(repoIdx)) + hasher := crc64.New(crc64.MakeTable(crc64.ISO)) hasher.Write(doc.Content) b.contentStrings = append(b.contentStrings, docStr) @@ -543,23 +542,23 @@ type DocChecker struct { } // Check returns a reason why the given contents are probably not source texts. -func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) error { +func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) SkipReason { if len(content) == 0 { - return nil + return SkipReasonNone } if len(content) < ngramSize { - return fmt.Errorf("file size smaller than %d", ngramSize) + return SkipReasonTooSmall } if index := bytes.IndexByte(content, 0); index > 0 { - return fmt.Errorf("binary data at byte offset %d", index) + return SkipReasonBinary } // PERF: we only need to do the trigram check if the upperbound on content is greater than // our threshold. Also skip the trigram check if the file is explicitly marked as allowed. if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile { - return nil + return SkipReasonNone } var cur [3]rune @@ -580,10 +579,10 @@ func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile b t.trigrams[runesToNGram(cur)] = struct{}{} if len(t.trigrams) > maxTrigramCount { // probably not text. - return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount) + return SkipReasonTooManyTrigrams } } - return nil + return SkipReasonNone } func (t *DocChecker) clearTrigrams(maxTrigramCount int) { diff --git a/index/shard_builder_test.go b/index/shard_builder_test.go index 662ac8cd6..f3bed5bc1 100644 --- a/index/shard_builder_test.go +++ b/index/shard_builder_test.go @@ -68,7 +68,7 @@ func TestDetermineLanguageIfUnknown(t *testing.T) { name: "skipped file", doc: Document{ Name: "large.js", - SkipReason: "too large", + SkipReason: SkipReasonTooLarge, Content: []byte(notIndexedMarker + "too large"), }, wantLang: "JavaScript", @@ -77,7 +77,7 @@ func TestDetermineLanguageIfUnknown(t *testing.T) { name: "skipped file with unknown extension", doc: Document{ Name: "deadb33f", - SkipReason: "binary", + SkipReason: SkipReasonBinary, Content: []byte(notIndexedMarker + "binary"), }, wantLang: "", diff --git a/internal/e2e/examples/example.bin b/internal/e2e/examples/example.bin new file mode 100644 index 0000000000000000000000000000000000000000..e9feaef2378cf4976474d1cc6e62975558f82bb7 GIT binary patch literal 14 VcmYdHN>)foO;cb{NK8uR0stTR1Lgn# literal 0 HcmV?d00001 diff --git a/internal/e2e/scoring_test.go b/internal/e2e/scoring_test.go index d2f8d32c3..7fbb93bee 100644 --- a/internal/e2e/scoring_test.go +++ b/internal/e2e/scoring_test.go @@ -72,6 +72,11 @@ func TestBM25(t *testing.T) { t.Fatal(err) } + exampleBin, err := os.ReadFile("./examples/example.bin") + if err != nil { + t.Fatal(err) + } + cases := []scoreCase{ { // Matches on both filename and content @@ -153,13 +158,22 @@ func TestBM25(t *testing.T) { wantScore: 2.07, }, { - // Match on test should be scored lower than regular files + // Match on test should be downweighted fileName: "test_example.py", query: &query.Substring{Pattern: "example"}, language: "Python", - // sum-termFrequencyScore: 1.0, length-ratio: 0.00 + // sum-termFrequencyScore: 1.00, length-ratio: 0.00 wantScore: 1.69, }, + { + // Match on binary should be downweighted + fileName: "example.bin", + query: &query.Substring{Pattern: "example"}, + language: "", + content: exampleBin, + // sum-termFrequencyScore: 1.00, length-ratio: 1.00 + wantScore: 1.00, + }, } for _, c := range cases { diff --git a/internal/gitindex/index.go b/internal/gitindex/index.go index 432784252..ebe267b24 100644 --- a/internal/gitindex/index.go +++ b/internal/gitindex/index.go @@ -882,7 +882,7 @@ func createDocument(key fileKey, // We filter out large documents when fetching the repo. So if an object is too large, it will not be found. if errors.Is(err, plumbing.ErrObjectNotFound) { - return skippedLargeDoc(key, branches, opts), nil + return skippedLargeDoc(key, branches), nil } if err != nil { @@ -891,7 +891,7 @@ func createDocument(key fileKey, keyFullPath := key.FullPath() if blob.Size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(keyFullPath) { - return skippedLargeDoc(key, branches, opts), nil + return skippedLargeDoc(key, branches), nil } contents, err := blobContents(blob) @@ -907,9 +907,9 @@ func createDocument(key fileKey, }, nil } -func skippedLargeDoc(key fileKey, branches []string, opts index.Options) index.Document { +func skippedLargeDoc(key fileKey, branches []string) index.Document { return index.Document{ - SkipReason: fmt.Sprintf("file size exceeds maximum size %d", opts.SizeMax), + SkipReason: index.SkipReasonTooLarge, Name: key.FullPath(), Branches: branches, SubRepositoryPath: key.SubRepoPath,