From 9358161b692024d52852e3737a4dfe3e5476eb3b Mon Sep 17 00:00:00 2001
From: thomas <julietibs@apache.org>
Date: Thu, 6 Mar 2025 13:45:16 -0800
Subject: [PATCH] ranking: downweight binary files

---
 cmd/zoekt-index/main.go           |   2 +-
 index/builder.go                  |  34 ++++----------------
 index/builder_test.go             |   6 ++--
 index/document.go                 |  50 ++++++++++++++++++++++++++++++
 index/file_category.go            |  18 ++++++++---
 index/index_test.go               |  18 +++++------
 index/shard_builder.go            |  27 ++++++++--------
 index/shard_builder_test.go       |   4 +--
 internal/e2e/examples/example.bin | Bin 0 -> 14 bytes
 internal/e2e/scoring_test.go      |  18 +++++++++--
 internal/gitindex/index.go        |   8 ++---
 11 files changed, 118 insertions(+), 67 deletions(-)
 create mode 100644 index/document.go
 create mode 100644 internal/e2e/examples/example.bin

diff --git a/cmd/zoekt-index/main.go b/cmd/zoekt-index/main.go
index bd7c4ddca..74186753e 100644
--- a/cmd/zoekt-index/main.go
+++ b/cmd/zoekt-index/main.go
@@ -137,7 +137,7 @@ func indexArg(arg string, opts index.Options, ignore map[string]struct{}) error
 		if f.size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(displayName) {
 			if err := builder.Add(index.Document{
 				Name:       displayName,
-				SkipReason: fmt.Sprintf("document size %d larger than limit %d", f.size, opts.SizeMax),
+				SkipReason: index.SkipReasonTooLarge,
 			}); err != nil {
 				return err
 			}
diff --git a/index/builder.go b/index/builder.go
index 6c93d1008..415740f68 100644
--- a/index/builder.go
+++ b/index/builder.go
@@ -601,17 +601,17 @@ func (b *Builder) Add(doc Document) error {
 		// we pass through a part of the source tree with binary/large
 		// files, the corresponding shard would be mostly empty, so
 		// insert a reason here too.
-		doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
-	} else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil {
-		doc.SkipReason = err.Error()
+		doc.SkipReason = SkipReasonTooLarge
+	} else if skip := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); skip != SkipReasonNone {
+		doc.SkipReason = skip
 	}
 
 	b.todo = append(b.todo, &doc)
 
-	if doc.SkipReason == "" {
+	if doc.SkipReason == SkipReasonNone {
 		b.size += len(doc.Name) + len(doc.Content)
 	} else {
-		b.size += len(doc.Name) + len(doc.SkipReason)
+		b.size += len(doc.Name)
 		// Drop the content if we are skipping the document. Skipped content is not counted towards the
 		// shard size limit, so otherwise we might buffer too much data in memory before flushing.
 		doc.Content = nil
@@ -865,7 +865,7 @@ type rankedDoc struct {
 // have a higher chance of being searched before limits kick in.
 func rank(d *Document, origIdx int) []float64 {
 	skipped := 0.0
-	if d.SkipReason != "" {
+	if d.SkipReason != SkipReasonNone {
 		skipped = 1.0
 	}
 
@@ -1074,28 +1074,6 @@ func (e *deltaIndexOptionsMismatchError) Error() string {
 	return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions)
 }
 
-// Document holds a document (file) to index.
-type Document struct {
-	Name              string
-	Content           []byte
-	Branches          []string
-	SubRepositoryPath string
-	Language          string
-	Category          FileCategory
-
-	// If set, something is wrong with the file contents, and this
-	// is the reason it wasn't indexed.
-	SkipReason string
-
-	// Document sections for symbols. Offsets should use bytes.
-	Symbols         []DocumentSection
-	SymbolsMetaData []*zoekt.Symbol
-}
-
-type DocumentSection struct {
-	Start, End uint32
-}
-
 // umask holds the Umask of the current process
 var umask os.FileMode
 
diff --git a/index/builder_test.go b/index/builder_test.go
index ce4fcb8b0..ec36001eb 100644
--- a/index/builder_test.go
+++ b/index/builder_test.go
@@ -246,7 +246,7 @@ func TestDontCountContentOfSkippedFiles(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
-	if len(b.todo) != 1 || b.todo[0].SkipReason == "" {
+	if len(b.todo) != 1 || b.todo[0].SkipReason == SkipReasonNone {
 		t.Fatalf("document should have been skipped")
 	}
 	if b.todo[0].Content != nil {
@@ -1143,7 +1143,7 @@ func TestFileRank(t *testing.T) {
 		docs: []*Document{
 			{
 				Name:       "binary_file",
-				SkipReason: "binary file",
+				SkipReason: SkipReasonBinary,
 			},
 			{
 				Name:    "some_test.go",
@@ -1151,7 +1151,7 @@ func TestFileRank(t *testing.T) {
 			},
 			{
 				Name:       "large_file.go",
-				SkipReason: "too large",
+				SkipReason: SkipReasonTooLarge,
 			},
 			{
 				Name:    "file.go",
diff --git a/index/document.go b/index/document.go
new file mode 100644
index 000000000..68a5b1e98
--- /dev/null
+++ b/index/document.go
@@ -0,0 +1,50 @@
+package index
+
+import "github.com/sourcegraph/zoekt"
+
+// Document holds a document (file) to index.
+type Document struct {
+	Name              string
+	Content           []byte
+	Branches          []string
+	SubRepositoryPath string
+	Language          string
+	Category          FileCategory
+
+	SkipReason SkipReason
+
+	// Document sections for symbols. Offsets should use bytes.
+	Symbols         []DocumentSection
+	SymbolsMetaData []*zoekt.Symbol
+}
+
+type SkipReason int
+
+const (
+	SkipReasonNone SkipReason = iota
+	SkipReasonTooLarge
+	SkipReasonTooSmall
+	SkipReasonBinary
+	SkipReasonTooManyTrigrams
+)
+
+func (s SkipReason) explanation() string {
+	switch s {
+	case SkipReasonNone:
+		return ""
+	case SkipReasonTooLarge:
+		return "exceeds the maximum size limit"
+	case SkipReasonTooSmall:
+		return "contains too few trigrams"
+	case SkipReasonBinary:
+		return "contains binary content"
+	case SkipReasonTooManyTrigrams:
+		return "contains too many trigrams"
+	default:
+		return "unknown skip reason"
+	}
+}
+
+type DocumentSection struct {
+	Start, End uint32
+}
diff --git a/index/file_category.go b/index/file_category.go
index 60561535d..6ca4800af 100644
--- a/index/file_category.go
+++ b/index/file_category.go
@@ -22,16 +22,22 @@ const (
 	FileCategoryGenerated
 	FileCategoryConfig
 	FileCategoryDotFile
+	FileCategoryBinary
 	FileCategoryDocumentation
 )
 
 func DetermineFileCategory(doc *Document) {
+	if doc.SkipReason == SkipReasonBinary {
+		doc.Category = FileCategoryBinary
+		return
+	}
+
 	name := doc.Name
 	content := doc.Content
 
-	// If this document has been skipped, it's likely very large. In this case, we just guess the category based
-	// on the filename to avoid examining the contents. Note: passing nil content is allowed by the go-enry contract.
-	if doc.SkipReason != "" {
+	// If this document was skipped because it was too large, just guess the category based on the filename to avoid
+	// examining the contents. Note: passing nil content is allowed by the go-enry contract.
+	if doc.SkipReason == SkipReasonTooLarge || doc.SkipReason == SkipReasonBinary {
 		content = nil
 	}
 
@@ -56,7 +62,7 @@ func DetermineFileCategory(doc *Document) {
 // lowPriority returns true if this file category is considered 'low priority'. This is used
 // in search scoring to down-weight matches in these files.
 func (c FileCategory) lowPriority() bool {
-	return c == FileCategoryTest || c == FileCategoryVendored || c == FileCategoryGenerated
+	return c == FileCategoryTest || c == FileCategoryVendored || c == FileCategoryGenerated || c == FileCategoryBinary
 }
 
 func (c FileCategory) encode() (byte, error) {
@@ -77,6 +83,8 @@ func (c FileCategory) encode() (byte, error) {
 		return 6, nil
 	case FileCategoryDocumentation:
 		return 7, nil
+	case FileCategoryBinary:
+		return 8, nil
 	default:
 		return 0, errors.New("unrecognized file category")
 	}
@@ -98,6 +106,8 @@ func decodeCategory(c byte) (FileCategory, error) {
 		return FileCategoryDotFile, nil
 	case 7:
 		return FileCategoryDocumentation, nil
+	case 8:
+		return FileCategoryBinary, nil
 	default:
 		return FileCategoryMissing, errors.New("unrecognized file category")
 	}
diff --git a/index/index_test.go b/index/index_test.go
index 0621e53d7..1cb36d615 100644
--- a/index/index_test.go
+++ b/index/index_test.go
@@ -2403,7 +2403,7 @@ func TestUnicodeVariableLength(t *testing.T) {
 
 	t.Run("LineMatches", func(t *testing.T) {
 		b := testShardBuilder(t, nil,
-			Document{Name: "f1", Content: []byte(corpus)})
+			Document{Name: "f1", Content: corpus})
 
 		res := searchForTest(t, b, &query.Substring{Pattern: needle, Content: true})
 		if len(res.Files) != 1 {
@@ -2413,7 +2413,7 @@ func TestUnicodeVariableLength(t *testing.T) {
 
 	t.Run("ChunkMatches", func(t *testing.T) {
 		b := testShardBuilder(t, nil,
-			Document{Name: "f1", Content: []byte(corpus)})
+			Document{Name: "f1", Content: corpus})
 
 		res := searchForTest(t, b, &query.Substring{Pattern: needle, Content: true}, chunkOpts)
 		if len(res.Files) != 1 {
@@ -3191,7 +3191,7 @@ func TestSymbolRegexpAll(t *testing.T) {
 			}
 
 			for j, sec := range want.Symbols {
-				if sec.Start != uint32(got[j].Start.ByteOffset) {
+				if sec.Start != got[j].Start.ByteOffset {
 					t.Fatalf("got offset %d, want %d in doc %s", got[j].Start.ByteOffset, sec.Start, want.Name)
 				}
 			}
@@ -3372,24 +3372,24 @@ func TestDocChecker(t *testing.T) {
 
 	// Test valid and invalid text
 	for _, text := range []string{"", "simple ascii", "símplé unicödé", "\uFEFFwith utf8 'bom'", "with \uFFFD unicode replacement char"} {
-		if err := docChecker.Check([]byte(text), 20000, false); err != nil {
-			t.Errorf("Check(%q): %v", text, err)
+		if skip := docChecker.Check([]byte(text), 20000, false); skip != SkipReasonNone {
+			t.Errorf("Check(%q): %v", text, skip)
 		}
 	}
 	for _, text := range []string{"zero\x00byte", "xx", "0123456789abcdefghi"} {
-		if err := docChecker.Check([]byte(text), 15, false); err == nil {
+		if skip := docChecker.Check([]byte(text), 15, false); skip == SkipReasonNone {
 			t.Errorf("Check(%q) succeeded", text)
 		}
 	}
 
 	// Test valid and invalid text with an allowed large file
 	for _, text := range []string{"0123456789abcdefghi", "qwertyuiopasdfghjklzxcvbnm"} {
-		if err := docChecker.Check([]byte(text), 15, true); err != nil {
-			t.Errorf("Check(%q): %v", text, err)
+		if skip := docChecker.Check([]byte(text), 15, true); skip != SkipReasonNone {
+			t.Errorf("Check(%q): %v", text, skip)
 		}
 	}
 	for _, text := range []string{"zero\x00byte", "xx"} {
-		if err := docChecker.Check([]byte(text), 15, true); err == nil {
+		if skip := docChecker.Check([]byte(text), 15, true); skip == SkipReasonNone {
 			t.Errorf("Check(%q) succeeded", text)
 		}
 	}
diff --git a/index/shard_builder.go b/index/shard_builder.go
index 2d80b6412..a7139856e 100644
--- a/index/shard_builder.go
+++ b/index/shard_builder.go
@@ -402,7 +402,7 @@ func DetermineLanguageIfUnknown(doc *Document) {
 		return
 	}
 
-	if doc.SkipReason != "" {
+	if doc.SkipReason != SkipReasonNone {
 		// If this document has been skipped, it's likely very large, or it's a non-code file like binary.
 		// In this case, we just guess the language based on file name to avoid examining the contents.
 		// Note: passing nil content is allowed by the go-enry contract (the underlying library we use here).
@@ -414,14 +414,12 @@ func DetermineLanguageIfUnknown(doc *Document) {
 
 // Add a file which only occurs in certain branches.
 func (b *ShardBuilder) Add(doc Document) error {
-	hasher := crc64.New(crc64.MakeTable(crc64.ISO))
-
-	if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 {
-		doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx)
+	if index := bytes.IndexByte(doc.Content, 0); index > 0 {
+		doc.SkipReason = SkipReasonBinary
 	}
 
-	if doc.SkipReason != "" {
-		doc.Content = []byte(notIndexedMarker + doc.SkipReason)
+	if doc.SkipReason != SkipReasonNone {
+		doc.Content = []byte(notIndexedMarker + doc.SkipReason.explanation())
 		doc.Symbols = nil
 		doc.SymbolsMetaData = nil
 	}
@@ -481,6 +479,7 @@ func (b *ShardBuilder) Add(doc Document) error {
 	b.subRepos = append(b.subRepos, subRepoIdx)
 	b.repos = append(b.repos, uint16(repoIdx))
 
+	hasher := crc64.New(crc64.MakeTable(crc64.ISO))
 	hasher.Write(doc.Content)
 
 	b.contentStrings = append(b.contentStrings, docStr)
@@ -543,23 +542,23 @@ type DocChecker struct {
 }
 
 // Check returns a reason why the given contents are probably not source texts.
-func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) error {
+func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) SkipReason {
 	if len(content) == 0 {
-		return nil
+		return SkipReasonNone
 	}
 
 	if len(content) < ngramSize {
-		return fmt.Errorf("file size smaller than %d", ngramSize)
+		return SkipReasonTooSmall
 	}
 
 	if index := bytes.IndexByte(content, 0); index > 0 {
-		return fmt.Errorf("binary data at byte offset %d", index)
+		return SkipReasonBinary
 	}
 
 	// PERF: we only need to do the trigram check if the upperbound on content is greater than
 	// our threshold. Also skip the trigram check if the file is explicitly marked as allowed.
 	if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile {
-		return nil
+		return SkipReasonNone
 	}
 
 	var cur [3]rune
@@ -580,10 +579,10 @@ func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile b
 		t.trigrams[runesToNGram(cur)] = struct{}{}
 		if len(t.trigrams) > maxTrigramCount {
 			// probably not text.
-			return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount)
+			return SkipReasonTooManyTrigrams
 		}
 	}
-	return nil
+	return SkipReasonNone
 }
 
 func (t *DocChecker) clearTrigrams(maxTrigramCount int) {
diff --git a/index/shard_builder_test.go b/index/shard_builder_test.go
index 662ac8cd6..f3bed5bc1 100644
--- a/index/shard_builder_test.go
+++ b/index/shard_builder_test.go
@@ -68,7 +68,7 @@ func TestDetermineLanguageIfUnknown(t *testing.T) {
 			name: "skipped file",
 			doc: Document{
 				Name:       "large.js",
-				SkipReason: "too large",
+				SkipReason: SkipReasonTooLarge,
 				Content:    []byte(notIndexedMarker + "too large"),
 			},
 			wantLang: "JavaScript",
@@ -77,7 +77,7 @@ func TestDetermineLanguageIfUnknown(t *testing.T) {
 			name: "skipped file with unknown extension",
 			doc: Document{
 				Name:       "deadb33f",
-				SkipReason: "binary",
+				SkipReason: SkipReasonBinary,
 				Content:    []byte(notIndexedMarker + "binary"),
 			},
 			wantLang: "",
diff --git a/internal/e2e/examples/example.bin b/internal/e2e/examples/example.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e9feaef2378cf4976474d1cc6e62975558f82bb7
GIT binary patch
literal 14
VcmYdHN>)foO;cb{NK8uR0stTR1Lgn#

literal 0
HcmV?d00001

diff --git a/internal/e2e/scoring_test.go b/internal/e2e/scoring_test.go
index d2f8d32c3..7fbb93bee 100644
--- a/internal/e2e/scoring_test.go
+++ b/internal/e2e/scoring_test.go
@@ -72,6 +72,11 @@ func TestBM25(t *testing.T) {
 		t.Fatal(err)
 	}
 
+	exampleBin, err := os.ReadFile("./examples/example.bin")
+	if err != nil {
+		t.Fatal(err)
+	}
+
 	cases := []scoreCase{
 		{
 			// Matches on both filename and content
@@ -153,13 +158,22 @@ func TestBM25(t *testing.T) {
 			wantScore: 2.07,
 		},
 		{
-			// Match on test should be scored lower than regular files
+			// Match on test should be downweighted
 			fileName: "test_example.py",
 			query:    &query.Substring{Pattern: "example"},
 			language: "Python",
-			// sum-termFrequencyScore: 1.0, length-ratio: 0.00
+			// sum-termFrequencyScore: 1.00, length-ratio: 0.00
 			wantScore: 1.69,
 		},
+		{
+			// Match on binary should be downweighted
+			fileName: "example.bin",
+			query:    &query.Substring{Pattern: "example"},
+			language: "",
+			content:  exampleBin,
+			// sum-termFrequencyScore: 1.00, length-ratio: 1.00
+			wantScore: 1.00,
+		},
 	}
 
 	for _, c := range cases {
diff --git a/internal/gitindex/index.go b/internal/gitindex/index.go
index 432784252..ebe267b24 100644
--- a/internal/gitindex/index.go
+++ b/internal/gitindex/index.go
@@ -882,7 +882,7 @@ func createDocument(key fileKey,
 
 	// We filter out large documents when fetching the repo. So if an object is too large, it will not be found.
 	if errors.Is(err, plumbing.ErrObjectNotFound) {
-		return skippedLargeDoc(key, branches, opts), nil
+		return skippedLargeDoc(key, branches), nil
 	}
 
 	if err != nil {
@@ -891,7 +891,7 @@ func createDocument(key fileKey,
 
 	keyFullPath := key.FullPath()
 	if blob.Size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(keyFullPath) {
-		return skippedLargeDoc(key, branches, opts), nil
+		return skippedLargeDoc(key, branches), nil
 	}
 
 	contents, err := blobContents(blob)
@@ -907,9 +907,9 @@ func createDocument(key fileKey,
 	}, nil
 }
 
-func skippedLargeDoc(key fileKey, branches []string, opts index.Options) index.Document {
+func skippedLargeDoc(key fileKey, branches []string) index.Document {
 	return index.Document{
-		SkipReason:        fmt.Sprintf("file size exceeds maximum size %d", opts.SizeMax),
+		SkipReason:        index.SkipReasonTooLarge,
 		Name:              key.FullPath(),
 		Branches:          branches,
 		SubRepositoryPath: key.SubRepoPath,