Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/zoekt-index/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ func indexArg(arg string, opts index.Options, ignore map[string]struct{}) error
if f.size > int64(opts.SizeMax) && !opts.IgnoreSizeMax(displayName) {
if err := builder.Add(index.Document{
Name: displayName,
SkipReason: fmt.Sprintf("document size %d larger than limit %d", f.size, opts.SizeMax),
SkipReason: index.SkipReasonTooLarge,
}); err != nil {
return err
}
Expand Down
34 changes: 6 additions & 28 deletions index/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -601,17 +601,17 @@ func (b *Builder) Add(doc Document) error {
// we pass through a part of the source tree with binary/large
// files, the corresponding shard would be mostly empty, so
// insert a reason here too.
doc.SkipReason = fmt.Sprintf("document size %d larger than limit %d", len(doc.Content), b.opts.SizeMax)
} else if err := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); err != nil {
doc.SkipReason = err.Error()
doc.SkipReason = SkipReasonTooLarge
} else if skip := b.docChecker.Check(doc.Content, b.opts.TrigramMax, allowLargeFile); skip != SkipReasonNone {
doc.SkipReason = skip
}

b.todo = append(b.todo, &doc)

if doc.SkipReason == "" {
if doc.SkipReason == SkipReasonNone {
b.size += len(doc.Name) + len(doc.Content)
} else {
b.size += len(doc.Name) + len(doc.SkipReason)
b.size += len(doc.Name)
// Drop the content if we are skipping the document. Skipped content is not counted towards the
// shard size limit, so otherwise we might buffer too much data in memory before flushing.
doc.Content = nil
Expand Down Expand Up @@ -865,7 +865,7 @@ type rankedDoc struct {
// have a higher chance of being searched before limits kick in.
func rank(d *Document, origIdx int) []float64 {
skipped := 0.0
if d.SkipReason != "" {
if d.SkipReason != SkipReasonNone {
skipped = 1.0
}

Expand Down Expand Up @@ -1074,28 +1074,6 @@ func (e *deltaIndexOptionsMismatchError) Error() string {
return fmt.Sprintf("one or more index options for shard %q do not match Builder's index options. These index option updates are incompatible with delta build. New index options: %+v", e.shardName, e.newOptions)
}

// Document holds a document (file) to index.
type Document struct {
Name string
Content []byte
Branches []string
SubRepositoryPath string
Language string
Category FileCategory

// If set, something is wrong with the file contents, and this
// is the reason it wasn't indexed.
SkipReason string

// Document sections for symbols. Offsets should use bytes.
Symbols []DocumentSection
SymbolsMetaData []*zoekt.Symbol
}

type DocumentSection struct {
Start, End uint32
}

// umask holds the Umask of the current process
var umask os.FileMode

Expand Down
6 changes: 3 additions & 3 deletions index/builder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ func TestDontCountContentOfSkippedFiles(t *testing.T) {
if err != nil {
t.Fatal(err)
}
if len(b.todo) != 1 || b.todo[0].SkipReason == "" {
if len(b.todo) != 1 || b.todo[0].SkipReason == SkipReasonNone {
t.Fatalf("document should have been skipped")
}
if b.todo[0].Content != nil {
Expand Down Expand Up @@ -1143,15 +1143,15 @@ func TestFileRank(t *testing.T) {
docs: []*Document{
{
Name: "binary_file",
SkipReason: "binary file",
SkipReason: SkipReasonBinary,
},
{
Name: "some_test.go",
Content: []byte("bla"),
},
{
Name: "large_file.go",
SkipReason: "too large",
SkipReason: SkipReasonTooLarge,
},
{
Name: "file.go",
Expand Down
50 changes: 50 additions & 0 deletions index/document.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package index

import "github.com/sourcegraph/zoekt"

// Document holds a document (file) to index.
type Document struct {
Name string
Content []byte
Branches []string
SubRepositoryPath string
Language string
Category FileCategory

SkipReason SkipReason

// Document sections for symbols. Offsets should use bytes.
Symbols []DocumentSection
SymbolsMetaData []*zoekt.Symbol
}

type SkipReason int

const (
SkipReasonNone SkipReason = iota
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we're doing an int enum, it's considered good practice to start at 1. It helps catch uninitialized cases.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a special situation, in that the first enum value purposefully represents the "not set" case. That's why it's called SkipReasonNone. I want the flexibility to not have to set it explicitly, and have it default to SkipReasonNone.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it, that makes sense.

SkipReasonTooLarge
SkipReasonTooSmall
SkipReasonBinary
SkipReasonTooManyTrigrams
)

func (s SkipReason) explanation() string {
switch s {
case SkipReasonNone:
return ""
case SkipReasonTooLarge:
return "exceeds the maximum size limit"
case SkipReasonTooSmall:
return "contains too few trigrams"
case SkipReasonBinary:
return "contains binary content"
case SkipReasonTooManyTrigrams:
return "contains too many trigrams"
default:
return "unknown skip reason"
}
}

type DocumentSection struct {
Start, End uint32
}
18 changes: 14 additions & 4 deletions index/file_category.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,22 @@ const (
FileCategoryGenerated
FileCategoryConfig
FileCategoryDotFile
FileCategoryBinary
FileCategoryDocumentation
)

func DetermineFileCategory(doc *Document) {
if doc.SkipReason == SkipReasonBinary {
doc.Category = FileCategoryBinary
return
}

name := doc.Name
content := doc.Content

// If this document has been skipped, it's likely very large. In this case, we just guess the category based
// on the filename to avoid examining the contents. Note: passing nil content is allowed by the go-enry contract.
if doc.SkipReason != "" {
// If this document was skipped because it was too large, just guess the category based on the filename to avoid
// examining the contents. Note: passing nil content is allowed by the go-enry contract.
if doc.SkipReason == SkipReasonTooLarge || doc.SkipReason == SkipReasonBinary {
content = nil
}

Expand All @@ -56,7 +62,7 @@ func DetermineFileCategory(doc *Document) {
// lowPriority returns true if this file category is considered 'low priority'. This is used
// in search scoring to down-weight matches in these files.
func (c FileCategory) lowPriority() bool {
return c == FileCategoryTest || c == FileCategoryVendored || c == FileCategoryGenerated
return c == FileCategoryTest || c == FileCategoryVendored || c == FileCategoryGenerated || c == FileCategoryBinary
}

func (c FileCategory) encode() (byte, error) {
Expand All @@ -77,6 +83,8 @@ func (c FileCategory) encode() (byte, error) {
return 6, nil
case FileCategoryDocumentation:
return 7, nil
case FileCategoryBinary:
return 8, nil
default:
return 0, errors.New("unrecognized file category")
}
Expand All @@ -98,6 +106,8 @@ func decodeCategory(c byte) (FileCategory, error) {
return FileCategoryDotFile, nil
case 7:
return FileCategoryDocumentation, nil
case 8:
return FileCategoryBinary, nil
default:
return FileCategoryMissing, errors.New("unrecognized file category")
}
Expand Down
18 changes: 9 additions & 9 deletions index/index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2403,7 +2403,7 @@ func TestUnicodeVariableLength(t *testing.T) {

t.Run("LineMatches", func(t *testing.T) {
b := testShardBuilder(t, nil,
Document{Name: "f1", Content: []byte(corpus)})
Document{Name: "f1", Content: corpus})

res := searchForTest(t, b, &query.Substring{Pattern: needle, Content: true})
if len(res.Files) != 1 {
Expand All @@ -2413,7 +2413,7 @@ func TestUnicodeVariableLength(t *testing.T) {

t.Run("ChunkMatches", func(t *testing.T) {
b := testShardBuilder(t, nil,
Document{Name: "f1", Content: []byte(corpus)})
Document{Name: "f1", Content: corpus})

res := searchForTest(t, b, &query.Substring{Pattern: needle, Content: true}, chunkOpts)
if len(res.Files) != 1 {
Expand Down Expand Up @@ -3191,7 +3191,7 @@ func TestSymbolRegexpAll(t *testing.T) {
}

for j, sec := range want.Symbols {
if sec.Start != uint32(got[j].Start.ByteOffset) {
if sec.Start != got[j].Start.ByteOffset {
t.Fatalf("got offset %d, want %d in doc %s", got[j].Start.ByteOffset, sec.Start, want.Name)
}
}
Expand Down Expand Up @@ -3372,24 +3372,24 @@ func TestDocChecker(t *testing.T) {

// Test valid and invalid text
for _, text := range []string{"", "simple ascii", "símplé unicödé", "\uFEFFwith utf8 'bom'", "with \uFFFD unicode replacement char"} {
if err := docChecker.Check([]byte(text), 20000, false); err != nil {
t.Errorf("Check(%q): %v", text, err)
if skip := docChecker.Check([]byte(text), 20000, false); skip != SkipReasonNone {
t.Errorf("Check(%q): %v", text, skip)
}
}
for _, text := range []string{"zero\x00byte", "xx", "0123456789abcdefghi"} {
if err := docChecker.Check([]byte(text), 15, false); err == nil {
if skip := docChecker.Check([]byte(text), 15, false); skip == SkipReasonNone {
t.Errorf("Check(%q) succeeded", text)
}
}

// Test valid and invalid text with an allowed large file
for _, text := range []string{"0123456789abcdefghi", "qwertyuiopasdfghjklzxcvbnm"} {
if err := docChecker.Check([]byte(text), 15, true); err != nil {
t.Errorf("Check(%q): %v", text, err)
if skip := docChecker.Check([]byte(text), 15, true); skip != SkipReasonNone {
t.Errorf("Check(%q): %v", text, skip)
}
}
for _, text := range []string{"zero\x00byte", "xx"} {
if err := docChecker.Check([]byte(text), 15, true); err == nil {
if skip := docChecker.Check([]byte(text), 15, true); skip == SkipReasonNone {
t.Errorf("Check(%q) succeeded", text)
}
}
Expand Down
27 changes: 13 additions & 14 deletions index/shard_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ func DetermineLanguageIfUnknown(doc *Document) {
return
}

if doc.SkipReason != "" {
if doc.SkipReason != SkipReasonNone {
// If this document has been skipped, it's likely very large, or it's a non-code file like binary.
// In this case, we just guess the language based on file name to avoid examining the contents.
// Note: passing nil content is allowed by the go-enry contract (the underlying library we use here).
Expand All @@ -414,14 +414,12 @@ func DetermineLanguageIfUnknown(doc *Document) {

// Add a file which only occurs in certain branches.
func (b *ShardBuilder) Add(doc Document) error {
hasher := crc64.New(crc64.MakeTable(crc64.ISO))

if idx := bytes.IndexByte(doc.Content, 0); idx >= 0 {
doc.SkipReason = fmt.Sprintf("binary content at byte offset %d", idx)
if index := bytes.IndexByte(doc.Content, 0); index > 0 {
doc.SkipReason = SkipReasonBinary
}

if doc.SkipReason != "" {
doc.Content = []byte(notIndexedMarker + doc.SkipReason)
if doc.SkipReason != SkipReasonNone {
doc.Content = []byte(notIndexedMarker + doc.SkipReason.explanation())
doc.Symbols = nil
doc.SymbolsMetaData = nil
}
Expand Down Expand Up @@ -481,6 +479,7 @@ func (b *ShardBuilder) Add(doc Document) error {
b.subRepos = append(b.subRepos, subRepoIdx)
b.repos = append(b.repos, uint16(repoIdx))

hasher := crc64.New(crc64.MakeTable(crc64.ISO))
hasher.Write(doc.Content)

b.contentStrings = append(b.contentStrings, docStr)
Expand Down Expand Up @@ -543,23 +542,23 @@ type DocChecker struct {
}

// Check returns a reason why the given contents are probably not source texts.
func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) error {
func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile bool) SkipReason {
if len(content) == 0 {
return nil
return SkipReasonNone
}

if len(content) < ngramSize {
return fmt.Errorf("file size smaller than %d", ngramSize)
return SkipReasonTooSmall
}

if index := bytes.IndexByte(content, 0); index > 0 {
return fmt.Errorf("binary data at byte offset %d", index)
return SkipReasonBinary
}

// PERF: we only need to do the trigram check if the upperbound on content is greater than
// our threshold. Also skip the trigram check if the file is explicitly marked as allowed.
if trigramsUpperBound := len(content) - ngramSize + 1; trigramsUpperBound <= maxTrigramCount || allowLargeFile {
return nil
return SkipReasonNone
}

var cur [3]rune
Expand All @@ -580,10 +579,10 @@ func (t *DocChecker) Check(content []byte, maxTrigramCount int, allowLargeFile b
t.trigrams[runesToNGram(cur)] = struct{}{}
if len(t.trigrams) > maxTrigramCount {
// probably not text.
return fmt.Errorf("number of trigrams exceeds %d", maxTrigramCount)
return SkipReasonTooManyTrigrams
}
}
return nil
return SkipReasonNone
}

func (t *DocChecker) clearTrigrams(maxTrigramCount int) {
Expand Down
4 changes: 2 additions & 2 deletions index/shard_builder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ func TestDetermineLanguageIfUnknown(t *testing.T) {
name: "skipped file",
doc: Document{
Name: "large.js",
SkipReason: "too large",
SkipReason: SkipReasonTooLarge,
Content: []byte(notIndexedMarker + "too large"),
},
wantLang: "JavaScript",
Expand All @@ -77,7 +77,7 @@ func TestDetermineLanguageIfUnknown(t *testing.T) {
name: "skipped file with unknown extension",
doc: Document{
Name: "deadb33f",
SkipReason: "binary",
SkipReason: SkipReasonBinary,
Content: []byte(notIndexedMarker + "binary"),
},
wantLang: "",
Expand Down
Binary file added internal/e2e/examples/example.bin
Binary file not shown.
18 changes: 16 additions & 2 deletions internal/e2e/scoring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ func TestBM25(t *testing.T) {
t.Fatal(err)
}

exampleBin, err := os.ReadFile("./examples/example.bin")
if err != nil {
t.Fatal(err)
}

cases := []scoreCase{
{
// Matches on both filename and content
Expand Down Expand Up @@ -153,13 +158,22 @@ func TestBM25(t *testing.T) {
wantScore: 2.07,
},
{
// Match on test should be scored lower than regular files
// Match on test should be downweighted
fileName: "test_example.py",
query: &query.Substring{Pattern: "example"},
language: "Python",
// sum-termFrequencyScore: 1.0, length-ratio: 0.00
// sum-termFrequencyScore: 1.00, length-ratio: 0.00
wantScore: 1.69,
},
{
// Match on binary should be downweighted
fileName: "example.bin",
query: &query.Substring{Pattern: "example"},
language: "",
content: exampleBin,
// sum-termFrequencyScore: 1.00, length-ratio: 1.00
wantScore: 1.00,
},
}

for _, c := range cases {
Expand Down
Loading