From 4fbe1b5b00d87bc16a75a4eae1bb19464bbeff50 Mon Sep 17 00:00:00 2001 From: John Mason Date: Fri, 3 Oct 2025 14:41:28 -0400 Subject: [PATCH] Add shard filtering for meta queries --- search/shards.go | 18 ++++- search/shards_test.go | 176 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 193 insertions(+), 1 deletion(-) diff --git a/search/shards.go b/search/shards.go index e7c23f1ca..bee903f99 100644 --- a/search/shards.go +++ b/search/shards.go @@ -436,6 +436,22 @@ func doSelectRepoSet(shards []*rankedShard, and *query.And) ([]*rankedShard, que } return false }) + case *query.Meta: + // Meta queries filter repositories based on metadata fields. + // By checking this at the shard level, we can skip entire shards + // that don't contain any matching repositories, avoiding expensive + // I/O operations. + setSize = 0 // Unknown size, we'll filter based on metadata + hasRepos = hasReposForPredicate(func(repo *zoekt.Repository) bool { + if repo.Metadata == nil { + return false + } + v, ok := repo.Metadata[setQuery.Field] + if !ok { + return false + } + return setQuery.Value.MatchString(v) + }) default: continue } @@ -486,7 +502,7 @@ func doSelectRepoSet(shards []*rankedShard, and *query.And) ([]*rankedShard, que // shard indexData.simplify will simplify to (and true (content baz)) -> // (content baz). This work can be done now once, rather than per shard. switch c := c.(type) { - case *query.RepoSet, *query.RepoIDs, *query.Repo: + case *query.RepoSet, *query.RepoIDs, *query.Repo, *query.Meta: and.Children[i] = &query.Const{Value: true} return filtered, query.Simplify(and) diff --git a/search/shards_test.go b/search/shards_test.go index b4eb5974d..668075b1f 100644 --- a/search/shards_test.go +++ b/search/shards_test.go @@ -387,6 +387,182 @@ func TestFilteringShardsByRepoSetOrBranchesReposOrRepoIDs(t *testing.T) { } } +func TestFilteringShardsByMeta(t *testing.T) { + ss := newShardedSearcher(1) + + // Create repos with different metadata values + // We'll create 30 repos total: + // - 10 with nickname="project-A" + // - 10 with nickname="project-B" + // - 10 with no metadata + n := 30 + projectARepos := []string{} + projectBRepos := []string{} + + // Common document that will be in all repos + doc := index.Document{ + Name: "common.go", + Content: []byte("needle haystack"), + } + + for i := range n { + shardName := fmt.Sprintf("shard%d", i) + repoName := fmt.Sprintf("repository%.3d", i) + + var metadata map[string]string + if i < 10 { + // First 10 repos have project-A + metadata = map[string]string{"nickname": "project-A", "visibility": "public"} + projectARepos = append(projectARepos, repoName) + } else if i < 20 { + // Next 10 repos have project-B + metadata = map[string]string{"nickname": "project-B", "visibility": "private"} + projectBRepos = append(projectBRepos, repoName) + } + // Last 10 repos have no metadata + + repo := &zoekt.Repository{ + ID: uint32(i + 1), + Name: repoName, + Metadata: metadata, + } + + ss.replace(map[string]zoekt.Searcher{ + shardName: searcherForTest(t, testShardBuilder(t, repo, doc)), + }) + } + + // Test 1: Search without Meta filter - should search all shards + res, err := ss.Search(context.Background(), &query.Substring{Pattern: "needle"}, &zoekt.SearchOptions{}) + if err != nil { + t.Fatalf("Search without filter: %v", err) + } + if len(res.Files) != n { + t.Fatalf("no meta filter: got %d results, want %d", len(res.Files), n) + } + + sub := &query.Substring{Pattern: "needle"} + + // Helper function to extract unique repo names from search results + getRepoNames := func(files []zoekt.FileMatch) []string { + repoSet := make(map[string]struct{}) + for _, f := range files { + repoSet[f.Repository] = struct{}{} + } + repos := make([]string, 0, len(repoSet)) + for repo := range repoSet { + repos = append(repos, repo) + } + sort.Strings(repos) + return repos + } + + // Test 2: Filter by nickname="project-A" - should only search 10 shards + metaQueryA := &query.Meta{ + Field: "nickname", + Value: regexp.MustCompile("^project-A$"), + } + res, err = ss.Search(context.Background(), query.NewAnd(metaQueryA, sub), &zoekt.SearchOptions{}) + if err != nil { + t.Fatalf("Search with Meta filter A: %v", err) + } + gotRepos := getRepoNames(res.Files) + wantRepos := append([]string{}, projectARepos...) + sort.Strings(wantRepos) + if !reflect.DeepEqual(gotRepos, wantRepos) { + t.Fatalf("Meta(nickname=project-A):\ngot repos: %v\nwant repos: %v", gotRepos, wantRepos) + } + + // Test 3: Filter by nickname="project-B" - should only search 10 shards + metaQueryB := &query.Meta{ + Field: "nickname", + Value: regexp.MustCompile("^project-B$"), + } + res, err = ss.Search(context.Background(), query.NewAnd(metaQueryB, sub), &zoekt.SearchOptions{}) + if err != nil { + t.Fatalf("Search with Meta filter B: %v", err) + } + gotRepos = getRepoNames(res.Files) + wantRepos = append([]string{}, projectBRepos...) + sort.Strings(wantRepos) + if !reflect.DeepEqual(gotRepos, wantRepos) { + t.Fatalf("Meta(nickname=project-B):\ngot repos: %v\nwant repos: %v", gotRepos, wantRepos) + } + + // Test 4: Filter by visibility="public" - should only search 10 shards (project-A repos) + metaQueryPublic := &query.Meta{ + Field: "visibility", + Value: regexp.MustCompile("^public$"), + } + res, err = ss.Search(context.Background(), query.NewAnd(metaQueryPublic, sub), &zoekt.SearchOptions{}) + if err != nil { + t.Fatalf("Search with Meta filter public: %v", err) + } + gotRepos = getRepoNames(res.Files) + wantRepos = append([]string{}, projectARepos...) + sort.Strings(wantRepos) + if !reflect.DeepEqual(gotRepos, wantRepos) { + t.Fatalf("Meta(visibility=public):\ngot repos: %v\nwant repos: %v", gotRepos, wantRepos) + } + + // Test 5: Filter by non-existent field - should return 0 results + metaQueryNonExistent := &query.Meta{ + Field: "nonexistent_field", + Value: regexp.MustCompile(".*"), + } + res, err = ss.Search(context.Background(), query.NewAnd(metaQueryNonExistent, sub), &zoekt.SearchOptions{}) + if err != nil { + t.Fatalf("Search with Meta filter non-existent: %v", err) + } + if len(res.Files) != 0 { + t.Fatalf("Meta(nonexistent_field): got %d results, want 0", len(res.Files)) + } + + // Test 6: Filter by regex pattern matching multiple values + metaQueryRegex := &query.Meta{ + Field: "nickname", + Value: regexp.MustCompile("project-.*"), // Matches both project-A and project-B + } + res, err = ss.Search(context.Background(), query.NewAnd(metaQueryRegex, sub), &zoekt.SearchOptions{}) + if err != nil { + t.Fatalf("Search with Meta regex filter: %v", err) + } + gotRepos = getRepoNames(res.Files) + wantRepos = append(append([]string{}, projectARepos...), projectBRepos...) + sort.Strings(wantRepos) + if !reflect.DeepEqual(gotRepos, wantRepos) { + t.Fatalf("Meta(nickname=project-.*):\ngot repos: %v\nwant repos: %v", gotRepos, wantRepos) + } + + // Test 7: Test that Meta query alone (without content search) works + res, err = ss.Search(context.Background(), metaQueryA, &zoekt.SearchOptions{}) + if err != nil { + t.Fatalf("Search with Meta query alone: %v", err) + } + gotRepos = getRepoNames(res.Files) + wantRepos = append([]string{}, projectARepos...) + sort.Strings(wantRepos) + if !reflect.DeepEqual(gotRepos, wantRepos) { + t.Fatalf("Meta query alone:\ngot repos: %v\nwant repos: %v", gotRepos, wantRepos) + } + + // Test 8: Test with List operation (not just Search) + listRes, err := ss.List(context.Background(), metaQueryA, nil) + if err != nil { + t.Fatalf("List with Meta filter: %v", err) + } + gotListRepos := make([]string, len(listRes.Repos)) + for i, r := range listRes.Repos { + gotListRepos[i] = r.Repository.Name + } + sort.Strings(gotListRepos) + wantRepos = append([]string{}, projectARepos...) + sort.Strings(wantRepos) + if !reflect.DeepEqual(gotListRepos, wantRepos) { + t.Fatalf("List with Meta(nickname=project-A):\ngot repos: %v\nwant repos: %v", gotListRepos, wantRepos) + } +} + func hash(name string) uint32 { h := fnv.New32() h.Write([]byte(name))