fix(core): various fixes and improvements to studio search [v2] (#4704)

* fix(core): improve search weighting * fix(core): filter out hits with no score on the client, don't exclude _id from whole word score generation * fix(core): ensure that we use optimised search specs for both selections and weighting * fix(core): allow search traversal depth to be set with SANITY_STUDIO_UNSTABLE_SEARCH_DEPTH environment variable * fix(core): expose matched indices on array fields in search hit stories * fix(core): add a cap on total search paths extracted per root-level object * fix(core): naive first pass of limiting attributes pre-search query generation * fixup! fix(core): add a cap on total search paths extracted per root-level object * fix(core): always include user generated experimental search paths, display skipped paths in generated search specs * fix(core): export all weighted search functions and types * fixup! fix(core): add a cap on total search paths extracted per root-level object * fix(core): reduce unnecessary whitespace from search query payloads * fix(core): don't include duplicate _id and _type values in type selections * fix(core): ignore duplicate paths when calculating maxAttributes * docs(core): update docs for applyWeights/createSearchQuery * fixup! fix(core): export all weighted search functions and types * fix(core): only add paths to rolling list if not previously added, add support for defining MAX_UNIQUE_ATTRIBUTES via env var * refactor(core): update consts to better match corresponding env vars * fixup! fix(core): filter out hits with no score on the client, don't exclude _id from whole word score generation * fixup! docs(core): update docs for applyWeights/createSearchQuery * refactor(core): rename parentType to rootType, add clarifying comment
sanity-io · Jul 20, 2023 · 3eb6fd3 · 3eb6fd3
1 parent 9a82ec5
commit 3eb6fd3
Show file tree

Hide file tree

Showing 17 changed files with 661 additions and 186 deletions.
diff --git a/packages/@sanity/base/src/_exports/_internal.ts b/packages/@sanity/base/src/_exports/_internal.ts
@@ -15,7 +15,7 @@ export type {DocumentAvailability} from '../preview/types'
 export {AvailabilityReason} from '../preview/types'
 
 export {getSearchableTypes} from '../search/common/utils'
-export {createWeightedSearch} from '../search/weighted/createWeightedSearch'
+export {createSearchQuery, createWeightedSearch} from '../search/weighted'
 export type {WeightedHit} from '../search/weighted/types'
 
 export {createHookFromObservableFactory} from '../util/createHookFromObservableFactory'

diff --git a/packages/@sanity/base/src/search/index.ts b/packages/@sanity/base/src/search/index.ts
@@ -6,14 +6,6 @@ import {versionedClient} from '../client/versionedClient'
 import {getSearchableTypes} from './common/utils'
 import {createWeightedSearch} from './weighted/createWeightedSearch'
 
-export type {
-  SearchOptions,
-  SearchSort,
-  SearchTerms,
-  SearchableType,
-  WeightedHit,
-} from './weighted/types'
-
 // Use >= 2021-03-25 for pt::text() support
 const searchClient = versionedClient.withConfig({
   apiVersion: '2021-03-25',
@@ -23,3 +15,5 @@ export default createWeightedSearch(getSearchableTypes(schema), searchClient, {
   unique: true,
   tag: 'search.global',
 })
+
+export * from './weighted'
diff --git a/packages/@sanity/base/src/search/weighted/applyWeights.test.ts b/packages/@sanity/base/src/search/weighted/applyWeights.test.ts
@@ -2,6 +2,7 @@ import {
   calculatePhraseScore,
   calculateWordScore,
   partitionAndSanitizeSearchTerms,
+  calculateCharacterScore,
 } from './applyWeights'
 
 describe('calculatePhraseScore', () => {
@@ -11,7 +12,11 @@ describe('calculatePhraseScore', () => {
   it('should handle partial matches', () => {
     expect(calculatePhraseScore(['the fox'], 'the fox of foo')).toEqual([
       0.25,
-      '[Phrase] Matched 7 of 14 characters',
+      '[Phrase] 7/14 chars',
+    ])
+    expect(calculatePhraseScore(['the fox', 'fox of'], 'the fox of foo')).toEqual([
+      0.4642857142857143,
+      '[Phrase] 13/14 chars',
     ])
   })
 })
@@ -25,17 +30,25 @@ describe('calculateWordScore', () => {
     expect(calculateWordScore(['foo', 'bar'], 'bar & foo')).toEqual([1, '[Word] Exact match'])
   })
   it('should handle partial matches', () => {
-    expect(calculateWordScore(['foo'], 'bar foo')).toEqual([
-      0.25,
-      '[Word] Matched 1 of 2 terms: [foo]',
-    ])
-    expect(calculateWordScore(['foo', 'bar'], 'foo')).toEqual([
-      0.25,
-      `[Word] Matched 1 of 2 terms: [foo]`,
-    ])
+    expect(calculateWordScore(['foo'], 'bar foo')).toEqual([0.25, '[Word] 1/2 terms: [foo]'])
+    expect(calculateWordScore(['foo', 'bar'], 'foo')).toEqual([0.25, `[Word] 1/2 terms: [foo]`])
     expect(calculateWordScore(['foo', 'bar', 'baz'], 'foo foo bar')).toEqual([
       1 / 3,
-      `[Word] Matched 2 of 3 terms: [foo, bar]`,
+      `[Word] 2/3 terms: [foo, bar]`,
+    ])
+  })
+})
+
+describe('calculateCharacterScore', () => {
+  it('should handle exact matches', () => {
+    expect(calculateCharacterScore(['bar', 'foo'], 'bar foo')).toEqual([1, '[Char] Contains all'])
+  })
+
+  it('should handle partial matches', () => {
+    expect(calculateCharacterScore(['foo'], 'bar foo')).toEqual([0.25, '[Char] 3/6 chars'])
+    expect(calculateCharacterScore(['fo', 'ba'], 'bar foo')).toEqual([
+      0.3333333333333333,
+      '[Char] 4/6 chars',
     ])
   })
 })

diff --git a/packages/@sanity/base/src/search/weighted/applyWeights.ts b/packages/@sanity/base/src/search/weighted/applyWeights.ts
@@ -3,55 +3,98 @@ import {SearchHit, WeightedHit, SearchSpec} from './types'
 
 type SearchScore = [number, string]
 
-// takes a set of terms and a value and returns a [score, story] pair where score is a value between 0, 1 and story is the explanation
+/**
+ * Calculates a score (between 0 and 1) indicating general search relevance of an array of
+ * search tokens within a specific string.
+ *
+ * @param searchTerms - All search terms
+ * @param value - The string to match against
+ * @returns A [score, story] pair containing the search score as well as a human readable explanation
+ * @internal
+ */
 export const calculateScore = (searchTerms: string[], value: string): SearchScore => {
   // Separate search terms by phrases (wrapped with quotes) and words.
   const {phrases: uniqueSearchPhrases, words: uniqueSearchWords} = partitionAndSanitizeSearchTerms(
     searchTerms
   )
-
-  // Calculate an aggregated score of both phrase and word matches.
+  // Calculate an aggregated score of words (partial + whole) and phrase matches.
+  const [charScore, charWhy] = calculateCharacterScore(uniqueSearchWords, value)
   const [phraseScore, phraseWhy] = calculatePhraseScore(uniqueSearchPhrases, value)
   const [wordScore, wordWhy] = calculateWordScore(uniqueSearchWords, value)
-  return [phraseScore + wordScore, [wordWhy, phraseWhy].join(', ')]
+  return [charScore + wordScore + phraseScore, [charWhy, wordWhy, phraseWhy].flat().join(', ')]
 }
 
 const stringify = (value: unknown): string =>
   typeof value === 'string' ? value : JSON.stringify(value)
 
+/**
+ * Applies path weights from a supplied SearchSpec to existing search hits to create _weighted_ hits
+ * augmented with search ranking and human readable explanations.
+ *
+ * @param searchSpec - SearchSpec containing path weighting
+ * @param hits - SearchHit objects to augment
+ * @param terms - All search terms
+ * @returns WeightedHit array containing search scores and ranking explanations
+ * @internal
+ */
 export function applyWeights(
   searchSpec: SearchSpec[],
   hits: SearchHit[],
   terms: string[] = []
 ): WeightedHit[] {
   const specByType = keyBy(searchSpec, (spec) => spec.typeName)
-  return hits.map((hit, index) => {
+
+  return hits.reduce((allHits, hit, index) => {
     const typeSpec = specByType[hit._type]
     const stories = typeSpec.paths.map((pathSpec, idx) => {
-      const value = stringify(hit[`w${idx}`])
+      const pathHit = ['_id', '_type'].includes(pathSpec.path) ? hit[pathSpec.path] : hit[idx]
+      const indices = Array.isArray(pathHit) ? findMatchingIndices(terms, pathHit) : null
+      // Only stringify non-falsy values so null values don't pollute search
+      const value = pathHit ? stringify(pathHit) : null
       if (!value) {
         return {path: pathSpec.path, score: 0, why: 'No match'}
       }
       const [score, why] = calculateScore(terms, value)
       return {
+        indices,
         path: pathSpec.path,
         score: score * pathSpec.weight,
         why: `${why} (*${pathSpec.weight})`,
       }
     })
 
     const totalScore = stories.reduce((acc, rank) => acc + rank.score, 0)
-
-    return {hit, resultIndex: hits.length - index, score: totalScore, stories: stories}
-  })
+    /*
+     * Filter out hits with no score.
+     * (only if search terms are present, otherwise we always show results)
+     *
+     * Due to how we generate search queries, in some cases it's possible to have returned search hits
+     * which shouldn't be displayed. This can happen when searching on multiple document types and
+     * user-configured `__experimental_search` paths are in play.
+     *
+     * Since search generates a GROQ query with filters that may refer to field names shared across
+     * multiple document types, it's possible that one document type searches on a field path
+     * that is hidden by another via `__experimental_search`.
+     */
+    if (terms.length === 0 || totalScore > 0) {
+      allHits.push({hit, resultIndex: hits.length - index, score: totalScore, stories: stories})
+    }
+    return allHits
+  }, [])
 }
+
 /**
  * For phrases: score on the total number of matching characters.
  * E.g. given the phrases ["the fox", "of london"] for the target value "the wily fox of london"
  *
  * - "the fox" isn't included in the target value (score: 0)
  * - "of london" is included in the target value, and 9 out of 22 characters match (score: 9/22 = ~0.408)
  * - non-exact matches have their score divided in half (final score: ~0.204)
+ *
+ * @param uniqueSearchPhrases - All search phrases
+ * @param value - The string to match against
+ * @returns SearchScore containing the search score as well as a human readable explanation
+ * @internal
  */
 export function calculatePhraseScore(uniqueSearchPhrases: string[], value: string): SearchScore {
   const sanitizedValue = value.toLowerCase().trim()
@@ -67,30 +110,70 @@ export function calculatePhraseScore(uniqueSearchPhrases: string[], value: strin
 
   return fieldScore === 1
     ? [1, '[Phrase] Exact match']
-    : [fieldScore / 2, `[Phrase] Matched ${matchCount} of ${sanitizedValue.length} characters`]
+    : [fieldScore / 2, `[Phrase] ${matchCount}/${sanitizedValue.length} chars`]
 }
 
 /**
  * For words: score on the total number of matching words.
+ * E.g. given the terms ["bar", "fo"] for the target value "food bar".
+ *
+ * - "fo" is included in the target value, and 2 out of 7 non-whitespace characters match (score: 2/7)
+ * - "bar" is included in the target value, and 3 out of 7 non-whitespace characters match (score: 3/7)
+ * - all values are accumulated and have their score devidied by half (final score: ~0.357)
+ *
+ * @param uniqueSearchTerms - A string array of search terms
+ * @param value - The string to match against
+ * @returns SearchScore containing the search score as well as a human readable explanation
+ * @internal
+ */
+export function calculateCharacterScore(uniqueSearchTerms: string[], value: string): SearchScore {
+  const sanitizedValue = value.toLowerCase().trim()
+  const sanitizedValueCompact = sanitizedValue.replace(/ /g, '')
+
+  let fieldScore = 0
+  let matchCount = 0
+  uniqueSearchTerms.forEach((term) => {
+    if (sanitizedValue.includes(term)) {
+      fieldScore += term.length / sanitizedValueCompact.length
+      matchCount += term.length
+    }
+  })
+
+  return fieldScore === 1
+    ? [fieldScore, `[Char] Contains all`]
+    : [fieldScore / 2, `[Char] ${matchCount}/${sanitizedValueCompact.length} chars`]
+}
+
+/**
+ * Generate a score on the total number of matching _whole_ words.
  * E.g. given the words ["the", "fox", "of", "london"] for the target value "the wily fox of london"
  *
  * - 4 out of 5 words match (score: 4/5 = 0.8)
  * - non-exact matches have their score divided in half (final score: 0.4)
+ *
+ * @param uniqueSearchTerms - All search terms
+ * @param value - The string to match against
+ * @returns SearchScore containing the search score as well as a human readable explanation
+ * @internal
  */
 export function calculateWordScore(uniqueSearchTerms: string[], value: string): SearchScore {
   const uniqueValueTerms = uniq(compact(words(toLower(value))))
 
   const matches = intersection(uniqueSearchTerms, uniqueValueTerms)
   const all = union(uniqueValueTerms, uniqueSearchTerms)
-  const fieldScore = matches.length / all.length
+  const fieldScore = matches.length / all.length || 0
   return fieldScore === 1
     ? [1, '[Word] Exact match']
-    : [
-        fieldScore / 2,
-        `[Word] Matched ${matches.length} of ${all.length} terms: [${matches.join(', ')}]`,
-      ]
+    : [fieldScore / 2, `[Word] ${matches.length}/${all.length} terms: [${matches.join(', ')}]`]
 }
 
+/**
+ * Partition search terms by phrases (wrapped with quotes) and words.
+ *
+ * @param searchTerms - All search terms
+ * @returns Partitioned phrases and words
+ * @internal
+ */
 export function partitionAndSanitizeSearchTerms(
   searchTerms: string[]
 ): {
@@ -106,6 +189,34 @@ export function partitionAndSanitizeSearchTerms(
   }
 }
 
+/**
+ * Returns matching array indices of `values` containing _any_ member of `uniqueSearchTerms`.
+ * When comparing for matches, members of `values` are stringified, trimmed and lowercased.
+ *
+ * @param uniqueSearchTerms - All search terms
+ * @param values - Values to match against (members are stringified)
+ * @returns All matching indices in `values`
+ * @internal
+ */
+export function findMatchingIndices(uniqueSearchTerms: string[], values: unknown[]): number[] {
+  const {phrases: uniqueSearchPhrases, words: uniqueSearchWords} = partitionAndSanitizeSearchTerms(
+    uniqueSearchTerms
+  )
+
+  return values.reduce<number[]>((acc, val, index) => {
+    if (val) {
+      const contains = [...uniqueSearchPhrases, ...uniqueSearchWords].some((term) => {
+        const stringifiedValue = stringify(val).toLowerCase().trim()
+        return stringifiedValue.includes(term)
+      })
+      if (contains) {
+        acc.push(index)
+      }
+    }
+    return acc
+  }, [])
+}
+
 function stripWrappingQuotes(str: string) {
   return str.replace(/^"(.*)"$/, '$1')
 }