Skip to content

Commit

Permalink
fix(core): various fixes and improvements to studio search [v2] (#4704)
Browse files Browse the repository at this point in the history
* fix(core): improve search weighting

* fix(core): filter out hits with no score on the client, don't exclude _id from whole word score generation

* fix(core): ensure that we use optimised search specs for both selections and weighting

* fix(core): allow search traversal depth to be set with SANITY_STUDIO_UNSTABLE_SEARCH_DEPTH environment variable

* fix(core): expose matched indices on array fields in search hit stories

* fix(core): add a cap on total search paths extracted per root-level object

* fix(core): naive first pass of limiting attributes pre-search query generation

* fixup! fix(core): add a cap on total search paths extracted per root-level object

* fix(core): always include user generated experimental search paths, display skipped paths in generated search specs

* fix(core): export all weighted search functions and types

* fixup! fix(core): add a cap on total search paths extracted per root-level object

* fix(core): reduce unnecessary whitespace from search query payloads

* fix(core): don't include duplicate _id and _type values in type selections

* fix(core): ignore duplicate paths when calculating maxAttributes

* docs(core): update docs for applyWeights/createSearchQuery

* fixup! fix(core): export all weighted search functions and types

* fix(core): only add paths to rolling list if not previously added, add support for defining MAX_UNIQUE_ATTRIBUTES via env var

* refactor(core): update consts to better match corresponding env vars

* fixup! fix(core): filter out hits with no score on the client, don't exclude _id from whole word score generation

* fixup! docs(core): update docs for applyWeights/createSearchQuery

* refactor(core): rename parentType to rootType, add clarifying comment
  • Loading branch information
robinpyon committed Jul 20, 2023
1 parent 9a82ec5 commit 3eb6fd3
Show file tree
Hide file tree
Showing 17 changed files with 661 additions and 186 deletions.
2 changes: 1 addition & 1 deletion packages/@sanity/base/src/_exports/_internal.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ export type {DocumentAvailability} from '../preview/types'
export {AvailabilityReason} from '../preview/types'

export {getSearchableTypes} from '../search/common/utils'
export {createWeightedSearch} from '../search/weighted/createWeightedSearch'
export {createSearchQuery, createWeightedSearch} from '../search/weighted'
export type {WeightedHit} from '../search/weighted/types'

export {createHookFromObservableFactory} from '../util/createHookFromObservableFactory'
Expand Down
10 changes: 2 additions & 8 deletions packages/@sanity/base/src/search/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,6 @@ import {versionedClient} from '../client/versionedClient'
import {getSearchableTypes} from './common/utils'
import {createWeightedSearch} from './weighted/createWeightedSearch'

export type {
SearchOptions,
SearchSort,
SearchTerms,
SearchableType,
WeightedHit,
} from './weighted/types'

// Use >= 2021-03-25 for pt::text() support
const searchClient = versionedClient.withConfig({
apiVersion: '2021-03-25',
Expand All @@ -23,3 +15,5 @@ export default createWeightedSearch(getSearchableTypes(schema), searchClient, {
unique: true,
tag: 'search.global',
})

export * from './weighted'
33 changes: 23 additions & 10 deletions packages/@sanity/base/src/search/weighted/applyWeights.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import {
calculatePhraseScore,
calculateWordScore,
partitionAndSanitizeSearchTerms,
calculateCharacterScore,
} from './applyWeights'

describe('calculatePhraseScore', () => {
Expand All @@ -11,7 +12,11 @@ describe('calculatePhraseScore', () => {
it('should handle partial matches', () => {
expect(calculatePhraseScore(['the fox'], 'the fox of foo')).toEqual([
0.25,
'[Phrase] Matched 7 of 14 characters',
'[Phrase] 7/14 chars',
])
expect(calculatePhraseScore(['the fox', 'fox of'], 'the fox of foo')).toEqual([
0.4642857142857143,
'[Phrase] 13/14 chars',
])
})
})
Expand All @@ -25,17 +30,25 @@ describe('calculateWordScore', () => {
expect(calculateWordScore(['foo', 'bar'], 'bar & foo')).toEqual([1, '[Word] Exact match'])
})
it('should handle partial matches', () => {
expect(calculateWordScore(['foo'], 'bar foo')).toEqual([
0.25,
'[Word] Matched 1 of 2 terms: [foo]',
])
expect(calculateWordScore(['foo', 'bar'], 'foo')).toEqual([
0.25,
`[Word] Matched 1 of 2 terms: [foo]`,
])
expect(calculateWordScore(['foo'], 'bar foo')).toEqual([0.25, '[Word] 1/2 terms: [foo]'])
expect(calculateWordScore(['foo', 'bar'], 'foo')).toEqual([0.25, `[Word] 1/2 terms: [foo]`])
expect(calculateWordScore(['foo', 'bar', 'baz'], 'foo foo bar')).toEqual([
1 / 3,
`[Word] Matched 2 of 3 terms: [foo, bar]`,
`[Word] 2/3 terms: [foo, bar]`,
])
})
})

describe('calculateCharacterScore', () => {
it('should handle exact matches', () => {
expect(calculateCharacterScore(['bar', 'foo'], 'bar foo')).toEqual([1, '[Char] Contains all'])
})

it('should handle partial matches', () => {
expect(calculateCharacterScore(['foo'], 'bar foo')).toEqual([0.25, '[Char] 3/6 chars'])
expect(calculateCharacterScore(['fo', 'ba'], 'bar foo')).toEqual([
0.3333333333333333,
'[Char] 4/6 chars',
])
})
})
Expand Down
141 changes: 126 additions & 15 deletions packages/@sanity/base/src/search/weighted/applyWeights.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,55 +3,98 @@ import {SearchHit, WeightedHit, SearchSpec} from './types'

type SearchScore = [number, string]

// takes a set of terms and a value and returns a [score, story] pair where score is a value between 0, 1 and story is the explanation
/**
* Calculates a score (between 0 and 1) indicating general search relevance of an array of
* search tokens within a specific string.
*
* @param searchTerms - All search terms
* @param value - The string to match against
* @returns A [score, story] pair containing the search score as well as a human readable explanation
* @internal
*/
export const calculateScore = (searchTerms: string[], value: string): SearchScore => {
// Separate search terms by phrases (wrapped with quotes) and words.
const {phrases: uniqueSearchPhrases, words: uniqueSearchWords} = partitionAndSanitizeSearchTerms(
searchTerms
)

// Calculate an aggregated score of both phrase and word matches.
// Calculate an aggregated score of words (partial + whole) and phrase matches.
const [charScore, charWhy] = calculateCharacterScore(uniqueSearchWords, value)
const [phraseScore, phraseWhy] = calculatePhraseScore(uniqueSearchPhrases, value)
const [wordScore, wordWhy] = calculateWordScore(uniqueSearchWords, value)
return [phraseScore + wordScore, [wordWhy, phraseWhy].join(', ')]
return [charScore + wordScore + phraseScore, [charWhy, wordWhy, phraseWhy].flat().join(', ')]
}

const stringify = (value: unknown): string =>
typeof value === 'string' ? value : JSON.stringify(value)

/**
* Applies path weights from a supplied SearchSpec to existing search hits to create _weighted_ hits
* augmented with search ranking and human readable explanations.
*
* @param searchSpec - SearchSpec containing path weighting
* @param hits - SearchHit objects to augment
* @param terms - All search terms
* @returns WeightedHit array containing search scores and ranking explanations
* @internal
*/
export function applyWeights(
searchSpec: SearchSpec[],
hits: SearchHit[],
terms: string[] = []
): WeightedHit[] {
const specByType = keyBy(searchSpec, (spec) => spec.typeName)
return hits.map((hit, index) => {

return hits.reduce((allHits, hit, index) => {
const typeSpec = specByType[hit._type]
const stories = typeSpec.paths.map((pathSpec, idx) => {
const value = stringify(hit[`w${idx}`])
const pathHit = ['_id', '_type'].includes(pathSpec.path) ? hit[pathSpec.path] : hit[idx]
const indices = Array.isArray(pathHit) ? findMatchingIndices(terms, pathHit) : null
// Only stringify non-falsy values so null values don't pollute search
const value = pathHit ? stringify(pathHit) : null
if (!value) {
return {path: pathSpec.path, score: 0, why: 'No match'}
}
const [score, why] = calculateScore(terms, value)
return {
indices,
path: pathSpec.path,
score: score * pathSpec.weight,
why: `${why} (*${pathSpec.weight})`,
}
})

const totalScore = stories.reduce((acc, rank) => acc + rank.score, 0)

return {hit, resultIndex: hits.length - index, score: totalScore, stories: stories}
})
/*
* Filter out hits with no score.
* (only if search terms are present, otherwise we always show results)
*
* Due to how we generate search queries, in some cases it's possible to have returned search hits
* which shouldn't be displayed. This can happen when searching on multiple document types and
* user-configured `__experimental_search` paths are in play.
*
* Since search generates a GROQ query with filters that may refer to field names shared across
* multiple document types, it's possible that one document type searches on a field path
* that is hidden by another via `__experimental_search`.
*/
if (terms.length === 0 || totalScore > 0) {
allHits.push({hit, resultIndex: hits.length - index, score: totalScore, stories: stories})
}
return allHits
}, [])
}

/**
* For phrases: score on the total number of matching characters.
* E.g. given the phrases ["the fox", "of london"] for the target value "the wily fox of london"
*
* - "the fox" isn't included in the target value (score: 0)
* - "of london" is included in the target value, and 9 out of 22 characters match (score: 9/22 = ~0.408)
* - non-exact matches have their score divided in half (final score: ~0.204)
*
* @param uniqueSearchPhrases - All search phrases
* @param value - The string to match against
* @returns SearchScore containing the search score as well as a human readable explanation
* @internal
*/
export function calculatePhraseScore(uniqueSearchPhrases: string[], value: string): SearchScore {
const sanitizedValue = value.toLowerCase().trim()
Expand All @@ -67,30 +110,70 @@ export function calculatePhraseScore(uniqueSearchPhrases: string[], value: strin

return fieldScore === 1
? [1, '[Phrase] Exact match']
: [fieldScore / 2, `[Phrase] Matched ${matchCount} of ${sanitizedValue.length} characters`]
: [fieldScore / 2, `[Phrase] ${matchCount}/${sanitizedValue.length} chars`]
}

/**
* For words: score on the total number of matching words.
* E.g. given the terms ["bar", "fo"] for the target value "food bar".
*
* - "fo" is included in the target value, and 2 out of 7 non-whitespace characters match (score: 2/7)
* - "bar" is included in the target value, and 3 out of 7 non-whitespace characters match (score: 3/7)
* - all values are accumulated and have their score devidied by half (final score: ~0.357)
*
* @param uniqueSearchTerms - A string array of search terms
* @param value - The string to match against
* @returns SearchScore containing the search score as well as a human readable explanation
* @internal
*/
export function calculateCharacterScore(uniqueSearchTerms: string[], value: string): SearchScore {
const sanitizedValue = value.toLowerCase().trim()
const sanitizedValueCompact = sanitizedValue.replace(/ /g, '')

let fieldScore = 0
let matchCount = 0
uniqueSearchTerms.forEach((term) => {
if (sanitizedValue.includes(term)) {
fieldScore += term.length / sanitizedValueCompact.length
matchCount += term.length
}
})

return fieldScore === 1
? [fieldScore, `[Char] Contains all`]
: [fieldScore / 2, `[Char] ${matchCount}/${sanitizedValueCompact.length} chars`]
}

/**
* Generate a score on the total number of matching _whole_ words.
* E.g. given the words ["the", "fox", "of", "london"] for the target value "the wily fox of london"
*
* - 4 out of 5 words match (score: 4/5 = 0.8)
* - non-exact matches have their score divided in half (final score: 0.4)
*
* @param uniqueSearchTerms - All search terms
* @param value - The string to match against
* @returns SearchScore containing the search score as well as a human readable explanation
* @internal
*/
export function calculateWordScore(uniqueSearchTerms: string[], value: string): SearchScore {
const uniqueValueTerms = uniq(compact(words(toLower(value))))

const matches = intersection(uniqueSearchTerms, uniqueValueTerms)
const all = union(uniqueValueTerms, uniqueSearchTerms)
const fieldScore = matches.length / all.length
const fieldScore = matches.length / all.length || 0
return fieldScore === 1
? [1, '[Word] Exact match']
: [
fieldScore / 2,
`[Word] Matched ${matches.length} of ${all.length} terms: [${matches.join(', ')}]`,
]
: [fieldScore / 2, `[Word] ${matches.length}/${all.length} terms: [${matches.join(', ')}]`]
}

/**
* Partition search terms by phrases (wrapped with quotes) and words.
*
* @param searchTerms - All search terms
* @returns Partitioned phrases and words
* @internal
*/
export function partitionAndSanitizeSearchTerms(
searchTerms: string[]
): {
Expand All @@ -106,6 +189,34 @@ export function partitionAndSanitizeSearchTerms(
}
}

/**
* Returns matching array indices of `values` containing _any_ member of `uniqueSearchTerms`.
* When comparing for matches, members of `values` are stringified, trimmed and lowercased.
*
* @param uniqueSearchTerms - All search terms
* @param values - Values to match against (members are stringified)
* @returns All matching indices in `values`
* @internal
*/
export function findMatchingIndices(uniqueSearchTerms: string[], values: unknown[]): number[] {
const {phrases: uniqueSearchPhrases, words: uniqueSearchWords} = partitionAndSanitizeSearchTerms(
uniqueSearchTerms
)

return values.reduce<number[]>((acc, val, index) => {
if (val) {
const contains = [...uniqueSearchPhrases, ...uniqueSearchWords].some((term) => {
const stringifiedValue = stringify(val).toLowerCase().trim()
return stringifiedValue.includes(term)
})
if (contains) {
acc.push(index)
}
}
return acc
}, [])
}

function stripWrappingQuotes(str: string) {
return str.replace(/^"(.*)"$/, '$1')
}
Loading

0 comments on commit 3eb6fd3

Please sign in to comment.