From 80e18051cb9e4f4ed2e6c1927b598367788d5f2a Mon Sep 17 00:00:00 2001 From: Michael Howell Date: Tue, 26 Aug 2025 19:46:50 -0700 Subject: [PATCH] rustdoc-search: yet another stringdex optimization attempt This one's uses a different tactic. It shouldn't significantly increase the amount of downloaded index data, but still reduces the amount of disk usage. This one works by changing the suffix-only node representation to omit some data that's needed for checking. Since those nodes make up the bulk of the tree, it reduces the data they store, but also requires validating the match by fetching the name itself (but the names list is pretty small, and when I tried it with wordnet "indexing" it was about the same). --- Cargo.lock | 4 +- src/librustdoc/Cargo.toml | 2 +- src/librustdoc/html/static/js/search.js | 9 +- src/librustdoc/html/static/js/stringdex.d.ts | 13 +- src/librustdoc/html/static/js/stringdex.js | 783 +++++++++++++++---- 5 files changed, 647 insertions(+), 164 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index af7c24abd132c..6866c1359b8c5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5225,9 +5225,9 @@ dependencies = [ [[package]] name = "stringdex" -version = "0.0.1-alpha4" +version = "0.0.1-alpha9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2841fd43df5b1ff1b042e167068a1fe9b163dc93041eae56ab2296859013a9a0" +checksum = "7081029913fd7d591c0112182aba8c98ae886b4f12edb208130496cd17dc3c15" dependencies = [ "stacker", ] diff --git a/src/librustdoc/Cargo.toml b/src/librustdoc/Cargo.toml index 5d36ffc2d3a52..f37a8d85361f2 100644 --- a/src/librustdoc/Cargo.toml +++ b/src/librustdoc/Cargo.toml @@ -21,7 +21,7 @@ rustdoc-json-types = { path = "../rustdoc-json-types" } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" smallvec = "1.8.1" -stringdex = { version = "0.0.1-alpha4" } +stringdex = { version = "0.0.1-alpha9" } tempfile = "3" threadpool = "1.8.1" tracing = "0.1" diff --git a/src/librustdoc/html/static/js/search.js b/src/librustdoc/html/static/js/search.js index 3fb4db3a89c5c..2c9004bc8ec1f 100644 --- a/src/librustdoc/html/static/js/search.js +++ b/src/librustdoc/html/static/js/search.js @@ -1211,7 +1211,7 @@ class DocSearch { * will never fulfill. */ async buildIndex() { - const nn = this.database.getIndex("normalizedName"); + const nn = this.database.getData("normalizedName"); if (!nn) { return; } @@ -3706,7 +3706,7 @@ class DocSearch { * @returns {AsyncGenerator} */ async function*(currentCrate) { - const index = this.database.getIndex("normalizedName"); + const index = this.database.getData("normalizedName"); if (!index) { return; } @@ -3835,8 +3835,7 @@ class DocSearch { }; if (elem.normalizedPathLast === "") { // faster full-table scan for this specific case. - const nameData = this.database.getData("name"); - const l = nameData ? nameData.length : 0; + const l = index.length; for (let id = 0; id < l; ++id) { if (!idDuplicates.has(id)) { idDuplicates.add(id); @@ -3938,7 +3937,7 @@ class DocSearch { * @returns {AsyncGenerator} */ async function*(inputs, output, typeInfo, currentCrate) { - const index = this.database.getIndex("normalizedName"); + const index = this.database.getData("normalizedName"); if (!index) { return; } diff --git a/src/librustdoc/html/static/js/stringdex.d.ts b/src/librustdoc/html/static/js/stringdex.d.ts index cf9a8b6b5648d..2eb1fdf95d845 100644 --- a/src/librustdoc/html/static/js/stringdex.d.ts +++ b/src/librustdoc/html/static/js/stringdex.d.ts @@ -5,17 +5,8 @@ declare namespace stringdex { * The client interface to Stringdex. */ interface Database { - getIndex(colname: string): SearchTree|undefined; getData(colname: string): DataColumn|undefined; } - /** - * A search index file. - */ - interface SearchTree { - trie(): Trie; - search(name: Uint8Array|string): Promise; - searchLev(name: Uint8Array|string): AsyncGenerator; - } /** * A compressed node in the search tree. * @@ -29,9 +20,7 @@ declare namespace stringdex { matches(): RoaringBitmap; substringMatches(): AsyncGenerator; prefixMatches(): AsyncGenerator; - keys(): Uint8Array; keysExcludeSuffixOnly(): Uint8Array; - children(): [number, Promise][]; childrenExcludeSuffixOnly(): [number, Promise][]; child(id: number): Promise?; } @@ -41,6 +30,8 @@ declare namespace stringdex { interface DataColumn { isEmpty(id: number): boolean; at(id: number): Promise; + search(name: Uint8Array|string): Promise; + searchLev(name: Uint8Array|string): AsyncGenerator; length: number, } /** diff --git a/src/librustdoc/html/static/js/stringdex.js b/src/librustdoc/html/static/js/stringdex.js index cb956d926db97..b7f605a10351d 100644 --- a/src/librustdoc/html/static/js/stringdex.js +++ b/src/librustdoc/html/static/js/stringdex.js @@ -1,3 +1,4 @@ +// ignore-tidy-filelength /** * @import * as stringdex from "./stringdex.d.ts" */ @@ -485,6 +486,63 @@ class RoaringBitmap { const mid = this.getContainerId(key); return mid === -1 ? false : this.containers[mid].contains(value); } + /** + * @param {number} keyvalue + * @returns {RoaringBitmap} + */ + remove(keyvalue) { + const key = keyvalue >> 16; + const value = keyvalue & 0xFFFF; + const mid = this.getContainerId(key); + if (mid === -1) { + return this; + } + const container = this.containers[mid]; + if (!container.contains(value)) { + return this; + } + const newCardinality = (this.keysAndCardinalities[(mid * 4) + 2] | + (this.keysAndCardinalities[(mid * 4) + 3] << 8)); + const l = this.containers.length; + const m = l - (newCardinality === 0 ? 1 : 0); + const result = new RoaringBitmap(null, 0); + result.keysAndCardinalities = new Uint8Array(m * 4); + let j = 0; + for (let i = 0; i < l; i += 1) { + if (i === mid) { + if (newCardinality !== 0) { + result.keysAndCardinalities[(j * 4) + 0] = key; + result.keysAndCardinalities[(j * 4) + 1] = key >> 8; + const card = newCardinality - 1; + result.keysAndCardinalities[(j * 4) + 2] = card; + result.keysAndCardinalities[(j * 4) + 3] = card >> 8; + const newContainer = new RoaringBitmapArray( + newCardinality, + new Uint8Array(newCardinality * 2), + ); + let newContainerSlot = 0; + for (const containerValue of container.values()) { + if (containerValue !== value) { + newContainer.array[newContainerSlot] = value & 0xFF; + newContainerSlot += 1; + newContainer.array[newContainerSlot] = value >> 8; + newContainerSlot += 1; + } + } + result.containers.push(newContainer); + j += 1; + } + } else { + result.keysAndCardinalities[(j * 4) + 0] = this.keysAndCardinalities[(i * 4) + 0]; + result.keysAndCardinalities[(j * 4) + 1] = this.keysAndCardinalities[(i * 4) + 1]; + result.keysAndCardinalities[(j * 4) + 2] = this.keysAndCardinalities[(i * 4) + 2]; + result.keysAndCardinalities[(j * 4) + 3] = this.keysAndCardinalities[(i * 4) + 3]; + result.containers.push(this.containers[i]); + j += 1; + } + } + return result; + } /** * @param {number} key * @returns {number} @@ -877,6 +935,46 @@ function bitCount(n) { } /*eslint-enable */ +/** + * https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore%E2%80%93Horspool_algorithm + */ +class Uint8ArraySearchPattern { + /** @param {Uint8Array} needle */ + constructor(needle) { + this.needle = needle; + this.skipTable = []; + const m = needle.length; + for (let i = 0; i < 256; i += 1) { + this.skipTable.push(m); + } + for (let i = 0; i < m - 1; i += 1) { + this.skipTable[needle[i]] = m - 1 - i; + } + } + /** + * @param {Uint8Array} haystack + * @returns {boolean} + */ + matches(haystack) { + const needle = this.needle; + const skipTable = this.skipTable; + const m = needle.length; + const n = haystack.length; + + let skip = 0; + search: while (n - skip >= m) { + for (let i = m - 1; i >= 0; i -= 1) { + if (haystack[skip + i] !== needle[i]) { + skip += skipTable[haystack[skip + m - 1]]; + continue search; + } + } + return true; + } + return false; + } +} + /** * @param {stringdex.Hooks} hooks * @returns {Promise} @@ -887,12 +985,6 @@ function loadDatabase(hooks) { rr_: function(data) { const dataObj = JSON.parse(data); for (const colName of Object.keys(dataObj)) { - if (Object.hasOwn(dataObj[colName], "I")) { - registry.searchTreeRoots.set( - colName, - makeSearchTreeFromBase64(dataObj[colName].I)[1], - ); - } if (Object.hasOwn(dataObj[colName], "N")) { const counts = []; const countsstring = dataObj[colName]["N"]; @@ -915,6 +1007,9 @@ function loadDatabase(hooks) { makeUint8ArrayFromBase64(dataObj[colName]["H"]), new RoaringBitmap(makeUint8ArrayFromBase64(dataObj[colName]["E"]), 0), colName, + Object.hasOwn(dataObj[colName], "I") ? + makeSearchTreeFromBase64(dataObj[colName].I)[1] : + null, )); } } @@ -978,7 +1073,7 @@ function loadDatabase(hooks) { * searchTreePromises: HashTable>; * dataColumnLoadPromiseCallbacks: HashTable; * dataColumns: Map; - * dataColumnsBuckets: Map>>; + * dataColumnsBuckets: HashTable>; * searchTreeLoadByNodeID: function(Uint8Array): Promise; * searchTreeRootCallback?: function(any, Database?): any; * dataLoadByNameAndHash: function(string, Uint8Array): Promise; @@ -990,7 +1085,7 @@ function loadDatabase(hooks) { searchTreePromises: new HashTable(), dataColumnLoadPromiseCallbacks: new HashTable(), dataColumns: new Map(), - dataColumnsBuckets: new Map(), + dataColumnsBuckets: new HashTable(), searchTreeLoadByNodeID: function(nodeid) { const existingPromise = registry.searchTreePromises.get(nodeid); if (existingPromise) { @@ -1022,7 +1117,7 @@ function loadDatabase(hooks) { const data = (nodeid[0] & 0x20) !== 0 ? Uint8Array.of(((nodeid[0] & 0x0f) << 4) | (nodeid[1] >> 4)) : EMPTY_UINT8; - newPromise = Promise.resolve(new SearchTree( + newPromise = Promise.resolve(new PrefixSearchTree( EMPTY_SEARCH_TREE_BRANCHES, EMPTY_SEARCH_TREE_BRANCHES, data, @@ -1058,12 +1153,7 @@ function loadDatabase(hooks) { return newPromise; }, dataLoadByNameAndHash: function(name, hash) { - let dataColumnBuckets = registry.dataColumnsBuckets.get(name); - if (dataColumnBuckets === undefined) { - dataColumnBuckets = new HashTable(); - registry.dataColumnsBuckets.set(name, dataColumnBuckets); - } - const existingBucket = dataColumnBuckets.get(hash); + const existingBucket = registry.dataColumnsBuckets.get(hash); if (existingBucket) { return existingBucket; } @@ -1091,16 +1181,17 @@ function loadDatabase(hooks) { hooks.loadDataByNameAndHash(name, hashHex); } }); - dataColumnBuckets.set(hash, newBucket); + registry.dataColumnsBuckets.set(hash, newBucket); return newBucket; }, }; /** * The set of child subtrees. + * @template ST * @type {{ * nodeids: Uint8Array, - * subtrees: Array|null>, + * subtrees: Array|null>, * }} */ class SearchTreeBranches { @@ -1128,7 +1219,7 @@ function loadDatabase(hooks) { ); } // https://github.com/microsoft/TypeScript/issues/17227 - /** @returns {Generator<[number, Promise|null]>} */ + /** @returns {Generator<[number, Promise|null]>} */ entries() { throw new Error(); } @@ -1157,10 +1248,12 @@ function loadDatabase(hooks) { /** * A sorted array of search tree branches. * + * @template ST + * @extends SearchTreeBranches * @type {{ * keys: Uint8Array, * nodeids: Uint8Array, - * subtrees: Array|null>, + * subtrees: Array|null>, * }} */ class SearchTreeBranchesArray extends SearchTreeBranches { @@ -1179,7 +1272,7 @@ function loadDatabase(hooks) { i += 1; } } - /** @returns {Generator<[number, Promise|null]>} */ + /** @returns {Generator<[number, Promise|null]>} */ * entries() { let i = 0; const l = this.keys.length; @@ -1246,12 +1339,16 @@ function loadDatabase(hooks) { } /** + * @template ST * @param {number[]} alphabitmap_chars * @param {number} width - * @return {(typeof SearchTreeBranches)&{"ALPHABITMAP_CHARS": number[], "width": number}} + * @return {(typeof SearchTreeBranches)&{"ALPHABITMAP_CHARS": number[], "width": number}} */ function makeSearchTreeBranchesAlphaBitmapClass(alphabitmap_chars, width) { const bitwidth = width * 8; + /** + * @extends SearchTreeBranches + */ const cls = class SearchTreeBranchesAlphaBitmap extends SearchTreeBranches { /** * @param {number} bitmap @@ -1265,7 +1362,7 @@ function loadDatabase(hooks) { this.bitmap = bitmap; this.nodeids = nodeids; } - /** @returns {Generator<[number, Promise|null]>} */ + /** @returns {Generator<[number, Promise|null]>} */ * entries() { let i = 0; let j = 0; @@ -1295,15 +1392,7 @@ function loadDatabase(hooks) { * @returns {number} */ getKey(branch_index) { - //return this.getKeys()[branch_index]; - let alpha_index = 0; - while (branch_index >= 0) { - if (this.bitmap & (1 << alpha_index)) { - branch_index -= 1; - } - alpha_index += 1; - } - return alphabitmap_chars[alpha_index]; + return this.getKeys()[branch_index]; } /** * @returns {Uint8Array} @@ -1326,20 +1415,35 @@ function loadDatabase(hooks) { return cls; } + /** + * @template ST + * @type {(typeof SearchTreeBranches)&{"ALPHABITMAP_CHARS": number[], "width": number}} + */ const SearchTreeBranchesShortAlphaBitmap = makeSearchTreeBranchesAlphaBitmapClass(SHORT_ALPHABITMAP_CHARS, 3); + /** + * @template ST + * @type {(typeof SearchTreeBranches)&{"ALPHABITMAP_CHARS": number[], "width": number}} + */ const SearchTreeBranchesLongAlphaBitmap = makeSearchTreeBranchesAlphaBitmapClass(LONG_ALPHABITMAP_CHARS, 4); /** - * A [suffix tree], used for name-based search. + * @typedef {PrefixSearchTree|SuffixSearchTree} SearchTree + * @typedef {PrefixTrie|SuffixTrie} Trie + */ + + /** + * An interleaved [prefix] and [suffix tree], + * used for name-based search. * - * This data structure is used to drive substring matches, + * This data structure is used to drive prefix matches, * such as matching the query "link" to `LinkedList`, * and Lev-distance matches, such as matching the * query "hahsmap" to `HashMap`. * + * [prefix tree]: https://en.wikipedia.org/wiki/Prefix_tree * [suffix tree]: https://en.wikipedia.org/wiki/Suffix_tree * * branches @@ -1358,17 +1462,17 @@ function loadDatabase(hooks) { * will include these. * * @type {{ - * might_have_prefix_branches: SearchTreeBranches, - * branches: SearchTreeBranches, + * might_have_prefix_branches: SearchTreeBranches, + * branches: SearchTreeBranches, * data: Uint8Array, * leaves_suffix: RoaringBitmap, * leaves_whole: RoaringBitmap, * }} */ - class SearchTree { + class PrefixSearchTree { /** - * @param {SearchTreeBranches} branches - * @param {SearchTreeBranches} might_have_prefix_branches + * @param {SearchTreeBranches} branches + * @param {SearchTreeBranches} might_have_prefix_branches * @param {Uint8Array} data * @param {RoaringBitmap} leaves_whole * @param {RoaringBitmap} leaves_suffix @@ -1392,25 +1496,31 @@ function loadDatabase(hooks) { * A Trie pointer refers to a single node in a logical decompressed search tree * (the real search tree is compressed). * - * @return {Trie} + * @param {DataColumn} dataColumn + * @param {Uint8ArraySearchPattern} searchPattern + * @return {PrefixTrie} */ - trie() { - return new Trie(this, 0); + trie(dataColumn, searchPattern) { + return new PrefixTrie(this, 0, dataColumn, searchPattern); } /** * Return the trie representing `name` * @param {Uint8Array|string} name + * @param {DataColumn} dataColumn * @returns {Promise} */ - async search(name) { + async search(name, dataColumn) { if (typeof name === "string") { const utf8encoder = new TextEncoder(); name = utf8encoder.encode(name); } - let trie = this.trie(); + const searchPattern = new Uint8ArraySearchPattern(name); + /** @type {Trie} */ + let trie = this.trie(dataColumn, searchPattern); for (const datum of name) { // code point definitely exists + /** @type {Promise?} */ const newTrie = trie.child(datum); if (newTrie) { trie = await newTrie; @@ -1423,26 +1533,28 @@ function loadDatabase(hooks) { /** * @param {Uint8Array|string} name + * @param {DataColumn} dataColumn * @returns {AsyncGenerator} */ - async* searchLev(name) { + async* searchLev(name, dataColumn) { if (typeof name === "string") { const utf8encoder = new TextEncoder(); name = utf8encoder.encode(name); } const w = name.length; if (w < 3) { - const trie = await this.search(name); + const trie = await this.search(name, dataColumn); if (trie !== null) { yield trie; } return; } + const searchPattern = new Uint8ArraySearchPattern(name); const levParams = w >= 6 ? new Lev2TParametricDescription(w) : new Lev1TParametricDescription(w); /** @type {Array<[Promise, number]>} */ - const stack = [[Promise.resolve(this.trie()), 0]]; + const stack = [[Promise.resolve(this.trie(dataColumn, searchPattern)), 0]]; const n = levParams.n; while (stack.length !== 0) { // It's not empty @@ -1475,20 +1587,29 @@ function loadDatabase(hooks) { } } } + + /** @returns {RoaringBitmap} */ + getCurrentLeaves() { + return this.leaves_whole.union(this.leaves_suffix); + } } /** * A representation of a set of strings in the search index, * as a subset of the entire tree. */ - class Trie { + class PrefixTrie { /** - * @param {SearchTree} tree + * @param {PrefixSearchTree} tree * @param {number} offset + * @param {DataColumn} dataColumn + * @param {Uint8ArraySearchPattern} searchPattern */ - constructor(tree, offset) { + constructor(tree, offset, dataColumn, searchPattern) { this.tree = tree; this.offset = offset; + this.dataColumn = dataColumn; + this.searchPattern = searchPattern; } /** @@ -1514,7 +1635,28 @@ function loadDatabase(hooks) { const current_layer = layer; layer = []; for await (const tree of current_layer) { - yield tree.leaves_whole.union(tree.leaves_suffix); + /** @type {number[]?} */ + let rejected = null; + let leaves = tree.getCurrentLeaves(); + for (const leaf of leaves.entries()) { + const haystack = await this.dataColumn.at(leaf); + if (haystack === undefined || !this.searchPattern.matches(haystack)) { + if (!rejected) { + rejected = []; + } + rejected.push(leaf); + } + } + if (rejected) { + if (leaves.cardinality() !== rejected.length) { + for (const rej of rejected) { + leaves = leaves.remove(rej); + } + yield leaves; + } + } else { + yield leaves; + } } /** @type {HashTable<[number, SearchTree][]>} */ const subnodes = new HashTable(); @@ -1548,12 +1690,14 @@ function loadDatabase(hooks) { const res = registry.searchTreeLoadByNodeID(newnode); for (const [byte, node] of subnode_list) { const branches = node.branches; - const might_have_prefix_branches = node.might_have_prefix_branches; const i = branches.getIndex(byte); branches.subtrees[i] = res; - const mhpI = might_have_prefix_branches.getIndex(byte); - if (mhpI !== -1) { - might_have_prefix_branches.subtrees[mhpI] = res; + if (node instanceof PrefixSearchTree) { + const might_have_prefix_branches = node.might_have_prefix_branches; + const mhpI = might_have_prefix_branches.getIndex(byte); + if (mhpI !== -1) { + might_have_prefix_branches.subtrees[mhpI] = res; + } } } layer.push(res); @@ -1581,6 +1725,9 @@ function loadDatabase(hooks) { // if we want to do them in order) for (const {node, len} of current_layer) { const tree = await node; + if (!(tree instanceof PrefixSearchTree)) { + continue; + } const length = len + tree.data.length; if (minLength === null || length < minLength) { minLength = length; @@ -1637,10 +1784,13 @@ function loadDatabase(hooks) { } } // if we still have more subtrees to walk, then keep going - /** @type {HashTable<{byte: number, tree: SearchTree, len: number}[]>} */ + /** @type {HashTable<{byte: number, tree: PrefixSearchTree, len: number}[]>} */ const subnodes = new HashTable(); for await (const {node, len} of current_layer) { const tree = await node; + if (!(tree instanceof PrefixSearchTree)) { + continue; + } const length = len + tree.data.length; const mhp_branches = tree.might_have_prefix_branches; const l = mhp_branches.subtrees.length; @@ -1663,8 +1813,6 @@ function loadDatabase(hooks) { subnode_list.push({byte, tree, len}); } } - } else { - throw new Error(`malformed tree; index ${i} does not exist`); } } } @@ -1728,14 +1876,21 @@ function loadDatabase(hooks) { this.tree.might_have_prefix_branches.subtrees[mhpI] = node; } } - nodes.push([k, node.then(node => node.trie())]); + nodes.push([k, node.then(node => { + return node.trie(this.dataColumn, this.searchPattern); + })]); i += 1; } return nodes; } else { /** @type {number} */ const codePoint = data[this.offset]; - const trie = new Trie(this.tree, this.offset + 1); + const trie = new PrefixTrie( + this.tree, + this.offset + 1, + this.dataColumn, + this.searchPattern, + ); return [[codePoint, Promise.resolve(trie)]]; } } @@ -1777,14 +1932,21 @@ function loadDatabase(hooks) { this.tree.might_have_prefix_branches.subtrees[i] = node; this.tree.branches.subtrees[this.tree.branches.getIndex(k)] = node; } - nodes.push([k, node.then(node => node.trie())]); + nodes.push([k, node.then(node => { + return node.trie(this.dataColumn, this.searchPattern); + })]); i += 1; } return nodes; } else { /** @type {number} */ const codePoint = data[this.offset]; - const trie = new Trie(this.tree, this.offset + 1); + const trie = new PrefixTrie( + this.tree, + this.offset + 1, + this.dataColumn, + this.searchPattern, + ); return [[codePoint, Promise.resolve(trie)]]; } } @@ -1811,10 +1973,275 @@ function loadDatabase(hooks) { this.tree.might_have_prefix_branches.subtrees[mhpI] = branch; } } - return branch.then(branch => branch.trie()); + return branch.then(branch => branch.trie(this.dataColumn, this.searchPattern)); } } else if (this.tree.data[this.offset] === byte) { - return Promise.resolve(new Trie(this.tree, this.offset + 1)); + return Promise.resolve(new PrefixTrie( + this.tree, + this.offset + 1, + this.dataColumn, + this.searchPattern, + )); + } + return null; + } + } + /** + * A [suffix tree], used for name-based search. + * + * This data structure is used to drive substring matches, + * such as matching the query "inked" to `LinkedList`. + * + * [suffix tree]: https://en.wikipedia.org/wiki/Suffix_tree + * + * Suffix trees do not actually carry the intermediate data + * between branches, so, in order to validate the results, + * they must go through a filtering step at the end. + * Suffix trees also cannot match lev and exact matches, + * so those just return empty sets. + * + * branches + * : A sorted-array map of subtrees. + * + * dataLen + * : The length of the substring used by this node. + * + * leaves_suffix + * : The IDs of every entry that matches. Levenshtein matches + * won't include these. + * + * @type {{ + * branches: SearchTreeBranches, + * dataLen: number, + * leaves_suffix: RoaringBitmap, + * }} + */ + class SuffixSearchTree { + /** + * @param {SearchTreeBranches} branches + * @param {number} dataLen + * @param {RoaringBitmap} leaves_suffix + */ + constructor( + branches, + dataLen, + leaves_suffix, + ) { + this.branches = branches; + this.dataLen = dataLen; + this.leaves_suffix = leaves_suffix; + } + /** + * Returns the Trie for the root node. + * + * A Trie pointer refers to a single node in a logical decompressed search tree + * (the real search tree is compressed). + * + * @param {DataColumn} dataColumn + * @param {Uint8ArraySearchPattern} searchPattern + * @return {Trie} + */ + trie(dataColumn, searchPattern) { + return new SuffixTrie(this, 0, dataColumn, searchPattern); + } + + /** + * Return the trie representing `name` + * @param {Uint8Array|string} name + * @param {DataColumn} dataColumn + * @returns {Promise} + */ + async search(name, dataColumn) { + if (typeof name === "string") { + const utf8encoder = new TextEncoder(); + name = utf8encoder.encode(name); + } + const searchPattern = new Uint8ArraySearchPattern(name); + let trie = this.trie(dataColumn, searchPattern); + for (const datum of name) { + // code point definitely exists + const newTrie = trie.child(datum); + if (newTrie) { + trie = await newTrie; + } else { + return null; + } + } + return trie; + } + + /** + * @param {Uint8Array|string} _name + * @param {DataColumn} _dataColumn + * @returns {AsyncGenerator} + */ + async* searchLev(_name, _dataColumn) { + // this function only returns whole-string matches, + // which pure-suffix nodes don't have, so is + // intentionally blank + } + + /** @returns {RoaringBitmap} */ + getCurrentLeaves() { + return this.leaves_suffix; + } + } + + /** + * A representation of a set of strings in the search index, + * as a subset of the entire tree (suffix-only). + */ + class SuffixTrie { + /** + * @param {SuffixSearchTree} tree + * @param {number} offset + * @param {DataColumn} dataColumn + * @param {Uint8ArraySearchPattern} searchPattern + */ + constructor(tree, offset, dataColumn, searchPattern) { + this.tree = tree; + this.offset = offset; + this.dataColumn = dataColumn; + this.searchPattern = searchPattern; + } + + /** + * All exact matches for the string represented by this node. + * Since pure-suffix nodes have no exactly-matching children, + * this function returns the empty bitmap. + * @returns {RoaringBitmap} + */ + matches() { + return EMPTY_BITMAP; + } + + /** + * All matches for strings that contain the string represented by this node. + * @returns {AsyncGenerator} + */ + async* substringMatches() { + /** @type {Promise[]} */ + let layer = [Promise.resolve(this.tree)]; + while (layer.length) { + const current_layer = layer; + layer = []; + for await (const tree of current_layer) { + /** @type {number[]?} */ + let rejected = null; + let leaves = tree.getCurrentLeaves(); + for (const leaf of leaves.entries()) { + const haystack = await this.dataColumn.at(leaf); + if (haystack === undefined || !this.searchPattern.matches(haystack)) { + if (!rejected) { + rejected = []; + } + rejected.push(leaf); + } + } + if (rejected) { + if (leaves.cardinality() !== rejected.length) { + for (const rej of rejected) { + leaves = leaves.remove(rej); + } + yield leaves; + } + } else { + yield leaves; + } + } + /** @type {HashTable<[number, SearchTree][]>} */ + const subnodes = new HashTable(); + for await (const node of current_layer) { + const branches = node.branches; + const l = branches.subtrees.length; + for (let i = 0; i < l; ++i) { + const subtree = branches.subtrees[i]; + if (subtree) { + layer.push(subtree); + } else if (subtree === null) { + const newnode = branches.getNodeID(i); + if (!newnode) { + throw new Error(`malformed tree; no node for index ${i}`); + } else { + let subnode_list = subnodes.get(newnode); + if (!subnode_list) { + subnode_list = [[i, node]]; + subnodes.set(newnode, subnode_list); + } else { + subnode_list.push([i, node]); + } + } + } else { + throw new Error(`malformed tree; index ${i} does not exist`); + } + } + } + for (const [newnode, subnode_list] of subnodes.entries()) { + const res = registry.searchTreeLoadByNodeID(newnode); + for (const [i, node] of subnode_list) { + const branches = node.branches; + branches.subtrees[i] = res; + } + layer.push(res); + } + } + } + + /** + * All matches for strings that start with the string represented by this node. + * Since this is a pure-suffix node, there aren't any. + * @returns {AsyncGenerator} + */ + async* prefixMatches() { + // this function only returns prefix matches, + // which pure-suffix nodes don't have, so is + // intentionally blank + } + + /** + * Returns all keys that are children of this node. + * @returns {Uint8Array} + */ + keysExcludeSuffixOnly() { + return EMPTY_UINT8; + } + + /** + * Returns all nodes that are direct children of this node. + * @returns {[number, Promise][]} + */ + childrenExcludeSuffixOnly() { + return []; + } + + /** + * Returns a single node that is a direct child of this node. + * @param {number} byte + * @returns {Promise?} + */ + child(byte) { + if (this.offset === this.tree.dataLen) { + const i = this.tree.branches.getIndex(byte); + if (i !== -1) { + /** @type {Promise?} */ + let branch = this.tree.branches.subtrees[i]; + if (branch === null) { + const newnode = this.tree.branches.getNodeID(i); + if (!newnode) { + throw new Error(`malformed tree; no node for key ${byte}`); + } + branch = registry.searchTreeLoadByNodeID(newnode); + this.tree.branches.subtrees[i] = branch; + } + return branch.then(branch => branch.trie(this.dataColumn, this.searchPattern)); + } + } else { + return Promise.resolve(new SuffixTrie( + this.tree, + this.offset + 1, + this.dataColumn, + this.searchPattern, + )); } return null; } @@ -1827,8 +2254,10 @@ function loadDatabase(hooks) { * @param {Uint8Array} hashes * @param {RoaringBitmap} emptyset * @param {string} name + * @param {SearchTree?} searchTree */ - constructor(counts, hashes, emptyset, name) { + constructor(counts, hashes, emptyset, name, searchTree) { + this.searchTree = searchTree; this.hashes = hashes; this.emptyset = emptyset; this.name = name; @@ -1883,7 +2312,7 @@ function loadDatabase(hooks) { const {hash, end} = this.buckets[idx]; let data = this.buckets[idx].data; if (data === null) { - const dataSansEmptyset = await registry.dataLoadByNameAndHash( + const dataSansEmptysetOrig = await registry.dataLoadByNameAndHash( this.name, hash, ); @@ -1893,6 +2322,7 @@ function loadDatabase(hooks) { if (data !== null) { return (await data)[id - start]; } + const dataSansEmptyset = [...dataSansEmptysetOrig]; /** @type {(Uint8Array[])|null} */ let dataWithEmptyset = null; let pos = start; @@ -1924,6 +2354,24 @@ function loadDatabase(hooks) { } } } + /** + * Search by exact substring + * @param {Uint8Array|string} name + * @returns {Promise} + */ + async search(name) { + return await (this.searchTree ? this.searchTree.search(name, this) : null); + } + /** + * Search by whole, inexact match + * @param {Uint8Array|string} name + * @returns {AsyncGenerator} + */ + async *searchLev(name) { + if (this.searchTree) { + yield* this.searchTree.searchLev(name, this); + } + } } class Database { @@ -2015,7 +2463,7 @@ function loadDatabase(hooks) { const data_history = []; let canonical = EMPTY_UINT8; /** @type {SearchTree} */ - let tree = new SearchTree( + let tree = new PrefixSearchTree( EMPTY_SEARCH_TREE_BRANCHES, EMPTY_SEARCH_TREE_BRANCHES, EMPTY_UINT8, @@ -2029,8 +2477,8 @@ function loadDatabase(hooks) { * @returns {{ * "cpbranches": Uint8Array, * "csbranches": Uint8Array, - * "might_have_prefix_branches": SearchTreeBranches, - * "branches": SearchTreeBranches, + * "might_have_prefix_branches": SearchTreeBranches, + * "branches": SearchTreeBranches, * "cpnodes": Uint8Array, * "csnodes": Uint8Array, * "consumed_len_bytes": number, @@ -2051,6 +2499,12 @@ function loadDatabase(hooks) { const start_point = i; let cplen; let cslen; + /** + * @type {( + * typeof SearchTreeBranches & + * {"ALPHABITMAP_CHARS": number[], "width": number} + * )?} + */ let alphabitmap = null; if (is_pure_suffixes_only_node) { cplen = 0; @@ -2299,6 +2753,8 @@ function loadDatabase(hooks) { if (is_data_compressed) { data = data_history[data_history.length - dlen - 1]; dlen = data.length; + } else if (is_pure_suffixes_only_node) { + data = EMPTY_UINT8; } else { data = dlen === 0 ? EMPTY_UINT8 : @@ -2319,80 +2775,109 @@ function loadDatabase(hooks) { let whole; let suffix; if (is_pure_suffixes_only_node) { - whole = EMPTY_BITMAP; suffix = input[i] === 0 ? EMPTY_BITMAP1 : new RoaringBitmap(input, i); i += suffix.consumed_len_bytes; - } else if (input[i] === 0xff) { - whole = EMPTY_BITMAP; - suffix = EMPTY_BITMAP1; - i += 1; + tree = new SuffixSearchTree( + branches, + dlen, + suffix, + ); + const clen = ( + 3 + // lengths of children and data + csnodes.length + + csbranches.length + + suffix.consumed_len_bytes + ); + if (canonical.length < clen) { + canonical = new Uint8Array(clen); + } + let ci = 0; + canonical[ci] = 1; + ci += 1; + canonical[ci] = dlen; + ci += 1; + canonical[ci] = input[coffset]; // suffix child count + ci += 1; + canonical.set(csnodes, ci); + ci += csnodes.length; + canonical.set(csbranches, ci); + ci += csbranches.length; + const leavesOffset = i - suffix.consumed_len_bytes; + for (let j = leavesOffset; j < i; j += 1) { + canonical[ci + j - leavesOffset] = input[j]; + } + siphashOfBytes(canonical.subarray(0, clen), 0, 0, 0, 0, hash); } else { - whole = input[i] === 0 ? - EMPTY_BITMAP1 : - new RoaringBitmap(input, i); - i += whole.consumed_len_bytes; - suffix = input[i] === 0 ? - EMPTY_BITMAP1 : - new RoaringBitmap(input, i); - i += suffix.consumed_len_bytes; - } - tree = new SearchTree( - branches, - might_have_prefix_branches, - data, - whole, - suffix, - ); - const clen = ( - (is_pure_suffixes_only_node ? 3 : 4) + // lengths of children and data - dlen + - cpnodes.length + csnodes.length + - cpbranches.length + csbranches.length + - whole.consumed_len_bytes + - suffix.consumed_len_bytes - ); - if (canonical.length < clen) { - canonical = new Uint8Array(clen); - } - let ci = 0; - canonical[ci] = is_pure_suffixes_only_node ? 1 : 0; - ci += 1; - canonical[ci] = dlen; - ci += 1; - canonical.set(data, ci); - ci += dlen; - canonical[ci] = input[coffset]; - ci += 1; - if (!is_pure_suffixes_only_node) { - canonical[ci] = input[coffset + 1]; + if (input[i] === 0xff) { + whole = EMPTY_BITMAP; + suffix = EMPTY_BITMAP1; + i += 1; + } else { + whole = input[i] === 0 ? + EMPTY_BITMAP1 : + new RoaringBitmap(input, i); + i += whole.consumed_len_bytes; + suffix = input[i] === 0 ? + EMPTY_BITMAP1 : + new RoaringBitmap(input, i); + i += suffix.consumed_len_bytes; + } + tree = new PrefixSearchTree( + branches, + might_have_prefix_branches, + data, + whole, + suffix, + ); + const clen = ( + 4 + // lengths of children and data + dlen + + cpnodes.length + csnodes.length + + cpbranches.length + csbranches.length + + whole.consumed_len_bytes + + suffix.consumed_len_bytes + ); + if (canonical.length < clen) { + canonical = new Uint8Array(clen); + } + let ci = 0; + canonical[ci] = 0; ci += 1; + canonical[ci] = dlen; + ci += 1; + canonical.set(data, ci); + ci += data.length; + canonical[ci] = input[coffset]; // prefix child count + ci += 1; + canonical[ci] = input[coffset + 1]; // suffix child count + ci += 1; + canonical.set(cpnodes, ci); + ci += cpnodes.length; + canonical.set(csnodes, ci); + ci += csnodes.length; + canonical.set(cpbranches, ci); + ci += cpbranches.length; + canonical.set(csbranches, ci); + ci += csbranches.length; + const leavesOffset = i - whole.consumed_len_bytes - suffix.consumed_len_bytes; + for (let j = leavesOffset; j < i; j += 1) { + canonical[ci + j - leavesOffset] = input[j]; + } + siphashOfBytes(canonical.subarray(0, clen), 0, 0, 0, 0, hash); } - canonical.set(cpnodes, ci); - ci += cpnodes.length; - canonical.set(csnodes, ci); - ci += csnodes.length; - canonical.set(cpbranches, ci); - ci += cpbranches.length; - canonical.set(csbranches, ci); - ci += csbranches.length; - const leavesOffset = i - whole.consumed_len_bytes - suffix.consumed_len_bytes; - for (let j = leavesOffset; j < i; j += 1) { - canonical[ci + j - leavesOffset] = input[j]; - } - siphashOfBytes(canonical.subarray(0, clen), 0, 0, 0, 0, hash); hash[2] &= 0x7f; } else { // uncompressed node const dlen = input [i + 1]; i += 2; - if (dlen === 0) { + if (dlen === 0 || is_pure_suffixes_only_node) { data = EMPTY_UINT8; } else { data = new Uint8Array(input.buffer, i + input.byteOffset, dlen); + i += dlen; } - i += dlen; const { consumed_len_bytes: branches_consumed_len_bytes, branches, @@ -2427,13 +2912,19 @@ function loadDatabase(hooks) { i - start, ), 0, 0, 0, 0, hash); hash[2] &= 0x7f; - tree = new SearchTree( - branches, - might_have_prefix_branches, - data, - whole, - suffix, - ); + tree = is_pure_suffixes_only_node ? + new SuffixSearchTree( + branches, + dlen, + suffix, + ) : + new PrefixSearchTree( + branches, + might_have_prefix_branches, + data, + whole, + suffix, + ); } hash_history.push({hash: truncatedHash.slice(), used: false}); if (data.length !== 0) { @@ -2454,20 +2945,22 @@ function loadDatabase(hooks) { } j += 1; } - const tree_mhp_branch_nodeids = tree.might_have_prefix_branches.nodeids; - const tree_mhp_branch_subtrees = tree.might_have_prefix_branches.subtrees; - j = 0; - lb = tree.might_have_prefix_branches.subtrees.length; - while (j < lb) { - // node id with a 1 in its most significant bit is inlined, and, so - // it won't be in the stash - if ((tree_mhp_branch_nodeids[j * 6] & 0x80) === 0) { - const subtree = stash.getWithOffsetKey(tree_mhp_branch_nodeids, j * 6); - if (subtree !== undefined) { - tree_mhp_branch_subtrees[j] = Promise.resolve(subtree); + if (tree instanceof PrefixSearchTree) { + const tree_mhp_branch_nodeids = tree.might_have_prefix_branches.nodeids; + const tree_mhp_branch_subtrees = tree.might_have_prefix_branches.subtrees; + j = 0; + lb = tree.might_have_prefix_branches.subtrees.length; + while (j < lb) { + // node id with a 1 in its most significant bit is inlined, and, so + // it won't be in the stash + if ((tree_mhp_branch_nodeids[j * 6] & 0x80) === 0) { + const subtree = stash.getWithOffsetKey(tree_mhp_branch_nodeids, j * 6); + if (subtree !== undefined) { + tree_mhp_branch_subtrees[j] = Promise.resolve(subtree); + } } + j += 1; } - j += 1; } if (i !== l) { stash.set(truncatedHash, tree);