Skip to content

Commit

Permalink
fix: work on Trie performance (#4480)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jason3S committed May 12, 2023
1 parent 8407f4b commit 2bd7fa8
Show file tree
Hide file tree
Showing 19 changed files with 564 additions and 180 deletions.
17 changes: 6 additions & 11 deletions packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlob.en.test.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import { opSkip, opTake, pipe } from '@cspell/cspell-pipe/sync';
import { describe, expect, test } from 'vitest';

import { readTrie } from '../../test/dictionaries.test.helper.js';
import { FastTrieBlob } from './FastTrieBlob.js';
import { measure } from './test/perf.js';

function getTrie() {
return readTrie('@cspell/dict-en_us/cspell-ext.json');
Expand All @@ -11,26 +11,21 @@ function getTrie() {
describe('Validate English FastTrieBlob', async () => {
const pTrie = getTrie();
const sampleTrie = await pTrie;
const sampleWordsLarge = [...sampleTrie.words()];
const fastTrieBlob = FastTrieBlob.fromWordList(sampleWordsLarge);
const sampleWordsLarge = [...pipe(sampleTrie.words(), opSkip(1000), opTake(6000))];
const fastTrieBlob = FastTrieBlob.fromTrieRoot(sampleTrie.root);

test('insert', () => {
const words = sampleWordsLarge.slice(1000, 6000);
const words = sampleWordsLarge;
const ft = new FastTrieBlob();
measure('FastTrieBlob', () => ft.insert(words));
ft.insert(words);
const result = [...ft.words()];
expect(result).toEqual(words);
});

test('has', () => {
const words = sampleWordsLarge.slice(1000, 6000);
const words = sampleWordsLarge;
for (const word of words) {
expect(fastTrieBlob.has(word)).toBe(true);
}
});

test('fromTrieRoot', () => {
const ft = FastTrieBlob.fromTrieRoot(sampleTrie.root);
expect(ft.has('hello')).toBe(true);
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ describe('FastTrieBlob', () => {
expect(words.findIndex((word) => !ft.has(word))).toBe(-1);
});

test('', () => {
test('createTriFromList', () => {
const root = createTriFromList(words);
const ft = FastTrieBlob.fromTrieRoot(root);
expect(ft.has('walk')).toBe(true);
Expand Down
11 changes: 2 additions & 9 deletions packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlob.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import type { TrieNode, TrieRoot } from '../TrieNode/TrieNode.js';
import { resolveMap } from './resolveMap.js';
import { TrieBlob } from './TrieBlob.js';

type FastTrieBlobNode = number[];
Expand Down Expand Up @@ -165,7 +166,7 @@ export class FastTrieBlob {
for (let i = 0; i < nodes.length; ++i) {
const node = nodes[i];
// assert(offset === nodeToIndex[i]);
binNodes[offset++] = (node.length << lenShift) | node[0];
binNodes[offset++] = ((node.length - 1) << lenShift) | node[0];
for (let j = 1; j < node.length; ++j) {
const v = node[j];
const nodeRef = v >>> NodeChildRefShift;
Expand Down Expand Up @@ -225,11 +226,3 @@ export class FastTrieBlob {
return tf.freeze();
}
}

function resolveMap<K, V>(map: Map<K, V>, key: K, resolve: (key: K) => V): V {
const r = map.get(key);
if (r !== undefined) return r;
const v = resolve(key);
map.set(key, v);
return v;
}
11 changes: 9 additions & 2 deletions packages/cspell-trie-lib/src/lib/TrieBlob/TrieBlob.test.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import { describe, expect, test } from 'vitest';

import { createTrieBlob } from './createTrieBlob.js';
import { createTriFromList } from '../TrieNode/trie-util.js';
import { createTrieBlob, createTrieBlobFromTrieRoot } from './createTrieBlob.js';
import { TrieBlob } from './TrieBlob.js';

describe('TrieBlob', () => {
const sampleWords = ['one', 'two', 'three', 'four', 'walk', 'walking', 'walks', 'wall', 'walls', 'walled'];
const sampleWords = ['one', 'two', 'three', 'four', 'walk', 'walking', 'walks', 'wall', 'walls', 'walled'].sort();

test('Constructor', () => {
const tb = createTrieBlob(['one', 'two']);
Expand All @@ -30,4 +31,10 @@ describe('TrieBlob', () => {
expect(r).toEqual(tb);
expect([...r.words()]).toEqual(sampleWords);
});

test('createTrieBlobFromTrieRoot', () => {
const root = createTriFromList(sampleWords);
const trieBlob = createTrieBlobFromTrieRoot(root);
expect([...trieBlob.words()]).toEqual(sampleWords);
});
});
10 changes: 3 additions & 7 deletions packages/cspell-trie-lib/src/lib/TrieBlob/TrieBlob.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ export class TrieBlob {
for (let p = 0; p < len; ++p, node = nodes[nodeIdx]) {
const letterIdx = charToIndexMap[word[p]];
const count = node & NodeMaskNumChildren;
let i = count - 1;
let i = count;
for (; i > 0; --i) {
if ((nodes[i + nodeIdx] & NodeMaskChildCharIndex) === letterIdx) {
break;
Expand Down Expand Up @@ -81,12 +81,12 @@ export class TrieBlob {
while (depth >= 0) {
const { nodeIdx, pos, word } = stack[depth];
const node = nodes[nodeIdx];

// pos is 0 when first entering a node
if (!pos && node & NodeMaskEOW) {
yield word;
}
const len = node & NodeMaskNumChildren;
if (pos >= len - 1) {
if (pos >= len) {
--depth;
continue;
}
Expand All @@ -103,10 +103,6 @@ export class TrieBlob {
}
}

private lookUpCharIndex(char: string): number {
return this.charToIndexMap[char] ?? -1;
}

toJSON() {
return {
charIndex: this.charIndex,
Expand Down
60 changes: 59 additions & 1 deletion packages/cspell-trie-lib/src/lib/TrieBlob/createTrieBlob.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,65 @@
import type { TrieNode, TrieRoot } from '../TrieNode/TrieNode.js';
import { FastTrieBlob } from './FastTrieBlob.js';
import type { TrieBlob } from './TrieBlob.js';
import { resolveMap } from './resolveMap.js';
import { TrieBlob } from './TrieBlob.js';

export function createTrieBlob(words: string[]): TrieBlob {
const ft = FastTrieBlob.fromWordList(words);
return ft.toTrieBlob();
}

export function createTrieBlobFromTrieRoot(root: TrieRoot): TrieBlob {
const NodeMaskEOW = TrieBlob.NodeMaskEOW;
const NodeChildRefShift = TrieBlob.NodeChildRefShift;
const NodeMaskNumChildren = TrieBlob.NodeMaskNumChildren;
const nodes: number[] = [];
const charIndex: string[] = [''];
const charMap: Record<string, number> = Object.create(null);
const known = new Map<TrieNode, number>();

known.set(root, appendNode(root));
const IdxEOW = appendNode({ f: 1 });

function getCharIndex(char: string): number {
const idx = charMap[char];
if (idx) return idx;
const newIdx = charIndex.push(char) - 1;
charMap[char.normalize('NFC')] = newIdx;
charMap[char.normalize('NFD')] = newIdx;
return newIdx;
}

function appendNode(n: TrieNode): number {
const idx = nodes.push(n.f ? NodeMaskEOW : 0) - 1;
if (n.c) {
const keys = Object.keys(n.c).map((key) => getCharIndex(key));
nodes[idx] = nodes[idx] | (keys.length & NodeMaskNumChildren);
nodes.push(...keys);
}
return idx;
}

function resolveNode(n: TrieNode): number {
if (n.f && !n.c) return IdxEOW;
return appendNode(n);
}

function walk(n: TrieNode): number {
const found = known.get(n);
if (found) return found;
const nodeIdx = resolveMap(known, n, resolveNode);
if (!n.c) return nodeIdx;
const children = Object.values(n.c);
for (let p = 0; p < children.length; ++p) {
const childNode = children[p];
const childIdx = walk(childNode);
// Nodes already have the letters, just OR in the child index.
nodes[nodeIdx + p + 1] |= childIdx << NodeChildRefShift;
}
return nodeIdx;
}

walk(root);

return new TrieBlob(Uint32Array.from(nodes), charIndex);
}
7 changes: 7 additions & 0 deletions packages/cspell-trie-lib/src/lib/TrieBlob/resolveMap.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
export function resolveMap<K, V>(map: Map<K, V>, key: K, resolve: (key: K) => V): V {
const r = map.get(key);
if (r !== undefined) return r;
const v = resolve(key);
map.set(key, v);
return v;
}
8 changes: 0 additions & 8 deletions packages/cspell-trie-lib/src/lib/TrieBlob/test/perf.ts

This file was deleted.

61 changes: 46 additions & 15 deletions packages/cspell-trie-lib/src/lib/TrieBlob/test/perfFastTrieBlob.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ import { readFileSync, writeFileSync } from 'fs';
import type { TrieNode } from '../../../index.js';
import { createTrieRoot, insert, Trie } from '../../../index.js';
import { readTrie } from '../../../test/dictionaries.test.helper.js';
import { getGlobalPerfTimer } from '../../utils/timer.js';
import { createTrieBlobFromTrieRoot } from '../createTrieBlob.js';
import { FastTrieBlob } from '../FastTrieBlob.js';
import { TrieBlob } from '../TrieBlob.js';
import { measure } from './perf.js';

function getTrie() {
return readTrie('@cspell/dict-en_us/cspell-ext.json');
Expand All @@ -23,58 +24,88 @@ function hasWords(words: string[], method: (word: string) => boolean): boolean {
}

export async function measureFastBlob(which: string | undefined, method: string | undefined) {
const trie = await getTrie();
const timer = getGlobalPerfTimer();
timer.start('measureFastBlob');
const trie = await timer.measureAsyncFn('getTrie', getTrie);
timer.start('words');
const words = [...trie.words()];
timer.stop('words');

timer.mark('done with setup');

timer.start('blob');
if (filterTest(which, 'blob')) {
const ft = measure('blob.FastTrieBlob.fromTrieRoot \t', () => FastTrieBlob.fromTrieRoot(trie.root));
const trieBlob = measure('blob.FastTrieBlob.toTrieBlob \t', () => ft.toTrieBlob());
{
const ft = timer.measureFn('blob.FastTrieBlob.fromTrieRoot \t', () => FastTrieBlob.fromTrieRoot(trie.root));
timer.measureFn('blob.FastTrieBlob.toTrieBlob \t', () => ft.toTrieBlob());
}
const trieBlob = timer.measureFn('blob.createTrieBlobFromTrieRoot\t', () =>
createTrieBlobFromTrieRoot(trie.root)
);

switch (method) {
case 'has':
measure('blob.TrieBlob.has \t\t', () => hasWords(words, (word) => trieBlob.has(word)));
measure('blob.TrieBlob.has \t\t', () => hasWords(words, (word) => trieBlob.has(word)));
timer.measureFn('blob.TrieBlob.has \t\t', () => hasWords(words, (word) => trieBlob.has(word)));
timer.measureFn('blob.TrieBlob.has \t\t', () => hasWords(words, (word) => trieBlob.has(word)));
break;
case 'words':
timer.start('blob.words');
[...trieBlob.words()];
timer.stop('blob.words');
break;
case 'dump':
writeFileSync('./TrieBlob.en.json', JSON.stringify(trieBlob, null, 2), 'utf8');
writeFileSync('./TrieBlob.en.trieb', trieBlob.encodeBin());
break;
case 'decode':
{
const tb = measure('blob.TrieBlob.decodeBin \t', () => {
const tb = timer.measureFn('blob.TrieBlob.decodeBin \t', () => {
return TrieBlob.decodeBin(readFileSync('./TrieBlob.en.trieb'));
});
measure('blob.TrieBlob.has \t\t', () => hasWords(words, (word) => tb.has(word)));
measure('blob.TrieBlob.has \t\t', () => hasWords(words, (word) => tb.has(word)));
timer.measureFn('blob.TrieBlob.has \t\t', () => hasWords(words, (word) => tb.has(word)));
timer.measureFn('blob.TrieBlob.has \t\t', () => hasWords(words, (word) => tb.has(word)));
}
break;
}
}
timer.stop('blob');

timer.start('fast');
if (filterTest(which, 'fast')) {
const ft = measure('fast.FastTrieBlob.fromWordList \t', () => FastTrieBlob.fromWordList(words));
const ft = timer.measureFn('fast.FastTrieBlob.fromWordList \t', () => FastTrieBlob.fromWordList(words));

switch (method) {
case 'has':
measure('fast.FastTrieBlob.has \t\t', () => hasWords(words, (word) => ft.has(word)));
measure('fast.FastTrieBlob.has \t\t', () => hasWords(words, (word) => ft.has(word)));
timer.measureFn('fast.FastTrieBlob.has \t\t', () => hasWords(words, (word) => ft.has(word)));
timer.measureFn('fast.FastTrieBlob.has \t\t', () => hasWords(words, (word) => ft.has(word)));
break;
case 'words':
timer.start('blob.words');
[...ft.words()];
timer.stop('blob.words');
break;
}
}
timer.stop('fast');

timer.start('trie');
if (filterTest(which, 'trie')) {
const root = createTrieRoot({});

measure('trie.createTriFromList \t\t', () => insertWords(root, words));
timer.measureFn('trie.createTriFromList \t\t', () => insertWords(root, words));
const trie = new Trie(root);

switch (method) {
case 'has':
measure('trie.Trie.has \t\t\t', () => hasWords(words, (word) => trie.hasWord(word, true)));
measure('trie.Trie.has \t\t\t', () => hasWords(words, (word) => trie.hasWord(word, true)));
timer.measureFn('trie.Trie.has \t\t\t', () => hasWords(words, (word) => trie.hasWord(word, true)));
timer.measureFn('trie.Trie.has \t\t\t', () => hasWords(words, (word) => trie.hasWord(word, true)));
break;
}
}
timer.stop('trie');
timer.stop('measureFastBlob');
timer.stop();
timer.report();
}

function filterTest(value: string | undefined, expected: string): boolean {
Expand Down
10 changes: 2 additions & 8 deletions packages/cspell-trie-lib/src/lib/TrieNode/trie-util.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import { opFilter, opMap, pipe } from '@cspell/cspell-pipe/sync';

import { mergeOptionalWithDefaults } from '../utils/mergeOptionalWithDefaults.js';
import { walker } from '../walker/walker.js';
import { walker, walkerWords } from '../walker/walker.js';
import type { YieldResult } from '../walker/walkerTypes.js';
import type { PartialTrieOptions, TrieNode, TrieRoot } from './TrieNode.js';
import { FLAG_WORD } from './TrieNode.js';
Expand Down Expand Up @@ -49,11 +47,7 @@ export const iterateTrie = walk;
* Generate a Iterator that can walk a Trie and yield the words.
*/
export function iteratorTrieWords(node: TrieNode): Iterable<string> {
return pipe(
walk(node),
opFilter((r) => isWordTerminationNode(r.node)),
opMap((r) => r.text)
);
return walkerWords(node);
}

export function createTrieRoot(options: PartialTrieOptions): TrieRoot {
Expand Down
Loading

0 comments on commit 2bd7fa8

Please sign in to comment.