Merge 7d232bb into 5afe9ee

streetsidesoftware · Jan 4, 2020 · 44f320d · 44f320d
2 parents 5afe9ee + 7d232bb
commit 44f320d
Show file tree

Hide file tree

Showing 11 changed files with 367 additions and 58 deletions.
diff --git a/packages/cspell-trie-lib/src/lib/IterableLike.ts b/packages/cspell-trie-lib/src/lib/IterableLike.ts
diff --git a/packages/cspell-trie-lib/src/lib/SimpleDictionaryParser.test.ts b/packages/cspell-trie-lib/src/lib/SimpleDictionaryParser.test.ts
@@ -0,0 +1,57 @@
+import { parseDictionary, parseDictionaryLines } from './SimpleDictionaryParser';
+
+describe('Validate SimpleDictionaryParser', () => {
+    test('test parsing lines', () => {
+        const expected = [
+            'Begin',
+            '~begin',
+            'Begin+',
+            '~begin+',
+            'End',
+            '~end',
+            '+End',
+            '~+end',
+            '+Middle+',
+            '~+middle+',
+            'Café',
+            '~cafe',
+            '!forbid',
+        ];
+        // Basic test
+        expect([...parseDictionaryLines(dictionary().split('\n'))]).toEqual(expected);
+        // Use expanded accents
+        expect([...parseDictionaryLines(dictionary().normalize('NFD').split('\n'))]).toEqual(expected);
+    });
+
+    test('basic test', () => {
+        const trie = parseDictionary(dictionary());
+        const result = [...trie.words()];
+        expect(result).toEqual([
+            '!forbid',
+            '+End',
+            '+Middle+',
+            'Begin',
+            'Begin+',
+            'Café',
+            'End',
+            '~+end',
+            '~+middle+',
+            '~begin',
+            '~begin+',
+            '~cafe',
+            '~end',
+        ]);
+    });
+});
+
+function dictionary() {
+    return `
+    # This is a comment.
+
+    Begin*
+    *End
+    +Middle+
+    Café        # é becomes e
+    !forbid     # do not allow "forbid"
+    `;
+}
diff --git a/packages/cspell-trie-lib/src/lib/SimpleDictionaryParser.ts b/packages/cspell-trie-lib/src/lib/SimpleDictionaryParser.ts
@@ -0,0 +1,92 @@
+import { operators } from 'gensequence';
+import { normalizeWordToLowercase, normalizeWord } from './util';
+import { COMPOUND_FIX, OPTIONAL_COMPOUND_FIX, FORBID_PREFIX, CASE_INSENSITIVE_PREFIX, LINE_COMMENT } from './constants';
+import { Trie } from './trie';
+import { buildTrie } from './TrieBuilder';
+
+export interface ParseDictionaryOptions {
+    compoundCharacter: string;
+    optionalCompoundCharacter: string;
+    forbiddenPrefix: string;
+    caseInsensitivePrefix: string;
+    commentCharacter: string;
+}
+
+const _defaultOptions: ParseDictionaryOptions = {
+    commentCharacter: LINE_COMMENT,
+    optionalCompoundCharacter: OPTIONAL_COMPOUND_FIX,
+    compoundCharacter: COMPOUND_FIX,
+    forbiddenPrefix: FORBID_PREFIX,
+    caseInsensitivePrefix: CASE_INSENSITIVE_PREFIX,
+};
+
+export const defaultParseDictionaryOptions: ParseDictionaryOptions = Object.freeze(_defaultOptions);
+
+export function parseDictionaryLines(lines: Iterable<string>, options: ParseDictionaryOptions = _defaultOptions): Iterable<string> {
+    const {
+        commentCharacter,
+        optionalCompoundCharacter: optionalCompound,
+        compoundCharacter: compound,
+        caseInsensitivePrefix: ignoreCase,
+        forbiddenPrefix: forbidden,
+    } = options;
+
+    const regexComment = new RegExp(escapeRegEx(commentCharacter) + '.*', 'g');
+
+    function removeComments(line: string): string {
+        return line.replace(regexComment, '').trim();
+    }
+
+    function filterEmptyLines(line: string): boolean {
+        return !!line;
+    }
+
+    function *mapOptionalPrefix(line: string) {
+        if (line[0] === optionalCompound) {
+            const t = line.slice(1);
+            yield t;
+            yield compound + t;
+        } else {
+            yield line;
+        }
+    }
+
+    function *mapOptionalSuffix(line: string) {
+        if (line.slice(-1) === optionalCompound) {
+            const t = line.slice(0, -1);
+            yield t;
+            yield t + compound;
+        } else {
+            yield line;
+        }
+    }
+
+    function *mapNormalize(line: string) {
+        yield normalizeWord(line);
+        if (line[0] !== forbidden) yield ignoreCase + normalizeWordToLowercase(line);
+    }
+
+    const processLines = operators.pipe(
+        operators.map(removeComments),
+        operators.filter(filterEmptyLines),
+        operators.concatMap(mapOptionalPrefix),
+        operators.concatMap(mapOptionalSuffix),
+        operators.concatMap(mapNormalize),
+    );
+
+    return processLines(lines);
+}
+
+export function parseDictionary(text: string, options: ParseDictionaryOptions = _defaultOptions): Trie {
+    const lines = parseDictionaryLines(text.split('\n'), options);
+    return buildTrie([...new Set(lines)].sort(), {
+        compoundCharacter:          options.compoundCharacter,
+        compoundOptionalCharacter:  options.optionalCompoundCharacter,
+        forbiddenWordPrefix:        options.forbiddenPrefix,
+        stripCaseAndAccentsPrefix:  options.caseInsensitivePrefix,
+    });
+}
+
+function escapeRegEx(s: string) {
+    return s.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&');
+}
diff --git a/packages/cspell-trie-lib/src/lib/TrieBuilder.ts b/packages/cspell-trie-lib/src/lib/TrieBuilder.ts
@@ -2,8 +2,8 @@ import { TrieNode } from './TrieNode';
 import { Trie, PartialTrieOptions, TrieOptions, mergeOptionalWithDefaults } from './trie';
 import { consolidate } from './consolidate';
 
-export function buildTrie(words: Iterable<string>): Trie {
-    return new TrieBuilder(words).build();
+export function buildTrie(words: Iterable<string>, trieOptions?: PartialTrieOptions): Trie {
+    return new TrieBuilder(words, trieOptions).build();
 }
 
 interface PathNode {

diff --git a/packages/cspell-trie-lib/src/lib/bufferLines.ts b/packages/cspell-trie-lib/src/lib/bufferLines.ts
@@ -1,6 +1,4 @@
-import { IterableLike } from './IterableLike';
-
-export function *buffer<T>(iter: IterableLike<T>, bufferSize: number): IterableIterator<T[]> {
+export function *buffer<T>(iter: Iterable<T>, bufferSize: number): IterableIterator<T[]> {
     const buffer: T[] = [];
     for (const s of iter) {
         buffer.push(s);
@@ -15,7 +13,7 @@ export function *buffer<T>(iter: IterableLike<T>, bufferSize: number): IterableI
     }
 }
 
-export function* bufferLines(iter: IterableLike<string>, bufferSize: number, eol: string): IterableIterator<string> {
+export function* bufferLines(iter: Iterable<string>, bufferSize: number, eol: string): IterableIterator<string> {
     if (eol) {
         for (const s of buffer(iter, bufferSize)) {
             yield s.join('') + eol;

diff --git a/packages/cspell-trie-lib/src/lib/compoundWalker.test.ts b/packages/cspell-trie-lib/src/lib/compoundWalker.test.ts
@@ -0,0 +1,132 @@
+import { parseDictionary } from './SimpleDictionaryParser';
+import { Trie } from './trie';
+import { findWord } from './find';
+import { WalkNext, WalkItem, compoundWalker, compoundWords } from './compoundWalker';
+
+// cspell:ignore errorerror
+
+describe('Verify compound walker', () => {
+    test('compoundWords', () => {
+        const trie = dictionary();
+        expect(findWord(trie.root, 'errorerror').forbidden).toBe(true);
+        expect(findWord(trie.root, 'ErrorCodes').found).toBe('errorcodes');
+        const words1 = [...compoundWords(trie, 1)];
+        expect(words1).toEqual([
+            'Code',
+            'Codes',
+            'Error',
+            'Errors',
+            'Message',
+            'Time',
+        ]);
+        const words2 = [...compoundWords(trie, 2)];
+        expect(words2).toEqual(expected2());
+        const words3 = [...compoundWords(trie, 3)];
+        expect(words3).toContain('PrefixMiddleSuffix');
+        expect(words3).toContain('PrefixErrorCodes');
+        expect(words3).toHaveLength(216);
+        words2.forEach(w2 => expect(words3).toContain(w2));
+    });
+
+    test('compoundWords lowercase', () => {
+        const trie = dictionary();
+        const words2 = [...compoundWords(trie, 2, false)];
+        expect(words2).toEqual(expected2().map(a => a.toLowerCase()));
+    });
+
+    test('test compound edges', () => {
+        const trie = dictionary();
+        const words1 = [...filterWalker(compoundWalker(trie), 1)];
+        expect(words1).toEqual([
+            'Code',
+            'Codes',
+            'Code+',
+            'Error',
+            'Errors',
+            'Error+',
+            'Message',
+            'Message+',
+            'Prefix+',
+            'Time',
+            'Time+',
+        ]);
+    });
+});
+
+function *filterWalker(stream: Generator<WalkItem, any, WalkNext>, maxDepth: number): Generator<string> {
+    let item = stream.next();
+    while (!item.done) {
+        const { n, s, c, d } = item.value;
+        if (n.f) {
+            yield s;
+        }
+        if (c) {
+            yield s + '+';
+        }
+        item = stream.next(d < maxDepth);
+    }
+}
+
+function dictionary(): Trie {
+    return parseDictionary(`
+    # Sample dictionary
+    *Error*
+    *Errors
+    *Code*
+    *Codes
+    *Message*
+    *Message
+    *Time*
+    +Middle+
+    Prefix+
+    +Suffix
+
+    !errorerror
+    `);
+}
+
+function expected2() {
+    return [
+        'Code',
+        'Codes',
+        'CodeCode',
+        'CodeCodes',
+        'CodeError',
+        'CodeErrors',
+        'CodeMessage',
+        'CodeSuffix',
+        'CodeTime',
+        'Error',
+        'Errors',
+        'ErrorCode',
+        'ErrorCodes',
+        'ErrorError',
+        'ErrorErrors',
+        'ErrorMessage',
+        'ErrorSuffix',
+        'ErrorTime',
+        'Message',
+        'MessageCode',
+        'MessageCodes',
+        'MessageError',
+        'MessageErrors',
+        'MessageMessage',
+        'MessageSuffix',
+        'MessageTime',
+        'PrefixCode',
+        'PrefixCodes',
+        'PrefixError',
+        'PrefixErrors',
+        'PrefixMessage',
+        'PrefixSuffix',
+        'PrefixTime',
+        'Time',
+        'TimeCode',
+        'TimeCodes',
+        'TimeError',
+        'TimeErrors',
+        'TimeMessage',
+        'TimeSuffix',
+        'TimeTime',
+    ];
+}
diff --git a/packages/cspell-trie-lib/src/lib/compoundWalker.ts b/packages/cspell-trie-lib/src/lib/compoundWalker.ts
@@ -0,0 +1,74 @@
+import { Trie } from './trie';
+import { TrieNode } from './TrieNode';
+
+
+export interface WalkItem {
+    /** prefix so far */
+    s: string;
+    n: TrieNode;
+    /** compound depth */
+    d: number;
+    /** true iff compound edge */
+    c: boolean;
+}
+
+export type WalkNext = boolean;
+
+/**
+ *
+ * Depth first walk of a compound trie.
+ * If there are compound, this becomes an infinite iterator.
+ * Use i.next(false) to prevent the walker from going deeper into the trie.
+ *
+ * @param trie the compound Trie to walk
+ */
+export function *compoundWalker(trie: Trie, caseSensitive: boolean = true): Generator<WalkItem, any, WalkNext> {
+    const { compoundCharacter: cc, forbiddenWordPrefix: forbidden, stripCaseAndAccentsPrefix } = trie.options;
+    const blockNode = new Set([cc, forbidden, stripCaseAndAccentsPrefix]);
+    const root = !caseSensitive && trie.root.c?.get(stripCaseAndAccentsPrefix) || trie.root;
+
+    function *walk(n: TrieNode, s: string, c: boolean, d: number): Generator<WalkItem, any, WalkNext> {
+        const deeper = yield {n, s, c, d};
+        if (deeper !== false && n.c) {
+            for (const [k, cn] of n.c) {
+                if (blockNode.has(k)) continue;
+                yield *walk(cn, s + k, false, d);
+            }
+            if (n.c.has(cc)) {
+                const compoundNodes = root.c!.get(cc);
+                if (compoundNodes) {
+                    yield *walk(compoundNodes, s, true, d + 1);
+                }
+            }
+        }
+    }
+
+    // Make sure we do not walk forbidden and compound only words from the root.
+    for (const n of root.c || []) {
+        if (!blockNode.has(n[0])) {
+            yield *walk(n[1], n[0], false, 0);
+        }
+    }
+}
+
+/**
+ *
+ * @param trie Trie to walk
+ * @param maxDepth Max compound depth
+ * @param caseSensitive case sensitive search.
+ */
+export function *compoundWords(trie: Trie, maxDepth: number, caseSensitive: boolean = true) {
+    const stream = compoundWalker(trie, caseSensitive);
+    let item = stream.next();
+    while (!item.done) {
+        const { n, s, d } = item.value;
+        if (d >= maxDepth) {
+            item = stream.next(false);
+            continue;
+        }
+        if (n.f) {
+            yield s;
+        }
+        item = stream.next();
+    }
+}
diff --git a/packages/cspell-trie-lib/src/lib/constants.ts b/packages/cspell-trie-lib/src/lib/constants.ts
@@ -3,3 +3,4 @@ export const COMPOUND_FIX = '+';
 export const OPTIONAL_COMPOUND_FIX = '*';
 export const CASE_INSENSITIVE_PREFIX = '~';
 export const FORBID_PREFIX = '!';
+export const LINE_COMMENT = '#';