Merge bd47e4f into 39ded08

streetsidesoftware · Jan 3, 2020 · cfe37c2 · cfe37c2
2 parents 39ded08 + bd47e4f
commit cfe37c2
Show file tree

Hide file tree

Showing 11 changed files with 507 additions and 39 deletions.
diff --git a/packages/Samples/dicts/nl_compound_trie3.trie.gz b/packages/Samples/dicts/nl_compound_trie3.trie.gz
diff --git a/packages/Samples/dicts/sampleCodeDic.txt b/packages/Samples/dicts/sampleCodeDic.txt
@@ -12,4 +12,4 @@ Code*
 
 Café    # will get normalized and will only match if case sensitive matching is turned off.
 
-~!codecode # Do not allow `codecode` or `Codecode` when using case insensitive matching.
+!codecode # Do not allow `codecode` or `Codecode` when using case insensitive matching.
diff --git a/packages/cspell-tools/src/compiler/Reader.test.ts b/packages/cspell-tools/src/compiler/Reader.test.ts
@@ -34,7 +34,10 @@ describe('Validate the iterateWordsFromFile', () => {
         expect(reader.size).toBe(3);
         const results = [...reader.annotatedWords()];
         // this might break if the processing order of hunspell changes.
-        expect(results).toEqual('hello tried try rework reworked work worked'.split(' ').sort());
+        expect(results).toEqual((
+            'hello tried try rework reworked work worked ' +
+            '~hello ~tried ~try ~rework ~reworked ~work ~worked'
+        ).split(' ').sort());
     });
 
     test('annotatedWords: hunspell Dutch', async () => {
@@ -74,8 +77,8 @@ describe('Validate the iterateWordsFromFile', () => {
         // cspell:ignore codecode errorerror codemsg
         // the results are sorted
         expect(results.join('|')).toBe(
-            '!Codemsg|!Errorerror|!err|+code|+code+|+error|+error+|+msg|Café|Code|Code+|Error|Error+|msg' +
-            '|~!codecode|~!codemsg|~!errorerror|~cafe|~code|~code+|~error|~error+'
+            '!Codemsg|!Errorerror|!codecode|!err|+code|+code+|+error|+error+|+msg|Café|Code|Code+|Error|Error+|msg' +
+            '|~!codecode|~!codemsg|~!err|~!errorerror|~+code|~+code+|~+error|~+error+|~+msg|~cafe|~code|~code+|~error|~error+|~msg'
         );
     });
 });
diff --git a/packages/cspell-tools/src/compiler/Reader.ts b/packages/cspell-tools/src/compiler/Reader.ts
@@ -146,12 +146,13 @@ const regNotLower = /[^a-z+!~]/;
 
 function *_stripCaseAndAccents(words: Iterable<AnnotatedWord>): Generator<AnnotatedWord> {
     for (const word of words) {
-        yield word;
-        if (regNotLower.test(word)) {
-            // covert to lower case and strip accents.
-            const n = word.toLowerCase().normalize('NFD').replace(/[\u0300-\u036f]/g, '');
-            yield NORMALIZED + n;
-        }
+        // Words are normalized to the compact format: e + ` => è
+        yield word.normalize();
+        // covert to lower case and strip accents.
+        const n = word.toLowerCase().normalize('NFD').replace(/[\u0300-\u036f]/g, '');
+        // All words are added for case-insensitive searches.
+        // It is a space / speed trade-off. In this case, speed is more important.
+        yield NORMALIZED + n;
     }
 }
 

diff --git a/packages/cspell-trie-lib/src/lib/TrieBuilder.ts b/packages/cspell-trie-lib/src/lib/TrieBuilder.ts
@@ -1,7 +1,6 @@
 import { TrieNode } from './TrieNode';
 import { Trie, PartialTrieOptions, TrieOptions, mergeOptionalWithDefaults } from './trie';
 import { consolidate } from './consolidate';
-import * as util from './util';
 
 export function buildTrie(words: Iterable<string>): Trie {
     return new TrieBuilder(words).build();
@@ -23,13 +22,13 @@ export class TrieBuilder {
     /** position 0 of lastPath is always the root */
     private lastPath: PathNode[] = [{ s: '', n: { f: undefined, c: undefined } }];
     private tails = new Map([['', this._eow]]);
-    private trieOptions: TrieOptions;
+    public trieOptions: TrieOptions;
 
     constructor(words?: Iterable<string>, trieOptions?: PartialTrieOptions) {
         this._canBeCached(this._eow); // this line is just for coverage reasons
         this.signatures.set(this.signature(this._eow), this._eow);
         this.cached.set(this._eow, this.count++);
-        this.trieOptions = mergeOptionalWithDefaults(trieOptions);
+        this.trieOptions = Object.freeze(mergeOptionalWithDefaults(trieOptions));
 
         if (words) {
             this.insert(words);
@@ -187,10 +186,6 @@ export class TrieBuilder {
 
     insert(words: Iterable<string>) {
         for (const w of words) {
-            if (w[0] === this.trieOptions.stripCaseAndAccentsPrefix && util.has(this._root, w.slice(1))) {
-                // Do not store the normalized form if it already exists in the trie.
-                continue;
-            }
             w && this.insertWord(w);
         }
     }

diff --git a/packages/cspell-trie-lib/src/lib/constants.ts b/packages/cspell-trie-lib/src/lib/constants.ts
@@ -0,0 +1,5 @@
+
+export const COMPOUND_FIX = '+';
+export const OPTIONAL_COMPOUND_FIX = '*';
+export const CASE_INSENSITIVE_PREFIX = '~';
+export const FORBID_PREFIX = '!';
diff --git a/packages/cspell-trie-lib/src/lib/find.dutch.test.ts b/packages/cspell-trie-lib/src/lib/find.dutch.test.ts
@@ -0,0 +1,192 @@
+import { findWord, PartialFindOptions, FindResult } from './find';
+
+import * as fs from 'fs-extra';
+import * as zlib from 'zlib';
+import { importTrie } from './importExport';
+import { TrieNode } from './TrieNode';
+import * as path from 'path';
+import { normalizeWordToLowercase } from './util';
+
+const dutchDictionary = path.join(__dirname, ...'../../../Samples/dicts/nl_compound_trie3.trie.gz'.split('/'));
+
+describe('Validate findWord', () => {
+    const pTrie = readTrie(dutchDictionary);
+
+    test('test find exact words preserve case', async () => {
+        const trie = await pTrie;
+
+        // cspell:ignore aanvaardbaard
+        // Code is not allowed as a full word.
+        expect(findWord(trie, 'aanvaardbaard', { matchCase: true, compoundMode: 'none' }))
+        .toEqual(frFound('aanvaardbaard', true));
+
+        expect(findWord(trie, 'code', { matchCase: true, compoundMode: 'none' }))
+        .toEqual({ found: 'code', compoundUsed: false, forbidden: false });
+
+        expect(findWord(trie, 'code', { matchCase: true, compoundMode: 'compound' }))
+        .toEqual({ found: 'code', compoundUsed: false, forbidden: false });
+    });
+
+    const tests: [string, PartialFindOptions, FindResult][] = [
+        ['Code', { matchCase: true,  compoundMode: 'none' }, frNotFound()],
+        ['code', { matchCase: true,  compoundMode: 'none' }, frFound('code')],
+        ['cafe', { matchCase: true,  compoundMode: 'none' }, frNotFound()],
+        ['cafe', { matchCase: false, compoundMode: 'none' }, frFound('cafe')],
+
+        // Compounding enabled, but matching whole words (compounding not used).
+        ['Code', { matchCase: true,  compoundMode: 'compound' }, frCompoundFound(false)],
+        ['code', { matchCase: true,  compoundMode: 'compound' }, frFound('code')],
+        ['cafe', { matchCase: true,  compoundMode: 'compound' }, frFound(false)],
+        ['cafe', { matchCase: false, compoundMode: 'compound' }, frFound('cafe')],
+
+        // compound words
+        testCompound('buurtbewoner'), // cspell:ignore buurtbewoner
+        testCompound('buurtbewoners'), // cspell:ignore buurtbewoners
+
+
+        // forbidden compounds
+        ['aanvaardbaard', { matchCase: true,  compoundMode: 'compound' }, frCompoundFound('aanvaardbaard', true)],
+
+    ];
+
+    tests.forEach(function ([word, options, exResult]) {
+        test(`Find Word: ${word} ${JSON.stringify(options)}, ${JSON.stringify(exResult)}`, async () => {
+            const trie = await pTrie;
+            expect(findWord(trie, word, options)).toEqual(exResult);
+        });
+     } );
+
+     sampleWords().forEach(word => {
+        test(`Find Word: ${word}`, async () => {
+            const trie = await pTrie;
+            const word2 = word[0].toLowerCase() + word.slice(1);
+            const r1 = findWord(trie, word, { matchCase: true, compoundMode: 'compound'});
+            const r2 = r1.found || word === word2 ? r1 : (word = word2, findWord(trie, word, { matchCase: true, compoundMode: 'compound'}));
+            // console.log(r2);
+            expect(r2.found).toEqual(word);
+            expect(r2.forbidden).toBe(false);
+        });
+        test(`Find Word case insensitive: ${word}`, async () => {
+            const trie = await pTrie;
+            const r = findWord(trie, word, { matchCase: false, compoundMode: 'compound'});
+            // console.log(r2);
+            expect(r.found).toEqual(normalizeWordToLowercase(word));
+            expect(r.forbidden).toBe(false);
+        });
+     });
+
+
+     sampleMisspellings().forEach(word => {
+        test(`Check misspelled words: ${word}`, async () => {
+            const trie = await pTrie;
+            const word2 = word[0].toLowerCase() + word.slice(1);
+            const r1 = findWord(trie, word, { matchCase: true, compoundMode: 'compound'});
+            const r2 = r1.found || word === word2 ? r1 : (word = word2, findWord(trie, word, { matchCase: true, compoundMode: 'compound'}));
+            // console.log(r2);
+            expect(r2.found).toEqual(false);
+            expect(r2.forbidden).toBe(false);
+        });
+        test(`Check misspelled words case insensitive: ${word}`, async () => {
+            const trie = await pTrie;
+            const r = findWord(trie, word, { matchCase: false, compoundMode: 'compound'});
+            // console.log(r2);
+            expect(r.found).toEqual(false);
+            expect(r.forbidden).toBe(false);
+        });
+     });
+
+});
+
+function sampleMisspellings(): string[] {
+    // cspell:disable
+    const text = `
+    nieuwjaarnacht
+    burgersmeester
+    buurtsbewoners
+    herdenkingbijeenkomst
+    pankoekhuis
+    blauwetram
+    `;
+    // cspell:enable
+    return processText(text);
+}
+
+function sampleWords(): string[] {
+    // cspell:disable
+    const text = `
+    Arnhem basisschool    burgemeester    buurtbewoners    haarvaten    herdenkingsbijeenkomst
+    nabestaanden    onmenselijke    slachtoffers    uitgebrande    verdachten    voorbereiden
+    exposé
+
+    De Australische marine heeft honderden inwoners en toeristen uit de kustplaats geëvacueerd
+    zo'n mensen vluchtten maandagavond naar het strand toen bosbranden het dorp deels in de as legden en de
+    vluchtwegen blokkeerden.
+
+    In het zuidoosten van Australië zijn meer dan 200 brandhaarden.
+    De autoriteiten vrezen dat de situatie alleen maar erger wordt door de hoge
+    temperaturen en harde wind die voor dit weekend worden verwacht.
+    In de deelstaat New Zuid Wales, waar Sydney ligt, geldt de noodtoestand.
+    Het Nederlandse ministerie van Buitenlandse Zaken adviseert in het gebied alleen noodzakelijke reizen te maken.
+
+    Nooit eerder waren de jaarlijkse bosbranden in Australië zo ernstig.
+    Tot nu toe is een gebied groter dan Nederland afgebrand en zijn meer dan 1400 huizen verwoest.
+    Ten minste negentien mensen kwamen om en er zijn tientallen vermisten.
+
+    Verdachten flatbrand Arnhem hebben ook levenslang, zegt Kinderombudsman
+
+    Lange woorden:
+    Kindercarnavalsoptochtenvoorbereidingswerkzaamheden
+    Meervoudige persoonlijkheidsstoornissen
+    Zandzeep mineraalwatersteenstralen
+    Randjongerenhangplekkenbeleidsambtenarensalarisbesprekingsafspraken
+    Invaliditeitsuitkeringshoofdkwartiervestigingsgebouwfundamentenblauwdruk
+    Hottentottententententoonstellingsterrein
+    Vervoerdersaansprakelijkheidsverzekering
+    Bestuurdersaansprakelijkheidsverzekering
+    Overeenstemmingsbeoordelingsprocedures
+    `;
+    // cspell:enable
+    return processText(text);
+}
+
+function processText(text: string): string[] {
+    return [...new Set(text.replace(/[.0-9,"“():]/g, ' ').split(/\s+/).sort().filter(a => !!a))];
+}
+
+function testCompound(word: string, found: boolean = true): [string, PartialFindOptions, FindResult] {
+    return [word, { matchCase: true, compoundMode: 'compound' }, frCompoundFound(found && word)];
+}
+
+function frNotFound(compoundUsed: boolean = false): FindResult {
+    return {
+        found: false,
+        forbidden: false,
+        compoundUsed,
+    };
+}
+
+function frFound(found: string | false, forbidden: boolean = false, compoundUsed: boolean = false): FindResult {
+    return {
+        found,
+        forbidden,
+        compoundUsed,
+    };
+}
+
+function frCompoundFound(found: string | false, forbidden: boolean = false, compoundUsed: boolean = true): FindResult {
+    return frFound(found, forbidden, compoundUsed);
+}
+
+async function readTrie(filename: string): Promise<TrieNode> {
+    const lines = await readTextFile(filename);
+    return importTrie(lines);
+}
+
+function readTextFile(filename: string): Promise<string[]> {
+    const lines = fs.readFile(filename)
+        .then(buffer => (/\.gz$/).test(filename) ? zlib.gunzipSync(buffer) : buffer)
+        .then(buffer => buffer.toString('UTF-8'))
+        .then(content => content.split(/\r?\n/g))
+        ;
+    return lines;
+}