Skip to content

Commit

Permalink
Merge bd47e4f into 39ded08
Browse files Browse the repository at this point in the history
  • Loading branch information
Jason3S committed Jan 3, 2020
2 parents 39ded08 + bd47e4f commit cfe37c2
Show file tree
Hide file tree
Showing 11 changed files with 507 additions and 39 deletions.
Binary file added packages/Samples/dicts/nl_compound_trie3.trie.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion packages/Samples/dicts/sampleCodeDic.txt
Expand Up @@ -12,4 +12,4 @@ Code*

Café # will get normalized and will only match if case sensitive matching is turned off.

~!codecode # Do not allow `codecode` or `Codecode` when using case insensitive matching.
!codecode # Do not allow `codecode` or `Codecode` when using case insensitive matching.
9 changes: 6 additions & 3 deletions packages/cspell-tools/src/compiler/Reader.test.ts
Expand Up @@ -34,7 +34,10 @@ describe('Validate the iterateWordsFromFile', () => {
expect(reader.size).toBe(3);
const results = [...reader.annotatedWords()];
// this might break if the processing order of hunspell changes.
expect(results).toEqual('hello tried try rework reworked work worked'.split(' ').sort());
expect(results).toEqual((
'hello tried try rework reworked work worked ' +
'~hello ~tried ~try ~rework ~reworked ~work ~worked'
).split(' ').sort());
});

test('annotatedWords: hunspell Dutch', async () => {
Expand Down Expand Up @@ -74,8 +77,8 @@ describe('Validate the iterateWordsFromFile', () => {
// cspell:ignore codecode errorerror codemsg
// the results are sorted
expect(results.join('|')).toBe(
'!Codemsg|!Errorerror|!err|+code|+code+|+error|+error+|+msg|Café|Code|Code+|Error|Error+|msg' +
'|~!codecode|~!codemsg|~!errorerror|~cafe|~code|~code+|~error|~error+'
'!Codemsg|!Errorerror|!codecode|!err|+code|+code+|+error|+error+|+msg|Café|Code|Code+|Error|Error+|msg' +
'|~!codecode|~!codemsg|~!err|~!errorerror|~+code|~+code+|~+error|~+error+|~+msg|~cafe|~code|~code+|~error|~error+|~msg'
);
});
});
13 changes: 7 additions & 6 deletions packages/cspell-tools/src/compiler/Reader.ts
Expand Up @@ -146,12 +146,13 @@ const regNotLower = /[^a-z+!~]/;

function *_stripCaseAndAccents(words: Iterable<AnnotatedWord>): Generator<AnnotatedWord> {
for (const word of words) {
yield word;
if (regNotLower.test(word)) {
// covert to lower case and strip accents.
const n = word.toLowerCase().normalize('NFD').replace(/[\u0300-\u036f]/g, '');
yield NORMALIZED + n;
}
// Words are normalized to the compact format: e + ` => è
yield word.normalize();
// covert to lower case and strip accents.
const n = word.toLowerCase().normalize('NFD').replace(/[\u0300-\u036f]/g, '');
// All words are added for case-insensitive searches.
// It is a space / speed trade-off. In this case, speed is more important.
yield NORMALIZED + n;
}
}

Expand Down
9 changes: 2 additions & 7 deletions packages/cspell-trie-lib/src/lib/TrieBuilder.ts
@@ -1,7 +1,6 @@
import { TrieNode } from './TrieNode';
import { Trie, PartialTrieOptions, TrieOptions, mergeOptionalWithDefaults } from './trie';
import { consolidate } from './consolidate';
import * as util from './util';

export function buildTrie(words: Iterable<string>): Trie {
return new TrieBuilder(words).build();
Expand All @@ -23,13 +22,13 @@ export class TrieBuilder {
/** position 0 of lastPath is always the root */
private lastPath: PathNode[] = [{ s: '', n: { f: undefined, c: undefined } }];
private tails = new Map([['', this._eow]]);
private trieOptions: TrieOptions;
public trieOptions: TrieOptions;

constructor(words?: Iterable<string>, trieOptions?: PartialTrieOptions) {
this._canBeCached(this._eow); // this line is just for coverage reasons
this.signatures.set(this.signature(this._eow), this._eow);
this.cached.set(this._eow, this.count++);
this.trieOptions = mergeOptionalWithDefaults(trieOptions);
this.trieOptions = Object.freeze(mergeOptionalWithDefaults(trieOptions));

if (words) {
this.insert(words);
Expand Down Expand Up @@ -187,10 +186,6 @@ export class TrieBuilder {

insert(words: Iterable<string>) {
for (const w of words) {
if (w[0] === this.trieOptions.stripCaseAndAccentsPrefix && util.has(this._root, w.slice(1))) {
// Do not store the normalized form if it already exists in the trie.
continue;
}
w && this.insertWord(w);
}
}
Expand Down
5 changes: 5 additions & 0 deletions packages/cspell-trie-lib/src/lib/constants.ts
@@ -0,0 +1,5 @@

export const COMPOUND_FIX = '+';
export const OPTIONAL_COMPOUND_FIX = '*';
export const CASE_INSENSITIVE_PREFIX = '~';
export const FORBID_PREFIX = '!';
192 changes: 192 additions & 0 deletions packages/cspell-trie-lib/src/lib/find.dutch.test.ts
@@ -0,0 +1,192 @@
import { findWord, PartialFindOptions, FindResult } from './find';

import * as fs from 'fs-extra';
import * as zlib from 'zlib';
import { importTrie } from './importExport';
import { TrieNode } from './TrieNode';
import * as path from 'path';
import { normalizeWordToLowercase } from './util';

const dutchDictionary = path.join(__dirname, ...'../../../Samples/dicts/nl_compound_trie3.trie.gz'.split('/'));

describe('Validate findWord', () => {
const pTrie = readTrie(dutchDictionary);

test('test find exact words preserve case', async () => {
const trie = await pTrie;

// cspell:ignore aanvaardbaard
// Code is not allowed as a full word.
expect(findWord(trie, 'aanvaardbaard', { matchCase: true, compoundMode: 'none' }))
.toEqual(frFound('aanvaardbaard', true));

expect(findWord(trie, 'code', { matchCase: true, compoundMode: 'none' }))
.toEqual({ found: 'code', compoundUsed: false, forbidden: false });

expect(findWord(trie, 'code', { matchCase: true, compoundMode: 'compound' }))
.toEqual({ found: 'code', compoundUsed: false, forbidden: false });
});

const tests: [string, PartialFindOptions, FindResult][] = [
['Code', { matchCase: true, compoundMode: 'none' }, frNotFound()],
['code', { matchCase: true, compoundMode: 'none' }, frFound('code')],
['cafe', { matchCase: true, compoundMode: 'none' }, frNotFound()],
['cafe', { matchCase: false, compoundMode: 'none' }, frFound('cafe')],

// Compounding enabled, but matching whole words (compounding not used).
['Code', { matchCase: true, compoundMode: 'compound' }, frCompoundFound(false)],
['code', { matchCase: true, compoundMode: 'compound' }, frFound('code')],
['cafe', { matchCase: true, compoundMode: 'compound' }, frFound(false)],
['cafe', { matchCase: false, compoundMode: 'compound' }, frFound('cafe')],

// compound words
testCompound('buurtbewoner'), // cspell:ignore buurtbewoner
testCompound('buurtbewoners'), // cspell:ignore buurtbewoners


// forbidden compounds
['aanvaardbaard', { matchCase: true, compoundMode: 'compound' }, frCompoundFound('aanvaardbaard', true)],

];

tests.forEach(function ([word, options, exResult]) {
test(`Find Word: ${word} ${JSON.stringify(options)}, ${JSON.stringify(exResult)}`, async () => {
const trie = await pTrie;
expect(findWord(trie, word, options)).toEqual(exResult);
});
} );

sampleWords().forEach(word => {
test(`Find Word: ${word}`, async () => {
const trie = await pTrie;
const word2 = word[0].toLowerCase() + word.slice(1);
const r1 = findWord(trie, word, { matchCase: true, compoundMode: 'compound'});
const r2 = r1.found || word === word2 ? r1 : (word = word2, findWord(trie, word, { matchCase: true, compoundMode: 'compound'}));
// console.log(r2);
expect(r2.found).toEqual(word);
expect(r2.forbidden).toBe(false);
});
test(`Find Word case insensitive: ${word}`, async () => {
const trie = await pTrie;
const r = findWord(trie, word, { matchCase: false, compoundMode: 'compound'});
// console.log(r2);
expect(r.found).toEqual(normalizeWordToLowercase(word));
expect(r.forbidden).toBe(false);
});
});


sampleMisspellings().forEach(word => {
test(`Check misspelled words: ${word}`, async () => {
const trie = await pTrie;
const word2 = word[0].toLowerCase() + word.slice(1);
const r1 = findWord(trie, word, { matchCase: true, compoundMode: 'compound'});
const r2 = r1.found || word === word2 ? r1 : (word = word2, findWord(trie, word, { matchCase: true, compoundMode: 'compound'}));
// console.log(r2);
expect(r2.found).toEqual(false);
expect(r2.forbidden).toBe(false);
});
test(`Check misspelled words case insensitive: ${word}`, async () => {
const trie = await pTrie;
const r = findWord(trie, word, { matchCase: false, compoundMode: 'compound'});
// console.log(r2);
expect(r.found).toEqual(false);
expect(r.forbidden).toBe(false);
});
});

});

function sampleMisspellings(): string[] {
// cspell:disable
const text = `
nieuwjaarnacht
burgersmeester
buurtsbewoners
herdenkingbijeenkomst
pankoekhuis
blauwetram
`;
// cspell:enable
return processText(text);
}

function sampleWords(): string[] {
// cspell:disable
const text = `
Arnhem basisschool burgemeester buurtbewoners haarvaten herdenkingsbijeenkomst
nabestaanden onmenselijke slachtoffers uitgebrande verdachten voorbereiden
exposé
De Australische marine heeft honderden inwoners en toeristen uit de kustplaats geëvacueerd
zo'n mensen vluchtten maandagavond naar het strand toen bosbranden het dorp deels in de as legden en de
vluchtwegen blokkeerden.
In het zuidoosten van Australië zijn meer dan 200 brandhaarden.
De autoriteiten vrezen dat de situatie alleen maar erger wordt door de hoge
temperaturen en harde wind die voor dit weekend worden verwacht.
In de deelstaat New Zuid Wales, waar Sydney ligt, geldt de noodtoestand.
Het Nederlandse ministerie van Buitenlandse Zaken adviseert in het gebied alleen noodzakelijke reizen te maken.
Nooit eerder waren de jaarlijkse bosbranden in Australië zo ernstig.
Tot nu toe is een gebied groter dan Nederland afgebrand en zijn meer dan 1400 huizen verwoest.
Ten minste negentien mensen kwamen om en er zijn tientallen vermisten.
Verdachten flatbrand Arnhem hebben ook levenslang, zegt Kinderombudsman
Lange woorden:
Kindercarnavalsoptochtenvoorbereidingswerkzaamheden
Meervoudige persoonlijkheidsstoornissen
Zandzeep mineraalwatersteenstralen
Randjongerenhangplekkenbeleidsambtenarensalarisbesprekingsafspraken
Invaliditeitsuitkeringshoofdkwartiervestigingsgebouwfundamentenblauwdruk
Hottentottententententoonstellingsterrein
Vervoerdersaansprakelijkheidsverzekering
Bestuurdersaansprakelijkheidsverzekering
Overeenstemmingsbeoordelingsprocedures
`;
// cspell:enable
return processText(text);
}

function processText(text: string): string[] {
return [...new Set(text.replace(/[.0-9,"“():]/g, ' ').split(/\s+/).sort().filter(a => !!a))];
}

function testCompound(word: string, found: boolean = true): [string, PartialFindOptions, FindResult] {
return [word, { matchCase: true, compoundMode: 'compound' }, frCompoundFound(found && word)];
}

function frNotFound(compoundUsed: boolean = false): FindResult {
return {
found: false,
forbidden: false,
compoundUsed,
};
}

function frFound(found: string | false, forbidden: boolean = false, compoundUsed: boolean = false): FindResult {
return {
found,
forbidden,
compoundUsed,
};
}

function frCompoundFound(found: string | false, forbidden: boolean = false, compoundUsed: boolean = true): FindResult {
return frFound(found, forbidden, compoundUsed);
}

async function readTrie(filename: string): Promise<TrieNode> {
const lines = await readTextFile(filename);
return importTrie(lines);
}

function readTextFile(filename: string): Promise<string[]> {
const lines = fs.readFile(filename)
.then(buffer => (/\.gz$/).test(filename) ? zlib.gunzipSync(buffer) : buffer)
.then(buffer => buffer.toString('UTF-8'))
.then(content => content.split(/\r?\n/g))
;
return lines;
}

0 comments on commit cfe37c2

Please sign in to comment.