Support limiting the suffix depth

streetsidesoftware · Nov 16, 2019 · d2e28b3 · d2e28b3
1 parent 619ef62
commit d2e28b3
Show file tree

Hide file tree

Showing 5 changed files with 59 additions and 32 deletions.
diff --git a/packages/cspell-tools/src/app.ts b/packages/cspell-tools/src/app.ts
@@ -27,12 +27,15 @@ program
     .description('compile words lists into simple dictionary files.')
     .option('-o, --output <path>', 'Specify the output directory, otherwise files are written back to the same location.')
     .option('-n, --no-compress', 'By default the files are Gzipped, this will turn that off.')
+    .option('-m, --max_depth <limit>', 'Maximum depth to apply suffix rules.')
     .option('-s, --no-split', 'Treat each line as a dictionary entry, do not split')
     .option('--no-sort', 'Do not sort the result')
-    .action(async (src: string[], options: { output?: string, compress: boolean, split: boolean, sort: boolean, case: boolean }) => {
+    .action(async (src: string[], options: { output?: string, compress: boolean, split: boolean, sort: boolean, case: boolean, max_depth?: string }) => {
+        const { max_depth } = options;
+        const maxDepth = max_depth !== undefined ? Number.parseInt(max_depth) : undefined;
         return processAction(src, '.txt', options, async (src, dst) => {
             console.log('Process "%s" to "%s"', src, dst);
-            await compileWordList(src, dst, { splitWords: options.split, sort: options.sort }).then(() => src);
+            await compileWordList(src, dst, { splitWords: options.split, sort: options.sort, maxDepth }).then(() => src);
             console.log('Done "%s" to "%s"', src, dst);
             return src;
         });
@@ -42,11 +45,14 @@ program
     .command('compile-trie <src...>')
     .description('Compile words lists or Hunspell dictionary into trie files used by cspell.')
     .option('-o, --output <path>', 'Specify the output directory, otherwise files are written back to the same location.')
+    .option('-m, --max_depth <limit>', 'Maximum depth to apply suffix rules.')
     .option('-n, --no-compress', 'By default the files are Gzipped, this will turn that off.')
-    .action((src: string[], options: { output?: string, compress: boolean }) => {
+    .action((src: string[], options: { output?: string, compress: boolean, max_depth?: string }) => {
+        const { max_depth } = options;
+        const maxDepth = max_depth !== undefined ? Number.parseInt(max_depth) : undefined;
         return processAction(src, '.trie', options, async (src, dst) => {
             console.log('Process "%s" to "%s"', src, dst);
-            return compileTrie(src, dst).then(() => src);
+            return compileTrie(src, dst, { maxDepth } ).then(() => src);
         });
     });
 
@@ -84,4 +90,3 @@ program.parse(process.argv);
 if (!process.argv.slice(2).length) {
     program.help();
 }
-
diff --git a/packages/cspell-tools/src/compiler/iterateWordsFromFile.test.ts b/packages/cspell-tools/src/compiler/iterateWordsFromFile.test.ts
@@ -4,7 +4,7 @@ import * as path from 'path';
 
 describe('Validate the iterateWordsFromFile', () => {
     test('streamWordsFromFile: hunspell', async () => {
-        const reader = await streamWordsFromFile(path.join(__dirname, '..', '..', 'Samples', 'hunspell', 'example.aff'));
+        const reader = await streamWordsFromFile(path.join(__dirname, '..', '..', 'Samples', 'hunspell', 'example.aff'), {});
         const results = [...reader];
         // this might break if the processing order of hunspell changes.
         expect(results.join(' ')).to.equal('hello try tried work rework reworked worked');

diff --git a/packages/cspell-tools/src/compiler/iterateWordsFromFile.ts b/packages/cspell-tools/src/compiler/iterateWordsFromFile.ts
@@ -4,11 +4,16 @@ import * as fs from 'fs-extra';
 
 const regHunspellFile = /\.(dic|aff)$/i;
 
-export async function readHunspellFiles(filename: string): Promise<Sequence<string>> {
+export interface HunspellOptions {
+    maxDepth?: number;
+}
+
+export async function readHunspellFiles(filename: string, options: HunspellOptions): Promise<Sequence<string>> {
     const dicFile = filename.replace(regHunspellFile, '.dic');
     const affFile = filename.replace(regHunspellFile, '.aff');
 
     const reader = await HR.IterableHunspellReader.createFromFiles(affFile, dicFile);
+    reader.maxDepth = options.maxDepth !== undefined ? options.maxDepth : reader.maxDepth;
 
     return genSequence(reader);
 }
@@ -18,6 +23,6 @@ async function iterateFile(filename: string): Promise<Sequence<string>> {
     return genSequence(content.split('\n'));
 }
 
-export function streamWordsFromFile(filename: string): Promise<Sequence<string>> {
-    return regHunspellFile.test(filename) ? readHunspellFiles(filename) : iterateFile(filename);
+export function streamWordsFromFile(filename: string, options: HunspellOptions): Promise<Sequence<string>> {
+    return regHunspellFile.test(filename) ? readHunspellFiles(filename, options) : iterateFile(filename);
 }
diff --git a/packages/cspell-tools/src/compiler/wordListCompiler.test.ts b/packages/cspell-tools/src/compiler/wordListCompiler.test.ts
@@ -104,6 +104,26 @@ describe('Validate the wordListCompiler', () => {
         const words = [...Trie.iteratorTrieWords(node)].sort();
         expect(words).to.be.deep.equal(expected);
     });
+
+    test('test a simple hunspell dictionary depth 0', async () => {
+        const sourceName = path.join(__dirname, '..', '..', 'Samples', 'hunspell', 'example.dic');
+        const destName = path.join(__dirname, '..', '..', 'temp', 'example0.txt');
+        return compileWordList(sourceName, destName, { splitWords: false, sort: true, maxDepth: 0 })
+        .then(() => fsp.readFile(destName, 'utf8'))
+        .then(output => {
+            expect(output).to.be.equal('hello\ntry\nwork\n');
+        });
+    });
+
+    test('test a simple hunspell dictionary depth 1', async () => {
+        const sourceName = path.join(__dirname, '..', '..', 'Samples', 'hunspell', 'example.dic');
+        const destName = path.join(__dirname, '..', '..', 'temp', 'example0.txt');
+        return compileWordList(sourceName, destName, { splitWords: false, sort: true, maxDepth: 1 })
+        .then(() => fsp.readFile(destName, 'utf8'))
+        .then(output => {
+            expect(output.split('\n')).to.be.deep.equal(['hello', 'rework', 'tried', 'try', 'work', 'worked', '']);
+        });
+    });
 });
 
 function distinct(): (word: string) => boolean {

diff --git a/packages/cspell-tools/src/compiler/wordListCompiler.ts b/packages/cspell-tools/src/compiler/wordListCompiler.ts
@@ -6,7 +6,7 @@ import * as path from 'path';
 import { mkdirp } from 'fs-extra';
 import * as Trie from 'cspell-trie-lib';
 import * as HR from 'hunspell-reader';
-import { streamWordsFromFile } from './iterateWordsFromFile';
+import { streamWordsFromFile, HunspellOptions } from './iterateWordsFromFile';
 import { writeSeqToFile } from './fileWriter';
 import { uniqueFilter } from 'hunspell-reader/dist/util';
 
@@ -43,21 +43,19 @@ function splitCamelCase(word: string): Sequence<string> | string[] {
     return splitWords;
 }
 
-interface CompileWordListOptions {
+interface CompileWordListOptions extends HunspellOptions {
     splitWords: boolean;
     sort: boolean;
 }
 
 export async function compileWordList(filename: string, destFilename: string, options: CompileWordListOptions): Promise<void> {
-    const getWords = () => regHunspellFile.test(filename) ? readHunspellFiles(filename) : lib.asyncIterableToArray(lib.lineReaderAsync(filename));
-
+    const pWords = streamWordsFromFile(filename, options);
     const destDir = path.dirname(destFilename);
 
     const pDir = mkdirp(destDir);
 
     const compile = options.splitWords ? compileWordListWithSplitSeq : compileSimpleWordListSeq;
-
-    const words = genSequence(await getWords());
+    const words = await pWords;
     const seq = compile(words)
         .filter(a => !!a)
         .filter(uniqueFilter(10000));
@@ -71,12 +69,22 @@ export async function compileWordList(filename: string, destFilename: string, op
 
 
 
-export function compileWordListWithSplit(filename: string, destFilename: string): Promise<void> {
-    return compileWordList(filename, destFilename, { splitWords: true, sort: true });
+export function compileWordListWithSplit(
+    filename: string,
+    destFilename: string,
+    options: CompileWordListOptions = { splitWords: true, sort: true }
+): Promise<void> {
+    const { sort = true } = options;
+    return compileWordList(filename, destFilename, { ...options, splitWords: true, sort });
 }
 
-export async function compileSimpleWordList(filename: string, destFilename: string, _options: CompileWordListOptions): Promise<void> {
-    return compileWordList(filename, destFilename, { splitWords: false, sort: true });
+export async function compileSimpleWordList(
+    filename: string,
+    destFilename: string,
+    options: CompileWordListOptions = { splitWords: false, sort: true }
+): Promise<void> {
+    const { sort = true } = options;
+    return compileWordList(filename, destFilename, { ...options, splitWords: false, sort });
 }
 
 function sort(words: Iterable<string>): Iterable<string> {
@@ -106,18 +114,7 @@ export async function compileWordListToTrieFile(words: Sequence<string>, destFil
     return writeSeqToFile(Trie.serializeTrie(root, { base: 32, comment: 'Built by cspell-tools.' }), destFilename);
 }
 
-const regHunspellFile = /\.(dic|aff)$/i;
-
-async function readHunspellFiles(filename: string): Promise<Sequence<string>> {
-    const dicFile = filename.replace(regHunspellFile, '.dic');
-    const affFile = filename.replace(regHunspellFile, '.aff');
-
-    const reader = await HR.IterableHunspellReader.createFromFiles(affFile, dicFile);
-
-    return genSequence(reader.iterateWords());
-}
-
-export async function compileTrie(filename: string, destFilename: string): Promise<void> {
-    const words = await streamWordsFromFile(filename);
+export async function compileTrie(filename: string, destFilename: string, options?: HunspellOptions): Promise<void> {
+    const words = await streamWordsFromFile(filename, options || {});
     return compileWordListToTrieFile(words, destFilename);
 }