Skip to content

Commit

Permalink
Support limiting the suffix depth
Browse files Browse the repository at this point in the history
  • Loading branch information
Jason3S committed Nov 16, 2019
1 parent 619ef62 commit d2e28b3
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 32 deletions.
15 changes: 10 additions & 5 deletions packages/cspell-tools/src/app.ts
Expand Up @@ -27,12 +27,15 @@ program
.description('compile words lists into simple dictionary files.')
.option('-o, --output <path>', 'Specify the output directory, otherwise files are written back to the same location.')
.option('-n, --no-compress', 'By default the files are Gzipped, this will turn that off.')
.option('-m, --max_depth <limit>', 'Maximum depth to apply suffix rules.')
.option('-s, --no-split', 'Treat each line as a dictionary entry, do not split')
.option('--no-sort', 'Do not sort the result')
.action(async (src: string[], options: { output?: string, compress: boolean, split: boolean, sort: boolean, case: boolean }) => {
.action(async (src: string[], options: { output?: string, compress: boolean, split: boolean, sort: boolean, case: boolean, max_depth?: string }) => {
const { max_depth } = options;
const maxDepth = max_depth !== undefined ? Number.parseInt(max_depth) : undefined;
return processAction(src, '.txt', options, async (src, dst) => {
console.log('Process "%s" to "%s"', src, dst);
await compileWordList(src, dst, { splitWords: options.split, sort: options.sort }).then(() => src);
await compileWordList(src, dst, { splitWords: options.split, sort: options.sort, maxDepth }).then(() => src);
console.log('Done "%s" to "%s"', src, dst);
return src;
});
Expand All @@ -42,11 +45,14 @@ program
.command('compile-trie <src...>')
.description('Compile words lists or Hunspell dictionary into trie files used by cspell.')
.option('-o, --output <path>', 'Specify the output directory, otherwise files are written back to the same location.')
.option('-m, --max_depth <limit>', 'Maximum depth to apply suffix rules.')
.option('-n, --no-compress', 'By default the files are Gzipped, this will turn that off.')
.action((src: string[], options: { output?: string, compress: boolean }) => {
.action((src: string[], options: { output?: string, compress: boolean, max_depth?: string }) => {
const { max_depth } = options;
const maxDepth = max_depth !== undefined ? Number.parseInt(max_depth) : undefined;
return processAction(src, '.trie', options, async (src, dst) => {
console.log('Process "%s" to "%s"', src, dst);
return compileTrie(src, dst).then(() => src);
return compileTrie(src, dst, { maxDepth } ).then(() => src);
});
});

Expand Down Expand Up @@ -84,4 +90,3 @@ program.parse(process.argv);
if (!process.argv.slice(2).length) {
program.help();
}

Expand Up @@ -4,7 +4,7 @@ import * as path from 'path';

describe('Validate the iterateWordsFromFile', () => {
test('streamWordsFromFile: hunspell', async () => {
const reader = await streamWordsFromFile(path.join(__dirname, '..', '..', 'Samples', 'hunspell', 'example.aff'));
const reader = await streamWordsFromFile(path.join(__dirname, '..', '..', 'Samples', 'hunspell', 'example.aff'), {});
const results = [...reader];
// this might break if the processing order of hunspell changes.
expect(results.join(' ')).to.equal('hello try tried work rework reworked worked');
Expand Down
11 changes: 8 additions & 3 deletions packages/cspell-tools/src/compiler/iterateWordsFromFile.ts
Expand Up @@ -4,11 +4,16 @@ import * as fs from 'fs-extra';

const regHunspellFile = /\.(dic|aff)$/i;

export async function readHunspellFiles(filename: string): Promise<Sequence<string>> {
export interface HunspellOptions {
maxDepth?: number;
}

export async function readHunspellFiles(filename: string, options: HunspellOptions): Promise<Sequence<string>> {
const dicFile = filename.replace(regHunspellFile, '.dic');
const affFile = filename.replace(regHunspellFile, '.aff');

const reader = await HR.IterableHunspellReader.createFromFiles(affFile, dicFile);
reader.maxDepth = options.maxDepth !== undefined ? options.maxDepth : reader.maxDepth;

return genSequence(reader);
}
Expand All @@ -18,6 +23,6 @@ async function iterateFile(filename: string): Promise<Sequence<string>> {
return genSequence(content.split('\n'));
}

export function streamWordsFromFile(filename: string): Promise<Sequence<string>> {
return regHunspellFile.test(filename) ? readHunspellFiles(filename) : iterateFile(filename);
export function streamWordsFromFile(filename: string, options: HunspellOptions): Promise<Sequence<string>> {
return regHunspellFile.test(filename) ? readHunspellFiles(filename, options) : iterateFile(filename);
}
20 changes: 20 additions & 0 deletions packages/cspell-tools/src/compiler/wordListCompiler.test.ts
Expand Up @@ -104,6 +104,26 @@ describe('Validate the wordListCompiler', () => {
const words = [...Trie.iteratorTrieWords(node)].sort();
expect(words).to.be.deep.equal(expected);
});

test('test a simple hunspell dictionary depth 0', async () => {
const sourceName = path.join(__dirname, '..', '..', 'Samples', 'hunspell', 'example.dic');
const destName = path.join(__dirname, '..', '..', 'temp', 'example0.txt');
return compileWordList(sourceName, destName, { splitWords: false, sort: true, maxDepth: 0 })
.then(() => fsp.readFile(destName, 'utf8'))
.then(output => {
expect(output).to.be.equal('hello\ntry\nwork\n');
});
});

test('test a simple hunspell dictionary depth 1', async () => {
const sourceName = path.join(__dirname, '..', '..', 'Samples', 'hunspell', 'example.dic');
const destName = path.join(__dirname, '..', '..', 'temp', 'example0.txt');
return compileWordList(sourceName, destName, { splitWords: false, sort: true, maxDepth: 1 })
.then(() => fsp.readFile(destName, 'utf8'))
.then(output => {
expect(output.split('\n')).to.be.deep.equal(['hello', 'rework', 'tried', 'try', 'work', 'worked', '']);
});
});
});

function distinct(): (word: string) => boolean {
Expand Down
43 changes: 20 additions & 23 deletions packages/cspell-tools/src/compiler/wordListCompiler.ts
Expand Up @@ -6,7 +6,7 @@ import * as path from 'path';
import { mkdirp } from 'fs-extra';
import * as Trie from 'cspell-trie-lib';
import * as HR from 'hunspell-reader';
import { streamWordsFromFile } from './iterateWordsFromFile';
import { streamWordsFromFile, HunspellOptions } from './iterateWordsFromFile';
import { writeSeqToFile } from './fileWriter';
import { uniqueFilter } from 'hunspell-reader/dist/util';

Expand Down Expand Up @@ -43,21 +43,19 @@ function splitCamelCase(word: string): Sequence<string> | string[] {
return splitWords;
}

interface CompileWordListOptions {
interface CompileWordListOptions extends HunspellOptions {
splitWords: boolean;
sort: boolean;
}

export async function compileWordList(filename: string, destFilename: string, options: CompileWordListOptions): Promise<void> {
const getWords = () => regHunspellFile.test(filename) ? readHunspellFiles(filename) : lib.asyncIterableToArray(lib.lineReaderAsync(filename));

const pWords = streamWordsFromFile(filename, options);
const destDir = path.dirname(destFilename);

const pDir = mkdirp(destDir);

const compile = options.splitWords ? compileWordListWithSplitSeq : compileSimpleWordListSeq;

const words = genSequence(await getWords());
const words = await pWords;
const seq = compile(words)
.filter(a => !!a)
.filter(uniqueFilter(10000));
Expand All @@ -71,12 +69,22 @@ export async function compileWordList(filename: string, destFilename: string, op



export function compileWordListWithSplit(filename: string, destFilename: string): Promise<void> {
return compileWordList(filename, destFilename, { splitWords: true, sort: true });
export function compileWordListWithSplit(
filename: string,
destFilename: string,
options: CompileWordListOptions = { splitWords: true, sort: true }
): Promise<void> {
const { sort = true } = options;
return compileWordList(filename, destFilename, { ...options, splitWords: true, sort });
}

export async function compileSimpleWordList(filename: string, destFilename: string, _options: CompileWordListOptions): Promise<void> {
return compileWordList(filename, destFilename, { splitWords: false, sort: true });
export async function compileSimpleWordList(
filename: string,
destFilename: string,
options: CompileWordListOptions = { splitWords: false, sort: true }
): Promise<void> {
const { sort = true } = options;
return compileWordList(filename, destFilename, { ...options, splitWords: false, sort });
}

function sort(words: Iterable<string>): Iterable<string> {
Expand Down Expand Up @@ -106,18 +114,7 @@ export async function compileWordListToTrieFile(words: Sequence<string>, destFil
return writeSeqToFile(Trie.serializeTrie(root, { base: 32, comment: 'Built by cspell-tools.' }), destFilename);
}

const regHunspellFile = /\.(dic|aff)$/i;

async function readHunspellFiles(filename: string): Promise<Sequence<string>> {
const dicFile = filename.replace(regHunspellFile, '.dic');
const affFile = filename.replace(regHunspellFile, '.aff');

const reader = await HR.IterableHunspellReader.createFromFiles(affFile, dicFile);

return genSequence(reader.iterateWords());
}

export async function compileTrie(filename: string, destFilename: string): Promise<void> {
const words = await streamWordsFromFile(filename);
export async function compileTrie(filename: string, destFilename: string, options?: HunspellOptions): Promise<void> {
const words = await streamWordsFromFile(filename, options || {});
return compileWordListToTrieFile(words, destFilename);
}

0 comments on commit d2e28b3

Please sign in to comment.