diff --git a/packages/cspell-tools/src/__snapshots__/app.test.ts.snap b/packages/cspell-tools/src/__snapshots__/app.test.ts.snap new file mode 100644 index 00000000000..22ff60e7afe --- /dev/null +++ b/packages/cspell-tools/src/__snapshots__/app.test.ts.snap @@ -0,0 +1,30 @@ +// Jest Snapshot v1, https://goo.gl/fbAQLP + +exports[`Validate the application test app compile merge 1`] = ` +"amsterdam +angeles +city +delhi +francisco +hello +london +los +los angeles +mexico +mexico city +new +new amsterdam +new delhi +new york +paris +rework +reworked +san +san francisco +tried +try +work +worked +york +" +`; diff --git a/packages/cspell-tools/src/app.test.ts b/packages/cspell-tools/src/app.test.ts index eb849b05a34..a011116ba48 100644 --- a/packages/cspell-tools/src/app.test.ts +++ b/packages/cspell-tools/src/app.test.ts @@ -1,6 +1,7 @@ import * as app from './app'; import * as Commander from 'commander'; import * as path from 'path'; +import * as fs from 'fs-extra'; const projectRoot = path.join(__dirname, '..'); const pathSamples = path.join(projectRoot, 'Samples'); @@ -16,6 +17,24 @@ function getCommander() { describe('Validate the application', () => { test('test app compile-trie', async () => { + const commander = getCommander(); + const log = jest.spyOn(console, 'log').mockImplementation(); + const args = argv('compile-trie', '-n', path.join(pathSamples, 'cities.txt')); + await expect(app.run(commander, args)).resolves.toBeUndefined(); + expect(log).toHaveBeenCalled(); + log.mockRestore(); + }); + + test('test app compile-trie compress', async () => { + const commander = getCommander(); + const log = jest.spyOn(console, 'log').mockImplementation(); + const args = argv('compile-trie', path.join(pathSamples, 'cities.txt')); + await expect(app.run(commander, args)).resolves.toBeUndefined(); + expect(log).toHaveBeenCalled(); + log.mockRestore(); + }); + + test('test app compile-trie -o', async () => { const commander = getCommander(); const log = jest.spyOn(console, 'log').mockImplementation(); const args = argv('compile-trie', '-n', path.join(pathSamples, 'cities.txt'), '-o', pathTemp); @@ -33,6 +52,31 @@ describe('Validate the application', () => { log.mockRestore(); }); + test('test app compile with compression', async () => { + const commander = getCommander(); + const log = jest.spyOn(console, 'log').mockImplementation(); + const args = argv('compile', path.join(pathSamples, 'cities.txt'), '-o', pathTemp); + await expect(app.run(commander, args)).resolves.toBeUndefined(); + expect(log).toHaveBeenCalled(); + log.mockRestore(); + }); + + test('test app compile merge', async () => { + const commander = getCommander(); + const targetDir = pathTemp; + const target = 'merge.txt'; + const cities = path.join(pathSamples, 'cities.txt'); + const exampleHunspell = path.join(pathSamples, 'hunspell', 'example.dic'); + const log = jest.spyOn(console, 'log').mockImplementation(); + const args = argv('compile', '-n', '-M', target, cities, exampleHunspell, '-o', targetDir); + await expect(app.run(commander, args)).resolves.toBeUndefined(); + const words = await fs.readFile(path.join(targetDir, target), 'UTF-8'); + expect(words).toMatchSnapshot(); + expect(log).toHaveBeenCalled(); + + log.mockRestore(); + }); + test('test app no args', () => { const commander = getCommander(); const mock = jest.fn(); diff --git a/packages/cspell-tools/src/app.ts b/packages/cspell-tools/src/app.ts index 6f0e3b1d23d..4418a79f06e 100644 --- a/packages/cspell-tools/src/app.ts +++ b/packages/cspell-tools/src/app.ts @@ -8,7 +8,8 @@ import { compileWordList, compileTrie } from './compiler'; import * as path from 'path'; import * as program from 'commander'; import * as glob from 'glob'; -import { genSequence } from 'gensequence'; +import { genSequence, Sequence } from 'gensequence'; +import { streamWordsFromFile } from './compiler/iterateWordsFromFile'; const npmPackage = require(path.join(__dirname, '..', 'package.json')); function globP(pattern: string): Promise { @@ -19,6 +20,22 @@ function globP(pattern: string): Promise { }); } +interface CompileCommonOptions { + output?: string; + compress: boolean; + case: boolean; + max_depth?: string; + merge: string; +} + +interface CompileOptions extends CompileCommonOptions { + split: boolean; + sort: boolean; +} + +interface CompileTrieOptions extends CompileCommonOptions { +} + export function run( program: program.Command, argv: string[] @@ -35,16 +52,12 @@ export function run( .option('-o, --output ', 'Specify the output directory, otherwise files are written back to the same location.') .option('-n, --no-compress', 'By default the files are Gzipped, this will turn that off.') .option('-m, --max_depth ', 'Maximum depth to apply suffix rules.') + .option('-M, --merge ', 'Merge all files into a single target file (extensions are applied)') .option('-s, --no-split', 'Treat each line as a dictionary entry, do not split') .option('--no-sort', 'Do not sort the result') - .action((src: string[], options: { output?: string, compress: boolean, split: boolean, sort: boolean, case: boolean, max_depth?: string }) => { - const { max_depth } = options; - const maxDepth = max_depth !== undefined ? Number.parseInt(max_depth) : undefined; + .action((src: string[], options: CompileOptions) => { const result = processAction(src, '.txt', options, async (src, dst) => { - console.log('Process "%s" to "%s"', src, dst); - await compileWordList(src, dst, { splitWords: options.split, sort: options.sort, maxDepth }).then(() => src); - console.log('Done "%s" to "%s"', src, dst); - return src; + return compileWordList(src, dst, { splitWords: options.split, sort: options.sort }).then(() => src); }); resolve(result); }); @@ -54,13 +67,11 @@ export function run( .description('Compile words lists or Hunspell dictionary into trie files used by cspell.') .option('-o, --output ', 'Specify the output directory, otherwise files are written back to the same location.') .option('-m, --max_depth ', 'Maximum depth to apply suffix rules.') + .option('-M, --merge ', 'Merge all files into a single target file (extensions are applied)') .option('-n, --no-compress', 'By default the files are Gzipped, this will turn that off.') - .action((src: string[], options: { output?: string, compress: boolean, max_depth?: string }) => { - const { max_depth } = options; - const maxDepth = max_depth !== undefined ? Number.parseInt(max_depth) : undefined; - const result = processAction(src, '.trie', options, async (src, dst) => { - console.log('Process "%s" to "%s"', src, dst); - return compileTrie(src, dst, { maxDepth }).then(() => src); + .action((src: string[], options: CompileTrieOptions) => { + const result = processAction(src, '.trie', options, async (words: Sequence, dst) => { + return compileTrie(words, dst); }); resolve(result); }); @@ -78,12 +89,16 @@ export function run( }); } +interface FileToProcess { + src: string; + words: Sequence; +} async function processAction( src: string[], fileExt: '.txt' | '.trie', - options: { output?: string, compress: boolean }, - action: (src: string, dst: string) => Promise) + options: CompileCommonOptions, + action: (words: Sequence, dst: string) => Promise) : Promise { console.log('Compile:\n output: %s\n compress: %s\n files:\n %s \n\n', options.output || 'default', @@ -91,21 +106,80 @@ async function processAction( src.join('\n ')); const ext = fileExt + (options.compress ? '.gz' : ''); + const { max_depth } = options; + const maxDepth = max_depth !== undefined ? Number.parseInt(max_depth) : undefined; + const readerOptions = { maxDepth }; const globResults = await Promise.all(src.map(s => globP(s))); - const toProcess = genSequence(globResults) + const filesToProcess = genSequence(globResults) .concatMap(files => files) - .map(s => { - const outFilename = path.basename(s).replace(/(\.txt|\.dic|\.aff)?$/, ext); - const dir = options.output ? options.output : path.dirname(s); - return [s, path.join(dir, outFilename)] as [string, string]; - }) - .map(([src, dst]) => action(src, dst)); + .map(async s => { + const words = await streamWordsFromFile(s, readerOptions); + const f: FileToProcess = { + src: s, + words, + }; + return f; + }); + + const r = options.merge + ? processFiles(action, filesToProcess, toMergeTargetFile(options.merge, options.output, ext)) + : processFilesIndividually(action, filesToProcess, s => toTargetFile(s, options.output, ext)); + await r; + console.log(`Complete.`); +} + +function toFilename(name: string, ext: string) { + return path.basename(name).replace(/((\.txt|\.dic|\.aff)(\.gz)?)?$/, '') + ext; +} + +function toTargetFile(filename: string, destination: string | undefined, ext: string) { + const outFileName = toFilename(filename, ext); + const dir = destination ?? path.dirname(filename); + return path.join(dir, outFileName); +} + +function toMergeTargetFile(filename: string, destination: string | undefined, ext: string) { + const outFileName = toFilename(filename, ext); + return path.resolve(destination ?? './', outFileName); +} + +async function processFilesIndividually( + action: (words: Sequence, dst: string) => Promise, + filesToProcess: Sequence>, + srcToTarget: (src: string) => string, +) { + const toProcess = filesToProcess + .map(async pFtp => { + const { src, words } = await pFtp; + const dst = srcToTarget(src); + console.log('Process "%s" to "%s"', src, dst); + await action(words, dst); + console.log('Done "%s" to "%s"', src, dst); + }); for (const p of toProcess) { await p; } - console.log(`Complete.`); +} + +async function processFiles( + action: (words: Sequence, dst: string) => Promise, + filesToProcess: Sequence>, + mergeTarget: string, +) { + const toProcess = await Promise.all([...filesToProcess]); + const dst = mergeTarget; + + const words = genSequence(toProcess) + .map(ftp => { + const { src } = ftp; + console.log('Process "%s" to "%s"', src, dst); + return ftp; + }) + .concatMap( ftp => ftp.words ); + await action(words, dst); + console.log('Done "%s"', dst); } if (require.main === module) { diff --git a/packages/cspell-tools/src/compiler/wordListCompiler.test.ts b/packages/cspell-tools/src/compiler/wordListCompiler.test.ts index 422c1150ab3..b03a7513282 100644 --- a/packages/cspell-tools/src/compiler/wordListCompiler.test.ts +++ b/packages/cspell-tools/src/compiler/wordListCompiler.test.ts @@ -8,6 +8,7 @@ import * as Trie from 'cspell-trie-lib'; import * as path from 'path'; import { genSequence } from 'gensequence'; import { readFile } from 'cspell-io'; +import { streamWordsFromFile } from './iterateWordsFromFile'; const UTF8: BufferEncoding = 'utf8'; @@ -51,24 +52,20 @@ describe('Validate the wordListCompiler', () => { ]); }); - test('test reading and normalizing a file', () => { - const sourceName = path.join(__dirname, '..', '..', 'Samples', 'cities.txt'); + test('test reading and normalizing a file', async () => { + const sourceName = await streamWordsFromFile(path.join(__dirname, '..', '..', 'Samples', 'cities.txt'), {}); const destName = path.join(__dirname, '..', '..', 'temp', 'cities.txt'); - return compileWordList(sourceName, destName, { splitWords: true, sort: true }) - .then(() => fsp.readFile(destName, 'utf8')) - .then(output => { - expect(output).to.be.equal(citiesResultSorted); - }); + await compileWordList(sourceName, destName, { splitWords: true, sort: true }); + const output = await fsp.readFile(destName, 'utf8'); + expect(output).to.be.equal(citiesResultSorted); }); - test('test compiling to a file without split', () => { - const sourceName = path.join(__dirname, '..', '..', 'Samples', 'cities.txt'); + test('test compiling to a file without split', async () => { + const sourceName = await streamWordsFromFile(path.join(__dirname, '..', '..', 'Samples', 'cities.txt'), {}); const destName = path.join(__dirname, '..', '..', 'temp', 'cities2.txt'); - return compileWordList(sourceName, destName, { splitWords: false, sort: true }) - .then(() => fsp.readFile(destName, 'utf8')) - .then(output => { - expect(output).to.be.equal(citiesSorted.toLowerCase()); - }); + await compileWordList(sourceName, destName, { splitWords: false, sort: true }) + const output = await fsp.readFile(destName, 'utf8'); + expect(output).to.be.equal(citiesSorted.toLowerCase()); }); test('tests normalized to a trie', () => { @@ -79,22 +76,19 @@ describe('Validate the wordListCompiler', () => { expect(tWords.sort()).to.be.deep.equal([...(new Set(nWords.sort()))]); }); - test('test reading and normalizing to a trie file', () => { - const sourceName = path.join(__dirname, '..', '..', 'Samples', 'cities.txt'); + test('test reading and normalizing to a trie file', async () => { + const sourceName = await streamWordsFromFile(path.join(__dirname, '..', '..', 'Samples', 'cities.txt'), {}); const destName = path.join(__dirname, '..', '..', 'temp', 'cities.trie'); - return compileTrie(sourceName, destName) - .then(() => fsp.readFile(destName, UTF8)) - .then(output => output.split('\n')) - .then(srcWords => { - const node = Trie.importTrie(srcWords); - const expected = citiesResult.split('\n').filter(a => !!a).sort(); - const words = [...Trie.iteratorTrieWords(node)].sort(); - expect(words).to.be.deep.equal(expected); - }); + await compileTrie(sourceName, destName); + const srcWords = (await fsp.readFile(destName, 'utf8')).split('\n'); + const node = Trie.importTrie(srcWords); + const expected = citiesResult.split('\n').filter(a => !!a).sort(); + const words = [...Trie.iteratorTrieWords(node)].sort(); + expect(words).to.be.deep.equal(expected); }); test('test reading and normalizing to a trie gz file', async () => { - const sourceName = path.join(__dirname, '..', '..', 'Samples', 'cities.txt'); + const sourceName = await streamWordsFromFile(path.join(__dirname, '..', '..', 'Samples', 'cities.txt'), {}); const destName = path.join(__dirname, '..', '..', 'temp', 'cities.trie.gz'); await compileTrie(sourceName, destName); const resultFile = await readFile(destName, UTF8); @@ -106,23 +100,19 @@ describe('Validate the wordListCompiler', () => { }); test('test a simple hunspell dictionary depth 0', async () => { - const sourceName = path.join(__dirname, '..', '..', 'Samples', 'hunspell', 'example.dic'); + const sourceName = await streamWordsFromFile(path.join(__dirname, '..', '..', 'Samples', 'hunspell', 'example.dic'), { maxDepth: 0}); const destName = path.join(__dirname, '..', '..', 'temp', 'example0.txt'); - return compileWordList(sourceName, destName, { splitWords: false, sort: true, maxDepth: 0 }) - .then(() => fsp.readFile(destName, 'utf8')) - .then(output => { - expect(output).to.be.equal('hello\ntry\nwork\n'); - }); + await compileWordList(sourceName, destName, { splitWords: false, sort: true }); + const output = await fsp.readFile(destName, 'utf8'); + expect(output).to.be.equal('hello\ntry\nwork\n'); }); test('test a simple hunspell dictionary depth 1', async () => { - const sourceName = path.join(__dirname, '..', '..', 'Samples', 'hunspell', 'example.dic'); + const sourceName = await streamWordsFromFile(path.join(__dirname, '..', '..', 'Samples', 'hunspell', 'example.dic'), { maxDepth: 1}); const destName = path.join(__dirname, '..', '..', 'temp', 'example0.txt'); - return compileWordList(sourceName, destName, { splitWords: false, sort: true, maxDepth: 1 }) - .then(() => fsp.readFile(destName, 'utf8')) - .then(output => { - expect(output.split('\n')).to.be.deep.equal(['hello', 'rework', 'tried', 'try', 'work', 'worked', '']); - }); + await compileWordList(sourceName, destName, { splitWords: false, sort: true }); + const output = await fsp.readFile(destName, 'utf8'); + expect(output.split('\n')).to.be.deep.equal(['hello', 'rework', 'tried', 'try', 'work', 'worked', '']); }); }); diff --git a/packages/cspell-tools/src/compiler/wordListCompiler.ts b/packages/cspell-tools/src/compiler/wordListCompiler.ts index 48c069a67d5..03f27aad343 100644 --- a/packages/cspell-tools/src/compiler/wordListCompiler.ts +++ b/packages/cspell-tools/src/compiler/wordListCompiler.ts @@ -41,19 +41,17 @@ function splitCamelCase(word: string): Sequence | string[] { return splitWords; } -interface CompileWordListOptions extends ReaderOptions { +interface CompileWordListOptions { splitWords: boolean; sort: boolean; } -export async function compileWordList(filename: string, destFilename: string, options: CompileWordListOptions): Promise { - const pWords = streamWordsFromFile(filename, options); +export async function compileWordList(words: Sequence, destFilename: string, options: CompileWordListOptions): Promise { const destDir = path.dirname(destFilename); const pDir = mkdirp(destDir); const compile = options.splitWords ? compileWordListWithSplitSeq : compileSimpleWordListSeq; - const words = await pWords; const seq = compile(words) .filter(a => !!a) .filter(uniqueFilter(10000)); @@ -65,26 +63,6 @@ export async function compileWordList(filename: string, destFilename: string, op return writeSeqToFile(finalSeq.map(a => a + '\n'), destFilename); } - - -export function compileWordListWithSplit( - filename: string, - destFilename: string, - options: CompileWordListOptions = { splitWords: true, sort: true } -): Promise { - const { sort = true } = options; - return compileWordList(filename, destFilename, { ...options, splitWords: true, sort }); -} - -export async function compileSimpleWordList( - filename: string, - destFilename: string, - options: CompileWordListOptions = { splitWords: false, sort: true } -): Promise { - const { sort = true } = options; - return compileWordList(filename, destFilename, { ...options, splitWords: false, sort }); -} - function sort(words: Iterable): Iterable { return [...words].sort(); } @@ -103,7 +81,7 @@ export function normalizeWordsToTrie(words: Sequence): Trie.TrieNode { return result; } -export async function compileWordListToTrieFile(words: Sequence, destFilename: string): Promise { +export async function compileTrie(words: Sequence, destFilename: string): Promise { const destDir = path.dirname(destFilename); const pDir = mkdirp(destDir); const pRoot = normalizeWordsToTrie(words); @@ -111,8 +89,3 @@ export async function compileWordListToTrieFile(words: Sequence, destFil return writeSeqToFile(Trie.serializeTrie(root, { base: 32, comment: 'Built by cspell-tools.' }), destFilename); } - -export async function compileTrie(filename: string, destFilename: string, options?: ReaderOptions): Promise { - const words = await streamWordsFromFile(filename, options || {}); - return compileWordListToTrieFile(words, destFilename); -}