Skip to content

Commit

Permalink
[cspell-tools] add ability to merge files into a single target. (#140)
Browse files Browse the repository at this point in the history
* [cspell-tools] removed unused functions.

* [cspell-tools] Refactor the app so we can merge files.

* Refactor for merge

* Add ability to merge files
  • Loading branch information
Jason3S committed Dec 8, 2019
1 parent 26479aa commit f6d8ada
Show file tree
Hide file tree
Showing 5 changed files with 203 additions and 92 deletions.
30 changes: 30 additions & 0 deletions packages/cspell-tools/src/__snapshots__/app.test.ts.snap
@@ -0,0 +1,30 @@
// Jest Snapshot v1, https://goo.gl/fbAQLP

exports[`Validate the application test app compile merge 1`] = `
"amsterdam
angeles
city
delhi
francisco
hello
london
los
los angeles
mexico
mexico city
new
new amsterdam
new delhi
new york
paris
rework
reworked
san
san francisco
tried
try
work
worked
york
"
`;
44 changes: 44 additions & 0 deletions packages/cspell-tools/src/app.test.ts
@@ -1,6 +1,7 @@
import * as app from './app';
import * as Commander from 'commander';
import * as path from 'path';
import * as fs from 'fs-extra';

const projectRoot = path.join(__dirname, '..');
const pathSamples = path.join(projectRoot, 'Samples');
Expand All @@ -16,6 +17,24 @@ function getCommander() {

describe('Validate the application', () => {
test('test app compile-trie', async () => {
const commander = getCommander();
const log = jest.spyOn(console, 'log').mockImplementation();
const args = argv('compile-trie', '-n', path.join(pathSamples, 'cities.txt'));
await expect(app.run(commander, args)).resolves.toBeUndefined();
expect(log).toHaveBeenCalled();
log.mockRestore();
});

test('test app compile-trie compress', async () => {
const commander = getCommander();
const log = jest.spyOn(console, 'log').mockImplementation();
const args = argv('compile-trie', path.join(pathSamples, 'cities.txt'));
await expect(app.run(commander, args)).resolves.toBeUndefined();
expect(log).toHaveBeenCalled();
log.mockRestore();
});

test('test app compile-trie -o', async () => {
const commander = getCommander();
const log = jest.spyOn(console, 'log').mockImplementation();
const args = argv('compile-trie', '-n', path.join(pathSamples, 'cities.txt'), '-o', pathTemp);
Expand All @@ -33,6 +52,31 @@ describe('Validate the application', () => {
log.mockRestore();
});

test('test app compile with compression', async () => {
const commander = getCommander();
const log = jest.spyOn(console, 'log').mockImplementation();
const args = argv('compile', path.join(pathSamples, 'cities.txt'), '-o', pathTemp);
await expect(app.run(commander, args)).resolves.toBeUndefined();
expect(log).toHaveBeenCalled();
log.mockRestore();
});

test('test app compile merge', async () => {
const commander = getCommander();
const targetDir = pathTemp;
const target = 'merge.txt';
const cities = path.join(pathSamples, 'cities.txt');
const exampleHunspell = path.join(pathSamples, 'hunspell', 'example.dic');
const log = jest.spyOn(console, 'log').mockImplementation();
const args = argv('compile', '-n', '-M', target, cities, exampleHunspell, '-o', targetDir);
await expect(app.run(commander, args)).resolves.toBeUndefined();
const words = await fs.readFile(path.join(targetDir, target), 'UTF-8');
expect(words).toMatchSnapshot();
expect(log).toHaveBeenCalled();

log.mockRestore();
});

test('test app no args', () => {
const commander = getCommander();
const mock = jest.fn();
Expand Down
122 changes: 98 additions & 24 deletions packages/cspell-tools/src/app.ts
Expand Up @@ -8,7 +8,8 @@ import { compileWordList, compileTrie } from './compiler';
import * as path from 'path';
import * as program from 'commander';
import * as glob from 'glob';
import { genSequence } from 'gensequence';
import { genSequence, Sequence } from 'gensequence';
import { streamWordsFromFile } from './compiler/iterateWordsFromFile';
const npmPackage = require(path.join(__dirname, '..', 'package.json'));

function globP(pattern: string): Promise<string[]> {
Expand All @@ -19,6 +20,22 @@ function globP(pattern: string): Promise<string[]> {
});
}

interface CompileCommonOptions {
output?: string;
compress: boolean;
case: boolean;
max_depth?: string;
merge: string;
}

interface CompileOptions extends CompileCommonOptions {
split: boolean;
sort: boolean;
}

interface CompileTrieOptions extends CompileCommonOptions {
}

export function run(
program: program.Command,
argv: string[]
Expand All @@ -35,16 +52,12 @@ export function run(
.option('-o, --output <path>', 'Specify the output directory, otherwise files are written back to the same location.')
.option('-n, --no-compress', 'By default the files are Gzipped, this will turn that off.')
.option('-m, --max_depth <limit>', 'Maximum depth to apply suffix rules.')
.option('-M, --merge <target>', 'Merge all files into a single target file (extensions are applied)')
.option('-s, --no-split', 'Treat each line as a dictionary entry, do not split')
.option('--no-sort', 'Do not sort the result')
.action((src: string[], options: { output?: string, compress: boolean, split: boolean, sort: boolean, case: boolean, max_depth?: string }) => {
const { max_depth } = options;
const maxDepth = max_depth !== undefined ? Number.parseInt(max_depth) : undefined;
.action((src: string[], options: CompileOptions) => {
const result = processAction(src, '.txt', options, async (src, dst) => {
console.log('Process "%s" to "%s"', src, dst);
await compileWordList(src, dst, { splitWords: options.split, sort: options.sort, maxDepth }).then(() => src);
console.log('Done "%s" to "%s"', src, dst);
return src;
return compileWordList(src, dst, { splitWords: options.split, sort: options.sort }).then(() => src);
});
resolve(result);
});
Expand All @@ -54,13 +67,11 @@ export function run(
.description('Compile words lists or Hunspell dictionary into trie files used by cspell.')
.option('-o, --output <path>', 'Specify the output directory, otherwise files are written back to the same location.')
.option('-m, --max_depth <limit>', 'Maximum depth to apply suffix rules.')
.option('-M, --merge <target>', 'Merge all files into a single target file (extensions are applied)')
.option('-n, --no-compress', 'By default the files are Gzipped, this will turn that off.')
.action((src: string[], options: { output?: string, compress: boolean, max_depth?: string }) => {
const { max_depth } = options;
const maxDepth = max_depth !== undefined ? Number.parseInt(max_depth) : undefined;
const result = processAction(src, '.trie', options, async (src, dst) => {
console.log('Process "%s" to "%s"', src, dst);
return compileTrie(src, dst, { maxDepth }).then(() => src);
.action((src: string[], options: CompileTrieOptions) => {
const result = processAction(src, '.trie', options, async (words: Sequence<string>, dst) => {
return compileTrie(words, dst);
});
resolve(result);
});
Expand All @@ -78,34 +89,97 @@ export function run(
});
}

interface FileToProcess {
src: string;
words: Sequence<string>;
}

async function processAction(
src: string[],
fileExt: '.txt' | '.trie',
options: { output?: string, compress: boolean },
action: (src: string, dst: string) => Promise<any>)
options: CompileCommonOptions,
action: (words: Sequence<string>, dst: string) => Promise<any>)
: Promise<void> {
console.log('Compile:\n output: %s\n compress: %s\n files:\n %s \n\n',
options.output || 'default',
options.compress ? 'true' : 'false',
src.join('\n '));

const ext = fileExt + (options.compress ? '.gz' : '');
const { max_depth } = options;
const maxDepth = max_depth !== undefined ? Number.parseInt(max_depth) : undefined;
const readerOptions = { maxDepth };

const globResults = await Promise.all(src.map(s => globP(s)));
const toProcess = genSequence(globResults)
const filesToProcess = genSequence(globResults)
.concatMap(files => files)
.map(s => {
const outFilename = path.basename(s).replace(/(\.txt|\.dic|\.aff)?$/, ext);
const dir = options.output ? options.output : path.dirname(s);
return [s, path.join(dir, outFilename)] as [string, string];
})
.map(([src, dst]) => action(src, dst));
.map(async s => {
const words = await streamWordsFromFile(s, readerOptions);
const f: FileToProcess = {
src: s,
words,
};
return f;
});

const r = options.merge
? processFiles(action, filesToProcess, toMergeTargetFile(options.merge, options.output, ext))
: processFilesIndividually(action, filesToProcess, s => toTargetFile(s, options.output, ext));
await r;
console.log(`Complete.`);
}

function toFilename(name: string, ext: string) {
return path.basename(name).replace(/((\.txt|\.dic|\.aff)(\.gz)?)?$/, '') + ext;
}

function toTargetFile(filename: string, destination: string | undefined, ext: string) {
const outFileName = toFilename(filename, ext);
const dir = destination ?? path.dirname(filename);
return path.join(dir, outFileName);
}

function toMergeTargetFile(filename: string, destination: string | undefined, ext: string) {
const outFileName = toFilename(filename, ext);
return path.resolve(destination ?? './', outFileName);
}

async function processFilesIndividually(
action: (words: Sequence<string>, dst: string) => Promise<any>,
filesToProcess: Sequence<Promise<FileToProcess>>,
srcToTarget: (src: string) => string,
) {
const toProcess = filesToProcess
.map(async pFtp => {
const { src, words } = await pFtp;
const dst = srcToTarget(src);
console.log('Process "%s" to "%s"', src, dst);
await action(words, dst);
console.log('Done "%s" to "%s"', src, dst);
});

for (const p of toProcess) {
await p;
}
console.log(`Complete.`);
}

async function processFiles(
action: (words: Sequence<string>, dst: string) => Promise<any>,
filesToProcess: Sequence<Promise<FileToProcess>>,
mergeTarget: string,
) {
const toProcess = await Promise.all([...filesToProcess]);
const dst = mergeTarget;

const words = genSequence(toProcess)
.map(ftp => {
const { src } = ftp;
console.log('Process "%s" to "%s"', src, dst);
return ftp;
})
.concatMap( ftp => ftp.words );
await action(words, dst);
console.log('Done "%s"', dst);
}

if (require.main === module) {
Expand Down
66 changes: 28 additions & 38 deletions packages/cspell-tools/src/compiler/wordListCompiler.test.ts
Expand Up @@ -8,6 +8,7 @@ import * as Trie from 'cspell-trie-lib';
import * as path from 'path';
import { genSequence } from 'gensequence';
import { readFile } from 'cspell-io';
import { streamWordsFromFile } from './iterateWordsFromFile';

const UTF8: BufferEncoding = 'utf8';

Expand Down Expand Up @@ -51,24 +52,20 @@ describe('Validate the wordListCompiler', () => {
]);
});

test('test reading and normalizing a file', () => {
const sourceName = path.join(__dirname, '..', '..', 'Samples', 'cities.txt');
test('test reading and normalizing a file', async () => {
const sourceName = await streamWordsFromFile(path.join(__dirname, '..', '..', 'Samples', 'cities.txt'), {});
const destName = path.join(__dirname, '..', '..', 'temp', 'cities.txt');
return compileWordList(sourceName, destName, { splitWords: true, sort: true })
.then(() => fsp.readFile(destName, 'utf8'))
.then(output => {
expect(output).to.be.equal(citiesResultSorted);
});
await compileWordList(sourceName, destName, { splitWords: true, sort: true });
const output = await fsp.readFile(destName, 'utf8');
expect(output).to.be.equal(citiesResultSorted);
});

test('test compiling to a file without split', () => {
const sourceName = path.join(__dirname, '..', '..', 'Samples', 'cities.txt');
test('test compiling to a file without split', async () => {
const sourceName = await streamWordsFromFile(path.join(__dirname, '..', '..', 'Samples', 'cities.txt'), {});
const destName = path.join(__dirname, '..', '..', 'temp', 'cities2.txt');
return compileWordList(sourceName, destName, { splitWords: false, sort: true })
.then(() => fsp.readFile(destName, 'utf8'))
.then(output => {
expect(output).to.be.equal(citiesSorted.toLowerCase());
});
await compileWordList(sourceName, destName, { splitWords: false, sort: true })
const output = await fsp.readFile(destName, 'utf8');
expect(output).to.be.equal(citiesSorted.toLowerCase());
});

test('tests normalized to a trie', () => {
Expand All @@ -79,22 +76,19 @@ describe('Validate the wordListCompiler', () => {
expect(tWords.sort()).to.be.deep.equal([...(new Set(nWords.sort()))]);
});

test('test reading and normalizing to a trie file', () => {
const sourceName = path.join(__dirname, '..', '..', 'Samples', 'cities.txt');
test('test reading and normalizing to a trie file', async () => {
const sourceName = await streamWordsFromFile(path.join(__dirname, '..', '..', 'Samples', 'cities.txt'), {});
const destName = path.join(__dirname, '..', '..', 'temp', 'cities.trie');
return compileTrie(sourceName, destName)
.then(() => fsp.readFile(destName, UTF8))
.then(output => output.split('\n'))
.then(srcWords => {
const node = Trie.importTrie(srcWords);
const expected = citiesResult.split('\n').filter(a => !!a).sort();
const words = [...Trie.iteratorTrieWords(node)].sort();
expect(words).to.be.deep.equal(expected);
});
await compileTrie(sourceName, destName);
const srcWords = (await fsp.readFile(destName, 'utf8')).split('\n');
const node = Trie.importTrie(srcWords);
const expected = citiesResult.split('\n').filter(a => !!a).sort();
const words = [...Trie.iteratorTrieWords(node)].sort();
expect(words).to.be.deep.equal(expected);
});

test('test reading and normalizing to a trie gz file', async () => {
const sourceName = path.join(__dirname, '..', '..', 'Samples', 'cities.txt');
const sourceName = await streamWordsFromFile(path.join(__dirname, '..', '..', 'Samples', 'cities.txt'), {});
const destName = path.join(__dirname, '..', '..', 'temp', 'cities.trie.gz');
await compileTrie(sourceName, destName);
const resultFile = await readFile(destName, UTF8);
Expand All @@ -106,23 +100,19 @@ describe('Validate the wordListCompiler', () => {
});

test('test a simple hunspell dictionary depth 0', async () => {
const sourceName = path.join(__dirname, '..', '..', 'Samples', 'hunspell', 'example.dic');
const sourceName = await streamWordsFromFile(path.join(__dirname, '..', '..', 'Samples', 'hunspell', 'example.dic'), { maxDepth: 0});
const destName = path.join(__dirname, '..', '..', 'temp', 'example0.txt');
return compileWordList(sourceName, destName, { splitWords: false, sort: true, maxDepth: 0 })
.then(() => fsp.readFile(destName, 'utf8'))
.then(output => {
expect(output).to.be.equal('hello\ntry\nwork\n');
});
await compileWordList(sourceName, destName, { splitWords: false, sort: true });
const output = await fsp.readFile(destName, 'utf8');
expect(output).to.be.equal('hello\ntry\nwork\n');
});

test('test a simple hunspell dictionary depth 1', async () => {
const sourceName = path.join(__dirname, '..', '..', 'Samples', 'hunspell', 'example.dic');
const sourceName = await streamWordsFromFile(path.join(__dirname, '..', '..', 'Samples', 'hunspell', 'example.dic'), { maxDepth: 1});
const destName = path.join(__dirname, '..', '..', 'temp', 'example0.txt');
return compileWordList(sourceName, destName, { splitWords: false, sort: true, maxDepth: 1 })
.then(() => fsp.readFile(destName, 'utf8'))
.then(output => {
expect(output.split('\n')).to.be.deep.equal(['hello', 'rework', 'tried', 'try', 'work', 'worked', '']);
});
await compileWordList(sourceName, destName, { splitWords: false, sort: true });
const output = await fsp.readFile(destName, 'utf8');
expect(output.split('\n')).to.be.deep.equal(['hello', 'rework', 'tried', 'try', 'work', 'worked', '']);
});
});

Expand Down

0 comments on commit f6d8ada

Please sign in to comment.