From ef13f2f0e29bf3855e1b32e9394dbf9f9c0d2611 Mon Sep 17 00:00:00 2001 From: Emiliano Heyns Date: Sat, 2 Mar 2024 14:08:03 +0100 Subject: [PATCH] prep chunker for dual use --- chunker.ts | 118 ++++++++++++++----------- docs/bibtex-parser.chunker.md | 1 - docs/bibtex-parser.chunker.parse.md | 4 +- docs/bibtex-parser.chunker.promises.md | 2 +- index.ts | 6 +- snap.sh | 2 +- test/all.js | 2 +- 7 files changed, 73 insertions(+), 62 deletions(-) diff --git a/chunker.ts b/chunker.ts index 94e908548..880c22e44 100644 --- a/chunker.ts +++ b/chunker.ts @@ -33,6 +33,12 @@ const letter = new RegExp('[' + [ /\u00AA\u00BA\u01BB\u01C0-\u01C3\u0294\u05D0-\u05EA\u05F0-\u05F2\u0620-\u063F\u0641-\u064A\u066E-\u066F\u0671-\u06D3\u06D5\u06EE-\u06EF\u06FA-\u06FC\u06FF\u0710\u0712-\u072F\u074D-\u07A5\u07B1\u07CA-\u07EA\u0800-\u0815\u0840-\u0858\u08A0-\u08B4\u0904-\u0939\u093D\u0950\u0958-\u0961\u0972-\u0980\u0985-\u098C\u098F-\u0990\u0993-\u09A8\u09AA-\u09B0\u09B2\u09B6-\u09B9\u09BD\u09CE\u09DC-\u09DD\u09DF-\u09E1\u09F0-\u09F1\u0A05-\u0A0A\u0A0F-\u0A10\u0A13-\u0A28\u0A2A-\u0A30\u0A32-\u0A33\u0A35-\u0A36\u0A38-\u0A39\u0A59-\u0A5C\u0A5E\u0A72-\u0A74\u0A85-\u0A8D\u0A8F-\u0A91\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2-\u0AB3\u0AB5-\u0AB9\u0ABD\u0AD0\u0AE0-\u0AE1\u0AF9\u0B05-\u0B0C\u0B0F-\u0B10\u0B13-\u0B28\u0B2A-\u0B30\u0B32-\u0B33\u0B35-\u0B39\u0B3D\u0B5C-\u0B5D\u0B5F-\u0B61\u0B71\u0B83\u0B85-\u0B8A\u0B8E-\u0B90\u0B92-\u0B95\u0B99-\u0B9A\u0B9C\u0B9E-\u0B9F\u0BA3-\u0BA4\u0BA8-\u0BAA\u0BAE-\u0BB9\u0BD0\u0C05-\u0C0C\u0C0E-\u0C10\u0C12-\u0C28\u0C2A-\u0C39\u0C3D\u0C58-\u0C5A\u0C60-\u0C61\u0C85-\u0C8C\u0C8E-\u0C90\u0C92-\u0CA8\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CBD\u0CDE\u0CE0-\u0CE1\u0CF1-\u0CF2\u0D05-\u0D0C\u0D0E-\u0D10\u0D12-\u0D3A\u0D3D\u0D4E\u0D5F-\u0D61\u0D7A-\u0D7F\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6\u0E01-\u0E30\u0E32-\u0E33\u0E40-\u0E45\u0E81-\u0E82\u0E84\u0E87-\u0E88\u0E8A\u0E8D\u0E94-\u0E97\u0E99-\u0E9F\u0EA1-\u0EA3\u0EA5\u0EA7\u0EAA-\u0EAB\u0EAD-\u0EB0\u0EB2-\u0EB3\u0EBD\u0EC0-\u0EC4\u0EDC-\u0EDF\u0F00\u0F40-\u0F47\u0F49-\u0F6C\u0F88-\u0F8C\u1000-\u102A\u103F\u1050-\u1055\u105A-\u105D\u1061\u1065-\u1066\u106E-\u1070\u1075-\u1081\u108E\u10D0-\u10FA\u10FD-\u1248\u124A-\u124D\u1250-\u1256\u1258\u125A-\u125D\u1260-\u1288\u128A-\u128D\u1290-\u12B0\u12B2-\u12B5\u12B8-\u12BE\u12C0\u12C2-\u12C5\u12C8-\u12D6\u12D8-\u1310\u1312-\u1315\u1318-\u135A\u1380-\u138F\u1401-\u166C\u166F-\u167F\u1681-\u169A\u16A0-\u16EA\u16F1-\u16F8\u1700-\u170C\u170E-\u1711\u1720-\u1731\u1740-\u1751\u1760-\u176C\u176E-\u1770\u1780-\u17B3\u17DC\u1820-\u1842\u1844-\u1877\u1880-\u18A8\u18AA\u18B0-\u18F5\u1900-\u191E\u1950-\u196D\u1970-\u1974\u1980-\u19AB\u19B0-\u19C9\u1A00-\u1A16\u1A20-\u1A54\u1B05-\u1B33\u1B45-\u1B4B\u1B83-\u1BA0\u1BAE-\u1BAF\u1BBA-\u1BE5\u1C00-\u1C23\u1C4D-\u1C4F\u1C5A-\u1C77\u1CE9-\u1CEC\u1CEE-\u1CF1\u1CF5-\u1CF6\u2135-\u2138\u2D30-\u2D67\u2D80-\u2D96\u2DA0-\u2DA6\u2DA8-\u2DAE\u2DB0-\u2DB6\u2DB8-\u2DBE\u2DC0-\u2DC6\u2DC8-\u2DCE\u2DD0-\u2DD6\u2DD8-\u2DDE\u3006\u303C\u3041-\u3096\u309F\u30A1-\u30FA\u30FF\u3105-\u312D\u3131-\u318E\u31A0-\u31BA\u31F0-\u31FF\u3400-\u4DB5\u4E00-\u9FD5\uA000-\uA014\uA016-\uA48C\uA4D0-\uA4F7\uA500-\uA60B\uA610-\uA61F\uA62A-\uA62B\uA66E\uA6A0-\uA6E5\uA78F\uA7F7\uA7FB-\uA801\uA803-\uA805\uA807-\uA80A\uA80C-\uA822\uA840-\uA873\uA882-\uA8B3\uA8F2-\uA8F7\uA8FB\uA8FD\uA90A-\uA925\uA930-\uA946\uA960-\uA97C\uA984-\uA9B2\uA9E0-\uA9E4\uA9E7-\uA9EF\uA9FA-\uA9FE\uAA00-\uAA28\uAA40-\uAA42\uAA44-\uAA4B\uAA60-\uAA6F\uAA71-\uAA76\uAA7A\uAA7E-\uAAAF\uAAB1\uAAB5-\uAAB6\uAAB9-\uAABD\uAAC0\uAAC2\uAADB-\uAADC\uAAE0-\uAAEA\uAAF2\uAB01-\uAB06\uAB09-\uAB0E\uAB11-\uAB16\uAB20-\uAB26\uAB28-\uAB2E\uABC0-\uABE2\uAC00-\uD7A3\uD7B0-\uD7C6\uD7CB-\uD7FB\uF900-\uFA6D\uFA70-\uFAD9\uFB1D\uFB1F-\uFB28\uFB2A-\uFB36\uFB38-\uFB3C\uFB3E\uFB40-\uFB41\uFB43-\uFB44\uFB46-\uFBB1\uFBD3-\uFD3D\uFD50-\uFD8F\uFD92-\uFDC7\uFDF0-\uFDFB\uFE70-\uFE74\uFE76-\uFEFC\uFF66-\uFF6F\uFF71-\uFF9D\uFFA0-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC/.source, ].join('') + ']') +type Entry = { + type: string + key?: string + fields: Record +} + export interface Chunk { /** * The text content of the chunk @@ -77,45 +83,71 @@ export interface ParserOptions { max_entries?: number } -type Entry = { - type: string - key?: string - fields: Record -} - -class BibtexParser { +class BibTeXParser { public parsing: string public chunks: Chunk[] + + public strings: Record public entries: Entry[] private pos: number // private linebreaks: { pos: number, line: number}[] private input: string - private strings: Record private max_entries: number - reset(input) { + constructor(input: string, options: ParserOptions = {}) { + this.max_entries = options.max_entries || 0 this.input = input this.pos = 0 this.entries = [] - this.strings = {} + this.strings = { + jan: '01', + feb: '02', + mar: '03', + apr: '04', + may: '05', + jun: '06', + jul: '07', + aug: '08', + sep: '09', + oct: '10', + nov: '11', + dec: '12', + acmcs: 'ACM Computing Surveys', + acta: 'Acta Informatica', + cacm: 'Communications of the ACM', + ibmjrd: 'IBM Journal of Research and Development', + ibmsj: 'IBM Systems Journal', + ieeese: 'IEEE Transactions on Software Engineering', + ieeetc: 'IEEE Transactions on Computers', + ieeetcad: 'IEEE Transactions on Computer-Aided Design of Integrated Circuits', + ipl: 'Information Processing Letters', + jacm: 'Journal of the ACM', + jcss: 'Journal of Computer and System Sciences', + scp: 'Science of Computer Programming', + sicomp: 'SIAM Journal on Computing', + tocs: 'ACM Transactions on Computer Systems', + tods: 'ACM Transactions on Database Systems', + tog: 'ACM Transactions on Graphics', + toms: 'ACM Transactions on Mathematical Software', + toois: 'ACM Transactions on Office Information Systems', + toplas: 'ACM Transactions on Programming Languages and Systems', + tcs: 'Theoretical Computer Science', + } this.parsing = null this.chunks = [] // this.linebreaks = undefined } - public parse(input, options: ParserOptions = {}): Chunk[] { - this.reset(input) - this.max_entries = options.max_entries || 0 + public parse() { this.bibtex() - return this.chunks + this.entries.reverse() } - public async parseAsync(input, options: ParserOptions = {}): Promise { - this.reset(input) - this.max_entries = options.max_entries || 0 + + public async parseAsync() { await this.bibtexAsync() - return this.chunks + this.entries.reverse() } private isWhitespace(s, horizontalOnly = false) { @@ -242,17 +274,14 @@ class BibtexParser { return values.join('') } - private key(allowUnicode = false) { + private key() { const start = this.pos while (true) { // eslint-disable-line no-constant-condition if (this.pos === this.input.length) { throw new ParseError('Runaway key', this) } - if (this.input[this.pos].match(/['a-zA-Z0-9&;_:\\./-]/)) { - this.pos++ - } - else if (allowUnicode && this.input[this.pos].match(letter)) { + if (this.input[this.pos].match(/['a-zA-Z0-9&;_:\\./-]/) || this.input[this.pos].match(letter)) { this.pos++ } else { @@ -279,8 +308,8 @@ class BibtexParser { } } - private entry(_d) { - this.entries[0].key = this.key(true) + private entry(d) { + this.entries.unshift({ type: d, key: this.key(), fields: {} }) this.match(',') this.key_equals_value() while (this.tryMatch(',')) { @@ -310,23 +339,14 @@ class BibtexParser { while (this.isWhitespace(this.input[this.pos], true)) this.pos++ if (this.input[this.pos] === '{') { - this.value_braces() - return + return this.value_braces() } + const start = this.pos while (this.input[this.pos] !== '\n' && this.pos < this.input.length) this.pos++ + return this.input.substring(start, this.pos) } - /* - private progress() { - const progress = Math.round((this.pos / this.input.length * 100) / 5) * 5 // eslint-disable-line no-magic-numbers - if (this._progress !== progress) { - this._progress = progress - process.stdout.write(` (${this._progress}%) `) - } - } - */ - private hasMore() { if (this.max_entries && this.entries.length >= this.max_entries) return false return (this.pos < this.input.length) @@ -355,8 +375,6 @@ class BibtexParser { } private parseNext() { - // this.progress() - const chunk: Chunk = { offset: { pos: this.pos, @@ -394,7 +412,6 @@ class BibtexParser { default: guard = this.matchGuard() - this.entries.unshift({ type: d, fields: {} }) this.entry(d) this.match(guard) chunk.entry = true @@ -444,21 +461,16 @@ class BibtexParser { * * @returns array of chunks, with markers for type and errors (if any) found. */ -export function parse(input: string, options: ParserOptions = {}): Chunk[] { - return (new BibtexParser).parse(input, options) -} - -export function entries(input: string, options: ParserOptions = {}): { entries: Entry[], errors: Chunk[] } { - const parser = new BibtexParser - parser.parse(input, options) - return { - entries: parser.entries.reverse(), - errors: parser.chunks.filter(chunk => chunk.error), - } +export function parse(input: string, options: ParserOptions = {}): BibTeXParser { + const parser = new BibTeXParser(input, options) + parser.parse() + return parser } export const promises = { - async parse(input: string, options: ParserOptions = {}): Promise { // eslint-disable-line prefer-arrow/prefer-arrow-functions - return await (new BibtexParser).parseAsync(input, options) + async parse(input: string, options: ParserOptions = {}): Promise { // eslint-disable-line prefer-arrow/prefer-arrow-functions + const parser = new BibTeXParser(input, options) + await parser.parseAsync() + return parser }, } diff --git a/docs/bibtex-parser.chunker.md b/docs/bibtex-parser.chunker.md index e9016bbfb..58fc22979 100644 --- a/docs/bibtex-parser.chunker.md +++ b/docs/bibtex-parser.chunker.md @@ -8,7 +8,6 @@ | Function | Description | | --- | --- | -| [entries(input, options)](./bibtex-parser.chunker.entries.md) | | | [parse(input, options)](./bibtex-parser.chunker.parse.md) | Reads the bibtex input and splits it into separate chunks of strings, @comments, and bibtex entries. Useful for detecting if a file is bibtex file and for filtering out basic errors that would make the more sophisticated \[\[bibtex.parse\]\] reject the whole file | ## Interfaces diff --git a/docs/bibtex-parser.chunker.parse.md b/docs/bibtex-parser.chunker.parse.md index f114909dc..2397c7370 100644 --- a/docs/bibtex-parser.chunker.parse.md +++ b/docs/bibtex-parser.chunker.parse.md @@ -9,7 +9,7 @@ Reads the bibtex input and splits it into separate chunks of `string`s, **Signature:** ```typescript -export declare function parse(input: string, options?: ParserOptions): Chunk[]; +export declare function parse(input: string, options?: ParserOptions): BibTeXParser; ``` ## Parameters @@ -21,7 +21,7 @@ export declare function parse(input: string, options?: ParserOptions): Chunk[]; **Returns:** -[Chunk](./bibtex-parser.chunker.chunk.md)\[\] +BibTeXParser array of chunks, with markers for type and errors (if any) found. diff --git a/docs/bibtex-parser.chunker.promises.md b/docs/bibtex-parser.chunker.promises.md index 162939550..f6044c6a2 100644 --- a/docs/bibtex-parser.chunker.promises.md +++ b/docs/bibtex-parser.chunker.promises.md @@ -8,6 +8,6 @@ ```typescript promises: { - parse(input: string, options?: ParserOptions): Promise; + parse(input: string, options?: ParserOptions): Promise; } ``` diff --git a/index.ts b/index.ts index d93f92013..08ae79e10 100644 --- a/index.ts +++ b/index.ts @@ -622,7 +622,7 @@ class Parser { public ast(input, clean = true): Node[] { let parsed: Node[] = [] - for (const chunk of chunker.parse(input)) { + for (const chunk of chunker.parse(input).chunks) { const { children } = bibtex.parse(chunk.text, {...this.options, combining: combining.macros}) if (clean) this.clean(children) parsed = parsed.concat(children) @@ -631,7 +631,7 @@ class Parser { } public parse(input): Bibliography { - for (const chunk of chunker.parse(input)) { + for (const chunk of chunker.parse(input).chunks) { this.parseChunk(chunk) } return this.parsed() @@ -639,7 +639,7 @@ class Parser { public async parseAsync(input): Promise { // eslint-disable-next-line @typescript-eslint/await-thenable - for (const chunk of await chunker.promises.parse(input)) { + for (const chunk of (await chunker.promises.parse(input)).chunks) { this.parseChunk(chunk) } return this.parsed() diff --git a/snap.sh b/snap.sh index c18e8c246..e130d3703 100755 --- a/snap.sh +++ b/snap.sh @@ -1,2 +1,2 @@ #!/bin/bash -TAP_SNAPSHOT=0 npm_config_test=apa-test npm_config_all=true npm test +TAP_SNAPSHOT=0 npm_config_test=664 npm_config_all=false npm test diff --git a/test/all.js b/test/all.js index 30bf2600a..f2b59e66f 100755 --- a/test/all.js +++ b/test/all.js @@ -108,7 +108,7 @@ for (let [option, value] of Object.entries(process.env)) { config[option] = value } -if (config.snapshot || config.all) { // reset to all for snapshots +if (config.snapshot || (config.all === 'true')) { // reset to all for snapshots for (const [key, value] of Object.entries(valid)) { if (!process.env[prefix + key]) config[key] = value }