From ef13f2f0e29bf3855e1b32e9394dbf9f9c0d2611 Mon Sep 17 00:00:00 2001
From: Emiliano Heyns <emiliano.heyns@iris-advies.com>
Date: Sat, 2 Mar 2024 14:08:03 +0100
Subject: [PATCH] prep chunker for dual use

---
 chunker.ts                             | 118 ++++++++++++++-----------
 docs/bibtex-parser.chunker.md          |   1 -
 docs/bibtex-parser.chunker.parse.md    |   4 +-
 docs/bibtex-parser.chunker.promises.md |   2 +-
 index.ts                               |   6 +-
 snap.sh                                |   2 +-
 test/all.js                            |   2 +-
 7 files changed, 73 insertions(+), 62 deletions(-)

diff --git a/chunker.ts b/chunker.ts
index 94e908548..880c22e44 100644
--- a/chunker.ts
+++ b/chunker.ts
@@ -33,6 +33,12 @@ const letter = new RegExp('[' + [
   /\u00AA\u00BA\u01BB\u01C0-\u01C3\u0294\u05D0-\u05EA\u05F0-\u05F2\u0620-\u063F\u0641-\u064A\u066E-\u066F\u0671-\u06D3\u06D5\u06EE-\u06EF\u06FA-\u06FC\u06FF\u0710\u0712-\u072F\u074D-\u07A5\u07B1\u07CA-\u07EA\u0800-\u0815\u0840-\u0858\u08A0-\u08B4\u0904-\u0939\u093D\u0950\u0958-\u0961\u0972-\u0980\u0985-\u098C\u098F-\u0990\u0993-\u09A8\u09AA-\u09B0\u09B2\u09B6-\u09B9\u09BD\u09CE\u09DC-\u09DD\u09DF-\u09E1\u09F0-\u09F1\u0A05-\u0A0A\u0A0F-\u0A10\u0A13-\u0A28\u0A2A-\u0A30\u0A32-\u0A33\u0A35-\u0A36\u0A38-\u0A39\u0A59-\u0A5C\u0A5E\u0A72-\u0A74\u0A85-\u0A8D\u0A8F-\u0A91\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2-\u0AB3\u0AB5-\u0AB9\u0ABD\u0AD0\u0AE0-\u0AE1\u0AF9\u0B05-\u0B0C\u0B0F-\u0B10\u0B13-\u0B28\u0B2A-\u0B30\u0B32-\u0B33\u0B35-\u0B39\u0B3D\u0B5C-\u0B5D\u0B5F-\u0B61\u0B71\u0B83\u0B85-\u0B8A\u0B8E-\u0B90\u0B92-\u0B95\u0B99-\u0B9A\u0B9C\u0B9E-\u0B9F\u0BA3-\u0BA4\u0BA8-\u0BAA\u0BAE-\u0BB9\u0BD0\u0C05-\u0C0C\u0C0E-\u0C10\u0C12-\u0C28\u0C2A-\u0C39\u0C3D\u0C58-\u0C5A\u0C60-\u0C61\u0C85-\u0C8C\u0C8E-\u0C90\u0C92-\u0CA8\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CBD\u0CDE\u0CE0-\u0CE1\u0CF1-\u0CF2\u0D05-\u0D0C\u0D0E-\u0D10\u0D12-\u0D3A\u0D3D\u0D4E\u0D5F-\u0D61\u0D7A-\u0D7F\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6\u0E01-\u0E30\u0E32-\u0E33\u0E40-\u0E45\u0E81-\u0E82\u0E84\u0E87-\u0E88\u0E8A\u0E8D\u0E94-\u0E97\u0E99-\u0E9F\u0EA1-\u0EA3\u0EA5\u0EA7\u0EAA-\u0EAB\u0EAD-\u0EB0\u0EB2-\u0EB3\u0EBD\u0EC0-\u0EC4\u0EDC-\u0EDF\u0F00\u0F40-\u0F47\u0F49-\u0F6C\u0F88-\u0F8C\u1000-\u102A\u103F\u1050-\u1055\u105A-\u105D\u1061\u1065-\u1066\u106E-\u1070\u1075-\u1081\u108E\u10D0-\u10FA\u10FD-\u1248\u124A-\u124D\u1250-\u1256\u1258\u125A-\u125D\u1260-\u1288\u128A-\u128D\u1290-\u12B0\u12B2-\u12B5\u12B8-\u12BE\u12C0\u12C2-\u12C5\u12C8-\u12D6\u12D8-\u1310\u1312-\u1315\u1318-\u135A\u1380-\u138F\u1401-\u166C\u166F-\u167F\u1681-\u169A\u16A0-\u16EA\u16F1-\u16F8\u1700-\u170C\u170E-\u1711\u1720-\u1731\u1740-\u1751\u1760-\u176C\u176E-\u1770\u1780-\u17B3\u17DC\u1820-\u1842\u1844-\u1877\u1880-\u18A8\u18AA\u18B0-\u18F5\u1900-\u191E\u1950-\u196D\u1970-\u1974\u1980-\u19AB\u19B0-\u19C9\u1A00-\u1A16\u1A20-\u1A54\u1B05-\u1B33\u1B45-\u1B4B\u1B83-\u1BA0\u1BAE-\u1BAF\u1BBA-\u1BE5\u1C00-\u1C23\u1C4D-\u1C4F\u1C5A-\u1C77\u1CE9-\u1CEC\u1CEE-\u1CF1\u1CF5-\u1CF6\u2135-\u2138\u2D30-\u2D67\u2D80-\u2D96\u2DA0-\u2DA6\u2DA8-\u2DAE\u2DB0-\u2DB6\u2DB8-\u2DBE\u2DC0-\u2DC6\u2DC8-\u2DCE\u2DD0-\u2DD6\u2DD8-\u2DDE\u3006\u303C\u3041-\u3096\u309F\u30A1-\u30FA\u30FF\u3105-\u312D\u3131-\u318E\u31A0-\u31BA\u31F0-\u31FF\u3400-\u4DB5\u4E00-\u9FD5\uA000-\uA014\uA016-\uA48C\uA4D0-\uA4F7\uA500-\uA60B\uA610-\uA61F\uA62A-\uA62B\uA66E\uA6A0-\uA6E5\uA78F\uA7F7\uA7FB-\uA801\uA803-\uA805\uA807-\uA80A\uA80C-\uA822\uA840-\uA873\uA882-\uA8B3\uA8F2-\uA8F7\uA8FB\uA8FD\uA90A-\uA925\uA930-\uA946\uA960-\uA97C\uA984-\uA9B2\uA9E0-\uA9E4\uA9E7-\uA9EF\uA9FA-\uA9FE\uAA00-\uAA28\uAA40-\uAA42\uAA44-\uAA4B\uAA60-\uAA6F\uAA71-\uAA76\uAA7A\uAA7E-\uAAAF\uAAB1\uAAB5-\uAAB6\uAAB9-\uAABD\uAAC0\uAAC2\uAADB-\uAADC\uAAE0-\uAAEA\uAAF2\uAB01-\uAB06\uAB09-\uAB0E\uAB11-\uAB16\uAB20-\uAB26\uAB28-\uAB2E\uABC0-\uABE2\uAC00-\uD7A3\uD7B0-\uD7C6\uD7CB-\uD7FB\uF900-\uFA6D\uFA70-\uFAD9\uFB1D\uFB1F-\uFB28\uFB2A-\uFB36\uFB38-\uFB3C\uFB3E\uFB40-\uFB41\uFB43-\uFB44\uFB46-\uFBB1\uFBD3-\uFD3D\uFD50-\uFD8F\uFD92-\uFDC7\uFDF0-\uFDFB\uFE70-\uFE74\uFE76-\uFEFC\uFF66-\uFF6F\uFF71-\uFF9D\uFFA0-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC/.source,
 ].join('') + ']')
 
+type Entry = {
+  type: string
+  key?: string
+  fields: Record<string, string>
+}
+
 export interface Chunk {
   /**
    * The text content of the chunk
@@ -77,45 +83,71 @@ export interface ParserOptions {
   max_entries?: number
 }
 
-type Entry = {
-  type: string
-  key?: string
-  fields: Record<string, string>
-}
-
-class BibtexParser {
+class BibTeXParser {
   public parsing: string
   public chunks: Chunk[]
+
+  public strings: Record<string, string>
   public entries: Entry[]
 
   private pos: number
   // private linebreaks: { pos: number, line: number}[]
   private input: string
 
-  private strings: Record<string, string>
   private max_entries: number
 
-  reset(input) {
+  constructor(input: string, options: ParserOptions = {}) {
+    this.max_entries = options.max_entries || 0
     this.input = input
     this.pos = 0
     this.entries = []
-    this.strings = {}
+    this.strings = {
+      jan: '01',
+      feb: '02',
+      mar: '03',
+      apr: '04',
+      may: '05',
+      jun: '06',
+      jul: '07',
+      aug: '08',
+      sep: '09',
+      oct: '10',
+      nov: '11',
+      dec: '12',
+      acmcs: 'ACM Computing Surveys',
+      acta: 'Acta Informatica',
+      cacm: 'Communications of the ACM',
+      ibmjrd: 'IBM Journal of Research and Development',
+      ibmsj: 'IBM Systems Journal',
+      ieeese: 'IEEE Transactions on Software Engineering',
+      ieeetc: 'IEEE Transactions on Computers',
+      ieeetcad: 'IEEE Transactions on Computer-Aided Design of Integrated Circuits',
+      ipl: 'Information Processing Letters',
+      jacm: 'Journal of the ACM',
+      jcss: 'Journal of Computer and System Sciences',
+      scp: 'Science of Computer Programming',
+      sicomp: 'SIAM Journal on Computing',
+      tocs: 'ACM Transactions on Computer Systems',
+      tods: 'ACM Transactions on Database Systems',
+      tog: 'ACM Transactions on Graphics',
+      toms: 'ACM Transactions on Mathematical Software',
+      toois: 'ACM Transactions on Office Information Systems',
+      toplas: 'ACM Transactions on Programming Languages and Systems',
+      tcs: 'Theoretical Computer Science',
+    }
     this.parsing = null
     this.chunks = []
     // this.linebreaks = undefined
   }
 
-  public parse(input, options: ParserOptions = {}): Chunk[] {
-    this.reset(input)
-    this.max_entries = options.max_entries || 0
+  public parse() {
     this.bibtex()
-    return this.chunks
+    this.entries.reverse()
   }
-  public async parseAsync(input, options: ParserOptions = {}): Promise<Chunk[]> {
-    this.reset(input)
-    this.max_entries = options.max_entries || 0
+
+  public async parseAsync() {
     await this.bibtexAsync()
-    return this.chunks
+    this.entries.reverse()
   }
 
   private isWhitespace(s, horizontalOnly = false) {
@@ -242,17 +274,14 @@ class BibtexParser {
     return values.join('')
   }
 
-  private key(allowUnicode = false) {
+  private key() {
     const start = this.pos
     while (true) { // eslint-disable-line no-constant-condition
       if (this.pos === this.input.length) {
         throw new ParseError('Runaway key', this)
       }
 
-      if (this.input[this.pos].match(/['a-zA-Z0-9&;_:\\./-]/)) {
-        this.pos++
-      }
-      else if (allowUnicode && this.input[this.pos].match(letter)) {
+      if (this.input[this.pos].match(/['a-zA-Z0-9&;_:\\./-]/) || this.input[this.pos].match(letter)) {
         this.pos++
       }
       else {
@@ -279,8 +308,8 @@ class BibtexParser {
     }
   }
 
-  private entry(_d) {
-    this.entries[0].key = this.key(true)
+  private entry(d) {
+    this.entries.unshift({ type: d, key: this.key(), fields: {} })
     this.match(',')
     this.key_equals_value()
     while (this.tryMatch(',')) {
@@ -310,23 +339,14 @@ class BibtexParser {
     while (this.isWhitespace(this.input[this.pos], true)) this.pos++
 
     if (this.input[this.pos] === '{') {
-      this.value_braces()
-      return
+      return this.value_braces()
     }
 
+    const start = this.pos
     while (this.input[this.pos] !== '\n' && this.pos < this.input.length) this.pos++
+    return this.input.substring(start, this.pos)
   }
 
-  /*
-  private progress() {
-    const progress = Math.round((this.pos / this.input.length * 100) / 5) * 5 // eslint-disable-line no-magic-numbers
-    if (this._progress !== progress) {
-      this._progress = progress
-      process.stdout.write(` (${this._progress}%) `)
-    }
-  }
-  */
-
   private hasMore() {
     if (this.max_entries && this.entries.length >= this.max_entries) return false
     return (this.pos < this.input.length)
@@ -355,8 +375,6 @@ class BibtexParser {
   }
 
   private parseNext() {
-    // this.progress()
-
     const chunk: Chunk = {
       offset: {
         pos: this.pos,
@@ -394,7 +412,6 @@ class BibtexParser {
 
         default:
           guard = this.matchGuard()
-          this.entries.unshift({ type: d, fields: {} })
           this.entry(d)
           this.match(guard)
           chunk.entry = true
@@ -444,21 +461,16 @@ class BibtexParser {
  *
  * @returns array of chunks, with markers for type and errors (if any) found.
  */
-export function parse(input: string, options: ParserOptions = {}): Chunk[] {
-  return (new BibtexParser).parse(input, options)
-}
-
-export function entries(input: string, options: ParserOptions = {}): { entries: Entry[], errors: Chunk[] } {
-  const parser = new BibtexParser
-  parser.parse(input, options)
-  return {
-    entries: parser.entries.reverse(),
-    errors: parser.chunks.filter(chunk => chunk.error),
-  }
+export function parse(input: string, options: ParserOptions = {}): BibTeXParser {
+  const parser = new BibTeXParser(input, options)
+  parser.parse()
+  return parser
 }
 
 export const promises = {
-  async parse(input: string, options: ParserOptions = {}): Promise<Chunk[]> { // eslint-disable-line prefer-arrow/prefer-arrow-functions
-    return await (new BibtexParser).parseAsync(input, options)
+  async parse(input: string, options: ParserOptions = {}): Promise<BibTeXParser> { // eslint-disable-line prefer-arrow/prefer-arrow-functions
+    const parser = new BibTeXParser(input, options)
+    await parser.parseAsync()
+    return parser
   },
 }
diff --git a/docs/bibtex-parser.chunker.md b/docs/bibtex-parser.chunker.md
index e9016bbfb..58fc22979 100644
--- a/docs/bibtex-parser.chunker.md
+++ b/docs/bibtex-parser.chunker.md
@@ -8,7 +8,6 @@
 
 |  Function | Description |
 |  --- | --- |
-|  [entries(input, options)](./bibtex-parser.chunker.entries.md) |  |
 |  [parse(input, options)](./bibtex-parser.chunker.parse.md) | Reads the bibtex input and splits it into separate chunks of <code>string</code>s, <code>@comment</code>s, and bibtex entries. Useful for detecting if a file is bibtex file and for filtering out basic errors that would make the more sophisticated \[\[bibtex.parse\]\] reject the whole file |
 
 ## Interfaces
diff --git a/docs/bibtex-parser.chunker.parse.md b/docs/bibtex-parser.chunker.parse.md
index f114909dc..2397c7370 100644
--- a/docs/bibtex-parser.chunker.parse.md
+++ b/docs/bibtex-parser.chunker.parse.md
@@ -9,7 +9,7 @@ Reads the bibtex input and splits it into separate chunks of `string`<!-- -->s,
 **Signature:**
 
 ```typescript
-export declare function parse(input: string, options?: ParserOptions): Chunk[];
+export declare function parse(input: string, options?: ParserOptions): BibTeXParser;
 ```
 
 ## Parameters
@@ -21,7 +21,7 @@ export declare function parse(input: string, options?: ParserOptions): Chunk[];
 
 **Returns:**
 
-[Chunk](./bibtex-parser.chunker.chunk.md)<!-- -->\[\]
+BibTeXParser
 
 array of chunks, with markers for type and errors (if any) found.
 
diff --git a/docs/bibtex-parser.chunker.promises.md b/docs/bibtex-parser.chunker.promises.md
index 162939550..f6044c6a2 100644
--- a/docs/bibtex-parser.chunker.promises.md
+++ b/docs/bibtex-parser.chunker.promises.md
@@ -8,6 +8,6 @@
 
 ```typescript
 promises: {
-    parse(input: string, options?: ParserOptions): Promise<Chunk[]>;
+    parse(input: string, options?: ParserOptions): Promise<BibTeXParser>;
 }
 ```
diff --git a/index.ts b/index.ts
index d93f92013..08ae79e10 100644
--- a/index.ts
+++ b/index.ts
@@ -622,7 +622,7 @@ class Parser {
 
   public ast(input, clean = true): Node[] {
     let parsed: Node[] = []
-    for (const chunk of chunker.parse(input)) {
+    for (const chunk of chunker.parse(input).chunks) {
       const { children } = bibtex.parse(chunk.text, {...this.options, combining: combining.macros})
       if (clean) this.clean(children)
       parsed = parsed.concat(children)
@@ -631,7 +631,7 @@ class Parser {
   }
 
   public parse(input): Bibliography {
-    for (const chunk of chunker.parse(input)) {
+    for (const chunk of chunker.parse(input).chunks) {
       this.parseChunk(chunk)
     }
     return this.parsed()
@@ -639,7 +639,7 @@ class Parser {
 
   public async parseAsync(input): Promise<Bibliography> {
     // eslint-disable-next-line @typescript-eslint/await-thenable
-    for (const chunk of await chunker.promises.parse(input)) {
+    for (const chunk of (await chunker.promises.parse(input)).chunks) {
       this.parseChunk(chunk)
     }
     return this.parsed()
diff --git a/snap.sh b/snap.sh
index c18e8c246..e130d3703 100755
--- a/snap.sh
+++ b/snap.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-TAP_SNAPSHOT=0 npm_config_test=apa-test npm_config_all=true npm test
+TAP_SNAPSHOT=0 npm_config_test=664 npm_config_all=false npm test
diff --git a/test/all.js b/test/all.js
index 30bf2600a..f2b59e66f 100755
--- a/test/all.js
+++ b/test/all.js
@@ -108,7 +108,7 @@ for (let [option, value] of Object.entries(process.env)) {
   config[option] = value
 }
 
-if (config.snapshot || config.all) { // reset to all for snapshots
+if (config.snapshot || (config.all === 'true')) { // reset to all for snapshots
   for (const [key, value] of Object.entries(valid)) {
     if (!process.env[prefix + key]) config[key] = value
   }