ditch xregexp

retorquere · Apr 15, 2024 · 86d5e8d · 86d5e8d
1 parent 2f8ab12
commit 86d5e8d
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 19 deletions.
diff --git a/sentence-case.ts b/sentence-case.ts
@@ -1,7 +1,6 @@
 /* eslint-disable @typescript-eslint/unbound-method */
 
-import XRegExp from 'xregexp'
-import { tokenize, Token } from './tokenizer'
+import { tokenize, Token, connectedInnerWord } from './tokenizer'
 import { merge } from './merge'
 
 // eslint-disable-next-line no-magic-numbers
@@ -11,17 +10,14 @@ function titleCase(s: string): string {
   return s.replace(/^(.)(.+)/, (match, car, cdr) => `${car}${cdr.toLowerCase()}`)
 }
 
-// const connectedWord = XRegExp('(^|-)\\p{Lu}\\p{Ll}*(?=-|$)', 'g')
-const connectedInnerWord = XRegExp('-\\p{Lu}\\p{Ll}*(?=-|$)', 'g')
-
 function wordSC(token: Token, allCaps: boolean, subSentence: boolean, hyphenated: boolean): string {
   if (token.type === 'domain') return token.text.toLowerCase()
   if (token.type !== 'word') return token.text
   if (token.text.match(/^I'/)) return titleCase(token.text)
   if (subSentence && token.subSentenceStart && token.text.match(/^a$/i)) return 'a'
 
   if ((subSentence && token.subSentenceStart) || token.sentenceStart) {
-    return allCaps ? titleCase(token.text) : XRegExp.replace(token.text, connectedInnerWord, match => match.toLowerCase())
+    return allCaps ? titleCase(token.text) : token.text.replace(connectedInnerWord, match => match.toLowerCase())
   }
 
   if (token.subtype === 'preposition') return token.text.toLowerCase()

diff --git a/tokenizer.ts b/tokenizer.ts
@@ -1,6 +1,4 @@
 import moo from 'moo'
-import XRegExp from 'xregexp'
-import { ReplacementDetail } from 'xregexp'
 
 // eslint-disable-next-line no-magic-numbers
 // const show = (obj: any): string => JSON.stringify(obj, null, 2).replace(/[\u007F-\uFFFF]/g, chr => `\\u${(`0000${chr.charCodeAt(0).toString(16)}`).substr(-4)}`)
@@ -14,18 +12,23 @@ type CharCategory =  {
 
 const charCategories: CharCategory[] = require('xregexp/tools/output/categories')
 
-function re(cats: CharCategory[], extra?: string, neg=false): string {
+function char(cats: CharCategory[], extra?: string, neg=false): string {
   return `[${neg ? '^' : ''}${cats.map(cat => cat.bmp).join('')}${extra || ''}]`
 }
-const L: string = re(charCategories.filter(cat => cat.name === 'L'))
-const LNM: string = re(charCategories.filter(cat => cat.name.match(/^[LNM]/)), '\u00AD\u2060')
+
+const ciwLu: string = char(charCategories.filter(cat => cat.name === 'Lu'))
+const ciwLl: string = char(charCategories.filter(cat => cat.name === 'Ll'))
+export const connectedInnerWord = new RegExp(`-${ciwLu}${ciwLl}*(?=-|$)`, 'g')
+
+const L: string = char(charCategories.filter(cat => cat.name === 'L'))
+const LNM: string = char(charCategories.filter(cat => cat.name.match(/^[LNM]/)), '\u00AD\u2060')
 const W = `${LNM}*?${L}${LNM}*`
-const B = `(?=(?:${re(charCategories.filter(cat => cat.name.match(/^[LNM]/)), '\u00AD\u2060').replace(/^./, '[^')}|$))`
+const B = `(?=(?:${char(charCategories.filter(cat => cat.name.match(/^[LNM]/)), '\u00AD\u2060').replace(/^./, '[^')}|$))`
 
 const Word = new RegExp(`${W}${B}`)
-const P = new RegExp(re(charCategories.filter(cat => cat.name.match(/^P/))))
+const P = new RegExp(char(charCategories.filter(cat => cat.name.match(/^P/))))
 
-const Lu: string = re(charCategories.filter(cat => cat.name === 'Lu'), '\u2060')
+const Lu: string = char(charCategories.filter(cat => cat.name === 'Lu'), '\u2060')
 const Acronym = new RegExp(`(?:(?:(?:${Lu}[.]){2,}${B})|(?:(?:vs?[.])(?=[ \t\n\r\u00A0])))`)
 
 const Contraction = new RegExp(`${W}['’]${W}${B}`)
@@ -64,14 +67,20 @@ const lexer = moo.compile({
   other:                  { match: /[\s\S]/, lineBreaks: true },
 })
 
-const shape: ReplacementDetail[] = [
-  [ XRegExp('\\p{Lu}'), 'X', 'all' ],
-  [ new RegExp(re(charCategories.filter(cat => cat.name.match(/^L[^Cu]/))), 'g'), 'x' ],
-  [ XRegExp('\\p{N}'), 'd', 'all' ],
+const shaper: [RegExp, string][] = [
+  [ new RegExp(char(charCategories.filter(cat => cat.name === 'Lu')), 'g'), 'X' ],
+  [ new RegExp(char(charCategories.filter(cat => cat.name.match(/^L[^Cu]/))), 'g'), 'x' ],
+  [ new RegExp(char(charCategories.filter(cat => cat.name[0] === 'N')), 'g'), 'd' ],
   [ /’/g, "'" ],
   [ /–/g, '-' ],
   [ /[\u2060\u00AD]/g, '' ],
 ]
+function shape(s: string) {
+  for (const [ re, repl ] of shaper) {
+    s = s.replace(re, repl)
+  }
+  return s
+}
 
 export type Token = {
   type: string
@@ -116,7 +125,7 @@ export function tokenize(title: string, markup?: RegExp): Token[] {
       text: token.text,
       start: token.offset,
       end: token.offset + token.text.length - 1,
-      shape: XRegExp.replaceEach(<string>token.text, shape),
+      shape: shape(<string>token.text),
       sentenceStart: type === 'word' && sentenceStart,
       subSentenceStart: type === 'word' && subSentenceStart,
     })