Skip to content

Commit

Permalink
ditch xregexp
Browse files Browse the repository at this point in the history
  • Loading branch information
retorquere committed Apr 15, 2024
1 parent 2f8ab12 commit 86d5e8d
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 19 deletions.
8 changes: 2 additions & 6 deletions sentence-case.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
/* eslint-disable @typescript-eslint/unbound-method */

import XRegExp from 'xregexp'
import { tokenize, Token } from './tokenizer'
import { tokenize, Token, connectedInnerWord } from './tokenizer'
import { merge } from './merge'

// eslint-disable-next-line no-magic-numbers
Expand All @@ -11,17 +10,14 @@ function titleCase(s: string): string {
return s.replace(/^(.)(.+)/, (match, car, cdr) => `${car}${cdr.toLowerCase()}`)
}

// const connectedWord = XRegExp('(^|-)\\p{Lu}\\p{Ll}*(?=-|$)', 'g')
const connectedInnerWord = XRegExp('-\\p{Lu}\\p{Ll}*(?=-|$)', 'g')

function wordSC(token: Token, allCaps: boolean, subSentence: boolean, hyphenated: boolean): string {
if (token.type === 'domain') return token.text.toLowerCase()
if (token.type !== 'word') return token.text
if (token.text.match(/^I'/)) return titleCase(token.text)
if (subSentence && token.subSentenceStart && token.text.match(/^a$/i)) return 'a'

if ((subSentence && token.subSentenceStart) || token.sentenceStart) {
return allCaps ? titleCase(token.text) : XRegExp.replace(token.text, connectedInnerWord, match => match.toLowerCase())
return allCaps ? titleCase(token.text) : token.text.replace(connectedInnerWord, match => match.toLowerCase())
}

if (token.subtype === 'preposition') return token.text.toLowerCase()
Expand Down
35 changes: 22 additions & 13 deletions tokenizer.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import moo from 'moo'
import XRegExp from 'xregexp'
import { ReplacementDetail } from 'xregexp'

// eslint-disable-next-line no-magic-numbers
// const show = (obj: any): string => JSON.stringify(obj, null, 2).replace(/[\u007F-\uFFFF]/g, chr => `\\u${(`0000${chr.charCodeAt(0).toString(16)}`).substr(-4)}`)
Expand All @@ -14,18 +12,23 @@ type CharCategory = {

const charCategories: CharCategory[] = require('xregexp/tools/output/categories')

function re(cats: CharCategory[], extra?: string, neg=false): string {
function char(cats: CharCategory[], extra?: string, neg=false): string {
return `[${neg ? '^' : ''}${cats.map(cat => cat.bmp).join('')}${extra || ''}]`
}
const L: string = re(charCategories.filter(cat => cat.name === 'L'))
const LNM: string = re(charCategories.filter(cat => cat.name.match(/^[LNM]/)), '\u00AD\u2060')

const ciwLu: string = char(charCategories.filter(cat => cat.name === 'Lu'))
const ciwLl: string = char(charCategories.filter(cat => cat.name === 'Ll'))
export const connectedInnerWord = new RegExp(`-${ciwLu}${ciwLl}*(?=-|$)`, 'g')

const L: string = char(charCategories.filter(cat => cat.name === 'L'))
const LNM: string = char(charCategories.filter(cat => cat.name.match(/^[LNM]/)), '\u00AD\u2060')
const W = `${LNM}*?${L}${LNM}*`
const B = `(?=(?:${re(charCategories.filter(cat => cat.name.match(/^[LNM]/)), '\u00AD\u2060').replace(/^./, '[^')}|$))`
const B = `(?=(?:${char(charCategories.filter(cat => cat.name.match(/^[LNM]/)), '\u00AD\u2060').replace(/^./, '[^')}|$))`

const Word = new RegExp(`${W}${B}`)
const P = new RegExp(re(charCategories.filter(cat => cat.name.match(/^P/))))
const P = new RegExp(char(charCategories.filter(cat => cat.name.match(/^P/))))

const Lu: string = re(charCategories.filter(cat => cat.name === 'Lu'), '\u2060')
const Lu: string = char(charCategories.filter(cat => cat.name === 'Lu'), '\u2060')
const Acronym = new RegExp(`(?:(?:(?:${Lu}[.]){2,}${B})|(?:(?:vs?[.])(?=[ \t\n\r\u00A0])))`)

const Contraction = new RegExp(`${W}['’]${W}${B}`)
Expand Down Expand Up @@ -64,14 +67,20 @@ const lexer = moo.compile({
other: { match: /[\s\S]/, lineBreaks: true },
})

const shape: ReplacementDetail[] = [
[ XRegExp('\\p{Lu}'), 'X', 'all' ],
[ new RegExp(re(charCategories.filter(cat => cat.name.match(/^L[^Cu]/))), 'g'), 'x' ],
[ XRegExp('\\p{N}'), 'd', 'all' ],
const shaper: [RegExp, string][] = [
[ new RegExp(char(charCategories.filter(cat => cat.name === 'Lu')), 'g'), 'X' ],
[ new RegExp(char(charCategories.filter(cat => cat.name.match(/^L[^Cu]/))), 'g'), 'x' ],
[ new RegExp(char(charCategories.filter(cat => cat.name[0] === 'N')), 'g'), 'd' ],
[ //g, "'" ],
[ //g, '-' ],
[ /[\u2060\u00AD]/g, '' ],
]
function shape(s: string) {
for (const [ re, repl ] of shaper) {
s = s.replace(re, repl)
}
return s
}

export type Token = {
type: string
Expand Down Expand Up @@ -116,7 +125,7 @@ export function tokenize(title: string, markup?: RegExp): Token[] {
text: token.text,
start: token.offset,
end: token.offset + token.text.length - 1,
shape: XRegExp.replaceEach(<string>token.text, shape),
shape: shape(<string>token.text),
sentenceStart: type === 'word' && sentenceStart,
subSentenceStart: type === 'word' && subSentenceStart,
})
Expand Down

0 comments on commit 86d5e8d

Please sign in to comment.