-
Notifications
You must be signed in to change notification settings - Fork 3
/
sentence-case.ts
138 lines (105 loc) · 4.79 KB
/
sentence-case.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
/* eslint-disable @typescript-eslint/unbound-method */
import { tokenize, Token } from './tokenizer'
import { merge } from './merge'
import * as rx from './re'
const Lu: string = rx.match(rx.categories.filter(cat => cat.name === 'Lu' || cat.name === 'Lt'))
const Ll: string = rx.match(rx.categories.filter(cat => cat.name === 'Ll'))
const connectedInnerWord = new RegExp(`-${Lu}${Ll}*(?=-|$)`, 'g')
// eslint-disable-next-line no-magic-numbers
// const show = (obj: any): string => JSON.stringify(obj, null, 2).replace(/[\u007F-\uFFFF]/g, chr => `\\u${(`0000${chr.charCodeAt(0).toString(16)}`).substr(-4)}`)
function titleCase(s: string): string {
return s.replace(/^(.)(.+)/, (match, car, cdr) => `${car}${cdr.toLowerCase()}`)
}
function wordSC(token: Token, allCaps: boolean, subSentence: boolean, hyphenated: boolean): string {
// domains are case-insensitive
if (token.type === 'domain') return token.text.toLowerCase()
// punctuation needs no handling, and websites can have case-sensitive stuff in the URL
if (token.type !== 'word') return token.text
// I'll, I'm
if (token.text.match(/^I'/)) return titleCase(token.text)
// starting 'a'
if (subSentence && token.subSentenceStart && token.text.match(/^a$/i)) return 'a'
// (sub)sentence start
if ((subSentence && token.subSentenceStart) || token.sentenceStart) {
return allCaps ? titleCase(token.text) : token.text.replace(connectedInnerWord, match => match.toLowerCase())
}
// prepositions should already have been lowercased, but OK
if (token.subtype === 'preposition') return token.text.toLowerCase()
// keep acronyms
if (token.subtype === 'acronym') return token.text
// if (!allCaps && token.shape.match(/^[Xxd]+(-[Xxd]+)+/)) return XRegExp.replace(token.text, connectedWord, match => match.toLowerCase())
// single-letter capitals are usually some kind of noun, but not when they're part of an hyphenated word
if (token.text.match(/^[B-Z]$/)) return hyphenated ? token.text.toLowerCase() : token.text
// ?? if (!allCaps && token.shape.match(/^[-X]+$/)) return token.text
const shape = token.shape.replace(/[^-Xxd]/g, '')
// ?? if (!allCaps && shape.match(/^[Xd-]+$/)) return token.text
// hyphenated ?? if (shape.match(/^X[xd]*(-[Xxd]*)*$/)) return token.text.toLowerCase()
// all-caps and numbers
if (shape.match(/^[Xd]+$/)) return allCaps ? token.text.toLowerCase() : token.text
// domain ?? if (token.text.includes('.')) return token.text
if (shape.match(/x.*X/)) return token.text
return token.text.toLowerCase()
}
export type Options = {
preserveQuoted?: boolean
subSentenceCapitalization?: boolean
markup?: RegExp
nocase?: RegExp
guess?: boolean
}
export function toSentenceCase(title: string, options: Options = {}): string {
options = merge(options, {
preserveQuoted: true,
subSentenceCapitalization: true,
guess: false,
})
const allCaps = title === title.toUpperCase()
const allLower = title === title.toLowerCase()
if (options.guess && !allCaps && !allLower) {
let $title = title
if (options.nocase) $title = $title.replace(options.nocase, match => match.match(/\s/) ? ' ' : '')
if (options.markup) $title = $title.replace(options.markup, '')
const guess = {
words: tokenize($title),
sc: 0,
other: 0,
}
guess.words.forEach((token, i) => {
if (token.type === 'word' && token.text.length > 1 && !token.subtype.match(/preposition|acronym|ordinal/) && (i === 0 || guess.words[i-1].type === 'whitespace')) {
guess[token.shape.match(/^[^X]*x[^X]*$/) ? 'sc' : 'other'] += 1
}
})
if (guess.sc && guess.sc >= guess.other) return title
}
if (allCaps && !title.match(/\s/)) return title
const tokens = tokenize(title, options.markup)
let sentenceCased = ''
for (const token of tokens) {
if (token.hyphenated && token.shape[0] === 'x') {
sentenceCased += token.text
}
else if (token.hyphenated) {
for (const t of token.hyphenated) {
sentenceCased += wordSC(t, allCaps, options.subSentenceCapitalization, true)
}
}
else {
sentenceCased += wordSC(token, allCaps, options.subSentenceCapitalization, false)
}
}
if (options.markup) {
for (const match of title.matchAll(options.markup)) {
sentenceCased = sentenceCased.substring(0, match.index) + match[0] + sentenceCased.substring(match.index + match[0].length)
}
}
if (options.nocase) {
sentenceCased = sentenceCased
.replace(options.nocase, (match: string, tag: string, offset: number) => title.substring(offset, offset + match.length))
}
if (options.preserveQuoted) {
for (const q of [/“.*?”/g, /‘.*?’/g, /".*?"/g]) {
sentenceCased = sentenceCased.replace(q, (match: string, offset: number) => title.substring(offset, offset + match.length))
}
}
return sentenceCased
}