Skip to content

Commit

Permalink
perf: improved tokenizer
Browse files Browse the repository at this point in the history
Faster and simpler tokenization.

Refs: #133
  • Loading branch information
Qtax committed Oct 4, 2023
1 parent bf2e2c2 commit f282666
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 85 deletions.
1 change: 1 addition & 0 deletions lib/index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@ declare module 'sql-highlight' {

export function getSegments(sqlString: string): Array<Segment>;
export function highlight(sqlString: string, options?: HighlightOptions): string;
export const DEFAULT_OPTIONS: HighlightOptions;
}
131 changes: 46 additions & 85 deletions lib/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,106 +18,66 @@ const DEFAULT_OPTIONS = {
}
}

const SPLIT_CHARS = '[^a-zA-Z_]'

const DEFAULT_KEYWORD = 'default'

const highlighters = [
{
name: 'keyword',
group: 1,
regex: new RegExp(`(^|${SPLIT_CHARS})(${keywords.join('|')})(?=${SPLIT_CHARS}|$)`, 'gi')
},
{
name: 'special',
regex: /(=|!=|%|\/|\*|-|,|;|:|\+|<|>)/g
},
{
name: 'function',
regex: /(\w+?)\(/g,
trimEnd: 1
},
{
name: 'number',
regex: /(\b\d+(?:\.\d+)?)/g
},
{
name: 'string',
regex: /(['](?:\\'|.)*?[']|["](?:\\"|.)*?["]|[`](?:\\`|.)*?[`])/g
},
{
name: 'bracket',
regex: /([()])/g
}
]
/\b(?<number>\d+(?:\.\d+)?)\b/,

function getSegments (sqlString) {
const matches = []

for (const hl of highlighters) {
let match

// This is probably the one time when an assignment inside a condition makes sense
// eslint-disable-next-line no-cond-assign
while (match = hl.regex.exec(sqlString)) {
let text = match[0]
let boringLength = 0

// If a specific group is requested, use that group instead, and make sure
// we offset the index by the length of the preceding groups
if (hl.group) {
text = match[hl.group + 1]
for (let i = 1; i <= hl.group; i++) {
boringLength += match[i].length
}
}
// Note: Repeating string escapes like 'sql''server' will also work as they are just repeating strings
/(?<string>'(?:[^'\\]|\\.)*'|"(?:[^"\\]|\\.)*"|`(?:[^`\\]|\\.)*`)/,

const trimmedText = hl.trimEnd
? text.substring(0, text.length - hl.trimEnd)
: text
matches.push({
name: hl.name,
start: match.index + boringLength,
length: trimmedText.length
})
}
}
/\b(?<function>\w+)(?=\s*\()/,

/(?<bracket>[()])/,

const sortedMatches = matches.slice().sort((a, b) => a.start - b.start)
/(?<special>!=|[=%*/\-+,;:<>])/
]

function getRegexString (regex) {
const str = regex.toString()
return str.replace(/^\/|\/\w*$/g, '')
}

// filter/exclude nested matches (matches within the last match)
// Regex of the shape /(.*?)|((?<token1>...)|(?<token2>...)|...|$)/y
const tokenizer = new RegExp(
'(.*?)(' +
'\\b(?<keyword>' + keywords.join('|') + ')\\b|' +
highlighters.map(getRegexString).join('|') +
'|$)', // $ needed to to match "default" till the end of string
'isy'
)

function getSegments (sqlString) {
const segments = []
let upperBound = 0
for (let i = 0; i < sortedMatches.length; i++) {
if (sortedMatches[i].start < upperBound) { continue }
let match

// Reset the starting position
tokenizer.lastIndex = 0

// If no match, add a default segment
if (sortedMatches[i].start > upperBound) {
// This is probably the one time when an assignment inside a condition makes sense
// eslint-disable-next-line no-cond-assign
while (match = tokenizer.exec(sqlString)) {
if (match[1]) {
segments.push({
name: DEFAULT_KEYWORD,
content: sqlString.substring(upperBound, sortedMatches[i].start)
content: match[1]
})
}

segments.push({
name: sortedMatches[i].name,
content: sqlString.substring(
sortedMatches[i].start,
sortedMatches[i].start + sortedMatches[i].length
)
})
upperBound = sortedMatches[i].start + sortedMatches[i].length
}
if (match[2]) {
const name = Object.keys(match.groups).find(key => match.groups[key])
segments.push({
name,
content: match.groups[name]
})
}

if (upperBound < sqlString.length - 1) {
segments.push({
name: DEFAULT_KEYWORD,
content: sqlString.substring(
upperBound,
upperBound + sqlString.length + 1
)
})
// Stop at the end of string
if (match.index + match[0].length >= sqlString.length) {
break
}
}

return segments
}

Expand All @@ -140,5 +100,6 @@ function highlight (sqlString, options) {

module.exports = {
getSegments,
highlight
highlight,
DEFAULT_OPTIONS
}

0 comments on commit f282666

Please sign in to comment.