Skip to content

Commit a2b6804

Browse files
feat(core): enhance string utils with robust edge case handling (#1154)
1 parent b4d3a57 commit a2b6804

File tree

2 files changed

+224
-7
lines changed

2 files changed

+224
-7
lines changed

packages/core/src/utils/strings.ts

Lines changed: 65 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,30 @@ import type { HighlighterGeneric, Position } from '@shikijs/types'
22

33
/**
44
* Split a string into lines, each line preserves the line ending.
5+
*
6+
* @param code - The code string to split into lines
7+
* @param preserveEnding - Whether to preserve line endings in the result
8+
* @returns Array of tuples containing [line content, offset index]
9+
*
10+
* @example
11+
* ```ts
12+
* splitLines('hello\nworld', false)
13+
* // => [['hello', 0], ['world', 6]]
14+
*
15+
* splitLines('hello\nworld', true)
16+
* // => [['hello\n', 0], ['world', 6]]
17+
* ```
518
*/
619
export function splitLines(code: string, preserveEnding = false): [string, number][] {
20+
// Handle empty string edge case
21+
if (code.length === 0) {
22+
return [['', 0]]
23+
}
24+
725
const parts = code.split(/(\r?\n)/g)
826
let index = 0
927
const lines: [string, number][] = []
28+
1029
for (let i = 0; i < parts.length; i += 2) {
1130
const line = preserveEnding
1231
? parts[i] + (parts[i + 1] || '')
@@ -15,6 +34,7 @@ export function splitLines(code: string, preserveEnding = false): [string, numbe
1534
index += parts[i].length
1635
index += parts[i + 1]?.length || 0
1736
}
37+
1838
return lines
1939
}
2040

@@ -69,24 +89,62 @@ export function createPositionConverter(code: string): {
6989
* Guess embedded languages from given code and highlighter.
7090
*
7191
* When highlighter is provided, only bundled languages will be included.
92+
*
93+
* @param code - The code string to analyze
94+
* @param _lang - The primary language of the code (currently unused)
95+
* @param highlighter - Optional highlighter instance to validate languages
96+
* @returns Array of detected language identifiers
97+
*
98+
* @example
99+
* ```ts
100+
* // Detects 'javascript' from Vue SFC
101+
* guessEmbeddedLanguages('<script lang="javascript">')
102+
*
103+
* // Detects 'python' from markdown code block
104+
* guessEmbeddedLanguages('```python\nprint("hi")\n```')
105+
* ```
72106
*/
73107
export function guessEmbeddedLanguages(
74108
code: string,
75109
_lang: string | undefined,
76110
highlighter?: HighlighterGeneric<any, any>,
77111
): string[] {
78112
const langs = new Set<string>()
79-
// For HTML code blocks like Vue SFC
80-
for (const match of code.matchAll(/lang=["']([\w-]+)["']/g)) {
81-
langs.add(match[1])
113+
114+
// For HTML code blocks like Vue SFC, support both single and double quotes
115+
// Matches: lang="js", lang='ts', :lang="typescript", etc.
116+
// Allow spaces around the language name
117+
for (const match of code.matchAll(/:?lang=["']([^"']+)["']/g)) {
118+
const lang = match[1].toLowerCase().trim()
119+
if (lang)
120+
langs.add(lang)
82121
}
83-
// For markdown code blocks
122+
123+
// For markdown code blocks, support both ``` and ~~~ fences
124+
// Matches: ```typescript, ~~~javascript, etc.
84125
for (const match of code.matchAll(/(?:```|~~~)([\w-]+)/g)) {
85-
langs.add(match[1])
126+
const lang = match[1].toLowerCase().trim()
127+
if (lang)
128+
langs.add(lang)
86129
}
87-
// For latex
130+
131+
// For LaTeX environments
132+
// Matches: \begin{equation}, \begin{align}, etc.
88133
for (const match of code.matchAll(/\\begin\{([\w-]+)\}/g)) {
89-
langs.add(match[1])
134+
const lang = match[1].toLowerCase().trim()
135+
if (lang)
136+
langs.add(lang)
137+
}
138+
139+
// For script tags in HTML/Vue
140+
// Matches: <script type="text/javascript">, <script lang="ts">, etc.
141+
// Allow spaces around the language name
142+
for (const match of code.matchAll(/<script\s+(?:type|lang)=["']([^"']+)["']/gi)) {
143+
// Extract language from MIME types like 'text/javascript' or 'application/typescript'
144+
const fullType = match[1].toLowerCase().trim()
145+
const lang = fullType.includes('/') ? fullType.split('/').pop() : fullType
146+
if (lang)
147+
langs.add(lang)
90148
}
91149

92150
if (!highlighter)
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
import { describe, expect, it } from 'vitest'
2+
import { guessEmbeddedLanguages, splitLines } from '../src/utils'
3+
4+
describe('enhanced string utils', () => {
5+
describe('splitLines edge cases', () => {
6+
it('handles empty string', () => {
7+
const result = splitLines('', false)
8+
expect(result).toEqual([['', 0]])
9+
10+
const resultWithEnding = splitLines('', true)
11+
expect(resultWithEnding).toEqual([['', 0]])
12+
})
13+
14+
it('handles single line without newline', () => {
15+
const result = splitLines('hello world', false)
16+
expect(result).toEqual([['hello world', 0]])
17+
})
18+
19+
it('handles different line endings', () => {
20+
// Unix style \n
21+
const unix = splitLines('line1\nline2', true)
22+
expect(unix).toEqual([
23+
['line1\n', 0],
24+
['line2', 6],
25+
])
26+
27+
// Windows style \r\n
28+
const windows = splitLines('line1\r\nline2', true)
29+
expect(windows).toEqual([
30+
['line1\r\n', 0],
31+
['line2', 7],
32+
])
33+
})
34+
35+
it('preserves offsets correctly for multiline code', () => {
36+
const code = 'abc\ndef\nghi'
37+
const result = splitLines(code, false)
38+
expect(result[0][1]).toBe(0) // 'abc' starts at 0
39+
expect(result[1][1]).toBe(4) // 'def' starts at 4 (after 'abc\n')
40+
expect(result[2][1]).toBe(8) // 'ghi' starts at 8 (after 'abc\ndef\n')
41+
})
42+
})
43+
44+
describe('guessEmbeddedLanguages enhanced', () => {
45+
it('detects languages from Vue SFC with double quotes', () => {
46+
const code = '<template lang="pug"></template><script lang="typescript"></script>'
47+
const langs = guessEmbeddedLanguages(code, undefined)
48+
expect(langs).toContain('pug')
49+
expect(langs).toContain('typescript')
50+
})
51+
52+
it('detects languages from Vue SFC with single quotes', () => {
53+
const code = '<template lang=\'pug\'></template><script lang=\'ts\'></script>'
54+
const langs = guessEmbeddedLanguages(code, undefined)
55+
expect(langs).toContain('pug')
56+
expect(langs).toContain('ts')
57+
})
58+
59+
it('detects languages with colon prefix (Vue v-bind)', () => {
60+
const code = '<component :lang="javascript"></component>'
61+
const langs = guessEmbeddedLanguages(code, undefined)
62+
expect(langs).toContain('javascript')
63+
})
64+
65+
it('detects languages from markdown code blocks', () => {
66+
const code = '```typescript\nconst x = 1\n```\n\n~~~python\nprint("hi")\n~~~'
67+
const langs = guessEmbeddedLanguages(code, undefined)
68+
expect(langs).toContain('typescript')
69+
expect(langs).toContain('python')
70+
})
71+
72+
it('detects languages from LaTeX environments', () => {
73+
const code = '\\begin{equation}\\end{equation}\\begin{align}\\end{align}'
74+
const langs = guessEmbeddedLanguages(code, undefined)
75+
expect(langs).toContain('equation')
76+
expect(langs).toContain('align')
77+
})
78+
79+
it('detects languages from script type attribute', () => {
80+
const code = '<script type="text/javascript">alert("hi")</script>'
81+
const langs = guessEmbeddedLanguages(code, undefined)
82+
expect(langs).toContain('javascript')
83+
})
84+
85+
it('detects languages from script type with application/ prefix', () => {
86+
const code = '<script type="application/typescript">const x = 1</script>'
87+
const langs = guessEmbeddedLanguages(code, undefined)
88+
expect(langs).toContain('typescript')
89+
})
90+
91+
it('normalizes languages to lowercase', () => {
92+
const code = '<script lang="TypeScript"></script>```JavaScript\ncode\n```'
93+
const langs = guessEmbeddedLanguages(code, undefined)
94+
expect(langs).toContain('typescript')
95+
expect(langs).toContain('javascript')
96+
})
97+
98+
it('handles empty code', () => {
99+
const langs = guessEmbeddedLanguages('', undefined)
100+
expect(langs).toEqual([])
101+
})
102+
103+
it('returns unique languages only', () => {
104+
const code = '```js\n```\n```js\n```\n```javascript\n```'
105+
const langs = guessEmbeddedLanguages(code, undefined)
106+
const jsCount = langs.filter(l => l === 'js').length
107+
const javascriptCount = langs.filter(l => l === 'javascript').length
108+
// Should only appear once each
109+
expect(jsCount).toBe(1)
110+
expect(javascriptCount).toBe(1)
111+
})
112+
113+
it('handles mixed case and whitespace', () => {
114+
const code = '<script lang=" TypeScript "></script>'
115+
const langs = guessEmbeddedLanguages(code, undefined)
116+
expect(langs).toContain('typescript')
117+
})
118+
119+
it('handles complex Vue SFC', () => {
120+
const code = `
121+
<template lang="pug">
122+
div Hello
123+
</template>
124+
<script lang="ts">
125+
const x = 1
126+
</script>
127+
<style lang="scss">
128+
.test { color: red; }
129+
</style>
130+
`
131+
const langs = guessEmbeddedLanguages(code, undefined)
132+
expect(langs).toContain('pug')
133+
expect(langs).toContain('ts')
134+
expect(langs).toContain('scss')
135+
})
136+
137+
it('handles markdown with multiple code blocks', () => {
138+
const code = `
139+
# Title
140+
141+
\`\`\`typescript
142+
const foo = 'bar'
143+
\`\`\`
144+
145+
\`\`\`javascript
146+
const baz = 'qux'
147+
\`\`\`
148+
149+
~~~python
150+
print("hello")
151+
~~~
152+
`
153+
const langs = guessEmbeddedLanguages(code, undefined)
154+
expect(langs).toContain('typescript')
155+
expect(langs).toContain('javascript')
156+
expect(langs).toContain('python')
157+
})
158+
})
159+
})

0 commit comments

Comments
 (0)