Skip to content

Commit

Permalink
tokenizers.CharArrayReader: allow surrogate pairs (#3345)
Browse files Browse the repository at this point in the history
  • Loading branch information
kitbellew committed Sep 22, 2023
1 parent 8d63253 commit 5c5d27a
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 10 deletions.
Expand Up @@ -11,7 +11,7 @@ private[meta] case class CharArrayReader private (
dialect: Dialect,
reporter: Reporter,
/** the last read character */
var ch: Char = SU,
var ch: Int = SU,
/** The offset one past the last read character */
var begCharOffset: Int = -1, // included
var endCharOffset: Int = 0, // excluded
Expand Down Expand Up @@ -58,9 +58,21 @@ private[meta] case class CharArrayReader private (
ch = SU
} else {
begCharOffset = endCharOffset
val (c, nextOffset) = readUnicodeChar(endCharOffset)
ch = c
endCharOffset = nextOffset
val (hi, hiEnd) = readUnicodeChar(endCharOffset)
if (!Character.isHighSurrogate(hi)) {
ch = hi
endCharOffset = hiEnd
} else if (hiEnd >= buf.length)
readerError("invalid unicode surrogate pair", at = begCharOffset)
else {
val (lo, loEnd) = readUnicodeChar(hiEnd)
if (!Character.isLowSurrogate(lo))
readerError("invalid unicode surrogate pair", at = begCharOffset)
else {
ch = Character.toCodePoint(hi, lo)
endCharOffset = loEnd
}
}
}
}

Expand Down
Expand Up @@ -82,8 +82,8 @@ class LegacyScanner(input: Input, dialect: Dialect) {
/**
* append Unicode character to "cbuf" buffer
*/
private def putChar(c: Char): Unit = {
cbuf.append(c)
private def putChar(c: Int): Unit = {
cbuf.appendCodePoint(c)
}

/**
Expand Down Expand Up @@ -258,7 +258,7 @@ class LegacyScanner(input: Input, dialect: Dialect) {
(ch: @switch) match {
case ' ' | '\t' | CR | LF | FF =>
token = WHITESPACE
strVal = ch.toString
strVal = ch.toChar.toString
nextChar()
// nextToken()
case
Expand Down Expand Up @@ -704,7 +704,7 @@ class LegacyScanner(input: Input, dialect: Dialect) {
val start = begCharOffset
nextChar()
if ('0' <= ch && ch <= '7') {
val leadch: Char = ch
val leadch = ch
var oct: Int = digit2int(ch, 8)
nextChar()
if ('0' <= ch && ch <= '7') {
Expand All @@ -718,7 +718,7 @@ class LegacyScanner(input: Input, dialect: Dialect) {
val alt = if (oct == LF) "\\n" else "\\u%04x" format oct
def msg(what: String) = s"Octal escape literals are $what, use $alt instead."
deprecationWarning(msg("deprecated"), at = start)
putChar(oct.toChar)
putChar(oct)
} else {
ch match {
case 'b' => putChar('\b')
Expand Down
Expand Up @@ -1170,7 +1170,23 @@ class TokenizerSuite extends BaseTokenizerSuite {
test("#3328") {
val code = "val \\uD835\\uDF11: Double"
val res = dialects.Scala212(code).tokenize
assertEquals(res.toEither.left.get.message, "illegal character '\\ud835'")
assertEquals(res.get.toString, code)
}

test("#3328 2") {
assertTokenizedAsStructureLines(
"val \uD835\uDF11: Double",
"""
|BOF [0..0)
|val [0..3)
| [3..4)
|\uD835\uDF11 [4..6)
|: [6..7)
| [7..8)
|Double [8..14)
|EOF [14..14)
|""".stripMargin
)
}

}

0 comments on commit 5c5d27a

Please sign in to comment.