diff --git a/scalameta/tokenizers/shared/src/main/scala/scala/meta/internal/tokenizers/CharArrayReader.scala b/scalameta/tokenizers/shared/src/main/scala/scala/meta/internal/tokenizers/CharArrayReader.scala index d4946e7d28..68fe6bf2e9 100644 --- a/scalameta/tokenizers/shared/src/main/scala/scala/meta/internal/tokenizers/CharArrayReader.scala +++ b/scalameta/tokenizers/shared/src/main/scala/scala/meta/internal/tokenizers/CharArrayReader.scala @@ -11,7 +11,7 @@ private[meta] case class CharArrayReader private ( dialect: Dialect, reporter: Reporter, /** the last read character */ - var ch: Char = SU, + var ch: Int = SU, /** The offset one past the last read character */ var begCharOffset: Int = -1, // included var endCharOffset: Int = 0, // excluded @@ -58,9 +58,21 @@ private[meta] case class CharArrayReader private ( ch = SU } else { begCharOffset = endCharOffset - val (c, nextOffset) = readUnicodeChar(endCharOffset) - ch = c - endCharOffset = nextOffset + val (hi, hiEnd) = readUnicodeChar(endCharOffset) + if (!Character.isHighSurrogate(hi)) { + ch = hi + endCharOffset = hiEnd + } else if (hiEnd >= buf.length) + readerError("invalid unicode surrogate pair", at = begCharOffset) + else { + val (lo, loEnd) = readUnicodeChar(hiEnd) + if (!Character.isLowSurrogate(lo)) + readerError("invalid unicode surrogate pair", at = begCharOffset) + else { + ch = Character.toCodePoint(hi, lo) + endCharOffset = loEnd + } + } } } diff --git a/scalameta/tokenizers/shared/src/main/scala/scala/meta/internal/tokenizers/LegacyScanner.scala b/scalameta/tokenizers/shared/src/main/scala/scala/meta/internal/tokenizers/LegacyScanner.scala index 35651e2adf..5e4986a1ff 100644 --- a/scalameta/tokenizers/shared/src/main/scala/scala/meta/internal/tokenizers/LegacyScanner.scala +++ b/scalameta/tokenizers/shared/src/main/scala/scala/meta/internal/tokenizers/LegacyScanner.scala @@ -82,8 +82,8 @@ class LegacyScanner(input: Input, dialect: Dialect) { /** * append Unicode character to "cbuf" buffer */ - private def putChar(c: Char): Unit = { - cbuf.append(c) + private def putChar(c: Int): Unit = { + cbuf.appendCodePoint(c) } /** @@ -258,7 +258,7 @@ class LegacyScanner(input: Input, dialect: Dialect) { (ch: @switch) match { case ' ' | '\t' | CR | LF | FF => token = WHITESPACE - strVal = ch.toString + strVal = ch.toChar.toString nextChar() // nextToken() case @@ -704,7 +704,7 @@ class LegacyScanner(input: Input, dialect: Dialect) { val start = begCharOffset nextChar() if ('0' <= ch && ch <= '7') { - val leadch: Char = ch + val leadch = ch var oct: Int = digit2int(ch, 8) nextChar() if ('0' <= ch && ch <= '7') { @@ -718,7 +718,7 @@ class LegacyScanner(input: Input, dialect: Dialect) { val alt = if (oct == LF) "\\n" else "\\u%04x" format oct def msg(what: String) = s"Octal escape literals are $what, use $alt instead." deprecationWarning(msg("deprecated"), at = start) - putChar(oct.toChar) + putChar(oct) } else { ch match { case 'b' => putChar('\b') diff --git a/tests/shared/src/test/scala/scala/meta/tests/tokenizers/TokenizerSuite.scala b/tests/shared/src/test/scala/scala/meta/tests/tokenizers/TokenizerSuite.scala index 4c4ea1ee8e..eac14f811a 100644 --- a/tests/shared/src/test/scala/scala/meta/tests/tokenizers/TokenizerSuite.scala +++ b/tests/shared/src/test/scala/scala/meta/tests/tokenizers/TokenizerSuite.scala @@ -1170,7 +1170,23 @@ class TokenizerSuite extends BaseTokenizerSuite { test("#3328") { val code = "val \\uD835\\uDF11: Double" val res = dialects.Scala212(code).tokenize - assertEquals(res.toEither.left.get.message, "illegal character '\\ud835'") + assertEquals(res.get.toString, code) + } + + test("#3328 2") { + assertTokenizedAsStructureLines( + "val \uD835\uDF11: Double", + """ + |BOF [0..0) + |val [0..3) + | [3..4) + |\uD835\uDF11 [4..6) + |: [6..7) + | [7..8) + |Double [8..14) + |EOF [14..14) + |""".stripMargin + ) } }