diff --git a/build.sbt b/build.sbt index 82208895c7ad..fd3f0d2c768e 100644 --- a/build.sbt +++ b/build.sbt @@ -717,6 +717,7 @@ lazy val junit = project.in(file("test") / "junit") "-feature", "-Xlint:-valpattern,_", "-Wconf:msg=match may not be exhaustive:s", // if we missed a case, all that happens is the test fails + "-Wconf:cat=lint-nullary-unit&site=.*Test:s", // normal unit test style "-Ypatmat-exhaust-depth", "40", // despite not caring about patmat exhaustiveness, we still get warnings for this ), Compile / javacOptions ++= Seq("-Xlint"), diff --git a/src/compiler/scala/tools/nsc/ast/parser/Parsers.scala b/src/compiler/scala/tools/nsc/ast/parser/Parsers.scala index ff9b8747f17c..adc577f54c86 100644 --- a/src/compiler/scala/tools/nsc/ast/parser/Parsers.scala +++ b/src/compiler/scala/tools/nsc/ast/parser/Parsers.scala @@ -264,7 +264,7 @@ self => if (syntaxErrors.isEmpty) firstTry else in.healBraces() match { case Nil => showSyntaxErrors() ; firstTry - case patches => (this withPatches patches).parse() + case patches => withPatches(patches).parse() } } } diff --git a/src/compiler/scala/tools/nsc/ast/parser/Scanners.scala b/src/compiler/scala/tools/nsc/ast/parser/Scanners.scala index 17b46da9191c..b40ad37f6bf2 100644 --- a/src/compiler/scala/tools/nsc/ast/parser/Scanners.scala +++ b/src/compiler/scala/tools/nsc/ast/parser/Scanners.scala @@ -172,7 +172,45 @@ trait Scanners extends ScannersCommon { /** A switch whether operators at the start of lines can be infix operators. */ private var allowLeadingInfixOperators = true - private def isDigit(c: Char) = java.lang.Character isDigit c + private def isDigit(c: Char) = Character.isDigit(c) + + import Character.{isHighSurrogate, isLowSurrogate, isUnicodeIdentifierPart, isUnicodeIdentifierStart, isValidCodePoint, toCodePoint} + + // given char (ch) is high surrogate followed by low, codepoint passes predicate. + // true means supplementary chars were put to buffer. + // strict to require low surrogate (if not in string literal). + private def isSupplementary(high: Char, test: Int => Boolean, strict: Boolean = true): Boolean = + isHighSurrogate(high) && { + var res = false + nextChar() + val low = ch + if (isLowSurrogate(low)) { + nextChar() + val codepoint = toCodePoint(high, low) + if (isValidCodePoint(codepoint) && test(codepoint)) { + putChar(high) + putChar(low) + res = true + } else + syntaxError(f"illegal character '\\u$high%04x\\u$low%04x'") + } else if (!strict) { + putChar(high) + res = true + } else + syntaxError(f"illegal character '\\u$high%04x' missing low surrogate") + res + } + private def atSupplementary(ch: Char, f: Int => Boolean): Boolean = + isHighSurrogate(ch) && { + val hi = ch + val r = lookaheadReader + r.nextRawChar() + val lo = r.ch + isLowSurrogate(lo) && { + val codepoint = toCodePoint(hi, lo) + isValidCodePoint(codepoint) && f(codepoint) + } + } private var openComments = 0 final protected def putCommentChar(): Unit = { processCommentChar(); nextChar() } @@ -705,14 +743,18 @@ trait Scanners extends ScannersCommon { syntaxError("empty character literal (use '\\'' for single quote)") else { nextChar() - token = CHARLIT - setStrVal() + if (cbuf.length != 1) + syntaxError("illegal codepoint in Char constant: " + cbuf.toString.map(c => f"\\u$c%04x").mkString("'", "", "'")) + else { + token = CHARLIT + setStrVal() + } } - } else if (isEmptyCharLit) { + } + else if (isEmptyCharLit) syntaxError("empty character literal") - } else { + else unclosedCharLit() - } } else unclosedCharLit() } @@ -755,7 +797,7 @@ trait Scanners extends ScannersCommon { } else if (ch == '\u2190') { deprecationWarning("The unicode arrow `←` is deprecated, use `<-` instead. If you still wish to display it as one character, consider using a font with programming ligatures such as Fira Code.", "2.13.0") nextChar(); token = LARROW - } else if (Character.isUnicodeIdentifierStart(ch)) { + } else if (isUnicodeIdentifierStart(ch)) { putChar(ch) nextChar() getIdentRest() @@ -763,8 +805,10 @@ trait Scanners extends ScannersCommon { putChar(ch) nextChar() getOperatorRest() + } else if (isSupplementary(ch, isUnicodeIdentifierStart)) { + getIdentRest() } else { - syntaxError("illegal character '" + ("" + '\\' + 'u' + "%04x".format(ch.toInt)) + "'") + syntaxError(f"illegal character '\\u$ch%04x'") nextChar() } } @@ -831,13 +875,15 @@ trait Scanners extends ScannersCommon { case SU => // strangely enough, Character.isUnicodeIdentifierPart(SU) returns true! finishNamed() case _ => - if (Character.isUnicodeIdentifierPart(ch)) { + if (isUnicodeIdentifierPart(ch)) { putChar(ch) nextChar() getIdentRest() - } else { - finishNamed() } + else if (isSupplementary(ch, isUnicodeIdentifierPart)) + getIdentRest() + else + finishNamed() } @tailrec @@ -955,6 +1001,25 @@ trait Scanners extends ScannersCommon { } getStringPart(multiLine, seenEscapedQuote || q) } else if (ch == '$') { + @tailrec def getInterpolatedIdentRest(): Unit = + if (ch != SU && isUnicodeIdentifierPart(ch)) { + putChar(ch) + nextRawChar() + getInterpolatedIdentRest() + } else if (atSupplementary(ch, isUnicodeIdentifierPart)) { + putChar(ch) + nextRawChar() + putChar(ch) + nextRawChar() + getInterpolatedIdentRest() + } else { + next.token = IDENTIFIER + next.name = newTermName(cbuf.toCharArray) + cbuf.clear() + val idx = next.name.start - kwOffset + if (idx >= 0 && idx < kwArray.length) + next.token = kwArray(idx) + } nextRawChar() if (ch == '$' || ch == '"') { putChar(ch) @@ -968,32 +1033,29 @@ trait Scanners extends ScannersCommon { finishStringPart() nextRawChar() next.token = USCORE - } else if (Character.isUnicodeIdentifierStart(ch)) { + } else if (isUnicodeIdentifierStart(ch)) { finishStringPart() - do { - putChar(ch) - nextRawChar() - } while (ch != SU && Character.isUnicodeIdentifierPart(ch)) - next.token = IDENTIFIER - next.name = newTermName(cbuf.toString) - cbuf.clear() - val idx = next.name.start - kwOffset - if (idx >= 0 && idx < kwArray.length) { - next.token = kwArray(idx) - } + putChar(ch) + nextRawChar() + getInterpolatedIdentRest() + } else if (atSupplementary(ch, isUnicodeIdentifierStart)) { + finishStringPart() + putChar(ch) + nextRawChar() + putChar(ch) + nextRawChar() + getInterpolatedIdentRest() } else { val expectations = "$$, $\", $identifier or ${expression}" syntaxError(s"invalid string interpolation $$$ch, expected: $expectations") } } else { val isUnclosedLiteral = (ch == SU || (!multiLine && (ch == CR || ch == LF))) - if (isUnclosedLiteral) { + if (isUnclosedLiteral) if (multiLine) incompleteInputError("unclosed multi-line string literal") - else { + else unclosedStringLit(seenEscapedQuote) - } - } else { putChar(ch) nextRawChar() @@ -1027,53 +1089,38 @@ trait Scanners extends ScannersCommon { false } - /** copy current character into cbuf, interpreting any escape sequences, - * and advance to next character. + /** Copy current character into cbuf, interpreting any escape sequences, + * and advance to next character. Surrogate pairs are consumed (see check + * at fetchSingleQuote), but orphan surrogate is allowed. */ protected def getLitChar(): Unit = if (ch == '\\') { nextChar() - if ('0' <= ch && ch <= '7') { - val start = charOffset - 2 - val leadch: Char = ch - var oct: Int = digit2int(ch, 8) - nextChar() - if ('0' <= ch && ch <= '7') { - oct = oct * 8 + digit2int(ch, 8) - nextChar() - if (leadch <= '3' && '0' <= ch && ch <= '7') { - oct = oct * 8 + digit2int(ch, 8) - nextChar() - } - } - val alt = if (oct == LF) "\\n" else "\\u%04x" format oct - syntaxError(start, s"octal escape literals are unsupported: use $alt instead") - putChar(oct.toChar) - } else { - if (ch == 'u') { - if (getUEscape()) nextChar() - } - else { - ch match { - case 'b' => putChar('\b') - case 't' => putChar('\t') - case 'n' => putChar('\n') - case 'f' => putChar('\f') - case 'r' => putChar('\r') - case '\"' => putChar('\"') - case '\'' => putChar('\'') - case '\\' => putChar('\\') - case _ => invalidEscape() - } - nextChar() - } - } - } else { + charEscape() + } else if (!isSupplementary(ch, _ => true, strict = false)) { putChar(ch) nextChar() } - private def getUEscape(): Boolean = { + private def charEscape(): Unit = { + var bump = true + ch match { + case 'b' => putChar('\b') + case 't' => putChar('\t') + case 'n' => putChar('\n') + case 'f' => putChar('\f') + case 'r' => putChar('\r') + case '\"' => putChar('\"') + case '\'' => putChar('\'') + case '\\' => putChar('\\') + case 'u' => bump = uEscape() + case x if '0' <= x && x <= '7' => bump = octalEscape() + case _ => invalidEscape() + } + if (bump) nextChar() + } + + private def uEscape(): Boolean = { while (ch == 'u') nextChar() var codepoint = 0 var digitsRead = 0 @@ -1094,7 +1141,25 @@ trait Scanners extends ScannersCommon { putChar(found) true } - + + private def octalEscape(): Boolean = { + val start = charOffset - 2 + val leadch: Char = ch + var oct: Int = digit2int(ch, 8) + nextChar() + if ('0' <= ch && ch <= '7') { + oct = oct * 8 + digit2int(ch, 8) + nextChar() + if (leadch <= '3' && '0' <= ch && ch <= '7') { + oct = oct * 8 + digit2int(ch, 8) + nextChar() + } + } + val alt = if (oct == LF) "\\n" else f"\\u$oct%04x" + syntaxError(start, s"octal escape literals are unsupported: use $alt instead") + putChar(oct.toChar) + false + } protected def invalidEscape(): Unit = { syntaxError(charOffset - 1, "invalid escape character") diff --git a/src/compiler/scala/tools/nsc/symtab/classfile/AbstractFileReader.scala b/src/compiler/scala/tools/nsc/symtab/classfile/AbstractFileReader.scala index ca1378e6c87e..faf69d5769e3 100644 --- a/src/compiler/scala/tools/nsc/symtab/classfile/AbstractFileReader.scala +++ b/src/compiler/scala/tools/nsc/symtab/classfile/AbstractFileReader.scala @@ -27,9 +27,7 @@ import scala.tools.nsc.io.AbstractFile */ final class AbstractFileReader(val buf: Array[Byte]) extends DataReader { @deprecated("Use other constructor", "2.13.0") - def this(file: AbstractFile) = { - this(file.toByteArray) - } + def this(file: AbstractFile) = this(file.toByteArray) /** the current input pointer */ @@ -67,9 +65,8 @@ final class AbstractFileReader(val buf: Array[Byte]) extends DataReader { def getByte(mybp: Int): Byte = buf(mybp) - def getBytes(mybp: Int, bytes: Array[Byte]): Unit = { + def getBytes(mybp: Int, bytes: Array[Byte]): Unit = System.arraycopy(buf, mybp, bytes, 0, bytes.length) - } /** extract a character at position bp from buf */ @@ -95,9 +92,8 @@ final class AbstractFileReader(val buf: Array[Byte]) extends DataReader { */ def getDouble(mybp: Int): Double = longBitsToDouble(getLong(mybp)) - def getUTF(mybp: Int, len: Int): String = { + def getUTF(mybp: Int, len: Int): String = new DataInputStream(new ByteArrayInputStream(buf, mybp, len)).readUTF - } /** skip next 'n' bytes */ diff --git a/src/partest/scala/tools/partest/DirectTest.scala b/src/partest/scala/tools/partest/DirectTest.scala index 17de444bb7c4..d923829b8c14 100644 --- a/src/partest/scala/tools/partest/DirectTest.scala +++ b/src/partest/scala/tools/partest/DirectTest.scala @@ -45,6 +45,7 @@ abstract class DirectTest { protected def pathOf(locations: String*) = locations.mkString(sys.props("path.separator")) // override to add additional settings besides -d testOutput.path + // default is -usejavacp def extraSettings: String = "-usejavacp" // a default Settings object using only extraSettings def settings: Settings = newSettings(CommandLineParser.tokenize(extraSettings)) diff --git a/src/partest/scala/tools/partest/package.scala b/src/partest/scala/tools/partest/package.scala index d3e5f070eed9..5484b5dc8b94 100644 --- a/src/partest/scala/tools/partest/package.scala +++ b/src/partest/scala/tools/partest/package.scala @@ -19,7 +19,6 @@ import scala.concurrent.duration.Duration import scala.io.Codec import scala.jdk.CollectionConverters._ import scala.tools.nsc.util.Exceptional -import scala.util.chaining._ package object partest { type File = java.io.File @@ -180,17 +179,4 @@ package object partest { def isDebug = sys.props.contains("partest.debug") || sys.env.contains("PARTEST_DEBUG") def debugSettings = sys.props.getOrElse("partest.debug.settings", "") def log(msg: => Any): Unit = if (isDebug) Console.err.println(msg) - - private val printable = raw"\p{Print}".r - - def hexdump(s: String): Iterator[String] = { - var offset = 0 - def hex(bytes: Array[Byte]) = bytes.map(b => f"$b%02x").mkString(" ") - def charFor(byte: Byte): Char = byte.toChar match { case c @ printable() => c ; case _ => '.' } - def ascii(bytes: Array[Byte]) = bytes.map(charFor).mkString - def format(bytes: Array[Byte]): String = - f"$offset%08x ${hex(bytes.slice(0, 8))}%-24s ${hex(bytes.slice(8, 16))}%-24s |${ascii(bytes)}|" - .tap(_ => offset += bytes.length) - s.getBytes(codec.charSet).grouped(16).map(format) - } } diff --git a/src/testkit/scala/tools/testkit/AssertUtil.scala b/src/testkit/scala/tools/testkit/AssertUtil.scala index 4b7083d83e2c..f6087c22d258 100644 --- a/src/testkit/scala/tools/testkit/AssertUtil.scala +++ b/src/testkit/scala/tools/testkit/AssertUtil.scala @@ -51,6 +51,25 @@ object AssertUtil { // junit fail is Unit def fail(message: String): Nothing = throw new AssertionError(message) + private val printable = raw"\p{Print}".r + + def hexdump(s: String): Iterator[String] = { + import scala.io.Codec + val codec: Codec = Codec.UTF8 + var offset = 0 + def hex(bytes: Array[Byte]) = bytes.map(b => f"$b%02x").mkString(" ") + def charFor(byte: Byte): Char = byte.toChar match { case c @ printable() => c ; case _ => '.' } + def ascii(bytes: Array[Byte]) = bytes.map(charFor).mkString + def format(bytes: Array[Byte]): String = + f"$offset%08x ${hex(bytes.slice(0, 8))}%-24s ${hex(bytes.slice(8, 16))}%-24s |${ascii(bytes)}|" + .tap(_ => offset += bytes.length) + s.getBytes(codec.charSet).grouped(16).map(format) + } + + private def dump(s: String) = hexdump(s).mkString("\n") + def assertEqualStrings(expected: String)(actual: String) = + assert(expected == actual, s"Expected:\n${dump(expected)}\nActual:\n${dump(actual)}") + private final val timeout = 60 * 1000L // wait a minute private implicit class `ref helper`[A](val r: Reference[A]) extends AnyVal { diff --git a/test/files/neg/surrogates.check b/test/files/neg/surrogates.check new file mode 100644 index 000000000000..8e691608df9c --- /dev/null +++ b/test/files/neg/surrogates.check @@ -0,0 +1,4 @@ +surrogates.scala:3: error: illegal codepoint in Char constant: '\ud801\udc00' + def c = '𐐀' + ^ +1 error diff --git a/test/files/neg/surrogates.scala b/test/files/neg/surrogates.scala new file mode 100644 index 000000000000..f9c91438627f --- /dev/null +++ b/test/files/neg/surrogates.scala @@ -0,0 +1,4 @@ + +class C { + def c = '𐐀' +} diff --git a/test/files/pos/surrogates.scala b/test/files/pos/surrogates.scala new file mode 100644 index 000000000000..69c5735b9147 --- /dev/null +++ b/test/files/pos/surrogates.scala @@ -0,0 +1,23 @@ + +class 𐐀 { + def 𐐀 = 42 + def x = "𐐀" + def y = s"$𐐀" + def z = s"$𐐀 𐐀" + def w = s" 𐐀" +} + +case class 𐐀𐐀(n: Int) { + def 𐐀𐐀 = n + def `𐐀𐐀1` = n + n +} + +class Construction { + def hi = '\ud801' + def lo = '\udc00' + def endhi = "abc\ud801" + def startlo = "\udc00xyz" + def reversed = "xyz\udc00\ud801abc" +} + +// was: error: illegal character '\ud801', '\udc00' diff --git a/test/files/run/t12276.scala b/test/files/run/t12276.scala index 50ef6b0edc5e..36fbbbc6c558 100644 --- a/test/files/run/t12276.scala +++ b/test/files/run/t12276.scala @@ -1,6 +1,7 @@ import scala.tools.nsc.Settings import scala.tools.nsc.interpreter.shell.{ILoop, ShellConfig} -import scala.tools.partest.{hexdump, ReplTest} +import scala.tools.partest.ReplTest +import scala.tools.testkit.AssertUtil.hexdump object Test extends ReplTest { def code = s""" diff --git a/test/files/run/t1406.scala b/test/files/run/t1406.scala new file mode 100644 index 000000000000..75b415b42586 --- /dev/null +++ b/test/files/run/t1406.scala @@ -0,0 +1,22 @@ + +import scala.tools.partest.DirectTest + +object Test extends DirectTest { + // for reference, UTF-8 of U0 + //val data = Array(0xed, 0xa0, 0x81).map(_.asInstanceOf[Byte]) + def U0 = "\ud801" + def U1 = "\udc00" + def code = + s"""class C { + | def x = "$U0" + | def y = "$U1" + | def `$U0` = x + | def `$U1` = y + |}""".stripMargin + + def show(): Unit = { + assert(U0.length == 1) + assert(compile()) + } +} + diff --git a/test/files/run/t1406b.check b/test/files/run/t1406b.check new file mode 100644 index 000000000000..407e44adf89d --- /dev/null +++ b/test/files/run/t1406b.check @@ -0,0 +1,6 @@ +newSource1.scala:4: error: illegal character '\ud801' missing low surrogate + def ? = x + ^ +newSource1.scala:5: error: illegal character '\udc00' + def ? = y + ^ diff --git a/test/files/run/t1406b.scala b/test/files/run/t1406b.scala new file mode 100644 index 000000000000..bd1868a642fb --- /dev/null +++ b/test/files/run/t1406b.scala @@ -0,0 +1,22 @@ + +import scala.tools.partest.DirectTest + +object Test extends DirectTest { + // for reference, UTF-8 of U0 + //val data = Array(0xed, 0xa0, 0x81).map(_.asInstanceOf[Byte]) + def U0 = "\ud801" + def U1 = "\udc00" + def code = + s"""class C { + | def x = "$U0" + | def y = "$U1" + | def $U0 = x + | def $U1 = y + |}""".stripMargin + + def show(): Unit = { + assert(U0.length == 1) + assert(!compile()) + } +} + diff --git a/test/files/run/t9915/Test_2.scala b/test/files/run/t9915/Test_2.scala index afed667cc6e5..f26f1c1a3d91 100644 --- a/test/files/run/t9915/Test_2.scala +++ b/test/files/run/t9915/Test_2.scala @@ -1,12 +1,14 @@ +import scala.tools.testkit.AssertUtil.assertEqualStrings + object Test extends App { val c = new C_1 - assert(c.nulled == "X\u0000ABC") // "X\000ABC" - assert(c.supped == "𐒈𐒝𐒑𐒛𐒐𐒘𐒕𐒖") + assert(C_1.NULLED.length == "XYABC".length) + assert(C_1.SUPPED.codePointCount(0, C_1.SUPPED.length) == 8) - assert(C_1.NULLED == "X\u0000ABC") // "X\000ABC" - assert(C_1.SUPPED == "𐒈𐒝𐒑𐒛𐒐𐒘𐒕𐒖") + assertEqualStrings(c.nulled)("X\u0000ABC") // "X\000ABC" in java source + assertEqualStrings(c.supped)("𐒈𐒝𐒑𐒛𐒐𐒘𐒕𐒖") - assert(C_1.NULLED.size == "XYABC".size) - assert(C_1.SUPPED.codePointCount(0, C_1.SUPPED.length) == 8) + assertEqualStrings(C_1.NULLED)("X\u0000ABC") // "X\000ABC" in java source + assertEqualStrings(C_1.SUPPED)("𐒈𐒝𐒑𐒛𐒐𐒘𐒕𐒖") } diff --git a/test/junit/scala/tools/testkit/AssertUtilTest.scala b/test/junit/scala/tools/testkit/AssertUtilTest.scala index 98e2c0308553..90e98e1598e3 100644 --- a/test/junit/scala/tools/testkit/AssertUtilTest.scala +++ b/test/junit/scala/tools/testkit/AssertUtilTest.scala @@ -110,4 +110,10 @@ class AssertUtilTest { assertEquals(1, sut.errors.size) assertEquals(0, sut.errors.head._2.getSuppressed.length) } + + /** TODO + @Test def `hexdump is supplementary-aware`: Unit = { + assertEquals("00000000 f0 90 90 80 |𐐀.|", hexdump("\ud801\udc00").next()) + } + */ }