Skip to content

Commit

Permalink
treat unicode escapes as any other escape
Browse files Browse the repository at this point in the history
except in triple quotes strings and raw interpolations
for those, emit deprecation warnings, but keep the current
behaviour of processing unicode escapes
  • Loading branch information
Martijn Hoekstra committed Aug 2, 2019
1 parent 4854c5d commit 97ba959
Show file tree
Hide file tree
Showing 23 changed files with 250 additions and 149 deletions.
1 change: 1 addition & 0 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ val mimaFilterSettings = Seq(
ProblemFilters.exclude[MissingClassProblem]("scala.reflect.runtime.JavaMirrors$JavaMirror$typeTagCache$"),
ProblemFilters.exclude[DirectMissingMethodProblem]("scala.reflect.api.TypeTags.TypeTagImpl"),
ProblemFilters.exclude[DirectMissingMethodProblem]("scala.reflect.api.Universe.TypeTagImpl"),
ProblemFilters.exclude[DirectMissingMethodProblem]("scala.StringContext.processUnicode")
),
)

Expand Down
47 changes: 39 additions & 8 deletions src/compiler/scala/tools/nsc/ast/parser/Scanners.scala
Original file line number Diff line number Diff line change
Expand Up @@ -206,10 +206,7 @@ trait Scanners extends ScannersCommon {

/** append Unicode character to "cbuf" buffer
*/
protected def putChar(c: Char): Unit = {
// assert(cbuf.size < 10000, cbuf)
cbuf.append(c)
}
protected def putChar(c: Char): Unit = cbuf.append(c)

/** Determines whether this scanner should emit identifier deprecation warnings,
* e.g. when seeing `macro` or `then`, which are planned to become keywords in future versions of Scala.
Expand Down Expand Up @@ -573,7 +570,7 @@ trait Scanners extends ScannersCommon {
charLitOr(() => getIdentRest())
else if (isOperatorPart(ch) && (ch != '\\'))
charLitOr(() => getOperatorRest())
else if (!isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape)) {
else if (!isAtEnd && (ch != SU && ch != CR && ch != LF)) {
val isEmptyCharLit = (ch == '\'')
getLitChar()
if (ch == '\'') {
Expand Down Expand Up @@ -763,11 +760,21 @@ trait Scanners extends ScannersCommon {

private def unclosedStringLit(): Unit = syntaxError("unclosed string literal")

private def replaceUnicodeEscapes(warn: Boolean): Unit =
if(strVal != null) {
val replaced = StringContext.processUnicode(strVal)
if(warn && replaced != strVal) {
deprecationWarning("Unicode escapes in triple quoted strings and raw interpolations are deprecated, use the literal character instead" , since="2.13.1")
}
strVal = replaced
}

@tailrec private def getRawStringLit(): Unit = {
if (ch == '\"') {
nextRawChar()
if (isTripleQuote()) {
setStrVal()
replaceUnicodeEscapes(true)
token = STRINGLIT
} else
getRawStringLit()
Expand Down Expand Up @@ -831,7 +838,7 @@ trait Scanners extends ScannersCommon {
syntaxError(s"invalid string interpolation $$$ch, expected: $$$$, $$identifier or $${expression}")
}
} else {
val isUnclosedLiteral = !isUnicodeEscape && (ch == SU || (!multiLine && (ch == CR || ch == LF)))
val isUnclosedLiteral = (ch == SU || (!multiLine && (ch == CR || ch == LF)))
if (isUnclosedLiteral) {
if (multiLine)
incompleteInputError("unclosed multi-line string literal")
Expand Down Expand Up @@ -903,6 +910,7 @@ trait Scanners extends ScannersCommon {
case '\"' => putChar('\"')
case '\'' => putChar('\'')
case '\\' => putChar('\\')
case 'u' => getUEscape()
case _ => invalidEscape()
}
nextChar()
Expand All @@ -912,13 +920,37 @@ trait Scanners extends ScannersCommon {
nextChar()
}

private def getUEscape(): Unit = {
while (ch == 'u') nextChar()
var codepoint = 0
var digitsRead = 0
while(digitsRead < 4){
if (digitsRead > 0) nextChar()
val digit = digit2int(ch, 16)
digitsRead += 1
if (digit >= 0) {
codepoint = codepoint << 4
codepoint += digit
}
else invalidUnicodeEscape(digitsRead)
}
val found = codepoint.asInstanceOf[Char]
putChar(found)
}


protected def invalidEscape(): Unit = {
syntaxError(charOffset - 1, "invalid escape character")
putChar(ch)
}

protected def invalidUnicodeEscape(n: Int): Unit = {
syntaxError(charOffset -n, "invalid unicode escape")
putChar(ch)
}

private def getLitChars(delimiter: Char) = {
while (ch != delimiter && !isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape))
while (ch != delimiter && !isAtEnd && (ch != SU && ch != CR && ch != LF))
getLitChar()
}

Expand Down Expand Up @@ -1315,7 +1347,6 @@ trait Scanners extends ScannersCommon {
*/
class SourceFileScanner(val source: SourceFile) extends Scanner {
val buf = source.content
override val decodeUni: Boolean = !settings.nouescape

// suppress warnings, throw exception on errors
def deprecationWarning(off: Offset, msg: String, since: String): Unit = ()
Expand Down
50 changes: 0 additions & 50 deletions src/compiler/scala/tools/nsc/util/CharArrayReader.scala
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,11 @@ trait CharArrayReaderData {
/** The start offset of the line before the current one */
var lastLineStartOffset: Int = 0

protected var lastUnicodeOffset = -1

def copyFrom(cd: CharArrayReaderData): this.type = {
this.ch = cd.ch
this.charOffset = cd.charOffset
this.lineStartOffset = cd.lineStartOffset
this.lastLineStartOffset = cd.lastLineStartOffset
this.lastUnicodeOffset = cd.lastUnicodeOffset
this
}
}
Expand All @@ -44,14 +41,6 @@ abstract class CharArrayReader extends CharArrayReaderData { self =>

val buf: Array[Char]

def decodeUni: Boolean = true

/** An error routine to call on bad unicode escapes \\uxxxx. */
protected def error(offset: Int, msg: String): Unit

/** Is last character a unicode escape \\uxxxx? */
def isUnicodeEscape = charOffset == lastUnicodeOffset

/** Advance one character; reducing CR;LF pairs to just LF */
final def nextChar(): Unit = {
if (charOffset >= buf.length) {
Expand All @@ -60,7 +49,6 @@ abstract class CharArrayReader extends CharArrayReaderData { self =>
val c = buf(charOffset)
ch = c
charOffset += 1
if (c == '\\') potentialUnicode()
if (ch < ' ') {
skipCR()
potentialLineEnd()
Expand All @@ -79,38 +67,6 @@ abstract class CharArrayReader extends CharArrayReaderData { self =>
val c = buf(charOffset)
ch = c
charOffset += 1
if (c == '\\') potentialUnicode()
}
}

/** Interpret \\uxxxx escapes */
private def potentialUnicode() = {
def evenSlashPrefix: Boolean = {
var p = charOffset - 2
while (p >= 0 && buf(p) == '\\') p -= 1
(charOffset - p) % 2 == 0
}
def udigit: Int = {
if (charOffset >= buf.length) {
// Since the positioning code is very insistent about throwing exceptions,
// we have to decrement the position so our error message can be seen, since
// we are one past EOF. This happens with e.g. val x = \ u 1 <EOF>
error(charOffset - 1, "incomplete unicode escape")
SU
}
else {
val d = digit2int(buf(charOffset), 16)
if (d >= 0) charOffset += 1
else error(charOffset, "error in unicode escape")
d
}
}
if (charOffset < buf.length && buf(charOffset) == 'u' && decodeUni && evenSlashPrefix) {
do charOffset += 1
while (charOffset < buf.length && buf(charOffset) == 'u')
val code = udigit << 12 | udigit << 8 | udigit << 4 | udigit
lastUnicodeOffset = charOffset
ch = code.toChar
}
}

Expand All @@ -121,9 +77,6 @@ abstract class CharArrayReader extends CharArrayReaderData { self =>
case LF =>
charOffset += 1
ch = LF
case '\\' =>
if (lookaheadReader.getu == LF)
potentialUnicode()
case _ =>
}

Expand All @@ -142,10 +95,7 @@ abstract class CharArrayReader extends CharArrayReaderData { self =>
val buf = self.buf
charOffset = self.charOffset
ch = self.ch
override def decodeUni = self.decodeUni
def error(offset: Int, msg: String) = self.error(offset, msg)
/** A mystery why CharArrayReader.nextChar() returns Unit */
def getc() = { nextChar() ; ch }
def getu() = { require(buf(charOffset) == '\\') ; ch = '\\' ; charOffset += 1 ; potentialUnicode() ; ch }
}
}
20 changes: 9 additions & 11 deletions src/compiler/scala/tools/reflect/FastStringInterpolator.scala
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,16 @@ trait FastStringInterpolator extends FormatInterpolator {
parts.forall(treeInfo.isLiteralString) &&
parts.length == (args.length + 1) =>

val treated =
if (isRaw) parts
else
try
parts.mapConserve { case lit@Literal(Constant(stringVal: String)) =>
val k = Constant(StringContext.processEscapes(stringVal))
// To avoid the backlash of backslash, taken literally by Literal, escapes are processed strictly (scala/bug#11196)
treeCopy.Literal(lit, k).setType(ConstantType(k))
}
catch {
case e: StringContext.InvalidEscapeException => c.abort(parts.head.pos.withShift(e.index), e.getMessage)
val treated =
try
parts.mapConserve { case lit@Literal(Constant(stringVal: String)) =>
val k = Constant(if(isRaw) StringContext.processUnicode(stringVal) else StringContext.processEscapes(stringVal))
// To avoid the backlash of backslash, taken literally by Literal, escapes are processed strictly (scala/bug#11196)
treeCopy.Literal(lit, k).setType(ConstantType(k))
}
catch {
case e: StringContext.InvalidEscapeException => c.abort(parts.head.pos.withShift(e.index), e.getMessage)
}

val argsIndexed = args.toVector
val concatArgs = collection.mutable.ListBuffer[Tree]()
Expand Down
72 changes: 57 additions & 15 deletions src/library/scala/StringContext.scala
Original file line number Diff line number Diff line change
Expand Up @@ -322,11 +322,38 @@ object StringContext {
class InvalidEscapeException(str: String, val index: Int) extends IllegalArgumentException(
s"""invalid escape ${
require(index >= 0 && index < str.length)
val ok = """[\b, \t, \n, \f, \r, \\, \", \']"""
val ok = s"""[\\b, \\t, \\n, \\f, \\r, \\, \\", \\', \\uxxxx]"""
if (index == str.length - 1) "at terminal" else s"'\\${str(index + 1)}' not one of $ok at"
} index $index in "$str". Use \\\\ for literal \\."""
)

private[this] def readUEscape(src: String, startindex: Int): (Char, Int) = {
val len = src.length()
def loop(uindex: Int): (Char, Int) = {
def loopCP(dindex: Int, codepoint: Int): (Char, Int) = {
//supports BMP + surrogate escapes
//but only in four hex-digit code units (uxxxx)
if(dindex >= 4) {
val usRead = uindex - startindex
val digitsRead = dindex
(codepoint.asInstanceOf[Char], usRead + digitsRead)
}
else {
val ch = src(dindex + uindex)
val e = ch.asDigit
if(e >= 0 && e <= 15) loopCP(dindex + 1, (codepoint << 4) + e)
else throw new InvalidEscapeException(src, startindex)
}
}
if(uindex > len) throw new InvalidEscapeException(src, startindex)
//allow one or more `u` characters between the
//backslash and the code unit
else if(src(uindex) == 'u') loop(uindex + 1)
else loopCP(0, 0)
}
loop(startindex)
}

/** Expands standard Scala escape sequences in a string.
* Escape sequences are:
* control: `\b`, `\t`, `\n`, `\f`, `\r`
Expand All @@ -347,18 +374,36 @@ object StringContext {
* @return The string with all escape sequences expanded.
*/
def processEscapes(str: String): String = {
val len = str.length
// replace escapes with given first escape
def replace(first: Int): String = {
val b = new JLSBuilder
str indexOf '\\' match {
case -1 => str
case i => replace(str, i, false)
}
}

protected[scala] def processUnicode(str: String): String = {
str indexOf "\\u" match {
case -1 => str
case i => replace(str, i, true)
}
}

// replace escapes with given first escape
private[this] def replace(str: String, first: Int, unicodeOnly: Boolean): String = {
val len = str.length()
val b = new JLSBuilder
// append replacement starting at index `i`, with `next` backslash
@tailrec def loop(i: Int, next: Int): String = {
if (next >= 0) {
//require(str(next) == '\\')
if (next > i) b.append(str, i, next)
if (next >= 0) {
//require(str(next) == '\\')
if (next > i) b.append(str, i, next)
var idx = next + 1
if (idx >= len) throw new InvalidEscapeException(str, next)
val c = str(idx) match {
case 'u' => 'u'
case chr if unicodeOnly => {
b.append('\\')
chr
}
case 'b' => '\b'
case 't' => '\t'
case 'n' => '\n'
Expand All @@ -369,8 +414,10 @@ object StringContext {
case '\\' => '\\'
case _ => throw new InvalidEscapeException(str, next)
}
idx += 1 // advance
b append c
val (ch, advance) = if (c == 'u') readUEscape(str, idx)
else (c, 1)
idx += advance
b append ch
loop(idx, str.indexOf('\\', idx))
} else {
if (i < len) b.append(str, i, len)
Expand All @@ -379,11 +426,6 @@ object StringContext {
}
loop(0, first)
}
str indexOf '\\' match {
case -1 => str
case i => replace(i)
}
}

def standardInterpolator(process: String => String, args: scala.collection.Seq[Any], parts: Seq[String]): String = {
StringContext.checkLengths(args, parts)
Expand Down
12 changes: 6 additions & 6 deletions test/files/neg/t4584.check
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
t4584.scala:1: error: error in unicode escape
class A { val \u2
^
t4584.scala:1: error: illegal character '\uffff'
class A { val \u2
^
t4584.scala:1: error: unclosed multi-line string literal
class A { val x = """\u2
^
t4584.scala:1: error: illegal start of simple expression
class A { val x = """\u2
^
two errors found
2 changes: 1 addition & 1 deletion test/files/neg/t4584.scala
Original file line number Diff line number Diff line change
@@ -1 +1 @@
class A { val \u2
class A { val x = """\u2
2 changes: 1 addition & 1 deletion test/files/neg/t6631.check
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
t6631.scala:2: error: invalid escape '\x' not one of [\b, \t, \n, \f, \r, \\, \", \'] at index 0 in "\x". Use \\ for literal \.
t6631.scala:2: error: invalid escape '\x' not one of [\b, \t, \n, \f, \r, \, \", \', \uxxxx] at index 0 in "\x". Use \\ for literal \.
s"""\x"""
^
one error found
8 changes: 4 additions & 4 deletions test/files/neg/t8015-ffb.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
//

trait G {
val c: Char = '\u000a' // disallowed!
def x\u000d\u000a = 9 // as nl
val c: Char = '\u000a' // allowed!
def x = 9
def y() = x
def z() = {
y()\u000a() // was Int does not take parameters
y()() // was Int does not take parameters
}
def v = y()\u000c() // was Int does not take parameters
def v = y()() // was Int does not take parameters
def w = { x () } // ^L is colored blue on this screen, hardly visible
}

0 comments on commit 97ba959

Please sign in to comment.