treat unicode escapes as any other escape

except in triple quotes strings and raw interpolations for those, emit deprecation warnings, but keep the current behaviour of processing unicode escapes
scala · Aug 2, 2019 · 97ba959 · 97ba959
1 parent 4854c5d
commit 97ba959
Show file tree

Hide file tree

Showing 23 changed files with 250 additions and 149 deletions.
diff --git a/build.sbt b/build.sbt
@@ -94,6 +94,7 @@ val mimaFilterSettings = Seq(
     ProblemFilters.exclude[MissingClassProblem]("scala.reflect.runtime.JavaMirrors$JavaMirror$typeTagCache$"),
     ProblemFilters.exclude[DirectMissingMethodProblem]("scala.reflect.api.TypeTags.TypeTagImpl"),
     ProblemFilters.exclude[DirectMissingMethodProblem]("scala.reflect.api.Universe.TypeTagImpl"),
+    ProblemFilters.exclude[DirectMissingMethodProblem]("scala.StringContext.processUnicode")
   ),
 )
 

diff --git a/src/compiler/scala/tools/nsc/ast/parser/Scanners.scala b/src/compiler/scala/tools/nsc/ast/parser/Scanners.scala
@@ -206,10 +206,7 @@ trait Scanners extends ScannersCommon {
 
     /** append Unicode character to "cbuf" buffer
      */
-    protected def putChar(c: Char): Unit = {
-//      assert(cbuf.size < 10000, cbuf)
-      cbuf.append(c)
-    }
+    protected def putChar(c: Char): Unit = cbuf.append(c)
 
     /** Determines whether this scanner should emit identifier deprecation warnings,
      *  e.g. when seeing `macro` or `then`, which are planned to become keywords in future versions of Scala.
@@ -573,7 +570,7 @@ trait Scanners extends ScannersCommon {
               charLitOr(() => getIdentRest())
             else if (isOperatorPart(ch) && (ch != '\\'))
               charLitOr(() => getOperatorRest())
-            else if (!isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape)) {
+            else if (!isAtEnd && (ch != SU && ch != CR && ch != LF)) {
               val isEmptyCharLit = (ch == '\'')
               getLitChar()
               if (ch == '\'') {
@@ -763,11 +760,21 @@ trait Scanners extends ScannersCommon {
 
     private def unclosedStringLit(): Unit = syntaxError("unclosed string literal")
 
+    private def replaceUnicodeEscapes(warn: Boolean): Unit = 
+      if(strVal != null) {
+        val replaced = StringContext.processUnicode(strVal)
+        if(warn && replaced != strVal) {
+          deprecationWarning("Unicode escapes in triple quoted strings and raw interpolations are deprecated, use the literal character instead" , since="2.13.1")
+        }
+        strVal = replaced
+      }
+
     @tailrec private def getRawStringLit(): Unit = {
       if (ch == '\"') {
         nextRawChar()
         if (isTripleQuote()) {
           setStrVal()
+          replaceUnicodeEscapes(true)
           token = STRINGLIT
         } else
           getRawStringLit()
@@ -831,7 +838,7 @@ trait Scanners extends ScannersCommon {
           syntaxError(s"invalid string interpolation $$$ch, expected: $$$$, $$identifier or $${expression}")
         }
       } else {
-        val isUnclosedLiteral = !isUnicodeEscape && (ch == SU || (!multiLine && (ch == CR || ch == LF)))
+        val isUnclosedLiteral = (ch == SU || (!multiLine && (ch == CR || ch == LF)))
         if (isUnclosedLiteral) {
           if (multiLine)
             incompleteInputError("unclosed multi-line string literal")
@@ -903,6 +910,7 @@ trait Scanners extends ScannersCommon {
             case '\"' => putChar('\"')
             case '\'' => putChar('\'')
             case '\\' => putChar('\\')
+            case 'u'  => getUEscape()
             case _    => invalidEscape()
           }
           nextChar()
@@ -912,13 +920,37 @@ trait Scanners extends ScannersCommon {
         nextChar()
       }
 
+    private def getUEscape(): Unit = {
+      while (ch == 'u') nextChar()
+      var codepoint = 0
+      var digitsRead = 0
+      while(digitsRead < 4){
+        if (digitsRead > 0) nextChar()
+        val digit = digit2int(ch, 16)
+        digitsRead += 1
+        if (digit >= 0) {
+          codepoint = codepoint << 4
+          codepoint += digit
+        }
+        else invalidUnicodeEscape(digitsRead)
+      }
+      val found = codepoint.asInstanceOf[Char]
+      putChar(found)
+    }
+
+
     protected def invalidEscape(): Unit = {
       syntaxError(charOffset - 1, "invalid escape character")
       putChar(ch)
     }
 
+    protected def invalidUnicodeEscape(n: Int): Unit = {
+      syntaxError(charOffset -n, "invalid unicode escape")
+      putChar(ch)
+    }
+
     private def getLitChars(delimiter: Char) = {
-      while (ch != delimiter && !isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape))
+      while (ch != delimiter && !isAtEnd && (ch != SU && ch != CR && ch != LF))
         getLitChar()
     }
 
@@ -1315,7 +1347,6 @@ trait Scanners extends ScannersCommon {
    */
   class SourceFileScanner(val source: SourceFile) extends Scanner {
     val buf = source.content
-    override val decodeUni: Boolean = !settings.nouescape
 
     // suppress warnings, throw exception on errors
     def deprecationWarning(off: Offset, msg: String, since: String): Unit = ()

diff --git a/src/compiler/scala/tools/nsc/util/CharArrayReader.scala b/src/compiler/scala/tools/nsc/util/CharArrayReader.scala
@@ -28,14 +28,11 @@ trait CharArrayReaderData {
   /** The start offset of the line before the current one */
   var lastLineStartOffset: Int = 0
 
-  protected var lastUnicodeOffset = -1
-
   def copyFrom(cd: CharArrayReaderData): this.type = {
     this.ch = cd.ch
     this.charOffset = cd.charOffset
     this.lineStartOffset = cd.lineStartOffset
     this.lastLineStartOffset = cd.lastLineStartOffset
-    this.lastUnicodeOffset = cd.lastUnicodeOffset
     this
   }
 }
@@ -44,14 +41,6 @@ abstract class CharArrayReader extends CharArrayReaderData { self =>
 
   val buf: Array[Char]
 
-  def decodeUni: Boolean = true
-
-  /** An error routine to call on bad unicode escapes \\uxxxx. */
-  protected def error(offset: Int, msg: String): Unit
-
-  /** Is last character a unicode escape \\uxxxx? */
-  def isUnicodeEscape = charOffset == lastUnicodeOffset
-
   /** Advance one character; reducing CR;LF pairs to just LF */
   final def nextChar(): Unit = {
     if (charOffset >= buf.length) {
@@ -60,7 +49,6 @@ abstract class CharArrayReader extends CharArrayReaderData { self =>
       val c = buf(charOffset)
       ch = c
       charOffset += 1
-      if (c == '\\') potentialUnicode()
       if (ch < ' ') {
         skipCR()
         potentialLineEnd()
@@ -79,38 +67,6 @@ abstract class CharArrayReader extends CharArrayReaderData { self =>
       val c = buf(charOffset)
       ch = c
       charOffset += 1
-      if (c == '\\') potentialUnicode()
-    }
-  }
-
-  /** Interpret \\uxxxx escapes */
-  private def potentialUnicode() = {
-    def evenSlashPrefix: Boolean = {
-      var p = charOffset - 2
-      while (p >= 0 && buf(p) == '\\') p -= 1
-      (charOffset - p) % 2 == 0
-    }
-    def udigit: Int = {
-      if (charOffset >= buf.length) {
-        // Since the positioning code is very insistent about throwing exceptions,
-        // we have to decrement the position so our error message can be seen, since
-        // we are one past EOF.  This happens with e.g. val x = \ u 1 <EOF>
-        error(charOffset - 1, "incomplete unicode escape")
-        SU
-      }
-      else {
-        val d = digit2int(buf(charOffset), 16)
-        if (d >= 0) charOffset += 1
-        else error(charOffset, "error in unicode escape")
-        d
-      }
-    }
-    if (charOffset < buf.length && buf(charOffset) == 'u' && decodeUni && evenSlashPrefix) {
-      do charOffset += 1
-      while (charOffset < buf.length && buf(charOffset) == 'u')
-      val code = udigit << 12 | udigit << 8 | udigit << 4 | udigit
-      lastUnicodeOffset = charOffset
-      ch = code.toChar
     }
   }
 
@@ -121,9 +77,6 @@ abstract class CharArrayReader extends CharArrayReaderData { self =>
         case LF =>
           charOffset += 1
           ch = LF
-        case '\\' =>
-          if (lookaheadReader.getu == LF)
-            potentialUnicode()
         case _ =>
       }
 
@@ -142,10 +95,7 @@ abstract class CharArrayReader extends CharArrayReaderData { self =>
     val buf = self.buf
     charOffset = self.charOffset
     ch = self.ch
-    override def decodeUni = self.decodeUni
-    def error(offset: Int, msg: String) = self.error(offset, msg)
     /** A mystery why CharArrayReader.nextChar() returns Unit */
     def getc() = { nextChar() ; ch }
-    def getu() = { require(buf(charOffset) == '\\') ; ch = '\\' ; charOffset += 1 ; potentialUnicode() ; ch }
   }
 }
diff --git a/src/compiler/scala/tools/reflect/FastStringInterpolator.scala b/src/compiler/scala/tools/reflect/FastStringInterpolator.scala
@@ -29,18 +29,16 @@ trait FastStringInterpolator extends FormatInterpolator {
       parts.forall(treeInfo.isLiteralString) &&
       parts.length == (args.length + 1) =>
 
-      val treated =
-        if (isRaw) parts
-        else
-          try
-            parts.mapConserve { case lit@Literal(Constant(stringVal: String)) =>
-              val k = Constant(StringContext.processEscapes(stringVal))
-              // To avoid the backlash of backslash, taken literally by Literal, escapes are processed strictly (scala/bug#11196)
-              treeCopy.Literal(lit, k).setType(ConstantType(k))
-            }
-          catch {
-            case e: StringContext.InvalidEscapeException => c.abort(parts.head.pos.withShift(e.index), e.getMessage)
+      val treated = 
+        try
+          parts.mapConserve { case lit@Literal(Constant(stringVal: String)) =>
+            val k = Constant(if(isRaw) StringContext.processUnicode(stringVal) else StringContext.processEscapes(stringVal))
+            // To avoid the backlash of backslash, taken literally by Literal, escapes are processed strictly (scala/bug#11196)
+            treeCopy.Literal(lit, k).setType(ConstantType(k))
           }
+        catch {
+          case e: StringContext.InvalidEscapeException => c.abort(parts.head.pos.withShift(e.index), e.getMessage)
+        }
 
       val argsIndexed = args.toVector
       val concatArgs = collection.mutable.ListBuffer[Tree]()

diff --git a/src/library/scala/StringContext.scala b/src/library/scala/StringContext.scala
@@ -322,11 +322,38 @@ object StringContext {
   class InvalidEscapeException(str: String, val index: Int) extends IllegalArgumentException(
     s"""invalid escape ${
       require(index >= 0 && index < str.length)
-      val ok = """[\b, \t, \n, \f, \r, \\, \", \']"""
+      val ok = s"""[\\b, \\t, \\n, \\f, \\r, \\, \\", \\', \\uxxxx]"""
       if (index == str.length - 1) "at terminal" else s"'\\${str(index + 1)}' not one of $ok at"
     } index $index in "$str". Use \\\\ for literal \\."""
   )
 
+  private[this] def readUEscape(src: String, startindex: Int): (Char, Int) = {
+    val len = src.length()
+    def loop(uindex: Int): (Char, Int) = {
+      def loopCP(dindex: Int, codepoint: Int): (Char, Int) = {
+        //supports BMP + surrogate escapes 
+        //but only in four hex-digit code units (uxxxx)
+        if(dindex >= 4) {
+          val usRead = uindex - startindex
+          val digitsRead = dindex
+          (codepoint.asInstanceOf[Char], usRead + digitsRead)
+        }
+        else {
+          val ch = src(dindex + uindex)
+          val e = ch.asDigit
+          if(e >= 0 && e <= 15) loopCP(dindex + 1, (codepoint << 4) + e)
+          else throw new InvalidEscapeException(src, startindex)
+        }
+      }
+      if(uindex > len) throw new InvalidEscapeException(src, startindex)
+      //allow one or more `u` characters between the
+      //backslash and the code unit
+      else if(src(uindex) == 'u') loop(uindex + 1)
+      else loopCP(0, 0)
+    }
+    loop(startindex)
+  }
+
   /** Expands standard Scala escape sequences in a string.
    *  Escape sequences are:
    *   control: `\b`, `\t`, `\n`, `\f`, `\r`
@@ -347,18 +374,36 @@ object StringContext {
    *  @return The string with all escape sequences expanded.
    */
   def processEscapes(str: String): String = {
-    val len = str.length
-    // replace escapes with given first escape
-    def replace(first: Int): String = {
-      val b = new JLSBuilder
+    str indexOf '\\' match {
+      case -1 => str
+      case  i => replace(str, i, false)
+    }
+  }
+
+  protected[scala] def processUnicode(str: String): String = {
+    str indexOf "\\u" match {
+      case -1 => str
+      case  i => replace(str, i, true)
+    }
+  }
+
+  // replace escapes with given first escape
+  private[this] def replace(str: String, first: Int, unicodeOnly: Boolean): String = {
+    val len = str.length()
+    val b = new JLSBuilder
       // append replacement starting at index `i`, with `next` backslash
       @tailrec def loop(i: Int, next: Int): String = {
-        if (next >= 0) {
-          //require(str(next) == '\\')
-          if (next > i) b.append(str, i, next)
+      if (next >= 0) {
+        //require(str(next) == '\\')
+        if (next > i) b.append(str, i, next)
           var idx = next + 1
           if (idx >= len) throw new InvalidEscapeException(str, next)
           val c = str(idx) match {
+            case 'u'  => 'u'
+            case chr if unicodeOnly => {
+              b.append('\\')
+              chr
+            }
             case 'b'  => '\b'
             case 't'  => '\t'
             case 'n'  => '\n'
@@ -369,8 +414,10 @@ object StringContext {
             case '\\' => '\\'
             case _    => throw new InvalidEscapeException(str, next)
           }
-          idx += 1       // advance
-          b append c
+          val (ch, advance) = if (c == 'u') readUEscape(str, idx)
+                              else (c, 1)
+          idx += advance
+          b append ch
           loop(idx, str.indexOf('\\', idx))
         } else {
           if (i < len) b.append(str, i, len)
@@ -379,11 +426,6 @@ object StringContext {
       }
       loop(0, first)
     }
-    str indexOf '\\' match {
-      case -1 => str
-      case  i => replace(i)
-    }
-  }
 
   def standardInterpolator(process: String => String, args: scala.collection.Seq[Any], parts: Seq[String]): String = {
     StringContext.checkLengths(args, parts)

diff --git a/test/files/neg/t4584.check b/test/files/neg/t4584.check
@@ -1,7 +1,7 @@
-t4584.scala:1: error: error in unicode escape
-class A { val \u2
-                 ^
-t4584.scala:1: error: illegal character '\uffff'
-class A { val \u2
-                ^
+t4584.scala:1: error: unclosed multi-line string literal
+class A { val x = """\u2
+                  ^
+t4584.scala:1: error: illegal start of simple expression
+class A { val x = """\u2
+                        ^
 two errors found
diff --git a/test/files/neg/t4584.scala b/test/files/neg/t4584.scala
@@ -1 +1 @@
-class A { val \u2
+class A { val x = """\u2
diff --git a/test/files/neg/t6631.check b/test/files/neg/t6631.check
@@ -1,4 +1,4 @@
-t6631.scala:2: error: invalid escape '\x' not one of [\b, \t, \n, \f, \r, \\, \", \'] at index 0 in "\x". Use \\ for literal \.
+t6631.scala:2: error: invalid escape '\x' not one of [\b, \t, \n, \f, \r, \, \", \', \uxxxx] at index 0 in "\x". Use \\ for literal \.
   s"""\x"""
       ^
 one error found
diff --git a/test/files/neg/t8015-ffb.scala b/test/files/neg/t8015-ffb.scala
@@ -2,12 +2,12 @@
 //
 
 trait G {
-  val c: Char = '\u000a'   // disallowed!
-  def x\u000d\u000a = 9    // as nl
+  val c: Char = '\u000a'   // allowed!
+  def x = 9
   def y() = x
   def z() = {
-    y()\u000a()           // was Int does not take parameters
+    y()()           // was Int does not take parameters
   }
-  def v = y()\u000c()     // was Int does not take parameters
+  def v = y()()     // was Int does not take parameters
   def w = { x() }       // ^L is colored blue on this screen, hardly visible
 }