scala · martijnhoekstra · May 15, 2018
diff --git a/spec/01-lexical-syntax.md b/spec/01-lexical-syntax.md
@@ -14,14 +14,6 @@ otherwise mentioned, the following descriptions of Scala tokens refer
 to _Scala mode_, and literal characters ‘c’ refer to the ASCII fragment
 `\u0000` – `\u007F`.
 
-In Scala mode, _Unicode escapes_ are replaced by the corresponding
-Unicode character with the given hexadecimal code.
-
-```ebnf
-UnicodeEscape ::= ‘\’ ‘u’ {‘u’} hexDigit hexDigit hexDigit hexDigit
-hexDigit      ::= ‘0’ | … | ‘9’ | ‘A’ | … | ‘F’ | ‘a’ | … | ‘f’
-```
-
 <!--
 TODO scala/bug#4583: UnicodeEscape used to allow additional backslashes,
 and there is something in the code `evenSlashPrefix` that alludes to it,

diff --git a/spec/13-syntax-summary.md b/spec/13-syntax-summary.md
@@ -30,6 +30,7 @@ delim            ::=  ‘`’ | ‘'’ | ‘"’ | ‘.’ | ‘;’ | ‘,’
 opchar           ::= // printableChar not matched by (whiteSpace | upper | lower |
                      // letter | digit | paren | delim | opchar | Unicode_Sm | Unicode_So)
 printableChar    ::= // all characters in [\u0020, \u007F] inclusive
+escapeSeq        ::= UnicodeEscape | charEscapeSeq
 charEscapeSeq    ::= ‘\’ (‘b’ | ‘t’ | ‘n’ | ‘f’ | ‘r’ | ‘"’ | ‘'’ | ‘\’)
 
 op               ::=  opchar {opchar}

diff --git a/src/compiler/scala/tools/nsc/ast/parser/Scanners.scala b/src/compiler/scala/tools/nsc/ast/parser/Scanners.scala
@@ -235,6 +235,17 @@ trait Scanners extends ScannersCommon {
       cbuf.clear()
     }
 
+    /** Process unicode escapes and set string*/
+    private def setUEscapeStrVal(): Unit = {
+      strVal = cbuf.toString
+      val replacement = StringContext.processUnicodeEscapes(strVal)
+      if(strVal != replacement) {
+        deprecationWarning("Unicode escapes in triple quoted strings are deprecated", "2.13.0")
+        strVal = replacement
+      }
+      cbuf.clear()
+    }
+
     /** a stack of tokens which indicates whether line-ends can be statement separators
      *  also used for keeping track of nesting levels.
      *  We keep track of the closing symbol of a region. This can be
@@ -566,7 +577,7 @@ trait Scanners extends ScannersCommon {
               charLitOr(() => getIdentRest())
             else if (isOperatorPart(ch) && (ch != '\\'))
               charLitOr(() => getOperatorRest())
-            else if (!isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape)) {
+            else if (!isAtEnd && (ch != SU && ch != CR && ch != LF)) {
               val isEmptyCharLit = (ch == '\'')
               getLitChar()
               if (ch == '\'') {
@@ -756,7 +767,7 @@ trait Scanners extends ScannersCommon {
       if (ch == '\"') {
         nextRawChar()
         if (isTripleQuote()) {
-          setStrVal()
+          setUEscapeStrVal()
           token = STRINGLIT
         } else
           getRawStringLit()
@@ -820,7 +831,7 @@ trait Scanners extends ScannersCommon {
           syntaxError(s"invalid string interpolation $$$ch, expected: $$$$, $$identifier or $${expression}")
         }
       } else {
-        val isUnclosedLiteral = !isUnicodeEscape && (ch == SU || (!multiLine && (ch == CR || ch == LF)))
+        val isUnclosedLiteral = (ch == SU || (!multiLine && (ch == CR || ch == LF)))
         if (isUnclosedLiteral) {
           if (multiLine)
             incompleteInputError("unclosed multi-line string literal")
@@ -867,6 +878,7 @@ trait Scanners extends ScannersCommon {
       if (ch == '\\') {
         nextChar()
         if ('0' <= ch && ch <= '7') {
+          //octal escape
           val start = charOffset - 2
           val leadch: Char = ch
           var oct: Int = digit2int(ch, 8)
@@ -882,7 +894,35 @@ trait Scanners extends ScannersCommon {
           val alt = if (oct == LF) "\\n" else "\\u%04x" format oct
           syntaxError(start, s"octal escape literals are unsupported: use $alt instead")
           putChar(oct.toChar)
-        } else {
+        } else if(ch == 'u'){
+          //unicode escape
+          while(ch == 'u') nextChar() //as many u's as you like
+          //four hexdigits: only BMP code points are supported.
+          var codepoint = 0
+          var digit = digit2int(ch, 16)
+          if(digit >= 0) {
+            codepoint += (0x1000 * digit)
+            nextChar()
+            digit = digit2int(ch, 16)
+            if(digit >= 0) {
+              codepoint += (0x100 * digit)
+              nextChar()
+              digit = digit2int(ch, 16)
+              if(digit >= 0) {
+                codepoint += (0x10 * digit)
+                nextChar()
+                digit = digit2int(ch, 16)
+                if(digit >= 0) {
+                  codepoint += digit
+                  val ch = codepoint.asInstanceOf[Char]
+                  putChar(ch)
+                  nextChar()
+                } else invalidUnicodeEscape(4)
+              } else invalidUnicodeEscape(3)
+            } else invalidUnicodeEscape(2)
+          } else invalidUnicodeEscape(1)
+        }
+        else {
           ch match {
             case 'b'  => putChar('\b')
             case 't'  => putChar('\t')
@@ -906,8 +946,13 @@ trait Scanners extends ScannersCommon {
       putChar(ch)
     }
 
+    protected def invalidUnicodeEscape(n: Int): Unit = {
+      syntaxError(charOffset -n, "invalid unicode escape")
+      putChar(ch)
+    }
+
     private def getLitChars(delimiter: Char) = {
-      while (ch != delimiter && !isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape))
+      while (ch != delimiter && !isAtEnd && (ch != SU && ch != CR && ch != LF))
         getLitChar()
     }
 
@@ -1279,7 +1324,6 @@ trait Scanners extends ScannersCommon {
    */
   class SourceFileScanner(val source: SourceFile) extends Scanner {
     val buf = source.content
-    override val decodeUni: Boolean = !settings.nouescape
 
     // suppress warnings, throw exception on errors
     def deprecationWarning(off: Offset, msg: String, since: String): Unit = ()

diff --git a/src/compiler/scala/tools/nsc/typechecker/RefChecks.scala b/src/compiler/scala/tools/nsc/typechecker/RefChecks.scala
@@ -1557,21 +1557,21 @@ abstract class RefChecks extends Transform {
                 lits.forall(lit => treeInfo.isLiteralString(lit)) &&
                 lits.length == (args.length + 1) =>
               val isRaw = sym == rd.StringContext_raw
-              if (isRaw) Some((lits, args))
-              else {
-                try {
-                  val treated = lits.mapConserve { lit =>
-                    val stringVal = lit.asInstanceOf[Literal].value.stringValue
-                    treeCopy.Literal(lit, Constant(StringContext.processEscapes(stringVal)))
+              val escape = if(isRaw) StringContext.processUnicodeEscapes _ else StringContext.processEscapes _
+              try {
+                  val treated = lits.mapConserve { lit => {
+                      val stringVal = lit.asInstanceOf[Literal].value.stringValue
+                      val escapedString = escape(stringVal)
+                      if(isRaw && escapedString != stringVal) reporter.warning(sym.pos, "Unicode escapes in raw interpolators are deprecated since scala 2.13.0")
+                      treeCopy.Literal(lit, Constant(escape(stringVal)))
+                    }
                   }
                   Some((treated, args))
                 } catch {
-                  case _: StringContext.InvalidEscapeException =>
-                    None
+                  case _: StringContext.MalformedUnicodeEscapeException => None
+                  case _: StringContext.InvalidEscapeException =>  None
                 }
-              }
             case _ => None
-
           }
         } else None
       }

diff --git a/src/compiler/scala/tools/nsc/util/CharArrayReader.scala b/src/compiler/scala/tools/nsc/util/CharArrayReader.scala
@@ -21,14 +21,11 @@ trait CharArrayReaderData {
   /** The start offset of the line before the current one */
   var lastLineStartOffset: Int = 0
 
-  protected var lastUnicodeOffset = -1
-
   def copyFrom(cd: CharArrayReaderData): this.type = {
     this.ch = cd.ch
     this.charOffset = cd.charOffset
     this.lineStartOffset = cd.lineStartOffset
     this.lastLineStartOffset = cd.lastLineStartOffset
-    this.lastUnicodeOffset = cd.lastUnicodeOffset
     this
   }
 }
@@ -37,14 +34,9 @@ abstract class CharArrayReader extends CharArrayReaderData { self =>
 
   val buf: Array[Char]
 
-  def decodeUni: Boolean = true
-
   /** An error routine to call on bad unicode escapes \\uxxxx. */
   protected def error(offset: Int, msg: String): Unit
 
-  /** Is last character a unicode escape \\uxxxx? */
-  def isUnicodeEscape = charOffset == lastUnicodeOffset
-
   /** Advance one character; reducing CR;LF pairs to just LF */
   final def nextChar(): Unit = {
     if (charOffset >= buf.length) {
@@ -53,7 +45,6 @@ abstract class CharArrayReader extends CharArrayReaderData { self =>
       val c = buf(charOffset)
       ch = c
       charOffset += 1
-      if (c == '\\') potentialUnicode()
       if (ch < ' ') {
         skipCR()
         potentialLineEnd()
@@ -72,38 +63,6 @@ abstract class CharArrayReader extends CharArrayReaderData { self =>
       val c = buf(charOffset)
       ch = c
       charOffset += 1
-      if (c == '\\') potentialUnicode()
-    }
-  }
-
-  /** Interpret \\uxxxx escapes */
-  private def potentialUnicode() = {
-    def evenSlashPrefix: Boolean = {
-      var p = charOffset - 2
-      while (p >= 0 && buf(p) == '\\') p -= 1
-      (charOffset - p) % 2 == 0
-    }
-    def udigit: Int = {
-      if (charOffset >= buf.length) {
-        // Since the positioning code is very insistent about throwing exceptions,
-        // we have to decrement the position so our error message can be seen, since
-        // we are one past EOF.  This happens with e.g. val x = \ u 1 <EOF>
-        error(charOffset - 1, "incomplete unicode escape")
-        SU
-      }
-      else {
-        val d = digit2int(buf(charOffset), 16)
-        if (d >= 0) charOffset += 1
-        else error(charOffset, "error in unicode escape")
-        d
-      }
-    }
-    if (charOffset < buf.length && buf(charOffset) == 'u' && decodeUni && evenSlashPrefix) {
-      do charOffset += 1
-      while (charOffset < buf.length && buf(charOffset) == 'u')
-      val code = udigit << 12 | udigit << 8 | udigit << 4 | udigit
-      lastUnicodeOffset = charOffset
-      ch = code.toChar
     }
   }
 
@@ -114,9 +73,6 @@ abstract class CharArrayReader extends CharArrayReaderData { self =>
         case LF =>
           charOffset += 1
           ch = LF
-        case '\\' =>
-          if (lookaheadReader.getu == LF)
-            potentialUnicode()
         case _ =>
       }
 
@@ -135,10 +91,8 @@ abstract class CharArrayReader extends CharArrayReaderData { self =>
     val buf = self.buf
     charOffset = self.charOffset
     ch = self.ch
-    override def decodeUni = self.decodeUni
     def error(offset: Int, msg: String) = self.error(offset, msg)
     /** A mystery why CharArrayReader.nextChar() returns Unit */
     def getc() = { nextChar() ; ch }
-    def getu() = { require(buf(charOffset) == '\\') ; ch = '\\' ; charOffset += 1 ; potentialUnicode() ; ch }
   }
 }