Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

unspecialcase unicode escapes [WIP] #6661

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions spec/01-lexical-syntax.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,6 @@ otherwise mentioned, the following descriptions of Scala tokens refer
to _Scala mode_, and literal characters ‘c’ refer to the ASCII fragment
`\u0000` – `\u007F`.

In Scala mode, _Unicode escapes_ are replaced by the corresponding
Unicode character with the given hexadecimal code.

```ebnf
UnicodeEscape ::= ‘\’ ‘u’ {‘u’} hexDigit hexDigit hexDigit hexDigit
hexDigit ::= ‘0’ | … | ‘9’ | ‘A’ | … | ‘F’ | ‘a’ | … | ‘f’
```

<!--
TODO scala/bug#4583: UnicodeEscape used to allow additional backslashes,
and there is something in the code `evenSlashPrefix` that alludes to it,
Expand Down
1 change: 1 addition & 0 deletions spec/13-syntax-summary.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ delim ::= ‘`’ | ‘'’ | ‘"’ | ‘.’ | ‘;’ | ‘,’
opchar ::= // printableChar not matched by (whiteSpace | upper | lower |
// letter | digit | paren | delim | opchar | Unicode_Sm | Unicode_So)
printableChar ::= // all characters in [\u0020, \u007F] inclusive
escapeSeq ::= UnicodeEscape | charEscapeSeq
charEscapeSeq ::= ‘\’ (‘b’ | ‘t’ | ‘n’ | ‘f’ | ‘r’ | ‘"’ | ‘'’ | ‘\’)

op ::= opchar {opchar}
Expand Down
56 changes: 50 additions & 6 deletions src/compiler/scala/tools/nsc/ast/parser/Scanners.scala
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,17 @@ trait Scanners extends ScannersCommon {
cbuf.clear()
}

/** Process unicode escapes and set string*/
private def setUEscapeStrVal(): Unit = {
strVal = cbuf.toString
val replacement = StringContext.processUnicodeEscapes(strVal)
if(strVal != replacement) {
deprecationWarning("Unicode escapes in triple quoted strings are deprecated", "2.13.0")
strVal = replacement
}
cbuf.clear()
}

/** a stack of tokens which indicates whether line-ends can be statement separators
* also used for keeping track of nesting levels.
* We keep track of the closing symbol of a region. This can be
Expand Down Expand Up @@ -566,7 +577,7 @@ trait Scanners extends ScannersCommon {
charLitOr(() => getIdentRest())
else if (isOperatorPart(ch) && (ch != '\\'))
charLitOr(() => getOperatorRest())
else if (!isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape)) {
else if (!isAtEnd && (ch != SU && ch != CR && ch != LF)) {
val isEmptyCharLit = (ch == '\'')
getLitChar()
if (ch == '\'') {
Expand Down Expand Up @@ -756,7 +767,7 @@ trait Scanners extends ScannersCommon {
if (ch == '\"') {
nextRawChar()
if (isTripleQuote()) {
setStrVal()
setUEscapeStrVal()
token = STRINGLIT
} else
getRawStringLit()
Expand Down Expand Up @@ -820,7 +831,7 @@ trait Scanners extends ScannersCommon {
syntaxError(s"invalid string interpolation $$$ch, expected: $$$$, $$identifier or $${expression}")
}
} else {
val isUnclosedLiteral = !isUnicodeEscape && (ch == SU || (!multiLine && (ch == CR || ch == LF)))
val isUnclosedLiteral = (ch == SU || (!multiLine && (ch == CR || ch == LF)))
if (isUnclosedLiteral) {
if (multiLine)
incompleteInputError("unclosed multi-line string literal")
Expand Down Expand Up @@ -867,6 +878,7 @@ trait Scanners extends ScannersCommon {
if (ch == '\\') {
nextChar()
if ('0' <= ch && ch <= '7') {
//octal escape
val start = charOffset - 2
val leadch: Char = ch
var oct: Int = digit2int(ch, 8)
Expand All @@ -882,7 +894,35 @@ trait Scanners extends ScannersCommon {
val alt = if (oct == LF) "\\n" else "\\u%04x" format oct
syntaxError(start, s"octal escape literals are unsupported: use $alt instead")
putChar(oct.toChar)
} else {
} else if(ch == 'u'){
//unicode escape
while(ch == 'u') nextChar() //as many u's as you like
//four hexdigits: only BMP code points are supported.
var codepoint = 0
var digit = digit2int(ch, 16)
if(digit >= 0) {
codepoint += (0x1000 * digit)
nextChar()
digit = digit2int(ch, 16)
if(digit >= 0) {
codepoint += (0x100 * digit)
nextChar()
digit = digit2int(ch, 16)
if(digit >= 0) {
codepoint += (0x10 * digit)
nextChar()
digit = digit2int(ch, 16)
if(digit >= 0) {
codepoint += digit
val ch = codepoint.asInstanceOf[Char]
putChar(ch)
nextChar()
} else invalidUnicodeEscape(4)
} else invalidUnicodeEscape(3)
} else invalidUnicodeEscape(2)
} else invalidUnicodeEscape(1)
}
else {
ch match {
case 'b' => putChar('\b')
case 't' => putChar('\t')
Expand All @@ -906,8 +946,13 @@ trait Scanners extends ScannersCommon {
putChar(ch)
}

protected def invalidUnicodeEscape(n: Int): Unit = {
syntaxError(charOffset -n, "invalid unicode escape")
putChar(ch)
}

private def getLitChars(delimiter: Char) = {
while (ch != delimiter && !isAtEnd && (ch != SU && ch != CR && ch != LF || isUnicodeEscape))
while (ch != delimiter && !isAtEnd && (ch != SU && ch != CR && ch != LF))
getLitChar()
}

Expand Down Expand Up @@ -1279,7 +1324,6 @@ trait Scanners extends ScannersCommon {
*/
class SourceFileScanner(val source: SourceFile) extends Scanner {
val buf = source.content
override val decodeUni: Boolean = !settings.nouescape

// suppress warnings, throw exception on errors
def deprecationWarning(off: Offset, msg: String, since: String): Unit = ()
Expand Down
20 changes: 10 additions & 10 deletions src/compiler/scala/tools/nsc/typechecker/RefChecks.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1557,21 +1557,21 @@ abstract class RefChecks extends Transform {
lits.forall(lit => treeInfo.isLiteralString(lit)) &&
lits.length == (args.length + 1) =>
val isRaw = sym == rd.StringContext_raw
if (isRaw) Some((lits, args))
else {
try {
val treated = lits.mapConserve { lit =>
val stringVal = lit.asInstanceOf[Literal].value.stringValue
treeCopy.Literal(lit, Constant(StringContext.processEscapes(stringVal)))
val escape = if(isRaw) StringContext.processUnicodeEscapes _ else StringContext.processEscapes _
try {
val treated = lits.mapConserve { lit => {
val stringVal = lit.asInstanceOf[Literal].value.stringValue
val escapedString = escape(stringVal)
if(isRaw && escapedString != stringVal) reporter.warning(sym.pos, "Unicode escapes in raw interpolators are deprecated since scala 2.13.0")
treeCopy.Literal(lit, Constant(escape(stringVal)))
}
}
Some((treated, args))
} catch {
case _: StringContext.InvalidEscapeException =>
None
case _: StringContext.MalformedUnicodeEscapeException => None
case _: StringContext.InvalidEscapeException => None
}
}
case _ => None

}
} else None
}
Expand Down
46 changes: 0 additions & 46 deletions src/compiler/scala/tools/nsc/util/CharArrayReader.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,11 @@ trait CharArrayReaderData {
/** The start offset of the line before the current one */
var lastLineStartOffset: Int = 0

protected var lastUnicodeOffset = -1

def copyFrom(cd: CharArrayReaderData): this.type = {
this.ch = cd.ch
this.charOffset = cd.charOffset
this.lineStartOffset = cd.lineStartOffset
this.lastLineStartOffset = cd.lastLineStartOffset
this.lastUnicodeOffset = cd.lastUnicodeOffset
this
}
}
Expand All @@ -37,14 +34,9 @@ abstract class CharArrayReader extends CharArrayReaderData { self =>

val buf: Array[Char]

def decodeUni: Boolean = true

/** An error routine to call on bad unicode escapes \\uxxxx. */
protected def error(offset: Int, msg: String): Unit

/** Is last character a unicode escape \\uxxxx? */
def isUnicodeEscape = charOffset == lastUnicodeOffset

/** Advance one character; reducing CR;LF pairs to just LF */
final def nextChar(): Unit = {
if (charOffset >= buf.length) {
Expand All @@ -53,7 +45,6 @@ abstract class CharArrayReader extends CharArrayReaderData { self =>
val c = buf(charOffset)
ch = c
charOffset += 1
if (c == '\\') potentialUnicode()
if (ch < ' ') {
skipCR()
potentialLineEnd()
Expand All @@ -72,38 +63,6 @@ abstract class CharArrayReader extends CharArrayReaderData { self =>
val c = buf(charOffset)
ch = c
charOffset += 1
if (c == '\\') potentialUnicode()
}
}

/** Interpret \\uxxxx escapes */
private def potentialUnicode() = {
def evenSlashPrefix: Boolean = {
var p = charOffset - 2
while (p >= 0 && buf(p) == '\\') p -= 1
(charOffset - p) % 2 == 0
}
def udigit: Int = {
if (charOffset >= buf.length) {
// Since the positioning code is very insistent about throwing exceptions,
// we have to decrement the position so our error message can be seen, since
// we are one past EOF. This happens with e.g. val x = \ u 1 <EOF>
error(charOffset - 1, "incomplete unicode escape")
SU
}
else {
val d = digit2int(buf(charOffset), 16)
if (d >= 0) charOffset += 1
else error(charOffset, "error in unicode escape")
d
}
}
if (charOffset < buf.length && buf(charOffset) == 'u' && decodeUni && evenSlashPrefix) {
do charOffset += 1
while (charOffset < buf.length && buf(charOffset) == 'u')
val code = udigit << 12 | udigit << 8 | udigit << 4 | udigit
lastUnicodeOffset = charOffset
ch = code.toChar
}
}

Expand All @@ -114,9 +73,6 @@ abstract class CharArrayReader extends CharArrayReaderData { self =>
case LF =>
charOffset += 1
ch = LF
case '\\' =>
if (lookaheadReader.getu == LF)
potentialUnicode()
case _ =>
}

Expand All @@ -135,10 +91,8 @@ abstract class CharArrayReader extends CharArrayReaderData { self =>
val buf = self.buf
charOffset = self.charOffset
ch = self.ch
override def decodeUni = self.decodeUni
def error(offset: Int, msg: String) = self.error(offset, msg)
/** A mystery why CharArrayReader.nextChar() returns Unit */
def getc() = { nextChar() ; ch }
def getu() = { require(buf(charOffset) == '\\') ; ch = '\\' ; charOffset += 1 ; potentialUnicode() ; ch }
}
}
Loading