From 8cdbbe4d0d42a020324dfce372036d38599b1ac6 Mon Sep 17 00:00:00 2001
From: Alec Theriault <alec.theriault@gmail.com>
Date: Tue, 23 Mar 2021 14:09:41 -0700
Subject: [PATCH] SI-12290: support JDK15 text blocks in Java parser

JDK15 introduced text blocks (JEP 378) for writing multiline strings.
This adds support for parsing these strings in the Java parser.

The logic for interpretting the literals is a little complicated, but
follows from the "3.10.6. Text Blocks" of the Java language specification.
The test cases include examples from there and from the JEP.

Fixes scala/bug#12290
---
 .../scala/tools/nsc/javac/JavaScanners.scala  | 132 ++++++++++++++++--
 test/files/run/t12290.check                   |  56 ++++++++
 test/files/run/t12290/Test.scala              |  27 ++++
 test/files/run/t12290/TextBlocks.java         |  68 +++++++++
 4 files changed, 275 insertions(+), 8 deletions(-)
 create mode 100644 test/files/run/t12290.check
 create mode 100644 test/files/run/t12290/Test.scala
 create mode 100644 test/files/run/t12290/TextBlocks.java

diff --git a/src/compiler/scala/tools/nsc/javac/JavaScanners.scala b/src/compiler/scala/tools/nsc/javac/JavaScanners.scala
index 3f8ee1166a08..0cd43574096e 100644
--- a/src/compiler/scala/tools/nsc/javac/JavaScanners.scala
+++ b/src/compiler/scala/tools/nsc/javac/JavaScanners.scala
@@ -239,6 +239,9 @@ trait JavaScanners extends ast.parser.ScannersCommon {
     */
     protected def putChar(c: Char): Unit = { cbuf.append(c) }
 
+    /** Remove the last N characters from the buffer */
+    private def popNChars(n: Int): Unit = if (n > 0) cbuf.setLength(cbuf.length - n)
+
     /** Clear buffer and set name */
     private def setName(): Unit = {
       name = newTermName(cbuf.toString())
@@ -322,15 +325,26 @@ trait JavaScanners extends ast.parser.ScannersCommon {
 
               case '\"' =>
                 in.next()
-                while (in.ch != '\"' && (in.isUnicode || in.ch != CR && in.ch != LF && in.ch != SU)) {
-                  getlitch()
-                }
-                if (in.ch == '\"') {
-                  token = STRINGLIT
-                  setName()
-                  in.next()
+                if (in.ch != '\"') { // "..." non-empty string literal
+                  while (in.ch != '\"' && (in.isUnicode || in.ch != CR && in.ch != LF && in.ch != SU)) {
+                    getlitch()
+                  }
+                  if (in.ch == '\"') {
+                    token = STRINGLIT
+                    setName()
+                    in.next()
+                  } else {
+                    syntaxError("unclosed string literal")
+                  }
                 } else {
-                  syntaxError("unclosed string literal")
+                  in.next()
+                  if (in.ch != '\"') { // "" empty string literal
+                    token = STRINGLIT
+                    setName()
+                  } else {
+                    in.next()
+                    getTextBlock()
+                  }
                 }
                 return
 
@@ -702,6 +716,108 @@ trait JavaScanners extends ast.parser.ScannersCommon {
         in.next()
       }
 
+    /** read a triple-quote delimited text block, starting after the first three
+      * double quotes
+      */
+    private def getTextBlock(): Unit = {
+      // Open delimiter is followed by optional space, then a newline
+      while (in.ch = ' ' || in.ch == '\t' || in.ch == FF) {
+        in.next()
+      }
+      if (in.ch != LF && in.ch != CR) {
+        syntaxError("illegal text block open delimiter sequence, missing line terminator")
+        return
+      }
+      in.next()
+
+      /* Do a lookahead scan over the full text block to:
+       *   - compute common white space prefix
+       *   - find the offset where the text block ends
+       */
+      var commonWhiteSpacePrefix = Int.MaxValue
+      var blockEndOffset = 0
+      val backtrackTo = in.copy
+      var blockClosed = false
+      var lineWhiteSpacePrefix = 0
+      var lineIsOnlyWhitespace = true
+      while (!blockClosed && (in.isUnicode || in.ch != SU)) {
+        if (in.ch == '\"') { // Potential end of the block
+          in.next()
+          if (in.ch == '\"') {
+            in.next()
+            if (in.ch == '\"') {
+              blockClosed = true
+              commonWhiteSpacePrefix = commonWhiteSpacePrefix min lineWhiteSpacePrefix
+              blockEndOffset = in.cpos - 2
+            }
+          }
+
+          // Not the end of the block - just a single or double " character
+          if (!blockClosed) {
+            lineIsOnlyWhitespace = false
+          }
+        } else if (in.ch == CR || in.ch == LF) { // new line in the block
+          in.next()
+          if (!lineIsOnlyWhitespace) {
+            commonWhiteSpacePrefix = commonWhiteSpacePrefix min lineWhiteSpacePrefix
+          }
+          lineWhiteSpacePrefix = 0
+          lineIsOnlyWhitespace = true
+        } else if (lineIsOnlyWhitespace && isWhitespace(in.ch)) { // extend white space prefix
+          in.next()
+          lineWhiteSpacePrefix += 1
+        } else {
+          lineIsOnlyWhitespace = false
+          getlitch()
+        }
+      }
+      setName() // clear the literal buffer
+
+      // Bail out if the block never did have an end
+      if (!blockClosed) {
+        syntaxError("unclosed text block")
+        return
+      }
+
+      // Second pass: construct the literal string value this time
+      in = backtrackTo
+      while (in.cpos < blockEndOffset) {
+        // Drop the line's leading whitespace
+        var remainingPrefix = commonWhiteSpacePrefix
+        while (remainingPrefix > 0 && in.ch != CR && in.ch != LF && in.cpos < blockEndOffset) {
+          in.next()
+          remainingPrefix -= 1
+        }
+
+        var trailingWhitespaceLength = 0
+        while (in.ch != CR && in.ch != LF && in.cpos < blockEndOffset) {
+          if (isWhitespace(in.ch)) {
+            trailingWhitespaceLength += 1
+          } else {
+            trailingWhitespaceLength = 0
+          }
+          getlitch()
+        }
+
+        // Drop the line's trailing whitespace
+        popNChars(trailingWhitespaceLength)
+
+        // Normalize line terminators
+        if (in.ch == CR || in.ch == LF) {
+          in.next()
+          putChar('\n')
+        }
+      }
+
+      token = STRINGLIT
+      setName()
+
+      // Trailing """
+      in.next()
+      in.next()
+      in.next()
+    }
+
     /** read fractional part and exponent of floating point number
      *  if one is present.
      */
diff --git a/test/files/run/t12290.check b/test/files/run/t12290.check
new file mode 100644
index 000000000000..c69c239803b8
--- /dev/null
+++ b/test/files/run/t12290.check
@@ -0,0 +1,56 @@
+====
+A text
+
+====
+<html>
+    <body>
+        <p>Hello, world</p>
+    </body>
+</html>
+
+====
+SELECT "EMP_ID", "LAST_NAME" FROM "EMPLOYEE_TB"
+WHERE "CITY" = 'INDIANAPOLIS'
+ORDER BY "EMP_ID", "LAST_NAME";
+
+====
+<html>
+    <body>
+        <p>Hello, world</p>
+    </body>
+</html>
+
+====
+                            <html>
+                                <body>
+                                    <p>Hello, world</p>
+                                </body>
+                            </html>
+
+====
+<html>
+    <body>
+        <p>Hello, world</p>
+    </body>
+
+</html>
+
+====
+<html>
+
+    <body>
+        <p>Hello,	world</p>
+    </body>
+</html>
+
+====
+String text = """
+    A text block inside a text block
+""";
+
+====
+foo	bar
+baz
+====
+
+====
diff --git a/test/files/run/t12290/Test.scala b/test/files/run/t12290/Test.scala
new file mode 100644
index 000000000000..1984ed9ec832
--- /dev/null
+++ b/test/files/run/t12290/Test.scala
@@ -0,0 +1,27 @@
+/* Using `valueOf` is a way to check that the Java string literals were properly
+ * parsed, since the parsed value is what the Scala compiler will use when
+ * resolving the singleton types
+ */
+object Test extends App {
+  println("====")
+  println(valueOf[TextBlocks.aText.type])
+  println("====")
+  println(valueOf[TextBlocks.html1.type])
+  println("====")
+  println(valueOf[TextBlocks.query.type])
+  println("====")
+  println(valueOf[TextBlocks.html2.type])
+  println("====")
+  println(valueOf[TextBlocks.html3.type])
+  println("====")
+  println(valueOf[TextBlocks.html4.type])
+  println("====")
+  println(valueOf[TextBlocks.html5.type])
+  println("====")
+  println(valueOf[TextBlocks.code.type])
+  println("====")
+  println(valueOf[TextBlocks.simpleString.type])
+  println("====")
+  println(valueOf[TextBlocks.emptyString.type])
+  println("====")
+}
diff --git a/test/files/run/t12290/TextBlocks.java b/test/files/run/t12290/TextBlocks.java
new file mode 100644
index 000000000000..0ba5526f47a4
--- /dev/null
+++ b/test/files/run/t12290/TextBlocks.java
@@ -0,0 +1,68 @@
+class TextBlocks {
+
+    final static String aText = """
+      A text
+      """;
+
+    final static String html1 = """
+                                <html>
+                                    <body>
+                                        <p>Hello, world</p>
+                                    </body>
+                                </html>
+                                """;
+
+    // quote characters are unescaped
+    final static String query = """
+                                SELECT "EMP_ID", "LAST_NAME" FROM "EMPLOYEE_TB"
+                                WHERE "CITY" = 'INDIANAPOLIS'
+                                ORDER BY "EMP_ID", "LAST_NAME";
+                                """;
+
+    // incidental trailing spaces
+    final static String html2 = """
+                                <html>   
+                                    <body>
+                                        <p>Hello, world</p>    
+                                    </body> 
+                                </html>   
+                                """;
+
+    // trailing delimiter influences
+    final static String html3 = """
+                                <html>
+                                    <body>
+                                        <p>Hello, world</p>
+                                    </body>
+                                </html>
+    """;
+
+    // blank line does not affect 
+    final static String html4 = """
+                                <html>
+                                    <body>
+                                        <p>Hello, world</p>
+                                    </body>
+
+                                </html>
+                                    """;
+
+    // escape sequences
+    final static String html5 = """
+                                <html>\n
+                                    <body>
+                                        <p>Hello,\tworld</p>
+                                    </body>
+                                </html>
+                                """;
+    final static String code =
+        """
+        String text = \"""
+            A text block inside a text block
+        \""";
+        """;
+
+    final static String simpleString = "foo\tbar\nbaz";
+
+    final static String emptyString = "";
+}