Permalink
Browse files

Changes to Unicode escape reading

  • Loading branch information...
mdr committed Jan 14, 2012
1 parent d2a402a commit 4ba012a889c2146f0eeafb63fb25dabc37561978
@@ -1,70 +1,34 @@
package scalariform.perf
-import scalariform.lexer.{ Token _, _ }
-import scalariform.utils.Utils.time
import java.io.File
import scala.io.Source
import scalariform.parser._
+import scalariform.lexer.{ Token _, _ }
+import scalariform.utils.Utils.time
import scalariform.formatter._
-object LexerPerformanceTest extends Application {
-
- def format(s: String) = {
- ScalaFormatter.format(s)
- }
-
- def parse(s: String) = {
- val (_, tokens) = ScalaLexer.tokeniseFull(file)
- // val parser = new ScalaCombinatorParser
- // val rawParseResult = parser.compilationUnitOrScript(new ScalaLexerReader(tokens))
- // rawParseResult.get.tokens
- new ScalaParser(tokens.toArray).compilationUnitOrScript().tokens
- }
-
- val file = new File("/home/matt/corpus2/" + "scala/src/compiler/scala/tools/nsc/symtab/Types.scala")
- val source = Source.fromFile(file).mkString
- if (true) {
- 1 to 10 foreach { _ format(source) }
-
- time("Format") {
- 1 to 100 foreach { _ format(source) }
- }
-
- } else if (false) {
+object LexerPerformanceTest {
- 1 to 10 foreach { _ parse(source) }
+ def main(args: Array[String]) {
- val tokens = parse(source)
- println(file + " -- " + tokens.length + " tokens")
+ val file = new File("/home/matt/coding/scala/src/compiler/scala/tools/nsc/typechecker/Typers.scala")
+ val source = Source.fromFile(file).mkString
+ println("Source: " + source.length + " chars")
+ 1 to 1000 foreach { _ doIt(source) }
- time("Parse") {
- 1 to 100 foreach { _ parse(source) }
- }
- } else if (false) {
- 1 to 10 foreach { _ ScalaLexer.tokeniseFull(source) }
-
- val (_, tokens) = ScalaLexer.tokeniseFull(source)
- println(file + " -- " + tokens.length + " tokens")
-
- time("Full tokenise") {
- 1 to 100 foreach { _ ScalaLexer.tokeniseFull(source) }
- }
- } else {
- 1 to 10 foreach { _ ScalaLexer.rawTokenise2(source) }
+ val its = 10000
+
+ val start = System.currentTimeMillis
+ 1 to its foreach { _ => doIt(source) }
+ val duration = System.currentTimeMillis - start
+ println(duration.toDouble / its + " ms")
+
+ }
- val tokens = ScalaLexer.rawTokenise2(source)
- println(file + " -- " + tokens.length + " tokens")
+ private def unicodeEscapeReader(s: String) = new UnicodeEscapeReader(s)
- time("Tokenise") {
- 1 to 100 foreach { _ ScalaLexer.rawTokenise2(source) }
- }
+ private def doIt(s: String) = {
+ UnicodeEscapeDecoder.decode(s)
}
- // 22706ms / 100, 21829ms
- // 20070ms ,19879ms ==> switch to Chars
- // 16454ms, 16449 => unicodeescapereader not backed by a Reader
- // 18113ms => post EOF SU fix
- // 16956ms, after removal of string builder
- // 12760ms => Maps => java.util.HashMap
- // 10000ms => rm hiddenSuccessorsMap
}
@@ -19,17 +19,24 @@ abstract class Lexer(reader: UnicodeEscapeReader) extends TokenTests {
private var actualTokenTextOffset = 0
private var actualTokenTextLength = 0
- protected var eof = false
protected var builtToken: Token = _
// Two queues maintained in parallel. Invariant: chQueue.length == unicodeEscapesQueue.length
private val chQueue = new Queue[Char]
+
private val unicodeEscapesQueue = new Queue[Option[String]]
+ /**
+ * Number of characters left in queues before end of file. If -1, this as yet unknown
+ */
+ private var untilEof = -1
+
protected var lastCh: Char = SU
protected val modeStack = new Stack[LexerMode]
+ protected def eof = untilEof == 0
+
protected def isUnicodeEscape = unicodeEscapesQueue.last.isDefined
protected def ch: Char = {
@@ -46,11 +53,10 @@ abstract class Lexer(reader: UnicodeEscapeReader) extends TokenTests {
}
private def slurpOneChar() {
- val (c, unicodeEscapeOfPreviousRead) = reader.read()
- chQueue.enqueue(c)
- if (reader.isEof)
- eof = true
- unicodeEscapesQueue.enqueue(unicodeEscapeOfPreviousRead)
+ chQueue.enqueue(reader.read())
+ unicodeEscapesQueue.enqueue(reader.unicodeEscapeOpt)
+ if (untilEof == -1 && reader.isEof)
+ untilEof = chQueue.size
}
protected def nextChar() {
@@ -61,15 +67,17 @@ abstract class Lexer(reader: UnicodeEscapeReader) extends TokenTests {
case None 1
case Some(s) s.length
}
+ if (untilEof > 0)
+ untilEof -= 1
actualTokenTextLength += delta
}
protected def token(tokenType: TokenType) {
val startIndex = actualTokenTextOffset
val tokenLength = actualTokenTextLength
require(tokenType == EOF || tokenLength > 0)
- val stopIndex = min(startIndex + tokenLength - 1, reader.s.length - 1) // min protects against overeager consumption past EOF in forgiving mode
- val rawText = reader.s.substring(actualTokenTextOffset, stopIndex + 1)
+ val stopIndex = min(startIndex + tokenLength - 1, reader.text.length - 1) // min protects against overeager consumption past EOF in forgiving mode
+ val rawText = reader.text.substring(actualTokenTextOffset, stopIndex + 1)
val text = tokenTextBuffer.toString
builtToken = Token(tokenType, text, startIndex, rawText)
tokenTextBuffer.clear()
@@ -1,44 +1,81 @@
package scalariform.lexer
import scalariform.lexer.CharConstants.SU
+import scalariform.utils.Utils._
-import scalariform.utils.Utils.digit2int
+object UnicodeEscapeDecoder {
-class UnicodeEscapeReader(val s: String, forgiveLexerErrors: Boolean = false) {
+ /**
+ * Decode unicode escapes of the form "\u0061" in the given text.
+ * If forgiveErrors is true, then no exception will be thrown on malformed escapes.
+ */
+ @throws(classOf[ScalaLexerException])
+ def decode(text: String, forgiveErrors: Boolean = false): String = {
+ val reader = new UnicodeEscapeReader(text, forgiveErrors)
+ val sb = new StringBuilder(text.length)
+ while (!reader.isEof)
+ sb.append(reader.read())
+ sb.toString
+ }
+
+}
+
+class UnicodeEscapeReader(val text: String, forgiveErrors: Boolean = false) {
private var pos: Int = 0
- private var eof = s == ""
+ private var unicodeEscapeSequence: String = null
/**
* To distinguish cases like "\\u" from unicode escape sequences.
*/
private var consecutiveBackslashCount = 0
/**
- * @return the next logical character paired with the unicode escape sequence that encoded it, if any.
+ * @return true if all the available characters have been read.
+ */
+ def isEof = pos >= text.length
+
+ /**
+ * @return the next character from the post-decoded text
*/
@throws(classOf[ScalaLexerException])
- def read(): (Char, Option[String]) = {
+ def read(): Char = {
val ch = consumeNextCharacter()
+ unicodeEscapeSequence = null
if (ch == '\\')
if (nextChar == 'u' && consecutiveBackslashCount % 2 == 0) {
consecutiveBackslashCount = 0
- readUnicodeChar()
+ readUnicodeChar(pos - 1)
} else {
consecutiveBackslashCount += 1
- (ch, None)
+ ch
}
else {
consecutiveBackslashCount = 0
- (ch, None)
+ ch
}
}
- private def readUnicodeChar(): (Char, Option[String]) = {
- val unicodeEscapeSequence = consumeUnicodeEscape()
- val decodedChar = decodeUnicodeChar(unicodeEscapeSequence takeRight 4 toList, unicodeEscapeSequence)
- (decodedChar, Some(unicodeEscapeSequence))
+ /**
+ * @return the corresponding unicode escape sequence if the last character read was decoded, otherwise None.
+ */
+ def unicodeEscapeOpt: Option[String] = Option(unicodeEscapeSequence)
+
+ private def consumeNextCharacter(): Char = {
+ val result = safeGet(pos)
+ pos += 1
+ result
+ }
+
+ private def nextChar = safeGet(pos)
+
+ private def safeGet(pos: Int): Char = if (pos >= text.length) SU else text(pos)
+
+ private def readUnicodeChar(startPos: Int): Char = {
+ this.unicodeEscapeSequence = consumeUnicodeEscape()
+ val decodedChar = decodeUnicodeChar(unicodeEscapeSequence takeRight 4 toList, unicodeEscapeSequence, startPos)
+ decodedChar
}
private def consumeUnicodeEscape(): String = {
@@ -55,26 +92,30 @@ class UnicodeEscapeReader(val s: String, forgiveLexerErrors: Boolean = false) {
sb.toString
}
- private def decodeUnicodeChar(digits: List[Char], unicodeEscapeSequence: String): Char = {
+ private def decodeUnicodeChar(digits: List[Char], unicodeEscapeSequence: String, startPos: Int): Char = {
val List(digit1, digit2, digit3, digit4) = digits.map(digit2int(_, base = 16))
if (digit1 < 0 || digit2 < 0 || digit3 < 0 || digit4 < 0)
- if (forgiveLexerErrors) ' ' else throw new ScalaLexerException("Error in unicode escape: " + unicodeEscapeSequence)
+ if (forgiveErrors)
+ ' '
+ else {
+ val (line, column) = lineAndColumn(startPos)
+ throw new ScalaLexerException("[" + line + ":" + column + "] error in unicode escape: '" + unicodeEscapeSequence + "'")
+ }
else
(digit1 << 12 | digit2 << 8 | digit3 << 4 | digit4).toChar
}
- private def consumeNextCharacter(): Char = {
- val result = safeGet(pos)
- if (pos >= s.length)
- eof = true
- pos += 1
- result
+ private def lineAndColumn(offset: Int): (Int, Int) = {
+ var line = 1
+ var column = 1
+ for (i 0 until offset) {
+ if (text(i) == '\n') {
+ line += 1
+ column = 1
+ } else
+ column += 1
+ }
+ (line, column)
}
- private def nextChar = safeGet(pos)
-
- private def safeGet(pos: Int): Char = if (pos >= s.length) SU else s(pos)
-
- def isEof = eof
-
}

0 comments on commit 4ba012a

Please sign in to comment.