Merge pull request #226 from Liquidsoul/faster-scanner

Optimise Scanner performance
stencilproject · Sep 25, 2018 · 6f9bb3e · 6f9bb3e
2 parents fce3dc5 + cb4e514
commit 6f9bb3e
Show file tree

Hide file tree

Showing 5 changed files with 1,279 additions and 80 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -40,6 +40,11 @@
 - Update to Spectre 0.9.0.  
   [Ilya Puchka](https://github.com/ilyapuchka)
   [#247](https://github.com/stencilproject/Stencil/pull/247)
+- Optimise Scanner performance.  
+  [Eric Thorpe](https://github.com/trametheka)
+  [Sébastien Duperron](https://github.com/Liquidsoul)
+  [David Jennes](https://github.com/djbe)
+  [#226](https://github.com/stencilproject/Stencil/pull/226)
 
 
 ## 0.12.1

diff --git a/Sources/Lexer.swift b/Sources/Lexer.swift
@@ -7,6 +7,18 @@ struct Lexer {
   let templateString: String
   let lines: [Line]
 
+  /// The potential token start characters. In a template these appear after a
+  /// `{` character, for example `{{`, `{%`, `{#`, ...
+  private static let tokenChars: [Unicode.Scalar] = ["{", "%", "#"]
+
+  /// The token end characters, corresponding to their token start characters.
+  /// For example, a variable token starts with `{{` and ends with `}}`
+  private static let tokenCharMap: [Unicode.Scalar: Unicode.Scalar] = [
+    "{": "}",
+    "%": "%",
+    "#": "#"
+  ]
+
   init(templateName: String? = nil, templateString: String) {
     self.templateName = templateName
     self.templateString = templateString
@@ -17,12 +29,19 @@ struct Lexer {
     }
   }
 
+  /// Create a token that will be passed on to the parser, with the given
+  /// content and a range. The content will be tested to see if it's a
+  /// `variable`, a `block` or a `comment`, otherwise it'll default to a simple
+  /// `text` token.
+  ///
+  /// - Parameters:
+  ///   - string: The content string of the token
+  ///   - range: The range within the template content, used for smart
+  ///            error reporting
   func createToken(string: String, at range: Range<String.Index>) -> Token {
     func strip() -> String {
       guard string.count > 4 else { return "" }
-      let start = string.index(string.startIndex, offsetBy: 2)
-      let end = string.index(string.endIndex, offsetBy: -2)
-      let trimmed = String(string[start..<end])
+      let trimmed = String(string.dropFirst(2).dropLast(2))
         .components(separatedBy: "\n")
         .filter({ !$0.isEmpty })
         .map({ $0.trim(character: " ") })
@@ -50,26 +69,22 @@ struct Lexer {
     return .text(value: string, at: sourceMap)
   }
 
-  /// Returns an array of tokens from a given template string.
+  /// Transforms the template into a list of tokens, that will eventually be
+  /// passed on to the parser.
+  ///
+  /// - Returns: The list of tokens (see `createToken(string: at:)`).
   func tokenize() -> [Token] {
     var tokens: [Token] = []
 
     let scanner = Scanner(templateString)
-
-    let map = [
-      "{{": "}}",
-      "{%": "%}",
-      "{#": "#}",
-      ]
-
     while !scanner.isEmpty {
-      if let text = scanner.scan(until: ["{{", "{%", "{#"]) {
-        if !text.1.isEmpty {
-          tokens.append(createToken(string: text.1, at: scanner.range))
+      if let (char, text) = scanner.scanForTokenStart(Lexer.tokenChars) {
+        if !text.isEmpty {
+          tokens.append(createToken(string: text, at: scanner.range))
         }
 
-        let end = map[text.0]!
-        let result = scanner.scan(until: end, returnUntil: true)
+        guard let end = Lexer.tokenCharMap[char] else { continue }
+        let result = scanner.scanForTokenEnd(end)
         tokens.append(createToken(string: result, at: scanner.range))
       } else {
         tokens.append(createToken(string: scanner.content, at: scanner.range))
@@ -80,6 +95,11 @@ struct Lexer {
     return tokens
   }
 
+  /// Finds the line matching the given range (for a token)
+  ///
+  /// - Parameter range: The range to search for.
+  /// - Returns: The content for that line, the line number and offset within
+  ///            the line.
   func rangeLocation(_ range: Range<String.Index>) -> ContentLocation {
     guard let line = self.lines.first(where: { $0.range.contains(range.lowerBound) }) else {
       return ("", 0, 0)
@@ -95,6 +115,11 @@ class Scanner {
   var content: String
   var range: Range<String.Index>
 
+  /// The start delimiter for a token.
+  private static let tokenStartDelimiter: Unicode.Scalar = "{"
+  /// And the corresponding end delimiter for a token.
+  private static let tokenEndDelimiter: Unicode.Scalar = "}"
+
   init(_ content: String) {
     self.originalContent = content
     self.content = content
@@ -105,64 +130,69 @@ class Scanner {
     return content.isEmpty
   }
 
-  func scan(until: String, returnUntil: Bool = false) -> String {
-    var index = content.startIndex
-
-    if until.isEmpty {
-      return ""
-    }
-
-    range = range.upperBound..<range.upperBound
-    while index != content.endIndex {
-      let substring = String(content[index...])
-
-      if substring.hasPrefix(until) {
-        let result = String(content[..<index])
-
-        if returnUntil {
-          range = range.lowerBound..<originalContent.index(range.upperBound, offsetBy: until.count)
-          content = String(substring[until.endIndex...])
-          return result + until
-        }
-
-        content = substring
+  /// Scans for the end of a token, with a specific ending character. If we're
+  /// searching for the end of a block token `%}`, this method receives a `%`.
+  /// The scanner will search for that `%` followed by a `}`.
+  ///
+  /// Note: if the end of a token is found, the `content` and `range`
+  /// properties are updated to reflect this. `content` will be set to what
+  /// remains of the template after the token. `range` will be set to the range
+  /// of the token within the template.
+  ///
+  /// - Parameter tokenChar: The token end character to search for.
+  /// - Returns: The content of a token, or "" if no token end was found.
+  func scanForTokenEnd(_ tokenChar: Unicode.Scalar) -> String {
+    var foundChar = false
+
+    for (index, char) in content.unicodeScalars.enumerated() {
+      if foundChar && char == Scanner.tokenEndDelimiter {
+        let result = String(content.prefix(index))
+        content = String(content.dropFirst(index + 1))
+        range = range.upperBound..<originalContent.index(range.upperBound, offsetBy: index + 1)
         return result
+      } else {
+        foundChar = (char == tokenChar)
       }
-
-      index = content.index(after: index)
-      range = range.lowerBound..<originalContent.index(after: range.upperBound)
     }
 
     content = ""
     return ""
   }
 
-  func scan(until: [String]) -> (String, String)? {
-    if until.isEmpty {
-      return nil
-    }
+  /// Scans for the start of a token, with a list of potential starting
+  /// characters. To scan for the start of variables (`{{`), blocks (`{%`) and
+  /// comments (`{#`), this method receives the characters `{`, `%` and `#`.
+  /// The scanner will search for a `{`, followed by one of the search
+  /// characters. It will give the found character, and the content that came
+  /// before the token.
+  ///
+  /// Note: if the start of a token is found, the `content` and `range`
+  /// properties are updated to reflect this. `content` will be set to what
+  /// remains of the template starting with the token. `range` will be set to
+  /// the start of the token within the template.
+  ///
+  /// - Parameter tokenChars: List of token start characters to search for.
+  /// - Returns: The found token start character, together with the content
+  ///            before the token, or nil of no token start was found.
+  func scanForTokenStart(_ tokenChars: [Unicode.Scalar]) -> (Unicode.Scalar, String)? {
+    var foundBrace = false
 
-    var index = content.startIndex
     range = range.upperBound..<range.upperBound
-    while index != content.endIndex {
-      let substring = String(content[index...])
-      for string in until {
-        if substring.hasPrefix(string) {
-          let result = String(content[..<index])
-          content = substring
-          return (string, result)
-        }
+    for (index, char) in content.unicodeScalars.enumerated() {
+      if foundBrace && tokenChars.contains(char) {
+        let result = String(content.prefix(index - 1))
+        content = String(content.dropFirst(index - 1))
+        range = range.upperBound..<originalContent.index(range.upperBound, offsetBy: index - 1)
+        return (char, result)
+      } else {
+        foundBrace = (char == Scanner.tokenStartDelimiter)
       }
-
-      index = content.index(after: index)
-      range = range.lowerBound..<originalContent.index(after: range.upperBound)
     }
 
     return nil
   }
 }
 
-
 extension String {
   func findFirstNot(character: Character) -> String.Index? {
     var index = startIndex

diff --git a/Tests/StencilTests/LexerSpec.swift b/Tests/StencilTests/LexerSpec.swift
@@ -1,10 +1,16 @@
-import XCTest
+import PathKit
 import Spectre
 @testable import Stencil
+import XCTest
 
 class LexerTests: XCTestCase {
   func testLexer() {
     describe("Lexer") {
+      func makeSourceMap(_ token: String, for lexer: Lexer, options: String.CompareOptions = []) -> SourceMap {
+        guard let range = lexer.templateString.range(of: token, options: options) else { fatalError("Token not found") }
+        return SourceMap(location: lexer.rangeLocation(range))
+      }
+
       $0.it("can tokenize text") {
         let lexer = Lexer(templateString: "Hello World")
         let tokens = lexer.tokenize()
@@ -44,9 +50,9 @@ class LexerTests: XCTestCase {
         let tokens = lexer.tokenize()
 
         try expect(tokens.count) == 3
-        try expect(tokens[0]) == Token.text(value: "My name is ", at: SourceMap(location: lexer.rangeLocation(templateString.range(of: "My name is ")!)))
-        try expect(tokens[1]) == Token.variable(value: "myname", at: SourceMap(location: lexer.rangeLocation(templateString.range(of: "myname")!)))
-        try expect(tokens[2]) == Token.text(value: ".", at: SourceMap(location: lexer.rangeLocation(templateString.range(of: ".")!)))
+        try expect(tokens[0]) == Token.text(value: "My name is ", at: makeSourceMap("My name is ", for: lexer))
+        try expect(tokens[1]) == Token.variable(value: "myname", at: makeSourceMap("myname", for: lexer))
+        try expect(tokens[2]) == Token.text(value: ".", at: makeSourceMap(".", for: lexer))
       }
 
       $0.it("can tokenize two variables without being greedy") {
@@ -55,43 +61,69 @@ class LexerTests: XCTestCase {
         let tokens = lexer.tokenize()
 
         try expect(tokens.count) == 2
-        try expect(tokens[0]) == Token.variable(value: "thing", at: SourceMap(location: lexer.rangeLocation(templateString.range(of: "thing")!)))
-        try expect(tokens[1]) == Token.variable(value: "name", at: SourceMap(location: lexer.rangeLocation(templateString.range(of: "name")!)))
+        try expect(tokens[0]) == Token.variable(value: "thing", at: makeSourceMap("thing", for: lexer))
+        try expect(tokens[1]) == Token.variable(value: "name", at: makeSourceMap("name", for: lexer))
       }
 
       $0.it("can tokenize an unclosed block") {
         let lexer = Lexer(templateString: "{%}")
-        let _ = lexer.tokenize()
+        _ = lexer.tokenize()
+      }
+
+      $0.it("can tokenize incorrect syntax without crashing") {
+        let lexer = Lexer(templateString: "func some() {{% if %}")
+        _ = lexer.tokenize()
       }
 
       $0.it("can tokenize an empty variable") {
         let lexer = Lexer(templateString: "{{}}")
-        let _ = lexer.tokenize()
+        _ = lexer.tokenize()
       }
 
       $0.it("can tokenize with new lines") {
         let templateString = """
-        My name is {%
-            if name
-             and
-            name
-        %}{{
-        name
-        }}{%
-        endif %}.
-        """
-
+          My name is {%
+              if name
+               and
+              name
+          %}{{
+          name
+          }}{%
+          endif %}.
+          """
         let lexer = Lexer(templateString: templateString)
+        let tokens = lexer.tokenize()
 
+        try expect(tokens.count) == 5
+        try expect(tokens[0]) == Token.text(value: "My name is ", at: makeSourceMap("My name is", for: lexer))
+        try expect(tokens[1]) == Token.block(value: "if name and name", at: makeSourceMap("{%", for: lexer))
+        try expect(tokens[2]) == Token.variable(value: "name", at: makeSourceMap("name", for: lexer, options: .backwards))
+        try expect(tokens[3]) == Token.block(value: "endif", at: makeSourceMap("endif", for: lexer))
+        try expect(tokens[4]) == Token.text(value: ".", at: makeSourceMap(".", for: lexer))
+      }
+
+      $0.it("can tokenize escape sequences") {
+        let templateString = "class Some {{ '{' }}{% if true %}{{ stuff }}{% endif %}"
+        let lexer = Lexer(templateString: templateString)
         let tokens = lexer.tokenize()
 
         try expect(tokens.count) == 5
-        try expect(tokens[0]) == Token.text(value: "My name is ", at: SourceMap(location: lexer.rangeLocation(templateString.range(of: "My name is")!)))
-        try expect(tokens[1]) == Token.block(value: "if name and name", at: SourceMap(location: lexer.rangeLocation(templateString.range(of: "{%")!)))
-        try expect(tokens[2]) == Token.variable(value: "name", at: SourceMap(location: lexer.rangeLocation(templateString.range(of: "name", options: [.backwards])!)))
-        try expect(tokens[3]) == Token.block(value: "endif", at: SourceMap(location: lexer.rangeLocation(templateString.range(of: "endif")!)))
-        try expect(tokens[4]) == Token.text(value: ".", at: SourceMap(location: lexer.rangeLocation(templateString.range(of: ".")!)))
+        try expect(tokens[0]) == Token.text(value: "class Some ", at: makeSourceMap("class Some ", for: lexer))
+        try expect(tokens[1]) == Token.variable(value: "'{'", at: makeSourceMap("'{'", for: lexer))
+        try expect(tokens[2]) == Token.block(value: "if true", at: makeSourceMap("if true", for: lexer))
+        try expect(tokens[3]) == Token.variable(value: "stuff", at: makeSourceMap("stuff", for: lexer))
+        try expect(tokens[4]) == Token.block(value: "endif", at: makeSourceMap("endif", for: lexer))
       }
     }
   }
+
+  func testPerformance() throws {
+    let path = Path(#file) + ".."  + "fixtures" + "huge.html"
+    let content: String = try path.read()
+
+    measure {
+      let lexer = Lexer(templateString: content)
+      _ = lexer.tokenize()
+    }
+  }
 }
diff --git a/Tests/StencilTests/XCTestManifests.swift b/Tests/StencilTests/XCTestManifests.swift
@@ -57,6 +57,7 @@ extension InheritenceTests {
 extension LexerTests {
     static let __allTests = [
         ("testLexer", testLexer),
+        ("testPerformance", testPerformance),
     ]
 }