sparklemotion · flavorjones · May 24, 2024 · May 20, 2024 · May 20, 2024 · May 20, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,8 +15,9 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA
 
 ### Fixed
 
-* [CRuby] libgumbo (the HTML5 parser) treats reaching max-depth as EOF. This addresses a class of issues when the parser is interrupted in this way. [#3121] @stevecheckoway
 * `Node#clone`, `NodeSet#clone`, and `*::Document#clone` all properly copy the metaclass of the original as expected. Previously, `#clone` had been aliased to `#dup` for these classes (since v1.3.0 in 2009). [#316, #3117] @flavorjones
+* CSS queries for pseudo-selectors that cannot be transpiled into XPath queries now raise a more descriptive `Nokogiri::CSS::SyntaxError` when they are parsed. Previously, an invalid XPath query was created and a hard-to-understand XPath error was being raised by the query engine. [#3197] @flavorjones
+* [CRuby] libgumbo (the HTML5 parser) treats reaching max-depth as EOF. This addresses a class of issues when the parser is interrupted in this way. [#3121] @stevecheckoway
 * [CRuby] Update node GC lifecycle to avoid a potential memory leak with fragments in libxml 2.13.0 caused by changes in `xmlAddChild`. [#3156] @flavorjones
 
 

diff --git a/lib/nokogiri/css/parser.rb b/lib/nokogiri/css/parser.rb
diff --git a/lib/nokogiri/css/parser.y b/lib/nokogiri/css/parser.y
@@ -1,6 +1,6 @@
 class Nokogiri::CSS::Parser
 
-token FUNCTION INCLUDES DASHMATCH LBRACE HASH PLUS GREATER S STRING IDENT
+token FUNCTION INCLUDES DASHMATCH LBRACE HASH PLUS MINUS GREATER S STRING IDENT
 token COMMA NUMBER PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH TILDE NOT_EQUAL
 token SLASH DOUBLESLASH NOT EQUAL RPAREN LSQUARE RSQUARE HAS
 
@@ -143,13 +143,17 @@ rule
         raise Racc::ParseError, "parse error on IDENT '#{val[1]}'"
       end
     }
-  | IDENT PLUS NUMBER {               # n+3, -n+3
+  | IDENT PLUS NUMBER {               # n+3
       if val[0] == 'n'
         val.unshift("1")
         result = Node.new(:NTH, val)
-      elsif val[0] == '-n'
-        val[0] = 'n'
-        val.unshift("-1")
+      else
+        raise Racc::ParseError, "parse error on IDENT '#{val[0]}'"
+      end
+    }
+  | MINUS IDENT PLUS NUMBER {         # -n+3
+      if val[1] == 'n'
+        val[0] = '-1'
         result = Node.new(:NTH, val)
       else
         raise Racc::ParseError, "parse error on IDENT '#{val[1]}'"

diff --git a/lib/nokogiri/css/tokenizer.rb b/lib/nokogiri/css/tokenizer.rb
@@ -63,13 +63,13 @@ def _next_token
                   when (text = @ss.scan(/has\([\s]*/))
                      action { [:HAS, text] }
 
-                  when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*\([\s]*/))
+                  when (text = @ss.scan(/([_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*\([\s]*/))
                      action { [:FUNCTION, text] }
 
-                  when (text = @ss.scan(/-?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*/))
+                  when (text = @ss.scan(/([_A-Za-z]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*/))
                      action { [:IDENT, text] }
 
-                  when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])+/))
+                  when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))+/))
                      action { [:HASH, text] }
 
                   when (text = @ss.scan(/[\s]*~=[\s]*/))
@@ -120,6 +120,9 @@ def _next_token
                   when (text = @ss.scan(/-?([0-9]+|[0-9]*\.[0-9]+)/))
                      action { [:NUMBER, text] }
 
+                  when (text = @ss.scan(/[\s]*\-[\s]*/))
+                     action { [:MINUS, text] }
+
                   when (text = @ss.scan(/[\s]*\/\/[\s]*/))
                      action { [:DOUBLESLASH, text] }
 
@@ -132,7 +135,7 @@ def _next_token
                   when (text = @ss.scan(/[\s]+/))
                      action { [:S, text] }
 
-                  when (text = @ss.scan(/"([^\n\r\f"]|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*(?<!\\)(?:\\{2})*"|'([^\n\r\f']|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*(?<!\\)(?:\\{2})*'/))
+                  when (text = @ss.scan(/("([^\n\r\f"]|(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*(?<!\\)(?:\\{2})*"|'([^\n\r\f']|(\n|\r\n|\r|\f)|[^\0-\177]|(\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f]))*(?<!\\)(?:\\{2})*')/))
                      action { [:STRING, text] }
 
                   when (text = @ss.scan(/./))

diff --git a/lib/nokogiri/css/tokenizer.rex b/lib/nokogiri/css/tokenizer.rex
@@ -4,29 +4,29 @@ module CSS
 class Tokenizer
 
 macro
-  nl        \n|\r\n|\r|\f
+  nl        (\n|\r\n|\r|\f)
   w         [\s]*
   nonascii  [^\0-\177]
   num       -?([0-9]+|[0-9]*\.[0-9]+)
   unicode   \\[0-9A-Fa-f]{1,6}(\r\n|[\s])?
 
-  escape    {unicode}|\\[^\n\r\f0-9A-Fa-f]
-  nmchar    [_A-Za-z0-9-]|{nonascii}|{escape}
-  nmstart   [_A-Za-z]|{nonascii}|{escape}
-  ident     -?({nmstart})({nmchar})*
-  name      ({nmchar})+
+  escape    ({unicode}|\\[^\n\r\f0-9A-Fa-f])
+  nmchar    ([_A-Za-z0-9-]|{nonascii}|{escape})
+  nmstart   ([_A-Za-z]|{nonascii}|{escape})
+  name      {nmstart}{nmchar}*
+  charref   {nmchar}+
   string1   "([^\n\r\f"]|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*"
   string2   '([^\n\r\f']|{nl}|{nonascii}|{escape})*(?<!\\)(?:\\{2})*'
-  string    {string1}|{string2}
+  string    ({string1}|{string2})
 
 rule
 
 # [:state]  pattern  [actions]
 
             has\({w}         { [:HAS, text] }
-            {ident}\({w}     { [:FUNCTION, text] }
-            {ident}          { [:IDENT, text] }
-            \#{name}         { [:HASH, text] }
+            {name}\({w}      { [:FUNCTION, text] }
+            {name}           { [:IDENT, text] }
+            \#{charref}      { [:HASH, text] }
             {w}~={w}         { [:INCLUDES, text] }
             {w}\|={w}        { [:DASHMATCH, text] }
             {w}\^={w}        { [:PREFIXMATCH, text] }
@@ -43,6 +43,7 @@ rule
             {w}~{w}          { [:TILDE, text] }
             \:not\({w}       { [:NOT, text] }
             {num}            { [:NUMBER, text] }
+            {w}\-{w}         { [:MINUS, text] }
             {w}\/\/{w}       { [:DOUBLESLASH, text] }
             {w}\/{w}         { [:SLASH, text] }
 

diff --git a/test/css/test_tokenizer.rb b/test/css/test_tokenizer.rb
@@ -262,7 +262,8 @@ def test_scan_nth
             [:IDENT, "x"],
             [":", ":"],
             [:FUNCTION, "nth-child("],
-            [:IDENT, "-n"],
+            [:MINUS, "-"],
+            [:IDENT, "n"],
             [:PLUS, "+"],
             [:NUMBER, "3"],
             [:RPAREN, ")"],

diff --git a/test/css/test_xpath_visitor.rb b/test/css/test_xpath_visitor.rb
@@ -369,6 +369,11 @@ def assert_xpath(expecteds, asts)
         assert_xpath("//*[not(@id='foo')]", parser.parse(":not(#foo)"))
         assert_xpath("//*[count(preceding-sibling::*)=0]", parser.parse(":first-child"))
       end
+
+      it "raises an exception for pseudo-classes that are not XPath Names" do
+        # see https://github.com/sparklemotion/nokogiri/issues/3193
+        assert_raises(Nokogiri::CSS::SyntaxError) { parser.parse("div:-moz-drag-over") }
+      end
     end
 
     describe "combinators" do