Permalink
Browse files

Workaround element/attribute names with special characters

For element names, these are equivalent in XPath:

    foo
    *[name() = "foo"]

And for attribute names:

    @foo
    attribute:*[name() = "foo"]

The former is faster but some characters are not allowed in it.
Since I am not sure which characters, only use it for "safe" names
that match ^[a-zA-Z_][a-zA-Z0-9_.-]*$
This is overly restrictive, but should cover every name actually used
in XML, HTML, SVG, etc.
  • Loading branch information...
1 parent 7189f52 commit c221b7bdc5328368279732ad5e82736639a1c066 @SimonSapin SimonSapin committed Jun 14, 2012
Showing with 39 additions and 7 deletions.
  1. +14 −0 cssselect/tests.py
  2. +25 −7 cssselect/xpath.py
View
@@ -373,6 +373,17 @@ def xpath(css):
"e/following-sibling::f")
assert xpath('div#container p') == (
"div[@id = 'container']/descendant-or-self::*/p")
+
+ # Invalid characters in XPath element names
+ assert xpath(r'di\a0 v') == (
+ "*[name() = 'di\xa0v']")
+ assert xpath(r'di\[v') == (
+ "*[name() = 'di[v']")
+ assert xpath(r'[h\a0 ref]') == (
+ "*[attribute::*[name() = 'h\xa0ref']]")
+ assert xpath(r'[h\]ref]') == (
+ "*[attribute::*[name() = 'h]ref']]")
+
self.assertRaises(ExpressionError, xpath, ':first-of-type')
self.assertRaises(ExpressionError, xpath, ':only-of-type')
self.assertRaises(ExpressionError, xpath, ':last-of-type')
@@ -551,6 +562,9 @@ def pcss(main, *selectors, **kwargs):
assert pcss('ol :Not(li[class])') == [
'first-li', 'second-li', 'li-div',
'fifth-li', 'sixth-li', 'seventh-li']
+ # Invalid characters in XPath element names, should not crash
+ assert pcss(r'di\a0 v', r'div\[') == []
+ assert pcss(r'[h\a0 ref]', r'[h\]ref]') == []
# HTML-specific
assert pcss(':link', html_only=True) == [
View
@@ -83,6 +83,11 @@ def join(self, combiner, other):
split_at_single_quotes = re.compile("('+)").split
+# The spec is actually more permissive than that, but don’t bother.
+# This is just for the fast path.
+# http://www.w3.org/TR/REC-xml/#NT-NameStartChar
+is_safe_name = re.compile('^[a-zA-Z_][a-zA-Z0-9_.-]*$').match
+
#### Translation
@@ -181,7 +186,9 @@ def selector_to_xpath(self, selector, prefix='descendant-or-self::'):
tree = getattr(selector, 'parsed_tree', None)
if not tree:
raise TypeError('Expected a parsed selector, got %r' % (selector,))
- return (prefix or '') + _unicode(self.xpath(tree))
+ xpath = self.xpath(tree)
+ assert isinstance(xpath, XPathExpr) # help debug a missing 'return'
+ return (prefix or '') + _unicode(xpath)
@staticmethod
def xpath_literal(s):
@@ -250,15 +257,19 @@ def xpath_attrib(self, selector):
name = selector.attrib.lower()
else:
name = selector.attrib
+ safe = is_safe_name(name)
if selector.namespace:
- name = '@%s:%s' % (selector.namespace, name)
+ name = '%s:%s' % (selector.namespace, name)
+ safe = safe and is_safe_name(selector.namespace)
+ if safe:
+ attrib = '@' + name
else:
- name = '@' + name
+ attrib = 'attribute::*[name() = %s]' % self.xpath_literal(name)
if self.lower_case_attribute_values:
value = selector.value.lower()
else:
value = selector.value
- return method(self.xpath(selector.selector), name, value)
+ return method(self.xpath(selector.selector), attrib, value)
def xpath_class(self, class_selector):
"""Translate a class selector."""
@@ -277,13 +288,20 @@ def xpath_element(self, selector):
element = selector.element
if not element:
element = '*'
- elif self.lower_case_element_names:
- element = element.lower()
+ safe = True
+ else:
+ safe = is_safe_name(element)
+ if self.lower_case_element_names:
+ element = element.lower()
if selector.namespace:
# Namespace prefixes are case-sensitive.
# http://www.w3.org/TR/css3-namespace/#prefixes
element = '%s:%s' % (selector.namespace, element)
- return XPathExpr(element=element)
+ safe = safe and is_safe_name(selector.namespace)
+ xpath = XPathExpr(element=element)
+ if not safe:
+ xpath.add_name_test()
+ return xpath
# CombinedSelector: dispatch by combinator

0 comments on commit c221b7b

Please sign in to comment.