Fetching contributors…
Cannot retrieve contributors at this time
586 lines (486 sloc) 20.1 KB
Translation of parsed CSS selectors to XPath expressions.
:copyright: (c) 2007-2012 Ian Bicking and contributors.
See AUTHORS for more details.
:license: BSD, see LICENSE for more details.
import sys
import re
from cssselect.parser import parse, parse_series, SelectorError
if sys.version_info[0] < 3:
_basestring = basestring
_unicode = unicode
_basestring = str
_unicode = str
class ExpressionError(SelectorError, RuntimeError):
"""Unknown or unsupported selector (eg. pseudo-class)."""
#### XPath Helpers
class XPathExpr(object):
def __init__(self, path='', element='*', condition='', star_prefix=False):
self.path = path
self.element = element
self.condition = condition
def __str__(self):
path = _unicode(self.path) + _unicode(self.element)
if self.condition:
path += '[%s]' % self.condition
return path
def __repr__(self):
return '%s[%s]' % (self.__class__.__name__, self)
def add_condition(self, condition):
if self.condition:
self.condition = '%s and (%s)' % (self.condition, condition)
self.condition = condition
return self
def add_name_test(self):
if self.element == '*':
# We weren't doing a test anyway
"name() = %s" % GenericTranslator.xpath_literal(self.element))
self.element = '*'
def add_star_prefix(self):
Append '*/' to the path to keep the context constrained
to a single parent.
self.path += '*/'
def join(self, combiner, other):
path = _unicode(self) + combiner
# Any "star prefix" is redundant when joining.
if other.path != '*/':
path += other.path
self.path = path
self.element = other.element
self.condition = other.condition
return self
split_at_single_quotes = re.compile("('+)").split
#### Translation
class GenericTranslator(object):
Translator for "generic" XML documents.
Everything is case-sensitive, no assumption is made on the meaning
of element names and attribute names.
combinator_mapping = {
' ': 'descendant',
'>': 'child',
'+': 'direct_adjacent',
'~': 'indirect_adjacent',
attribute_operator_mapping = {
'exists': 'exists',
'=': 'equals',
'~=': 'includes',
'|=': 'dashmatch',
'^=': 'prefixmatch',
'$=': 'suffixmatch',
'*=': 'substringmatch',
'!=': 'different', # XXX Not in Level 3 but meh
#: The attribute used for ID selectors depends on the document language:
id_attribute = 'id'
#: The case sensitivity of document language element names,
#: attribute names, and attribute values in selectors depends
#: on the document language.
#: When a document language defines one of these as case-insensitive,
#: cssselect assumes that the document parser makes the parsed values
#: lower-case. Making the selector lower-case too makes the comparaison
#: case-insensitive.
#: In HTML, element names and attributes names (but not attribute values)
#: are case-insensitive. All of lxml.html, html5lib, BeautifulSoup4
#: and HTMLParser make them lower-case in their parse result, so
#: the assumption holds.
lower_case_element_names = False
lower_case_attribute_names = False
lower_case_attribute_values = False
def css_to_xpath(self, css, prefix='descendant-or-self::'):
"""Translate a *group of selectors* to XPath.
Pseudo-elements are not supported here since XPath only knows
about "real" elements.
:param css:
A *group of selectors* as an Unicode string.
:class:`SelectorSyntaxError` on invalid selectors,
:class:`ExpressionError` on unknown/unsupported selectors,
including pseudo-elements.
The equivalent XPath 1.0 expression as an Unicode string.
selectors = parse(css)
for selector in selectors:
if selector.pseudo_element:
raise ExpressionError('Pseudo-elements are not supported.')
return ' | '.join(
self.selector_to_xpath(selector, prefix)
for selector in selectors)
def selector_to_xpath(self, selector, prefix='descendant-or-self::'):
"""Translate a parsed selector to XPath.
The :attr:`~Selector.pseudo_element` attribute of the selector
is ignored. It is the caller's responsibility to reject selectors
with pseudo-elements, or to account for them somehow.
:param selector:
A parsed :class:`Selector` object.
:class:`ExpressionError` on unknown/unsupported selectors.
The equivalent XPath 1.0 expression as an Unicode string.
tree = getattr(selector, 'parsed_tree', None)
if not tree:
raise TypeError('Expected a parsed selector, got %r' % (selector,))
return (prefix or '') + _unicode(self.xpath(tree))
def xpath_literal(s):
s = _unicode(s)
if "'" not in s:
s = "'%s'" % s
elif '"' not in s:
s = '"%s"' % s
s = "concat(%s)" % ','.join([
(("'" in part) and '"%s"' or "'%s'") % part
for part in split_at_single_quotes(s) if part
return s
def xpath(self, parsed_selector):
"""Translate any parsed selector object."""
type_name = type(parsed_selector).__name__
method = getattr(self, 'xpath_%s' % type_name.lower())
return method(parsed_selector)
# Dispatched by parsed object type
def xpath_combinedselector(self, combined):
"""Translate a combined selector."""
combinator = self.combinator_mapping[combined.combinator]
method = getattr(self, 'xpath_%s_combinator' % combinator)
return method(self.xpath(combined.selector),
def xpath_negation(self, negation):
xpath = self.xpath(negation.selector)
sub_xpath = self.xpath(negation.subselector)
if sub_xpath.condition:
return xpath.add_condition('not(%s)' % sub_xpath.condition)
return xpath.add_condition('0')
def xpath_function(self, function):
"""Translate a functional pseudo-class."""
method = 'xpath_%s_function' %'-', '_').lower()
method = getattr(self, method, None)
if not method:
raise ExpressionError(
"The pseudo-class :%s() is unknown" %
return method(self.xpath(function.selector), function)
def xpath_pseudo(self, pseudo):
"""Translate a pseudo-class."""
method = 'xpath_%s_pseudo' % pseudo.ident.replace('-', '_').lower()
method = getattr(self, method, None)
if not method:
# TODO: better error message for pseudo-elements?
raise ExpressionError(
"The pseudo-class :%s is unknown" % pseudo.ident)
return method(self.xpath(pseudo.selector))
def xpath_attrib(self, selector):
"""Translate an attribute selector."""
operator = self.attribute_operator_mapping[selector.operator]
method = getattr(self, 'xpath_attrib_%s' % operator)
if self.lower_case_attribute_names:
name = selector.attrib.lower()
name = selector.attrib
if selector.namespace == '*':
name = '@' + name
name = '@%s:%s' % (selector.namespace, name)
if self.lower_case_attribute_values:
value = selector.value.lower()
value = selector.value
return method(self.xpath(selector.selector), name, value)
def xpath_class(self, class_selector):
"""Translate a class selector."""
# .foo is defined as [class~=foo] in the spec.
xpath = self.xpath(class_selector.selector)
return self.xpath_attrib_includes(
xpath, '@class', class_selector.class_name)
def xpath_hash(self, id_selector):
"""Translate an ID selector."""
xpath = self.xpath(id_selector.selector)
return self.xpath_attrib_equals(xpath, '@id',
def xpath_element(self, selector):
"""Translate a type or universal selector."""
if self.lower_case_element_names:
element = selector.element.lower()
element = selector.element
if selector.namespace != '*':
# Namespace prefixes are case-sensitive.
element = '%s:%s' % (selector.namespace, element)
return XPathExpr(element=element)
# CombinedSelector: dispatch by combinator
def xpath_descendant_combinator(self, left, right):
"""right is a child, grand-child or further descendant of left"""
return left.join('/descendant-or-self::*/', right)
def xpath_child_combinator(self, left, right):
"""right is an immediate child of left"""
return left.join('/', right)
def xpath_direct_adjacent_combinator(self, left, right):
"""right is a sibling immediately after left"""
xpath = left.join('/following-sibling::', right)
return xpath.add_condition('position() = 1')
def xpath_indirect_adjacent_combinator(self, left, right):
"""right is a sibling after left, immediately or not"""
return left.join('/following-sibling::', right)
# Function: dispatch by function/pseudo-class name
def xpath_nth_child_function(self, xpath, function, last=False,
a, b = parse_series(function.arguments)
except ValueError:
raise ExpressionError("Invalid series: '%r'" % function.arguments)
if add_name_test:
if a == 0:
if last:
b = 'last() - %s' % b
return xpath.add_condition('position() = %s' % b)
if last:
# FIXME: I'm not sure if this is right
a = -a
b = -b
if b > 0:
b_neg = str(-b)
b_neg = '+%s' % (-b)
if a != 1:
expr = ['(position() %s) mod %s = 0' % (b_neg, a)]
expr = []
if b >= 0:
expr.append('position() >= %s' % b)
elif b < 0 and last:
expr.append('position() < (last() %s)' % b)
expr = ' and '.join(expr)
if expr:
return xpath
# FIXME: handle an+b, odd, even
# an+b means every-a, plus b, e.g., 2n+1 means odd
# 0n+b means b
# n+0 means a=1, i.e., all elements
# an means every a elements, i.e., 2n means even
# -n means -1n
# -1n+6 means elements 6 and previous
def xpath_nth_last_child_function(self, xpath, function):
return self.xpath_nth_child_function(xpath, function, last=True)
def xpath_nth_of_type_function(self, xpath, function):
if xpath.element == '*':
raise ExpressionError(
"*:nth-of-type() is not implemented")
return self.xpath_nth_child_function(xpath, function,
def xpath_nth_last_of_type_function(self, xpath, function):
if xpath.element == '*':
raise ExpressionError(
"*:nth-of-type() is not implemented")
return self.xpath_nth_child_function(xpath, function, last=True,
def xpath_contains_function(self, xpath, function):
return xpath.add_condition('contains(string(.), %s)'
% self.xpath_literal(function.arguments))
def function_unsupported(self, xpath, pseudo):
raise ExpressionError(
"The pseudo-class :%s() is not supported" %
xpath_lang_function = function_unsupported
# Pseudo: dispatch by pseudo-class name
def xpath_root_pseudo(self, xpath):
return xpath.add_condition("not(parent::*)")
def xpath_first_child_pseudo(self, xpath):
return xpath.add_condition('position() = 1')
def xpath_last_child_pseudo(self, xpath):
return xpath.add_condition('position() = last()')
def xpath_first_of_type_pseudo(self, xpath):
if xpath.element == '*':
raise ExpressionError(
"*:first-of-type is not implemented")
return xpath.add_condition('position() = 1')
def xpath_last_of_type_pseudo(self, xpath):
if xpath.element == '*':
raise ExpressionError(
"*:last-of-type is not implemented")
return xpath.add_condition('position() = last()')
def xpath_only_child_pseudo(self, xpath):
return xpath.add_condition('last() = 1')
def xpath_only_of_type_pseudo(self, xpath):
if xpath.element == '*':
raise ExpressionError(
"*:only-of-type is not implemented")
return xpath.add_condition('last() = 1')
def xpath_empty_pseudo(self, xpath):
return xpath.add_condition("not(*) and not(normalize-space())")
def pseudo_never_matches(self, xpath):
"""Common implementation for pseudo-classes that never match."""
return xpath.add_condition("0")
xpath_link_pseudo = pseudo_never_matches
xpath_visited_pseudo = pseudo_never_matches
xpath_hover_pseudo = pseudo_never_matches
xpath_active_pseudo = pseudo_never_matches
xpath_focus_pseudo = pseudo_never_matches
xpath_target_pseudo = pseudo_never_matches
xpath_enabled_pseudo = pseudo_never_matches
xpath_disabled_pseudo = pseudo_never_matches
xpath_checked_pseudo = pseudo_never_matches
# Attrib: dispatch by attribute operator
def xpath_attrib_exists(self, xpath, name, value):
assert not value
return xpath
def xpath_attrib_equals(self, xpath, name, value):
xpath.add_condition('%s = %s' % (name, self.xpath_literal(value)))
return xpath
def xpath_attrib_different(self, xpath, name, value):
# FIXME: this seems like a weird hack...
if value:
xpath.add_condition('not(%s) or %s != %s'
% (name, name, self.xpath_literal(value)))
xpath.add_condition('%s != %s'
% (name, self.xpath_literal(value)))
return xpath
def xpath_attrib_includes(self, xpath, name, value):
"%s and contains(concat(' ', normalize-space(%s), ' '), %s)"
% (name, name, self.xpath_literal(' '+value+' ')))
return xpath
def xpath_attrib_dashmatch(self, xpath, name, value):
# Weird, but true...
xpath.add_condition('%s and (%s = %s or starts-with(%s, %s))' % (
name, self.xpath_literal(value),
name, self.xpath_literal(value + '-')))
return xpath
def xpath_attrib_prefixmatch(self, xpath, name, value):
return xpath.add_condition('%s and starts-with(%s, %s)' % (
name, name, self.xpath_literal(value)))
def xpath_attrib_suffixmatch(self, xpath, name, value):
# Oddly there is a starts-with in XPath 1.0, but not ends-with
return xpath.add_condition(
'%s and substring(%s, string-length(%s)-%s) = %s'
% (name, name, name, len(value)-1, self.xpath_literal(value)))
def xpath_attrib_substringmatch(self, xpath, name, value):
# Attribute selectors are case sensitive
return xpath.add_condition('%s and contains(%s, %s)' % (
name, name, self.xpath_literal(value)))
class HTMLTranslator(GenericTranslator):
Translator for (X)HTML documents.
Has a more useful implementation of some pseudo-classes based on
HTML-specific element names and attribute names, as described in
the `HTML5 specification`_. It assumes no-quirks mode.
The API is the same as :class:`GenericTranslator`.
.. _HTML5 specification:
:param xhtml:
If false (the default), element names and attribute names
are case-insensitive.
def __init__(self, xhtml=False):
self.xhtml = xhtml # Might be useful for sub-classes?
if not xhtml:
# See their definition in GenericTranslator.
self.lower_case_element_names = True
self.lower_case_attribute_names = True
def xpath_checked_pseudo(self, xpath):
# FIXME: is this really all the elements?
return xpath.add_condition(
"(@selected and name(.) = 'option') or "
"(@checked "
"and (name(.) = 'input' or name(.) = 'command')"
"and (@type = 'checkbox' or @type = 'radio'))")
def xpath_link_pseudo(self, xpath):
return xpath.add_condition("@href and "
"(name(.) = 'a' or name(.) = 'link' or name(.) = 'area')")
# Links are never visited, the implementation for :visited is the same
# as in GenericTranslator
def xpath_disabled_pseudo(self, xpath):
return xpath.add_condition('''
@disabled and
(name(.) = 'input' and @type != 'hidden') or
name(.) = 'button' or
name(.) = 'select' or
name(.) = 'textarea' or
name(.) = 'command' or
name(.) = 'fieldset' or
name(.) = 'optgroup' or
name(.) = 'option'
) or (
(name(.) = 'input' and @type != 'hidden') or
name(.) = 'button' or
name(.) = 'select' or
name(.) = 'textarea'
and ancestor::fieldset[@disabled]
# FIXME: in the second half, add "and is not a descendant of that
# fieldset element's first legend element child, if any."
def xpath_enabled_pseudo(self, xpath):
return xpath.add_condition('''
@href and (
name(.) = 'a' or
name(.) = 'link' or
name(.) = 'area'
) or (
name(.) = 'command' or
name(.) = 'fieldset' or
name(.) = 'optgroup'
and not(@disabled)
) or (
(name(.) = 'input' and @type != 'hidden') or
name(.) = 'button' or
name(.) = 'select' or
name(.) = 'textarea' or
name(.) = 'keygen'
and not (@disabled or ancestor::fieldset[@disabled])
) or (
name(.) = 'option' and not(
@disabled or ancestor::optgroup[@disabled]
# FIXME: ... or "li elements that are children of menu elements,
# and that have a child element that defines a command, if the first
# such element's Disabled State facet is false (not disabled)".
# FIXME: after ancestor::fieldset[@disabled], add "and is not a
# descendant of that fieldset element's first legend element child,
# if any."