Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG+1] Migrating selectors to use parsel #1409

Merged
merged 17 commits into from
Aug 11, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
ce21884
migrating scrapy Selector to use Parsel
eliasdorneles Aug 3, 2015
c7b29d1
fix support to legacy _root argument
eliasdorneles Aug 3, 2015
3a572e2
cleanup csstranslator module, keeping only imports
eliasdorneles Aug 3, 2015
01d948f
remove selector support for LxmlDocument DOM cache and add deprecatio…
eliasdorneles Aug 4, 2015
17d7347
update minimal parsel version, add deprecated classes for csstranslat…
eliasdorneles Aug 5, 2015
35c1dcd
use response.selector in link extractors instead of instantiating new…
eliasdorneles Aug 5, 2015
6287fc3
remove lxmldocument dependency from http.request.form
eliasdorneles Aug 7, 2015
94c3a34
remove deprecated module lxmldocument
eliasdorneles Aug 7, 2015
67c98b1
avoid harcoded check for selector type
eliasdorneles Aug 7, 2015
2fe6d12
upgrade parsel and using promoted root attribute
eliasdorneles Aug 7, 2015
26ebccd
upgrade parsel and use its function to instantiate root for finding form
eliasdorneles Aug 7, 2015
12579b9
warning when ambiguous root arguments and minor cleanups
eliasdorneles Aug 9, 2015
3a03ef7
cleanup tests for selectors and translators
eliasdorneles Aug 9, 2015
8ef5aa2
using bytes for response body in tests
eliasdorneles Aug 11, 2015
e50610b
set base_url in kwargs to be fully backward compatible
eliasdorneles Aug 11, 2015
766c255
upgrade parsel and add shim for deprecated selectorlist methods
eliasdorneles Aug 11, 2015
a5abd19
make Parsel's Selector more private, remove direct dependency of Pars…
eliasdorneles Aug 11, 2015
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ queuelib
six>=1.5.2
PyDispatcher>=2.0.5
service_identity
parsel>=0.9.5
5 changes: 3 additions & 2 deletions scrapy/http/request/form.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from six.moves.urllib.parse import urljoin, urlencode
import lxml.html
from parsel.selector import create_root_node
import six
from scrapy.http.request import Request
from scrapy.utils.python import to_bytes, is_listlike
Expand Down Expand Up @@ -56,8 +57,8 @@ def _urlencode(seq, enc):

def _get_form(response, formname, formid, formnumber, formxpath):
"""Find the form element """
from scrapy.selector.lxmldocument import LxmlDocument
root = LxmlDocument(response, lxml.html.HTMLParser)
text = response.body_as_unicode()
root = create_root_node(text, lxml.html.HTMLParser, base_url=response.url)
forms = root.xpath('//form')
if not forms:
raise ValueError("No <form> element found in %s" % response)
Expand Down
10 changes: 4 additions & 6 deletions scrapy/linkextractors/lxmlhtml.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def _iter_links(self, document):
def _extract_links(self, selector, response_url, response_encoding, base_url):
links = []
# hacky way to get the underlying lxml parsed document
for el, attr, attr_val in self._iter_links(selector._root):
for el, attr, attr_val in self._iter_links(selector.root):
# pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
attr_val = urljoin(base_url, attr_val)
url = self.process_attr(attr_val)
Expand All @@ -65,9 +65,8 @@ def _extract_links(self, selector, response_url, response_encoding, base_url):
if self.unique else links

def extract_links(self, response):
html = Selector(response)
base_url = get_base_url(response)
return self._extract_links(html, response.url, response.encoding, base_url)
return self._extract_links(response.selector, response.url, response.encoding, base_url)

def _process_links(self, links):
""" Normalize and filter extracted links
Expand Down Expand Up @@ -95,14 +94,13 @@ def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restric
canonicalize=canonicalize, deny_extensions=deny_extensions)

def extract_links(self, response):
html = Selector(response)
base_url = get_base_url(response)
if self.restrict_xpaths:
docs = [subdoc
for x in self.restrict_xpaths
for subdoc in html.xpath(x)]
for subdoc in response.xpath(x)]
else:
docs = [html]
docs = [response.selector]
all_links = []
for doc in docs:
links = self._extract_links(doc, response.url, response.encoding, base_url)
Expand Down
3 changes: 1 addition & 2 deletions scrapy/linkextractors/sgml.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,10 @@ def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restric
def extract_links(self, response):
base_url = None
if self.restrict_xpaths:
sel = Selector(response)
base_url = get_base_url(response)
body = u''.join(f
for x in self.restrict_xpaths
for f in sel.xpath(x).extract()
for f in response.xpath(x).extract()
).encode(response.encoding, errors='xmlcharrefreplace')
else:
body = response.body
Expand Down
95 changes: 11 additions & 84 deletions scrapy/selector/csstranslator.py
Original file line number Diff line number Diff line change
@@ -1,88 +1,15 @@
from cssselect import GenericTranslator, HTMLTranslator
from cssselect.xpath import _unicode_safe_getattr, XPathExpr, ExpressionError
from cssselect.parser import FunctionalPseudoElement
from parsel.csstranslator import XPathExpr, GenericTranslator, HTMLTranslator
from scrapy.utils.deprecate import create_deprecated_class


class ScrapyXPathExpr(XPathExpr):
ScrapyXPathExpr = create_deprecated_class(
'ScrapyXPathExpr', XPathExpr,
new_class_path='parsel.csstranslator.XPathExpr')

textnode = False
attribute = None

@classmethod
def from_xpath(cls, xpath, textnode=False, attribute=None):
x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
x.textnode = textnode
x.attribute = attribute
return x

def __str__(self):
path = super(ScrapyXPathExpr, self).__str__()
if self.textnode:
if path == '*':
path = 'text()'
elif path.endswith('::*/*'):
path = path[:-3] + 'text()'
else:
path += '/text()'

if self.attribute is not None:
if path.endswith('::*/*'):
path = path[:-2]
path += '/@%s' % self.attribute

return path

def join(self, combiner, other):
super(ScrapyXPathExpr, self).join(combiner, other)
self.textnode = other.textnode
self.attribute = other.attribute
return self


class TranslatorMixin(object):

def xpath_element(self, selector):
xpath = super(TranslatorMixin, self).xpath_element(selector)
return ScrapyXPathExpr.from_xpath(xpath)

def xpath_pseudo_element(self, xpath, pseudo_element):
if isinstance(pseudo_element, FunctionalPseudoElement):
method = 'xpath_%s_functional_pseudo_element' % (
pseudo_element.name.replace('-', '_'))
method = _unicode_safe_getattr(self, method, None)
if not method:
raise ExpressionError(
"The functional pseudo-element ::%s() is unknown"
% pseudo_element.name)
xpath = method(xpath, pseudo_element)
else:
method = 'xpath_%s_simple_pseudo_element' % (
pseudo_element.replace('-', '_'))
method = _unicode_safe_getattr(self, method, None)
if not method:
raise ExpressionError(
"The pseudo-element ::%s is unknown"
% pseudo_element)
xpath = method(xpath)
return xpath

def xpath_attr_functional_pseudo_element(self, xpath, function):
if function.argument_types() not in (['STRING'], ['IDENT']):
raise ExpressionError(
"Expected a single string or ident for ::attr(), got %r"
% function.arguments)
return ScrapyXPathExpr.from_xpath(xpath,
attribute=function.arguments[0].value)

def xpath_text_simple_pseudo_element(self, xpath):
"""Support selecting text nodes using ::text pseudo-element"""
return ScrapyXPathExpr.from_xpath(xpath, textnode=True)


class ScrapyGenericTranslator(TranslatorMixin, GenericTranslator):
pass


class ScrapyHTMLTranslator(TranslatorMixin, HTMLTranslator):
pass
ScrapyGenericTranslator = create_deprecated_class(
'ScrapyGenericTranslator', GenericTranslator,
new_class_path='parsel.csstranslator.GenericTranslator')

ScrapyHTMLTranslator = create_deprecated_class(
'ScrapyHTMLTranslator', HTMLTranslator,
new_class_path='parsel.csstranslator.HTMLTranslator')
31 changes: 0 additions & 31 deletions scrapy/selector/lxmldocument.py

This file was deleted.

Loading