Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Use w3lib.html.replace_entities() (remove_entities() is deprecated)
Conflicts:
scrapy/utils/misc.py
- Loading branch information
|
@@ -84,6 +84,6 @@ def _has_ajaxcrawlable_meta(text): |
|
|
|
|
|
text = _script_re.sub(u'', text) |
|
|
text = _noscript_re.sub(u'', text) |
|
|
text = html.remove_comments(html.remove_entities(text)) |
|
|
text = html.remove_comments(html.replace_entities(text)) |
|
|
return _ajax_crawlable_re.search(text) is not None |
|
|
|
|
|
@@ -1,7 +1,7 @@ |
|
|
import re |
|
|
from urlparse import urljoin |
|
|
|
|
|
from w3lib.html import remove_tags, remove_entities, replace_escape_chars |
|
|
from w3lib.html import remove_tags, replace_entities, replace_escape_chars |
|
|
|
|
|
from scrapy.link import Link |
|
|
from .sgml import SgmlLinkExtractor |
|
@@ -21,7 +21,7 @@ def _extract_links(self, response_text, response_url, response_encoding, base_ur |
|
|
if base_url is None: |
|
|
base_url = urljoin(response_url, self.base_url) if self.base_url else response_url |
|
|
|
|
|
clean_url = lambda u: urljoin(base_url, remove_entities(clean_link(u.decode(response_encoding)))) |
|
|
clean_url = lambda u: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding)))) |
|
|
clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip() |
|
|
|
|
|
links_text = linkre.findall(response_text) |
|
|
|
@@ -2,11 +2,11 @@ |
|
|
|
|
|
import re |
|
|
import hashlib |
|
|
|
|
|
from importlib import import_module |
|
|
from pkgutil import iter_modules |
|
|
|
|
|
from w3lib.html import remove_entities |
|
|
from w3lib.html import replace_entities |
|
|
|
|
|
from scrapy.utils.python import flatten |
|
|
from scrapy.item import BaseItem |
|
|
|
|
@@ -87,9 +87,9 @@ def extract_regex(regex, text, encoding='utf-8'): |
|
|
strings = flatten(strings) |
|
|
|
|
|
if isinstance(text, unicode): |
|
|
return [remove_entities(s, keep=['lt', 'amp']) for s in strings] |
|
|
return [replace_entities(s, keep=['lt', 'amp']) for s in strings] |
|
|
else: |
|
|
return [remove_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings] |
|
|
return [replace_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings] |
|
|
|
|
|
def md5sum(file): |
|
|
"""Calculate the md5 checksum of a file-like object without reading its |
|
|