Use w3lib.html.replace_entities() (remove_entities() is deprecated)

redapple authored and dangra committed Aug 8, 2014
1 parent a51ee8b commit 180d3add0c969b83ce2e927b827885885fea920d
@@ -84,6 +84,6 @@ def _has_ajaxcrawlable_meta(text):

text = _script_re.sub(u'', text)
text = _noscript_re.sub(u'', text)
text = html.remove_comments(html.remove_entities(text))
text = html.remove_comments(html.replace_entities(text))
return is not None

@@ -1,7 +1,7 @@
import re
from urlparse import urljoin

from w3lib.html import remove_tags, remove_entities, replace_escape_chars
from w3lib.html import remove_tags, replace_entities, replace_escape_chars

from import Link
from .sgml import SgmlLinkExtractor
@@ -21,7 +21,7 @@ def _extract_links(self, response_text, response_url, response_encoding, base_ur
if base_url is None:
base_url = urljoin(response_url, self.base_url) if self.base_url else response_url

clean_url = lambda u: urljoin(base_url, remove_entities(clean_link(u.decode(response_encoding))))
clean_url = lambda u: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding))))
clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

links_text = linkre.findall(response_text)
@@ -2,11 +2,11 @@

import re
import hashlib

from importlib import import_module
from pkgutil import iter_modules

from w3lib.html import remove_entities
from w3lib.html import replace_entities

from scrapy.utils.python import flatten
from scrapy.item import BaseItem

@@ -87,9 +87,9 @@ def extract_regex(regex, text, encoding='utf-8'):
strings = flatten(strings)

if isinstance(text, unicode):
return [remove_entities(s, keep=['lt', 'amp']) for s in strings]
return [replace_entities(s, keep=['lt', 'amp']) for s in strings]
return [remove_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings]
return [replace_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings]

def md5sum(file):
"""Calculate the md5 checksum of a file-like object without reading its

