diff --git a/scrapely/extractors.py b/scrapely/extractors.py index 7f41ee0..c25e44e 100644 --- a/scrapely/extractors.py +++ b/scrapely/extractors.py @@ -47,7 +47,6 @@ 'b' : 'strong', 'i' : 'em', } - # tags whoose content will be completely removed (recursively) # (overrides tags_to_keep and tags_to_replace) _TAGS_TO_PURGE = ('script', 'img', 'input') @@ -91,12 +90,11 @@ def text(region): HTML entities are converted to text >>> t(u"only £42") u'only \\xa342' + + >>> t(u"
The text
is here
") + u'The text is here' """ - chunks = _process_markup(region, - lambda text: remove_entities(text, encoding=region.htmlpage.encoding), - lambda tag: u' ' - ) - text = u''.join(chunks) + text = remove_entities(region.text_content, encoding=region.htmlpage.encoding) return _WS.sub(u' ', text).strip() def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE, diff --git a/scrapely/htmlpage.py b/scrapely/htmlpage.py index 258e3cc..373990b 100644 --- a/scrapely/htmlpage.py +++ b/scrapely/htmlpage.py @@ -200,7 +200,7 @@ def __repr__(self): _TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?" _DOCTYPE = r"" _SCRIPT = "(The text
is here
")) + self.assertFalse(parsed[3].is_text_content) + +