diff --git a/scrapely/extractors.py b/scrapely/extractors.py index 7f41ee0..c25e44e 100644 --- a/scrapely/extractors.py +++ b/scrapely/extractors.py @@ -47,7 +47,6 @@ 'b' : 'strong', 'i' : 'em', } - # tags whoose content will be completely removed (recursively) # (overrides tags_to_keep and tags_to_replace) _TAGS_TO_PURGE = ('script', 'img', 'input') @@ -91,12 +90,11 @@ def text(region): HTML entities are converted to text >>> t(u"only £42") u'only \\xa342' + + >>> t(u"

The text

is here

") + u'The text is here' """ - chunks = _process_markup(region, - lambda text: remove_entities(text, encoding=region.htmlpage.encoding), - lambda tag: u' ' - ) - text = u''.join(chunks) + text = remove_entities(region.text_content, encoding=region.htmlpage.encoding) return _WS.sub(u' ', text).strip() def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE, diff --git a/scrapely/htmlpage.py b/scrapely/htmlpage.py index 258e3cc..373990b 100644 --- a/scrapely/htmlpage.py +++ b/scrapely/htmlpage.py @@ -200,7 +200,7 @@ def __repr__(self): _TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?" _DOCTYPE = r"" _SCRIPT = "()(.*?)()" -_COMMENT = "()" +_COMMENT = "(|<\?.+?>)" _ATTR_REGEXP = re.compile(_ATTR, re.I | re.DOTALL) _HTML_REGEXP = re.compile("%s|%s|%s" % (_COMMENT, _SCRIPT, _TAG), re.I | re.DOTALL) diff --git a/scrapely/tests/test_htmlpage.py b/scrapely/tests/test_htmlpage.py index 5d83dc1..a987a70 100644 --- a/scrapely/tests/test_htmlpage.py +++ b/scrapely/tests/test_htmlpage.py @@ -138,3 +138,10 @@ def test_malformed2(self): def test_empty_subregion(self): htmlpage = HtmlPage(body=u"") self.assertEqual(htmlpage.subregion(), u"") + + def test_ignore_xml_declaration(self): + """Ignore xml declarations inside html""" + parsed = list(parse_html(u"

The text

is here

")) + self.assertFalse(parsed[3].is_text_content) + +