Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions scrapely/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@
'b' : 'strong',
'i' : 'em',
}

# tags whoose content will be completely removed (recursively)
# (overrides tags_to_keep and tags_to_replace)
_TAGS_TO_PURGE = ('script', 'img', 'input')
Expand Down Expand Up @@ -91,12 +90,11 @@ def text(region):
HTML entities are converted to text
>>> t(u"only £42")
u'only \\xa342'

>>> t(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>")
u'The text is here'
"""
chunks = _process_markup(region,
lambda text: remove_entities(text, encoding=region.htmlpage.encoding),
lambda tag: u' '
)
text = u''.join(chunks)
text = remove_entities(region.text_content, encoding=region.htmlpage.encoding)
return _WS.sub(u' ', text).strip()

def safehtml(region, allowed_tags=_TAGS_TO_KEEP, replace_tags=_TAGS_TO_REPLACE,
Expand Down
2 changes: 1 addition & 1 deletion scrapely/htmlpage.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def __repr__(self):
_TAG = "<(\/?)(\w+(?::\w+)?)((?:\s*" + _ATTR + ")+\s*|\s*)(\/?)>?"
_DOCTYPE = r"<!DOCTYPE.*?>"
_SCRIPT = "(<script.*?>)(.*?)(</script.*?>)"
_COMMENT = "(<!--.*?-->)"
_COMMENT = "(<!--.*?-->|<\?.+?>)"

_ATTR_REGEXP = re.compile(_ATTR, re.I | re.DOTALL)
_HTML_REGEXP = re.compile("%s|%s|%s" % (_COMMENT, _SCRIPT, _TAG), re.I | re.DOTALL)
Expand Down
7 changes: 7 additions & 0 deletions scrapely/tests/test_htmlpage.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,10 @@ def test_malformed2(self):
def test_empty_subregion(self):
htmlpage = HtmlPage(body=u"")
self.assertEqual(htmlpage.subregion(), u"")

def test_ignore_xml_declaration(self):
"""Ignore xml declarations inside html"""
parsed = list(parse_html(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>"))
self.assertFalse(parsed[3].is_text_content)