Skip to content

Commit

Permalink
Merge pull request #39 from scrapinghub/wa-baseurl
Browse files Browse the repository at this point in the history
preserve URL in <base> tag
  • Loading branch information
kmike committed Apr 6, 2017
2 parents 97d6d37 + c4786cb commit 1499ad0
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 6 deletions.
8 changes: 4 additions & 4 deletions webstruct/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,22 +93,22 @@ def build_entity(self, html_tokens):
"""
return _join_tokens(html_tokens)

def annotate(self, bytes_data, pretty_print=False):
def annotate(self, bytes_data, url=None, pretty_print=False):
"""
Return annotated HTML data in WebAnnotator format.
"""
html_tokens, tags = self.extract_raw(bytes_data)
tree = self.html_tokenizer.detokenize_single(html_tokens, tags)
tree = to_webannotator(tree, self.entity_colors)
tree = to_webannotator(tree, entity_colors=self.entity_colors, url=url)
return tostring(tree, pretty_print=pretty_print)

def annotate_url(self, url):
def annotate_url(self, url, pretty_print=False):
"""
Return annotated HTML data in WebAnnotator format; input is downloaded
from ``url``.
"""
data = urlopen(url).read()
return self.annotate(data)
return self.annotate(data, pretty_print=pretty_print, url=url)

def __getstate__(self):
dct = self.__dict__.copy()
Expand Down
36 changes: 36 additions & 0 deletions webstruct/tests/test_webannotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,42 @@ def test_handle_nonxml_attributes(self):
wa_tree_str = tostring(wa_tree)
self.assertHtmlEqual(wa_tree_str, html)

def test_baseurl_nohead(self):
html = b"""<html><body><p>hello</p></body></html>"""
tree = html_document_fromstring(html)
wa_tree = webannotator.to_webannotator(tree,
url='http://example.com/foo')
self.assertHtmlEqual(tostring(wa_tree), """
<html>
<head><base href="http://example.com/foo"/></head>
<body><p>hello</p></body>
</html>
""")

def test_baseurl_head(self):
html = b"""<html><head><meta/></head><body><p>hello</p></body></html>"""
tree = html_document_fromstring(html)
wa_tree = webannotator.to_webannotator(tree,
url='http://example.com/foo')
self.assertHtmlEqual(tostring(wa_tree), """
<html>
<head><base href="http://example.com/foo"/><meta/></head>
<body><p>hello</p></body>
</html>
""")

def test_baseurl_exists(self):
html = b"""
<html>
<head><base href="http://example.com/foo"/></head>
<body><p>hello</p></body>
</html>
"""
tree = html_document_fromstring(html)
wa_tree = webannotator.to_webannotator(tree,
url='http://example.com/bar')
self.assertHtmlEqual(tostring(wa_tree), html)


class EntityColorsTest(HtmlTest):
def test_entity_colors(self):
Expand Down
28 changes: 26 additions & 2 deletions webstruct/webannotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def _openSpan(self):

def _fix_sax_attributes(attrs):
""" Fix sax startElement attributes for lxml < 3.1.2 """
if LXML_VERSION >= (3,1,2):
if LXML_VERSION >= (3, 1, 2):
return attrs
items = [((None, key), value) for key, value in attrs.items()]
return OrderedDict(items)
Expand Down Expand Up @@ -241,7 +241,29 @@ def _copy_title(tree):
title.text = text


def to_webannotator(tree, entity_colors=None):
def _ensure_head(tree):
""" Insert <head> element if it is missing. """
heads = tree.xpath('//head')
if heads:
return heads[0]
htmls = tree.xpath('//html')
root = htmls[0] if htmls else tree.root
head = Element("head")
root.insert(0, head)
return head


def _set_base(tree, baseurl):
"""
Add <base> tag to the tree. If <base> tag already exists do nothing.
"""
if tree.xpath('//base'):
return
head = _ensure_head(tree)
head.insert(0, Element("base", href=baseurl))


def to_webannotator(tree, entity_colors=None, url=None):
"""
Convert a tree loaded by one of WebStruct loaders to WebAnnotator format.
Expand All @@ -261,4 +283,6 @@ def to_webannotator(tree, entity_colors=None):
tree = handler.out.etree
_copy_title(tree)
_add_wacolor_elements(tree, handler.entity_colors)
if url is not None:
_set_base(tree, url)
return tree

0 comments on commit 1499ad0

Please sign in to comment.