diff --git a/scrapely/extraction/pageparsing.py b/scrapely/extraction/pageparsing.py index c7bea9e..a17c499 100644 --- a/scrapely/extraction/pageparsing.py +++ b/scrapely/extraction/pageparsing.py @@ -125,8 +125,9 @@ def _handle_unpaired_tag(self, html_tag): annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1) attribute_annotations = jannotation.pop('annotations', {}).items() + content_key = jannotation.pop('text-content', 'content') for extract_attribute, tag_value in attribute_annotations: - if extract_attribute == 'content': + if extract_attribute == content_key: annotation.surrounds_attribute = tag_value self.unpairedtag_stack.append(annotation) else: @@ -200,8 +201,9 @@ def _handle_open_tag(self, html_tag): self.extra_required_attrs.extend(jannotation.pop('required', [])) attribute_annotations = jannotation.pop('annotations', {}).items() + content_key = jannotation.pop('text-content', 'content') for extract_attribute, tag_value in attribute_annotations: - if extract_attribute == 'content': + if extract_attribute == content_key: annotation.surrounds_attribute = tag_value else: annotation.tag_attributes.append((extract_attribute, tag_value)) diff --git a/scrapely/tests/test_pageparsing.py b/scrapely/tests/test_pageparsing.py index 4c93fec..cb3c83e 100644 --- a/scrapely/tests/test_pageparsing.py +++ b/scrapely/tests/test_pageparsing.py @@ -178,6 +178,18 @@ """ +LABELLED_PAGE12 = u""" + + + +""" + +LABELLED_PAGE13 = u""" + +This is the name + +""" + def _parse_page(parser_class, pagetext): htmlpage = HtmlPage(None, {}, pagetext) parser = parser_class(TokenDict()) @@ -304,6 +316,22 @@ def test_variant_attribute(self): annotations = _parse_page(TemplatePageParser, LABELLED_PAGE11).annotations self.assertEqual(annotations[0].variant_id, 1) + def test_content_attribute(self): + """ + Test that attribute with name content is unambiguously interpreted + """ + annotations = _parse_page(TemplatePageParser, LABELLED_PAGE12).annotations + self.assertEqual(annotations[0].surrounds_attribute, None) + self.assertEqual(annotations[0].tag_attributes, [("content", "description")]) + + def test_content_and_content_attribute(self): + """ + Test that attribute with name content and the content itself are unambiguously interpreted + """ + annotations = _parse_page(TemplatePageParser, LABELLED_PAGE13).annotations + self.assertEqual(annotations[0].surrounds_attribute, 'name') + self.assertEqual(annotations[0].tag_attributes, [("content", "description")]) + def test_site_pages(self): """ Tests from real pages. More reliable and easy to build for more complicated structures