From c8adbb38c6b0bc7ccef3e9c02cdcbfa7986c7616 Mon Sep 17 00:00:00 2001 From: Martin Olveyra Date: Wed, 3 Oct 2012 23:44:12 -0200 Subject: [PATCH] allow to annotate attributes with name "content" in an unambiguous way and with complete back compatibility, by allowing to specify which annotation key is used for the tag content, fallbacking to the default "content" if the "text-content" annotation meta attribute is not present --- scrapely/extraction/pageparsing.py | 6 ++++-- scrapely/tests/test_pageparsing.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/scrapely/extraction/pageparsing.py b/scrapely/extraction/pageparsing.py index c7bea9e..a17c499 100644 --- a/scrapely/extraction/pageparsing.py +++ b/scrapely/extraction/pageparsing.py @@ -125,8 +125,9 @@ def _handle_unpaired_tag(self, html_tag): annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1) attribute_annotations = jannotation.pop('annotations', {}).items() + content_key = jannotation.pop('text-content', 'content') for extract_attribute, tag_value in attribute_annotations: - if extract_attribute == 'content': + if extract_attribute == content_key: annotation.surrounds_attribute = tag_value self.unpairedtag_stack.append(annotation) else: @@ -200,8 +201,9 @@ def _handle_open_tag(self, html_tag): self.extra_required_attrs.extend(jannotation.pop('required', [])) attribute_annotations = jannotation.pop('annotations', {}).items() + content_key = jannotation.pop('text-content', 'content') for extract_attribute, tag_value in attribute_annotations: - if extract_attribute == 'content': + if extract_attribute == content_key: annotation.surrounds_attribute = tag_value else: annotation.tag_attributes.append((extract_attribute, tag_value)) diff --git a/scrapely/tests/test_pageparsing.py b/scrapely/tests/test_pageparsing.py index 4c93fec..cb3c83e 100644 --- a/scrapely/tests/test_pageparsing.py +++ b/scrapely/tests/test_pageparsing.py @@ -178,6 +178,18 @@ """ +LABELLED_PAGE12 = u""" + + + +""" + +LABELLED_PAGE13 = u""" + +This is the name + +""" + def _parse_page(parser_class, pagetext): htmlpage = HtmlPage(None, {}, pagetext) parser = parser_class(TokenDict()) @@ -304,6 +316,22 @@ def test_variant_attribute(self): annotations = _parse_page(TemplatePageParser, LABELLED_PAGE11).annotations self.assertEqual(annotations[0].variant_id, 1) + def test_content_attribute(self): + """ + Test that attribute with name content is unambiguously interpreted + """ + annotations = _parse_page(TemplatePageParser, LABELLED_PAGE12).annotations + self.assertEqual(annotations[0].surrounds_attribute, None) + self.assertEqual(annotations[0].tag_attributes, [("content", "description")]) + + def test_content_and_content_attribute(self): + """ + Test that attribute with name content and the content itself are unambiguously interpreted + """ + annotations = _parse_page(TemplatePageParser, LABELLED_PAGE13).annotations + self.assertEqual(annotations[0].surrounds_attribute, 'name') + self.assertEqual(annotations[0].tag_attributes, [("content", "description")]) + def test_site_pages(self): """ Tests from real pages. More reliable and easy to build for more complicated structures