diff --git a/scrapely/extraction/regionextract.py b/scrapely/extraction/regionextract.py index 57ad751..fec5d58 100644 --- a/scrapely/extraction/regionextract.py +++ b/scrapely/extraction/regionextract.py @@ -107,13 +107,9 @@ def _extract_both(self, page, start_index, end_index, ignored_regions=None, **kw self._extract_attribute(page, start_index, end_index, ignored_regions) def _extract_content(self, extraction_page, start_index, end_index, ignored_regions=None, **kwargs): - # extract content between annotation indexes - if not ignored_regions: - region = extraction_page.htmlpage_region_inside(start_index, end_index) - else: - # assumes ignored_regions are completely contained within start and end index - assert (start_index <= ignored_regions[0].start_index and - end_index >= ignored_regions[-1].end_index) + """extract content between annotation indexes""" + if ignored_regions and (start_index <= ignored_regions[0].start_index and + end_index >= ignored_regions[-1].end_index): starts = [start_index] + [i.end_index for i in ignored_regions if i.end_index is not None] ends = [i.start_index for i in ignored_regions] if starts[-1] is not None: @@ -123,6 +119,8 @@ def _extract_content(self, extraction_page, start_index, end_index, ignored_regi included_regions.next() regions = starmap(extraction_page.htmlpage_region_inside, included_regions) region = FragmentedHtmlPageRegion(extraction_page.htmlpage, list(regions)) + else: + region = extraction_page.htmlpage_region_inside(start_index, end_index) validated = self.content_validate(region) return [(self.annotation.surrounds_attribute, validated)] if validated else [] diff --git a/scrapely/tests/test_extraction.py b/scrapely/tests/test_extraction.py index aedb7c9..0af030e 100644 --- a/scrapely/tests/test_extraction.py +++ b/scrapely/tests/test_extraction.py @@ -960,6 +960,24 @@ """ +# repeated elements with ignored region only in one of them +ANNOTATED_PAGE32 = u""" + +""" + +EXTRACT_PAGE32 = u""" + +""" + DEFAULT_DESCRIPTOR = ItemDescriptor('test', 'item test, removes tags from description attribute', [A('description', 'description field without tags', notags)]) @@ -1265,7 +1283,10 @@ u'name': [u'Product name'], u'image_urls': [['http://example.com/image.jpg']] } - ) + ), + ('single ignored region inside a repeated structure', [ANNOTATED_PAGE32], EXTRACT_PAGE32, DEFAULT_DESCRIPTOR, + {'features': [u'feature1', u'feature2', u'feature3']} + ), ] class TestIbl(TestCase):