diff --git a/scrapely/extraction/regionextract.py b/scrapely/extraction/regionextract.py index e1e6ffa..57ad751 100644 --- a/scrapely/extraction/regionextract.py +++ b/scrapely/extraction/regionextract.py @@ -409,7 +409,7 @@ def _doextract(self, page, region_elements, start_index, end_index, nested_regio end_index, _, following_data = self._doextract(page, following_regions, start_index, end_index, **kwargs) if end_index is not None: pindex, sindex, extracted_data = self._doextract(page, [first_region], start_index, end_index - 1, nested_regions, ignored_regions, **kwargs) - extracted_data += following_data + extracted_data += following_data elif nested_regions: _, _, nested_data = self._doextract(page, nested_regions, start_index, end_index, **kwargs) extracted_data += nested_data diff --git a/scrapely/tests/test_extraction.py b/scrapely/tests/test_extraction.py index b5d4600..aedb7c9 100644 --- a/scrapely/tests/test_extraction.py +++ b/scrapely/tests/test_extraction.py @@ -936,6 +936,29 @@
""" +ANNOTATED_PAGE31 = u""" + +
+Product name +

60.00

+description +features + +
+
+ +""" + +EXTRACT_PAGE31 = u""" + +
+Product name +

60.00

+ +
+
+ +""" DEFAULT_DESCRIPTOR = ItemDescriptor('test', 'item test, removes tags from description attribute', @@ -950,6 +973,15 @@ ] ) +SAMPLE_DESCRIPTOR1a = ItemDescriptor('test', 'product test', [ + A('name', "Product name"), + A('price', "Product price, including any discounts and tax or vat", + contains_any_numbers), + A('image_urls', "URLs for one or more images", image_url), + A('description', "The full description of the product", html), + ] + ) + SAMPLE_DESCRIPTOR2 = ItemDescriptor('test', 'item test', [ A('description', 'description field without tags', notags), A('price', "Product price, including any discounts and tax or vat", @@ -1227,6 +1259,13 @@ ('avoid false positives on scripts', [ANNOTATED_PAGE30], EXTRACT_PAGE30d, SAMPLE_DESCRIPTOR3, None ), + ('correctly extract regions that follows more than one consecutive misses', [ANNOTATED_PAGE31], EXTRACT_PAGE31, SAMPLE_DESCRIPTOR1a, + { + u'price': [u'60.00'], + u'name': [u'Product name'], + u'image_urls': [['http://example.com/image.jpg']] + } + ) ] class TestIbl(TestCase):