Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scrapely/extraction/regionextract.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ def _doextract(self, page, region_elements, start_index, end_index, nested_regio
end_index, _, following_data = self._doextract(page, following_regions, start_index, end_index, **kwargs)
if end_index is not None:
pindex, sindex, extracted_data = self._doextract(page, [first_region], start_index, end_index - 1, nested_regions, ignored_regions, **kwargs)
extracted_data += following_data
extracted_data += following_data
elif nested_regions:
_, _, nested_data = self._doextract(page, nested_regions, start_index, end_index, **kwargs)
extracted_data += nested_data
Expand Down
39 changes: 39 additions & 0 deletions scrapely/tests/test_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -936,6 +936,29 @@
<div><span><script>var myvar= 10;</script></span></div>
"""

ANNOTATED_PAGE31 = u"""
<html><body>
<div>
<span data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">Product name</span>
<div><p data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;price&quot;}}">60.00</p>
<span data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}">description</span>
<span data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;features&quot;}}">features</span>
<img data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;src&quot;: &quot;image_urls&quot;}}" src="image.jpg" />
<table></table>
</div></div>
</body></html>
"""

EXTRACT_PAGE31 = u"""
<html><body>
<div>
<span>Product name</span>
<div><p>60.00</p>
<img src="http://example.com/image.jpg" />
<table></table>
</div></div>
</body></html>
"""

DEFAULT_DESCRIPTOR = ItemDescriptor('test',
'item test, removes tags from description attribute',
Expand All @@ -950,6 +973,15 @@
]
)

SAMPLE_DESCRIPTOR1a = ItemDescriptor('test', 'product test', [
A('name', "Product name"),
A('price', "Product price, including any discounts and tax or vat",
contains_any_numbers),
A('image_urls', "URLs for one or more images", image_url),
A('description', "The full description of the product", html),
]
)

SAMPLE_DESCRIPTOR2 = ItemDescriptor('test', 'item test', [
A('description', 'description field without tags', notags),
A('price', "Product price, including any discounts and tax or vat",
Expand Down Expand Up @@ -1227,6 +1259,13 @@
('avoid false positives on scripts', [ANNOTATED_PAGE30], EXTRACT_PAGE30d, SAMPLE_DESCRIPTOR3,
None
),
('correctly extract regions that follows more than one consecutive misses', [ANNOTATED_PAGE31], EXTRACT_PAGE31, SAMPLE_DESCRIPTOR1a,
{
u'price': [u'60.00'],
u'name': [u'Product name'],
u'image_urls': [['http://example.com/image.jpg']]
}
)
]

class TestIbl(TestCase):
Expand Down