Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 5 additions & 7 deletions scrapely/extraction/regionextract.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,13 +107,9 @@ def _extract_both(self, page, start_index, end_index, ignored_regions=None, **kw
self._extract_attribute(page, start_index, end_index, ignored_regions)

def _extract_content(self, extraction_page, start_index, end_index, ignored_regions=None, **kwargs):
# extract content between annotation indexes
if not ignored_regions:
region = extraction_page.htmlpage_region_inside(start_index, end_index)
else:
# assumes ignored_regions are completely contained within start and end index
assert (start_index <= ignored_regions[0].start_index and
end_index >= ignored_regions[-1].end_index)
"""extract content between annotation indexes"""
if ignored_regions and (start_index <= ignored_regions[0].start_index and
end_index >= ignored_regions[-1].end_index):
starts = [start_index] + [i.end_index for i in ignored_regions if i.end_index is not None]
ends = [i.start_index for i in ignored_regions]
if starts[-1] is not None:
Expand All @@ -123,6 +119,8 @@ def _extract_content(self, extraction_page, start_index, end_index, ignored_regi
included_regions.next()
regions = starmap(extraction_page.htmlpage_region_inside, included_regions)
region = FragmentedHtmlPageRegion(extraction_page.htmlpage, list(regions))
else:
region = extraction_page.htmlpage_region_inside(start_index, end_index)
validated = self.content_validate(region)
return [(self.annotation.surrounds_attribute, validated)] if validated else []

Expand Down
23 changes: 22 additions & 1 deletion scrapely/tests/test_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -960,6 +960,24 @@
</body></html>
"""

# repeated elements with ignored region only in one of them
ANNOTATED_PAGE32 = u"""
<ul>
<li data-scrapy-annotate="{&quot;variant&quot;: 0,
&quot;annotations&quot;: {&quot;content&quot;: &quot;features&quot;}}">feature1<span data-scrapy-ignore="true"> ignore this</span></li>
<li data-scrapy-annotate="{&quot;variant&quot;: 0,
&quot;annotations&quot;: {&quot;content&quot;: &quot;features&quot;}}">feature2</li>
</ul>
"""

EXTRACT_PAGE32 = u"""
<ul>
<li>feature1<span> ignore this</span></li>
<li>feature2</li>
<li>feature3</li>
</ul>
"""

DEFAULT_DESCRIPTOR = ItemDescriptor('test',
'item test, removes tags from description attribute',
[A('description', 'description field without tags', notags)])
Expand Down Expand Up @@ -1265,7 +1283,10 @@
u'name': [u'Product name'],
u'image_urls': [['http://example.com/image.jpg']]
}
)
),
('single ignored region inside a repeated structure', [ANNOTATED_PAGE32], EXTRACT_PAGE32, DEFAULT_DESCRIPTOR,
{'features': [u'feature1', u'feature2', u'feature3']}
),
]

class TestIbl(TestCase):
Expand Down