Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions scrapely/extraction/pageparsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,9 @@ def _handle_unpaired_tag(self, html_tag):

annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1)
attribute_annotations = jannotation.pop('annotations', {}).items()
content_key = jannotation.pop('text-content', 'content')
for extract_attribute, tag_value in attribute_annotations:
if extract_attribute == 'content':
if extract_attribute == content_key:
annotation.surrounds_attribute = tag_value
self.unpairedtag_stack.append(annotation)
else:
Expand Down Expand Up @@ -200,8 +201,9 @@ def _handle_open_tag(self, html_tag):
self.extra_required_attrs.extend(jannotation.pop('required', []))

attribute_annotations = jannotation.pop('annotations', {}).items()
content_key = jannotation.pop('text-content', 'content')
for extract_attribute, tag_value in attribute_annotations:
if extract_attribute == 'content':
if extract_attribute == content_key:
annotation.surrounds_attribute = tag_value
else:
annotation.tag_attributes.append((extract_attribute, tag_value))
Expand Down
28 changes: 28 additions & 0 deletions scrapely/tests/test_pageparsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,18 @@
</body></html>
"""

LABELLED_PAGE12 = u"""
<head>
<meta name="description" content="This is the description" data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false, &quot;text-content&quot;: &quot;text-content:&quot;, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}" />
</head>
"""

LABELLED_PAGE13 = u"""
<head>
<meta name="description" content="This is the description" data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false, &quot;text-content&quot;: &quot;text-content&quot;, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;, &quot;text-content&quot;: &quot;name&quot;}}">This is the name</meta>
</head>
"""

def _parse_page(parser_class, pagetext):
htmlpage = HtmlPage(None, {}, pagetext)
parser = parser_class(TokenDict())
Expand Down Expand Up @@ -304,6 +316,22 @@ def test_variant_attribute(self):
annotations = _parse_page(TemplatePageParser, LABELLED_PAGE11).annotations
self.assertEqual(annotations[0].variant_id, 1)

def test_content_attribute(self):
"""
Test that attribute with name content is unambiguously interpreted
"""
annotations = _parse_page(TemplatePageParser, LABELLED_PAGE12).annotations
self.assertEqual(annotations[0].surrounds_attribute, None)
self.assertEqual(annotations[0].tag_attributes, [("content", "description")])

def test_content_and_content_attribute(self):
"""
Test that attribute with name content and the content itself are unambiguously interpreted
"""
annotations = _parse_page(TemplatePageParser, LABELLED_PAGE13).annotations
self.assertEqual(annotations[0].surrounds_attribute, 'name')
self.assertEqual(annotations[0].tag_attributes, [("content", "description")])

def test_site_pages(self):
"""
Tests from real pages. More reliable and easy to build for more complicated structures
Expand Down