Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Merge pull request #29 from kalessin/contentattr

allow annotating attributes named "content"
  • Loading branch information...
commit d292a6e09c34fd85366da0859cf87d8e79db067a 2 parents 4bbc3c7 + c8adbb3
@shaneaevans shaneaevans authored
View
6 scrapely/extraction/pageparsing.py
@@ -125,8 +125,9 @@ def _handle_unpaired_tag(self, html_tag):
annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1)
attribute_annotations = jannotation.pop('annotations', {}).items()
+ content_key = jannotation.pop('text-content', 'content')
for extract_attribute, tag_value in attribute_annotations:
- if extract_attribute == 'content':
+ if extract_attribute == content_key:
annotation.surrounds_attribute = tag_value
self.unpairedtag_stack.append(annotation)
else:
@@ -200,8 +201,9 @@ def _handle_open_tag(self, html_tag):
self.extra_required_attrs.extend(jannotation.pop('required', []))
attribute_annotations = jannotation.pop('annotations', {}).items()
+ content_key = jannotation.pop('text-content', 'content')
for extract_attribute, tag_value in attribute_annotations:
- if extract_attribute == 'content':
+ if extract_attribute == content_key:
annotation.surrounds_attribute = tag_value
else:
annotation.tag_attributes.append((extract_attribute, tag_value))
View
28 scrapely/tests/test_pageparsing.py
@@ -178,6 +178,18 @@
</body></html>
"""
+LABELLED_PAGE12 = u"""
+<head>
+<meta name="description" content="This is the description" data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false, &quot;text-content&quot;: &quot;text-content:&quot;, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}" />
+</head>
+"""
+
+LABELLED_PAGE13 = u"""
+<head>
+<meta name="description" content="This is the description" data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false, &quot;text-content&quot;: &quot;text-content&quot;, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;, &quot;text-content&quot;: &quot;name&quot;}}">This is the name</meta>
+</head>
+"""
+
def _parse_page(parser_class, pagetext):
htmlpage = HtmlPage(None, {}, pagetext)
parser = parser_class(TokenDict())
@@ -304,6 +316,22 @@ def test_variant_attribute(self):
annotations = _parse_page(TemplatePageParser, LABELLED_PAGE11).annotations
self.assertEqual(annotations[0].variant_id, 1)
+ def test_content_attribute(self):
+ """
+ Test that attribute with name content is unambiguously interpreted
+ """
+ annotations = _parse_page(TemplatePageParser, LABELLED_PAGE12).annotations
+ self.assertEqual(annotations[0].surrounds_attribute, None)
+ self.assertEqual(annotations[0].tag_attributes, [("content", "description")])
+
+ def test_content_and_content_attribute(self):
+ """
+ Test that attribute with name content and the content itself are unambiguously interpreted
+ """
+ annotations = _parse_page(TemplatePageParser, LABELLED_PAGE13).annotations
+ self.assertEqual(annotations[0].surrounds_attribute, 'name')
+ self.assertEqual(annotations[0].tag_attributes, [("content", "description")])
+
def test_site_pages(self):
"""
Tests from real pages. More reliable and easy to build for more complicated structures
Please sign in to comment.
Something went wrong with that request. Please try again.