From c8adbb38c6b0bc7ccef3e9c02cdcbfa7986c7616 Mon Sep 17 00:00:00 2001
From: Martin Olveyra <molveyra@gmail.com>
Date: Wed, 3 Oct 2012 23:44:12 -0200
Subject: [PATCH] allow to annotate attributes with name "content" in an
 unambiguous way and with complete back compatibility, by allowing to specify
 which annotation key is used for the tag content, fallbacking to the default
 "content" if the "text-content" annotation meta attribute is not present

---
 scrapely/extraction/pageparsing.py |  6 ++++--
 scrapely/tests/test_pageparsing.py | 28 ++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 2 deletions(-)
diff --git a/scrapely/extraction/pageparsing.py b/scrapely/extraction/pageparsing.py
index c7bea9e..a17c499 100644
--- a/scrapely/extraction/pageparsing.py
+++ b/scrapely/extraction/pageparsing.py
@@ -125,8 +125,9 @@ def _handle_unpaired_tag(self, html_tag):
                 
             annotation = AnnotationTag(self.next_tag_index, self.next_tag_index + 1)
             attribute_annotations = jannotation.pop('annotations', {}).items()
+            content_key = jannotation.pop('text-content', 'content')
             for extract_attribute, tag_value in attribute_annotations:
-                if extract_attribute == 'content':
+                if extract_attribute == content_key:
                     annotation.surrounds_attribute = tag_value
                     self.unpairedtag_stack.append(annotation)
                 else:
@@ -200,8 +201,9 @@ def _handle_open_tag(self, html_tag):
         self.extra_required_attrs.extend(jannotation.pop('required', []))
         
         attribute_annotations = jannotation.pop('annotations', {}).items()
+        content_key = jannotation.pop('text-content', 'content')
         for extract_attribute, tag_value in attribute_annotations:
-            if extract_attribute == 'content':
+            if extract_attribute == content_key:
                 annotation.surrounds_attribute = tag_value
             else:
                 annotation.tag_attributes.append((extract_attribute, tag_value))
diff --git a/scrapely/tests/test_pageparsing.py b/scrapely/tests/test_pageparsing.py
index 4c93fec..cb3c83e 100644
--- a/scrapely/tests/test_pageparsing.py
+++ b/scrapely/tests/test_pageparsing.py
@@ -178,6 +178,18 @@
 </body></html>
 """
 
+LABELLED_PAGE12 = u"""
+<head>
+<meta name="description" content="This is the description" data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false, &quot;text-content&quot;: &quot;text-content:&quot;, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;}}" />
+</head>
+"""
+
+LABELLED_PAGE13 = u"""
+<head>
+<meta name="description" content="This is the description" data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false, &quot;text-content&quot;: &quot;text-content&quot;, &quot;annotations&quot;: {&quot;content&quot;: &quot;description&quot;, &quot;text-content&quot;: &quot;name&quot;}}">This is the name</meta>
+</head>
+"""
+
 def _parse_page(parser_class, pagetext):
     htmlpage = HtmlPage(None, {}, pagetext)
     parser = parser_class(TokenDict())
@@ -304,6 +316,22 @@ def test_variant_attribute(self):
         annotations = _parse_page(TemplatePageParser, LABELLED_PAGE11).annotations
         self.assertEqual(annotations[0].variant_id, 1)
 
+    def test_content_attribute(self):
+        """
+        Test that attribute with name content is unambiguously interpreted
+        """
+        annotations = _parse_page(TemplatePageParser, LABELLED_PAGE12).annotations
+        self.assertEqual(annotations[0].surrounds_attribute, None)
+        self.assertEqual(annotations[0].tag_attributes, [("content", "description")])
+
+    def test_content_and_content_attribute(self):
+        """
+        Test that attribute with name content and the content itself are unambiguously interpreted
+        """
+        annotations = _parse_page(TemplatePageParser, LABELLED_PAGE13).annotations
+        self.assertEqual(annotations[0].surrounds_attribute, 'name')
+        self.assertEqual(annotations[0].tag_attributes, [("content", "description")])
+
     def test_site_pages(self):
         """
         Tests from real pages. More reliable and easy to build for more complicated structures