scrapy · shaneaevans · Feb 16, 2012 · Feb 15, 2012
diff --git a/scrapely/extraction/pageobjects.py b/scrapely/extraction/pageobjects.py
@@ -96,6 +96,10 @@ def __init__(self, htmlpage, regions):
     def parsed_fragments(self):
         return chain(*(r.parsed_fragments for r in self.regions))
 
+    @property
+    def text_content(self):
+        return chain(*(r.text_content for r in self.regions))
+
 class Page(object):
     """Basic representation of a page. This consists of a reference to a
     dictionary of tokens and an array of raw token ids

diff --git a/scrapely/htmlpage.py b/scrapely/htmlpage.py
@@ -75,6 +75,15 @@ def fragment_data(self, data_fragment):
         """portion of the body corresponding to the HtmlDataFragment"""
         return self.body[data_fragment.start:data_fragment.end]
 
+class TextPage(HtmlPage):
+    """An HtmlPage with one unique HtmlDataFragment, needed to have a
+    convenient text with same interface as html page but avoiding unnecesary
+    reparsing"""
+    def _set_body(self, text): 
+        self._body = text
+        self.parsed_body = [HtmlDataFragment(0, len(self._body), True)]
+    body = property(lambda x: x._body, _set_body, doc="raw text for the page")
+
 class HtmlPageRegion(unicode):
     """A Region of an HtmlPage that has been extracted
     """
@@ -87,7 +96,11 @@ def __init__(self, htmlpage, data):
         htmlpage is the original page and data is the raw html
         """
         self.htmlpage = htmlpage
-
+
+    @property
+    def text_content(self):
+        return self
+
 class HtmlPageParsedRegion(HtmlPageRegion):
     """A region of an HtmlPage that has been extracted
 
@@ -111,20 +124,31 @@ def parsed_fragments(self):
         end = self.end_index + 1 if self.end_index is not None else None
         return self.htmlpage.parsed_body[self.start_index:end]
 
+    @property
+    def text_content(self):
+        """Text content of this parsed region"""
+        text_all = u" ".join(self.htmlpage.body[_element.start:_element.end] \
+                for _element in self.parsed_fragments if \
+                not isinstance(_element, HtmlTag) and _element.is_text_content)
+        return TextPage(self.htmlpage.url, self.htmlpage.headers, \
+                text_all, encoding=self.htmlpage.encoding).subregion()
+
+
 class HtmlTagType(object):
     OPEN_TAG = 1
     CLOSE_TAG = 2 
     UNPAIRED_TAG = 3
 
 class HtmlDataFragment(object):
-    __slots__ = ('start', 'end')
+    __slots__ = ('start', 'end', 'is_text_content')
 
-    def __init__(self, start, end):
+    def __init__(self, start, end, is_text_content=False):
         self.start = start
         self.end = end
+        self.is_text_content = is_text_content
 
     def __str__(self):
-        return "<HtmlDataFragment [%s:%s]>" % (self.start, self.end)
+        return "<HtmlDataFragment [%s:%s] is_text_content: %s>" % (self.start, self.end, self.is_text_content)
 
     def __repr__(self):
         return str(self)
@@ -171,7 +195,7 @@ def parse_html(text):
         end = match.end()
 
         if start > prev_end:
-            yield HtmlDataFragment(prev_end, start)
+            yield HtmlDataFragment(prev_end, start, True)
 
         if match.groups()[0] is not None: # comment
             yield HtmlDataFragment(start, end)
@@ -183,7 +207,7 @@ def parse_html(text):
         prev_end = end
     textlen = len(text)
     if prev_end < textlen:
-        yield HtmlDataFragment(prev_end, textlen)
+        yield HtmlDataFragment(prev_end, textlen, True)
 
 def _parse_script(match):
     """parse a <script>...</script> region matched by _HTML_REGEXP"""

diff --git a/scrapely/tests/samples/samples_htmlpage_0.json b/scrapely/tests/samples/samples_htmlpage_0.json
@@ -211,15 +211,18 @@
         },
         {       
                 "start": 1073,
-                "end": 1074
+                "end": 1074,
+                "is_text_content": false
         },
         {
                 "start": 1074, 
-                "end": 2052
+                "end": 2052,
+                "is_text_content": false
         },
         {
                 "start": 2052,
-                "end": 2053
+                "end": 2053,
+                "is_text_content": false
         },
         {
                 "attributes": {}, 

diff --git a/scrapely/tests/samples/samples_htmlpage_1.json b/scrapely/tests/samples/samples_htmlpage_1.json
@@ -212,7 +212,8 @@
         },
         {
                 "start": 1956,
-                "end": 1979
+                "end": 1979,
+                "is_text_content": false
         },
         {
                 "start": 1979,
@@ -274,15 +275,18 @@
         }, 
         {
                 "start": 2282, 
-                "end": 2283
+                "end": 2283,
+                "is_text_content": false
         },
         {
                 "start": 2283,
-                "end": 2437
+                "end": 2437,
+                "is_text_content": false
         },
         {
                 "start": 2437,
-                "end": 2438
+                "end": 2438,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -306,7 +310,8 @@
         }, 
         {
                 "start": 2482, 
-                "end": 2702
+                "end": 2702,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -330,7 +335,8 @@
         }, 
         {
                 "start": 2743, 
-                "end": 2851
+                "end": 2851,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -345,7 +351,8 @@
         },
         {
                 "start": 2861,
-                "end": 2882
+                "end": 2882,
+                "is_text_content": false
         },
         {
                 "start": 2882,
@@ -986,7 +993,8 @@
         }, 
         {
                 "start": 5346, 
-                "end": 5537
+                "end": 5537,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -8851,7 +8859,8 @@
         }, 
         {
                 "start": 30410, 
-                "end": 30920
+                "end": 30920,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -9551,7 +9560,8 @@
         }, 
         {
                 "start": 33433, 
-                "end": 33454
+                "end": 33454,
+                "is_text_content": false
         }, 
         {
                 "attributes": {
@@ -9576,7 +9586,8 @@
         }, 
         {
                 "start": 33669, 
-                "end": 33689
+                "end": 33689,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -21129,7 +21140,8 @@
         }, 
         {
                 "start": 70112, 
-                "end": 70136
+                "end": 70136,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 

diff --git a/scrapely/tests/samples/samples_htmlpage_2.json b/scrapely/tests/samples/samples_htmlpage_2.json
@@ -244,7 +244,8 @@
         },
         {
                 "start": 2182,
-                "end": 2205
+                "end": 2205,
+                "is_text_content": false
         },
         {
                 "start": 2205,
@@ -306,15 +307,18 @@
         }, 
         {
                 "start": 2508, 
-                "end": 2509
+                "end": 2509,
+                "is_text_content": false
         },
         {
                 "start": 2509,
-                "end": 2663
+                "end": 2663,
+                "is_text_content": false
         },
         {
                 "start": 2663,
-                "end": 2664
+                "end": 2664,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -338,7 +342,8 @@
         }, 
         {
                 "start": 2708, 
-                "end": 2928
+                "end": 2928,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -362,7 +367,8 @@
         }, 
         {
                 "start": 2969, 
-                "end": 3077
+                "end": 3077,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -377,7 +383,8 @@
         },
         {
                 "start": 3087,
-                "end": 3108
+                "end": 3108,
+                "is_text_content": false
         },
         {
                 "start": 3108,
@@ -993,7 +1000,8 @@
         }, 
         {
                 "start": 5449, 
-                "end": 5640
+                "end": 5640,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -8828,7 +8836,8 @@
         }, 
         {
                 "start": 30096, 
-                "end": 30606
+                "end": 30606,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -9501,7 +9510,8 @@
         }, 
         {
                 "start": 32812, 
-                "end": 32833
+                "end": 32833,
+                "is_text_content": false
         }, 
         {
                 "attributes": {
@@ -9526,7 +9536,8 @@
         }, 
         {
                 "start": 33044, 
-                "end": 33064
+                "end": 33064,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 
@@ -20956,7 +20967,8 @@
         }, 
         {
                 "start": 69651, 
-                "end": 69675
+                "end": 69675,
+                "is_text_content": false
         }, 
         {
                 "attributes": {}, 

diff --git a/scrapely/tests/test_extraction.py b/scrapely/tests/test_extraction.py
@@ -915,6 +915,28 @@
 </table>
 """
 
+ANNOTATED_PAGE30 = u"""
+<div data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false,
+ &quot;annotations&quot;: {&quot;content&quot;: &quot;phone&quot;}}"><span>029349293</span></div>
+"""
+
+EXTRACT_PAGE30a = u"""
+<div><span style="font-size:100%">Any text</span></div>
+"""
+
+EXTRACT_PAGE30b = u"""
+<div><span style="font-size:100%">029847272</span></div>
+"""
+
+EXTRACT_PAGE30c = u"""
+<div><span><!--item no. 100--></span></div>
+"""
+
+EXTRACT_PAGE30d = u"""
+<div><span><script>var myvar= 10;</script></span></div>
+"""
+
+
 DEFAULT_DESCRIPTOR = ItemDescriptor('test', 
         'item test, removes tags from description attribute',
         [A('description', 'description field without tags', notags)])
@@ -934,6 +956,9 @@
                 contains_any_numbers),
     ])
 
+SAMPLE_DESCRIPTOR3 = ItemDescriptor('test', 
+        'item test',
+        [A('phone', 'phone number', lambda x: contains_any_numbers(x.text_content))])
 
 # A list of (test name, [templates], page, extractors, expected_result)
 TEST_DATA = [
@@ -1178,6 +1203,19 @@
             }
 
     ),
+    ('avoid false positives by allowing to extract only from text content', [ANNOTATED_PAGE30], EXTRACT_PAGE30a, SAMPLE_DESCRIPTOR3,
+        {}
+    ),
+    ('only extract from text content', [ANNOTATED_PAGE30], EXTRACT_PAGE30b, SAMPLE_DESCRIPTOR3,
+        {u'phone': [u'029847272']}
+    ),
+    ('avoid false positives on comments', [ANNOTATED_PAGE30], EXTRACT_PAGE30c, SAMPLE_DESCRIPTOR3,
+        {}
+    ),
+    ('avoid false positives on scripts', [ANNOTATED_PAGE30], EXTRACT_PAGE30d, SAMPLE_DESCRIPTOR3,
+        {}
+    ),
+
 ]
 
 class TestIbl(TestCase):