diff --git a/scrapely/extraction/pageobjects.py b/scrapely/extraction/pageobjects.py index b5f852f..87a41cc 100644 --- a/scrapely/extraction/pageobjects.py +++ b/scrapely/extraction/pageobjects.py @@ -96,6 +96,10 @@ def __init__(self, htmlpage, regions): def parsed_fragments(self): return chain(*(r.parsed_fragments for r in self.regions)) + @property + def text_content(self): + return chain(*(r.text_content for r in self.regions)) + class Page(object): """Basic representation of a page. This consists of a reference to a dictionary of tokens and an array of raw token ids diff --git a/scrapely/htmlpage.py b/scrapely/htmlpage.py index b73e24b..e80b85a 100644 --- a/scrapely/htmlpage.py +++ b/scrapely/htmlpage.py @@ -75,6 +75,15 @@ def fragment_data(self, data_fragment): """portion of the body corresponding to the HtmlDataFragment""" return self.body[data_fragment.start:data_fragment.end] +class TextPage(HtmlPage): + """An HtmlPage with one unique HtmlDataFragment, needed to have a + convenient text with same interface as html page but avoiding unnecesary + reparsing""" + def _set_body(self, text): + self._body = text + self.parsed_body = [HtmlDataFragment(0, len(self._body), True)] + body = property(lambda x: x._body, _set_body, doc="raw text for the page") + class HtmlPageRegion(unicode): """A Region of an HtmlPage that has been extracted """ @@ -87,7 +96,11 @@ def __init__(self, htmlpage, data): htmlpage is the original page and data is the raw html """ self.htmlpage = htmlpage - + + @property + def text_content(self): + return self + class HtmlPageParsedRegion(HtmlPageRegion): """A region of an HtmlPage that has been extracted @@ -111,20 +124,31 @@ def parsed_fragments(self): end = self.end_index + 1 if self.end_index is not None else None return self.htmlpage.parsed_body[self.start_index:end] + @property + def text_content(self): + """Text content of this parsed region""" + text_all = u" ".join(self.htmlpage.body[_element.start:_element.end] \ + for _element in self.parsed_fragments if \ + not isinstance(_element, HtmlTag) and _element.is_text_content) + return TextPage(self.htmlpage.url, self.htmlpage.headers, \ + text_all, encoding=self.htmlpage.encoding).subregion() + + class HtmlTagType(object): OPEN_TAG = 1 CLOSE_TAG = 2 UNPAIRED_TAG = 3 class HtmlDataFragment(object): - __slots__ = ('start', 'end') + __slots__ = ('start', 'end', 'is_text_content') - def __init__(self, start, end): + def __init__(self, start, end, is_text_content=False): self.start = start self.end = end + self.is_text_content = is_text_content def __str__(self): - return "" % (self.start, self.end) + return "" % (self.start, self.end, self.is_text_content) def __repr__(self): return str(self) @@ -171,7 +195,7 @@ def parse_html(text): end = match.end() if start > prev_end: - yield HtmlDataFragment(prev_end, start) + yield HtmlDataFragment(prev_end, start, True) if match.groups()[0] is not None: # comment yield HtmlDataFragment(start, end) @@ -183,7 +207,7 @@ def parse_html(text): prev_end = end textlen = len(text) if prev_end < textlen: - yield HtmlDataFragment(prev_end, textlen) + yield HtmlDataFragment(prev_end, textlen, True) def _parse_script(match): """parse a region matched by _HTML_REGEXP""" diff --git a/scrapely/tests/samples/samples_htmlpage_0.json b/scrapely/tests/samples/samples_htmlpage_0.json index 08826aa..a93cb0c 100644 --- a/scrapely/tests/samples/samples_htmlpage_0.json +++ b/scrapely/tests/samples/samples_htmlpage_0.json @@ -211,15 +211,18 @@ }, { "start": 1073, - "end": 1074 + "end": 1074, + "is_text_content": false }, { "start": 1074, - "end": 2052 + "end": 2052, + "is_text_content": false }, { "start": 2052, - "end": 2053 + "end": 2053, + "is_text_content": false }, { "attributes": {}, diff --git a/scrapely/tests/samples/samples_htmlpage_1.json b/scrapely/tests/samples/samples_htmlpage_1.json index f94837c..8cc36e6 100644 --- a/scrapely/tests/samples/samples_htmlpage_1.json +++ b/scrapely/tests/samples/samples_htmlpage_1.json @@ -212,7 +212,8 @@ }, { "start": 1956, - "end": 1979 + "end": 1979, + "is_text_content": false }, { "start": 1979, @@ -274,15 +275,18 @@ }, { "start": 2282, - "end": 2283 + "end": 2283, + "is_text_content": false }, { "start": 2283, - "end": 2437 + "end": 2437, + "is_text_content": false }, { "start": 2437, - "end": 2438 + "end": 2438, + "is_text_content": false }, { "attributes": {}, @@ -306,7 +310,8 @@ }, { "start": 2482, - "end": 2702 + "end": 2702, + "is_text_content": false }, { "attributes": {}, @@ -330,7 +335,8 @@ }, { "start": 2743, - "end": 2851 + "end": 2851, + "is_text_content": false }, { "attributes": {}, @@ -345,7 +351,8 @@ }, { "start": 2861, - "end": 2882 + "end": 2882, + "is_text_content": false }, { "start": 2882, @@ -986,7 +993,8 @@ }, { "start": 5346, - "end": 5537 + "end": 5537, + "is_text_content": false }, { "attributes": {}, @@ -8851,7 +8859,8 @@ }, { "start": 30410, - "end": 30920 + "end": 30920, + "is_text_content": false }, { "attributes": {}, @@ -9551,7 +9560,8 @@ }, { "start": 33433, - "end": 33454 + "end": 33454, + "is_text_content": false }, { "attributes": { @@ -9576,7 +9586,8 @@ }, { "start": 33669, - "end": 33689 + "end": 33689, + "is_text_content": false }, { "attributes": {}, @@ -21129,7 +21140,8 @@ }, { "start": 70112, - "end": 70136 + "end": 70136, + "is_text_content": false }, { "attributes": {}, diff --git a/scrapely/tests/samples/samples_htmlpage_2.json b/scrapely/tests/samples/samples_htmlpage_2.json index c856f4a..c6f99af 100644 --- a/scrapely/tests/samples/samples_htmlpage_2.json +++ b/scrapely/tests/samples/samples_htmlpage_2.json @@ -244,7 +244,8 @@ }, { "start": 2182, - "end": 2205 + "end": 2205, + "is_text_content": false }, { "start": 2205, @@ -306,15 +307,18 @@ }, { "start": 2508, - "end": 2509 + "end": 2509, + "is_text_content": false }, { "start": 2509, - "end": 2663 + "end": 2663, + "is_text_content": false }, { "start": 2663, - "end": 2664 + "end": 2664, + "is_text_content": false }, { "attributes": {}, @@ -338,7 +342,8 @@ }, { "start": 2708, - "end": 2928 + "end": 2928, + "is_text_content": false }, { "attributes": {}, @@ -362,7 +367,8 @@ }, { "start": 2969, - "end": 3077 + "end": 3077, + "is_text_content": false }, { "attributes": {}, @@ -377,7 +383,8 @@ }, { "start": 3087, - "end": 3108 + "end": 3108, + "is_text_content": false }, { "start": 3108, @@ -993,7 +1000,8 @@ }, { "start": 5449, - "end": 5640 + "end": 5640, + "is_text_content": false }, { "attributes": {}, @@ -8828,7 +8836,8 @@ }, { "start": 30096, - "end": 30606 + "end": 30606, + "is_text_content": false }, { "attributes": {}, @@ -9501,7 +9510,8 @@ }, { "start": 32812, - "end": 32833 + "end": 32833, + "is_text_content": false }, { "attributes": { @@ -9526,7 +9536,8 @@ }, { "start": 33044, - "end": 33064 + "end": 33064, + "is_text_content": false }, { "attributes": {}, @@ -20956,7 +20967,8 @@ }, { "start": 69651, - "end": 69675 + "end": 69675, + "is_text_content": false }, { "attributes": {}, diff --git a/scrapely/tests/test_extraction.py b/scrapely/tests/test_extraction.py index 0b627eb..7af4dcf 100644 --- a/scrapely/tests/test_extraction.py +++ b/scrapely/tests/test_extraction.py @@ -915,6 +915,28 @@ """ +ANNOTATED_PAGE30 = u""" +
029349293
+""" + +EXTRACT_PAGE30a = u""" +
Any text
+""" + +EXTRACT_PAGE30b = u""" +
029847272
+""" + +EXTRACT_PAGE30c = u""" +
+""" + +EXTRACT_PAGE30d = u""" +
+""" + + DEFAULT_DESCRIPTOR = ItemDescriptor('test', 'item test, removes tags from description attribute', [A('description', 'description field without tags', notags)]) @@ -934,6 +956,9 @@ contains_any_numbers), ]) +SAMPLE_DESCRIPTOR3 = ItemDescriptor('test', + 'item test', + [A('phone', 'phone number', lambda x: contains_any_numbers(x.text_content))]) # A list of (test name, [templates], page, extractors, expected_result) TEST_DATA = [ @@ -1178,6 +1203,19 @@ } ), + ('avoid false positives by allowing to extract only from text content', [ANNOTATED_PAGE30], EXTRACT_PAGE30a, SAMPLE_DESCRIPTOR3, + {} + ), + ('only extract from text content', [ANNOTATED_PAGE30], EXTRACT_PAGE30b, SAMPLE_DESCRIPTOR3, + {u'phone': [u'029847272']} + ), + ('avoid false positives on comments', [ANNOTATED_PAGE30], EXTRACT_PAGE30c, SAMPLE_DESCRIPTOR3, + {} + ), + ('avoid false positives on scripts', [ANNOTATED_PAGE30], EXTRACT_PAGE30d, SAMPLE_DESCRIPTOR3, + {} + ), + ] class TestIbl(TestCase): diff --git a/scrapely/tests/test_htmlpage.py b/scrapely/tests/test_htmlpage.py index fc06aaf..1d58280 100644 --- a/scrapely/tests/test_htmlpage.py +++ b/scrapely/tests/test_htmlpage.py @@ -19,7 +19,7 @@ def _encode_element(el): return {"tag": el.tag, "attributes": el.attributes, "start": el.start, "end": el.end, "tag_type": el.tag_type} if isinstance(el, HtmlDataFragment): - return {"start": el.start, "end": el.end} + return {"start": el.start, "end": el.end, "is_text_content": el.is_text_content} raise TypeError def _decode_element(dct): @@ -30,7 +30,7 @@ def _decode_element(dct): return HtmlTag(dct["tag_type"], dct["tag"], \ dct["attributes"], dct["start"], dct["end"]) if "start" in dct: - return HtmlDataFragment(dct["start"], dct["end"]) + return HtmlDataFragment(dct["start"], dct["end"], dct.get("is_text_content", True)) return dct class TestParseHtml(TestCase): @@ -64,6 +64,12 @@ def _test_sample(self, source, expected_parsed, samplecount=None): self.assertEqual(element.tag, expected.tag) self.assertEqual(element.attributes, expected.attributes) self.assertEqual(element.tag_type, expected.tag_type) + if type(element) == HtmlDataFragment: + msg = "Got: %s Expected: %s in sample: %d [%d:%d] (%s)" % \ + (element.is_text_content, expected.is_text_content, samplecount, element.start, element.end, repr(element_text)) \ + if samplecount is not None else None + self.assertEqual(element.is_text_content, expected.is_text_content, msg) + if expected_parsed: errstring = "Expected %s" % repr(expected_parsed) if samplecount is not None: diff --git a/scrapely/tests/test_htmlpage_data.py b/scrapely/tests/test_htmlpage_data.py index f54dc9f..72491bc 100644 --- a/scrapely/tests/test_htmlpage_data.py +++ b/scrapely/tests/test_htmlpage_data.py @@ -163,11 +163,11 @@ {'attributes': {}, 'end': 34, 'start': 31, 'tag': u'p', 'tag_type': 1}, {'end': 51, 'start': 34}, {'attributes': {}, 'end': 55, 'start': 51, 'tag': u'p', 'tag_type': 2}, - {'end': 70, 'start': 55}, + {'end': 70, 'start': 55, 'is_text_content': False}, {'attributes': {u'type': u'text/javascript'}, 'end': 101, 'start': 70, 'tag': u'script', 'tag_type': 1}, - {'end': 104, 'start': 101}, - {'end': 118, 'start': 104}, - {'end': 124, 'start': 118}, + {'end': 104, 'start': 101, 'is_text_content': False}, + {'end': 118, 'start': 104, 'is_text_content': False}, + {'end': 124, 'start': 118, 'is_text_content': False}, {'attributes': {}, 'end': 133, 'start': 124, 'tag': u'script', 'tag_type': 2}, {'attributes': {}, 'end': 140, 'start': 133, 'tag': u'body', 'tag_type': 2}, {'attributes': {}, 'end': 147, 'start': 140, 'tag': u'html', 'tag_type': 2} @@ -186,7 +186,7 @@ {'attributes': {}, 'end': 33, 'start': 28, 'tag': u'h1', 'tag_type': 2}, {'end': 38, 'start': 33}, {'attributes': {u'type': u'text/javascript'}, 'end': 69, 'start': 38, 'tag': u'script', 'tag_type': 1}, - {'end': 130, 'start': 69}, + {'end': 130, 'start': 69, 'is_text_content': False}, {'attributes': {}, 'end': 139, 'start': 130, 'tag': u'script', 'tag_type': 2}, {'end': 150, 'start': 139}, {'attributes': {}, 'end': 157, 'start': 150, 'tag': u'body', 'tag_type': 2}, @@ -200,14 +200,14 @@ {'attributes': {}, 'end': 6, 'start': 0, 'tag': u'html', 'tag_type': 1}, {'attributes': {}, 'end': 12, 'start': 6, 'tag': u'body', 'tag_type': 1}, {'attributes': {}, 'end': 20, 'start': 12, 'tag': u'script', 'tag_type': 1}, - {'end': 25, 'start': 20}, + {'end': 25, 'start': 20, 'is_text_content': False}, {'attributes': {}, 'end': 34, 'start': 25, 'tag': u'script', 'tag_type': 2}, {'attributes': {}, 'end': 42, 'start': 34, 'tag': u'script', 'tag_type': 1}, - {'end': 45, 'start': 42}, + {'end': 45, 'start': 42, 'is_text_content': False}, {'attributes': {}, 'end': 54, 'start': 45, 'tag': u'script', 'tag_type': 2}, {'attributes': {}, 'end': 61, 'start': 54, 'tag': u'body', 'tag_type': 2}, - {'end': 76, 'start': 61}, - {'end': 91, 'start': 76}, + {'end': 76, 'start': 61, 'is_text_content': False}, + {'end': 91, 'start': 76, 'is_text_content': False}, {'attributes': {}, 'end': 98, 'start': 91, 'tag': u'html', 'tag_type': 2}, ] @@ -218,13 +218,13 @@ {'attributes': {}, 'end': 6, 'start': 0, 'tag': u'html', 'tag_type': 1}, {'attributes': {}, 'end': 12, 'start': 6, 'tag': u'body', 'tag_type': 1}, {'attributes': {}, 'end': 20, 'start': 12, 'tag': u'script', 'tag_type': 1}, - {'end': 23, 'start': 20}, - {'end': 37, 'start': 23}, - {'end': 40, 'start': 37}, + {'end': 23, 'start': 20, 'is_text_content': False}, + {'end': 37, 'start': 23, 'is_text_content': False}, + {'end': 40, 'start': 37, 'is_text_content': False}, {'attributes': {}, 'end': 49, 'start': 40, 'tag': u'script', 'tag_type': 2}, {'end': 52, 'start': 49}, {'attributes': {}, 'end': 60, 'start': 52, 'tag': u'script', 'tag_type': 1}, - {'end': 63, 'start': 60}, + {'end': 63, 'start': 60, 'is_text_content': False}, {'attributes': {}, 'end': 72, 'start': 63, 'tag': u'script', 'tag_type': 2}, {'end': 74, 'start': 72}, {'attributes': {}, 'end': 81, 'start': 74, 'tag': u'body', 'tag_type': 2}, @@ -241,9 +241,9 @@ {'end': 53, 'start': 15}, {'attributes': {}, 'end': 57, 'start': 53, 'tag': u'p', 'tag_type': 2}, {'attributes' : {}, 'end': 65, 'start': 57, 'tag': u'script', 'tag_type': 1}, - {'end': 76, 'start': 65}, + {'end': 76, 'start': 65, 'is_text_content': False}, {'attributes' : {}, 'end': 85, 'start': 76, 'tag': u'script', 'tag_type': 2}, - {'end': 99, 'start': 85}, + {'end': 99, 'start': 85, 'is_text_content': False}, ] PAGE8 = u""""""