From 0c3eca9b56131307708704c2c25f7b8a4d823fbc Mon Sep 17 00:00:00 2001 From: Martin Olveyra Date: Thu, 26 Apr 2012 16:13:59 +0000 Subject: [PATCH] fix ignore region extraction when it is inside an annotation which uses text content extraction, and added test --- scrapely/extraction/pageobjects.py | 10 +++------- scrapely/tests/test_extraction.py | 12 ++++++++++++ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/scrapely/extraction/pageobjects.py b/scrapely/extraction/pageobjects.py index 87a41cc..5c144e7 100644 --- a/scrapely/extraction/pageobjects.py +++ b/scrapely/extraction/pageobjects.py @@ -7,7 +7,7 @@ from itertools import chain from numpy import array, ndarray -from scrapely.htmlpage import HtmlTagType, HtmlPageRegion +from scrapely.htmlpage import HtmlTagType, HtmlPageRegion, HtmlPageParsedRegion class TokenType(HtmlTagType): """constants for token types""" @@ -82,7 +82,7 @@ def __str__(self): def __repr__(self): return str(self) -class FragmentedHtmlPageRegion(HtmlPageRegion): +class FragmentedHtmlPageRegion(HtmlPageParsedRegion, HtmlPageRegion): """An HtmlPageRegion consisting of possibly non-contiguous sub-regions""" def __new__(cls, htmlpage, regions): text = u''.join(regions) @@ -95,11 +95,7 @@ def __init__(self, htmlpage, regions): @property def parsed_fragments(self): return chain(*(r.parsed_fragments for r in self.regions)) - - @property - def text_content(self): - return chain(*(r.text_content for r in self.regions)) - + class Page(object): """Basic representation of a page. This consists of a reference to a dictionary of tokens and an array of raw token ids diff --git a/scrapely/tests/test_extraction.py b/scrapely/tests/test_extraction.py index de4d9af..b5d4600 100644 --- a/scrapely/tests/test_extraction.py +++ b/scrapely/tests/test_extraction.py @@ -960,6 +960,10 @@ 'item test', [A('phone', 'phone number', lambda x: contains_any_numbers(x.text_content))]) +SAMPLE_DESCRIPTOR4 = ItemDescriptor('test', + 'item test, removes tags from description attribute', + [A('description', 'description field without tags', lambda x: x.text_content)]) + # A list of (test name, [templates], page, extractors, expected_result) TEST_DATA = [ # extract from a similar page @@ -1052,6 +1056,14 @@ 'price': [u'\n12.00\n(VAT exc.)'], } ), + # ignored regions and text content extraction + ( + 'ignored_regions', [ANNOTATED_PAGE8], EXTRACT_PAGE8, SAMPLE_DESCRIPTOR4, + { + 'description': [u'\n A very nice product for all intelligent people \n \n'], + 'price': [u'\n12.00\n(VAT exc.)'], + } + ), # shifted ignored regions (detected by region similarity) ( 'shifted_ignored_regions', [ANNOTATED_PAGE9], EXTRACT_PAGE9, DEFAULT_DESCRIPTOR,