Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions scrapely/extraction/pageobjects.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ def __init__(self, htmlpage, regions):
def parsed_fragments(self):
return chain(*(r.parsed_fragments for r in self.regions))

@property
def text_content(self):
return chain(*(r.text_content for r in self.regions))

class Page(object):
"""Basic representation of a page. This consists of a reference to a
dictionary of tokens and an array of raw token ids
Expand Down
36 changes: 30 additions & 6 deletions scrapely/htmlpage.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,15 @@ def fragment_data(self, data_fragment):
"""portion of the body corresponding to the HtmlDataFragment"""
return self.body[data_fragment.start:data_fragment.end]

class TextPage(HtmlPage):
"""An HtmlPage with one unique HtmlDataFragment, needed to have a
convenient text with same interface as html page but avoiding unnecesary
reparsing"""
def _set_body(self, text):
self._body = text
self.parsed_body = [HtmlDataFragment(0, len(self._body), True)]
body = property(lambda x: x._body, _set_body, doc="raw text for the page")

class HtmlPageRegion(unicode):
"""A Region of an HtmlPage that has been extracted
"""
Expand All @@ -87,7 +96,11 @@ def __init__(self, htmlpage, data):
htmlpage is the original page and data is the raw html
"""
self.htmlpage = htmlpage


@property
def text_content(self):
return self

class HtmlPageParsedRegion(HtmlPageRegion):
"""A region of an HtmlPage that has been extracted

Expand All @@ -111,20 +124,31 @@ def parsed_fragments(self):
end = self.end_index + 1 if self.end_index is not None else None
return self.htmlpage.parsed_body[self.start_index:end]

@property
def text_content(self):
"""Text content of this parsed region"""
text_all = u" ".join(self.htmlpage.body[_element.start:_element.end] \
for _element in self.parsed_fragments if \
not isinstance(_element, HtmlTag) and _element.is_text_content)
return TextPage(self.htmlpage.url, self.htmlpage.headers, \
text_all, encoding=self.htmlpage.encoding).subregion()


class HtmlTagType(object):
OPEN_TAG = 1
CLOSE_TAG = 2
UNPAIRED_TAG = 3

class HtmlDataFragment(object):
__slots__ = ('start', 'end')
__slots__ = ('start', 'end', 'is_text_content')

def __init__(self, start, end):
def __init__(self, start, end, is_text_content=False):
self.start = start
self.end = end
self.is_text_content = is_text_content

def __str__(self):
return "<HtmlDataFragment [%s:%s]>" % (self.start, self.end)
return "<HtmlDataFragment [%s:%s] is_text_content: %s>" % (self.start, self.end, self.is_text_content)

def __repr__(self):
return str(self)
Expand Down Expand Up @@ -171,7 +195,7 @@ def parse_html(text):
end = match.end()

if start > prev_end:
yield HtmlDataFragment(prev_end, start)
yield HtmlDataFragment(prev_end, start, True)

if match.groups()[0] is not None: # comment
yield HtmlDataFragment(start, end)
Expand All @@ -183,7 +207,7 @@ def parse_html(text):
prev_end = end
textlen = len(text)
if prev_end < textlen:
yield HtmlDataFragment(prev_end, textlen)
yield HtmlDataFragment(prev_end, textlen, True)

def _parse_script(match):
"""parse a <script>...</script> region matched by _HTML_REGEXP"""
Expand Down
9 changes: 6 additions & 3 deletions scrapely/tests/samples/samples_htmlpage_0.json
Original file line number Diff line number Diff line change
Expand Up @@ -211,15 +211,18 @@
},
{
"start": 1073,
"end": 1074
"end": 1074,
"is_text_content": false
},
{
"start": 1074,
"end": 2052
"end": 2052,
"is_text_content": false
},
{
"start": 2052,
"end": 2053
"end": 2053,
"is_text_content": false
},
{
"attributes": {},
Expand Down
36 changes: 24 additions & 12 deletions scrapely/tests/samples/samples_htmlpage_1.json
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,8 @@
},
{
"start": 1956,
"end": 1979
"end": 1979,
"is_text_content": false
},
{
"start": 1979,
Expand Down Expand Up @@ -274,15 +275,18 @@
},
{
"start": 2282,
"end": 2283
"end": 2283,
"is_text_content": false
},
{
"start": 2283,
"end": 2437
"end": 2437,
"is_text_content": false
},
{
"start": 2437,
"end": 2438
"end": 2438,
"is_text_content": false
},
{
"attributes": {},
Expand All @@ -306,7 +310,8 @@
},
{
"start": 2482,
"end": 2702
"end": 2702,
"is_text_content": false
},
{
"attributes": {},
Expand All @@ -330,7 +335,8 @@
},
{
"start": 2743,
"end": 2851
"end": 2851,
"is_text_content": false
},
{
"attributes": {},
Expand All @@ -345,7 +351,8 @@
},
{
"start": 2861,
"end": 2882
"end": 2882,
"is_text_content": false
},
{
"start": 2882,
Expand Down Expand Up @@ -986,7 +993,8 @@
},
{
"start": 5346,
"end": 5537
"end": 5537,
"is_text_content": false
},
{
"attributes": {},
Expand Down Expand Up @@ -8851,7 +8859,8 @@
},
{
"start": 30410,
"end": 30920
"end": 30920,
"is_text_content": false
},
{
"attributes": {},
Expand Down Expand Up @@ -9551,7 +9560,8 @@
},
{
"start": 33433,
"end": 33454
"end": 33454,
"is_text_content": false
},
{
"attributes": {
Expand All @@ -9576,7 +9586,8 @@
},
{
"start": 33669,
"end": 33689
"end": 33689,
"is_text_content": false
},
{
"attributes": {},
Expand Down Expand Up @@ -21129,7 +21140,8 @@
},
{
"start": 70112,
"end": 70136
"end": 70136,
"is_text_content": false
},
{
"attributes": {},
Expand Down
36 changes: 24 additions & 12 deletions scrapely/tests/samples/samples_htmlpage_2.json
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,8 @@
},
{
"start": 2182,
"end": 2205
"end": 2205,
"is_text_content": false
},
{
"start": 2205,
Expand Down Expand Up @@ -306,15 +307,18 @@
},
{
"start": 2508,
"end": 2509
"end": 2509,
"is_text_content": false
},
{
"start": 2509,
"end": 2663
"end": 2663,
"is_text_content": false
},
{
"start": 2663,
"end": 2664
"end": 2664,
"is_text_content": false
},
{
"attributes": {},
Expand All @@ -338,7 +342,8 @@
},
{
"start": 2708,
"end": 2928
"end": 2928,
"is_text_content": false
},
{
"attributes": {},
Expand All @@ -362,7 +367,8 @@
},
{
"start": 2969,
"end": 3077
"end": 3077,
"is_text_content": false
},
{
"attributes": {},
Expand All @@ -377,7 +383,8 @@
},
{
"start": 3087,
"end": 3108
"end": 3108,
"is_text_content": false
},
{
"start": 3108,
Expand Down Expand Up @@ -993,7 +1000,8 @@
},
{
"start": 5449,
"end": 5640
"end": 5640,
"is_text_content": false
},
{
"attributes": {},
Expand Down Expand Up @@ -8828,7 +8836,8 @@
},
{
"start": 30096,
"end": 30606
"end": 30606,
"is_text_content": false
},
{
"attributes": {},
Expand Down Expand Up @@ -9501,7 +9510,8 @@
},
{
"start": 32812,
"end": 32833
"end": 32833,
"is_text_content": false
},
{
"attributes": {
Expand All @@ -9526,7 +9536,8 @@
},
{
"start": 33044,
"end": 33064
"end": 33064,
"is_text_content": false
},
{
"attributes": {},
Expand Down Expand Up @@ -20956,7 +20967,8 @@
},
{
"start": 69651,
"end": 69675
"end": 69675,
"is_text_content": false
},
{
"attributes": {},
Expand Down
38 changes: 38 additions & 0 deletions scrapely/tests/test_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -915,6 +915,28 @@
</table>
"""

ANNOTATED_PAGE30 = u"""
<div data-scrapy-annotate="{&quot;variant&quot;: 0, &quot;generated&quot;: false,
&quot;annotations&quot;: {&quot;content&quot;: &quot;phone&quot;}}"><span>029349293</span></div>
"""

EXTRACT_PAGE30a = u"""
<div><span style="font-size:100%">Any text</span></div>
"""

EXTRACT_PAGE30b = u"""
<div><span style="font-size:100%">029847272</span></div>
"""

EXTRACT_PAGE30c = u"""
<div><span><!--item no. 100--></span></div>
"""

EXTRACT_PAGE30d = u"""
<div><span><script>var myvar= 10;</script></span></div>
"""


DEFAULT_DESCRIPTOR = ItemDescriptor('test',
'item test, removes tags from description attribute',
[A('description', 'description field without tags', notags)])
Expand All @@ -934,6 +956,9 @@
contains_any_numbers),
])

SAMPLE_DESCRIPTOR3 = ItemDescriptor('test',
'item test',
[A('phone', 'phone number', lambda x: contains_any_numbers(x.text_content))])

# A list of (test name, [templates], page, extractors, expected_result)
TEST_DATA = [
Expand Down Expand Up @@ -1178,6 +1203,19 @@
}

),
('avoid false positives by allowing to extract only from text content', [ANNOTATED_PAGE30], EXTRACT_PAGE30a, SAMPLE_DESCRIPTOR3,
{}
),
('only extract from text content', [ANNOTATED_PAGE30], EXTRACT_PAGE30b, SAMPLE_DESCRIPTOR3,
{u'phone': [u'029847272']}
),
('avoid false positives on comments', [ANNOTATED_PAGE30], EXTRACT_PAGE30c, SAMPLE_DESCRIPTOR3,
{}
),
('avoid false positives on scripts', [ANNOTATED_PAGE30], EXTRACT_PAGE30d, SAMPLE_DESCRIPTOR3,
{}
),

]

class TestIbl(TestCase):
Expand Down
Loading