Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions scrapely/extraction/pageobjects.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,10 @@ class Page(object):
dictionary of tokens and an array of raw token ids
"""

__slots__ = ('token_dict', 'page_tokens')
__slots__ = ('token_dict', 'page_tokens', 'htmlpage')

def __init__(self, token_dict, page_tokens):
def __init__(self, htmlpage, token_dict, page_tokens):
self.htmlpage = htmlpage
self.token_dict = token_dict
# use a numpy array becuase we can index/slice easily and efficiently
if not isinstance(page_tokens, ndarray):
Expand All @@ -113,9 +114,9 @@ def __init__(self, token_dict, page_tokens):
class TemplatePage(Page):
__slots__ = ('annotations', 'id', 'ignored_regions', 'extra_required_attrs')

def __init__(self, token_dict, page_tokens, annotations, template_id=None, \
ignored_regions=None, extra_required=None):
Page.__init__(self, token_dict, page_tokens)
def __init__(self, htmlpage, token_dict, page_tokens, annotations, \
template_id=None, ignored_regions=None, extra_required=None):
Page.__init__(self, htmlpage, token_dict, page_tokens)
# ensure order is the same as start tag order in the original page
annotations = sorted(annotations, key=lambda x: x.end_index, reverse=True)
self.annotations = sorted(annotations, key=lambda x: x.start_index)
Expand All @@ -136,7 +137,7 @@ class ExtractionPage(Page):
"""Parsed data belonging to a web page upon which we wish to perform
extraction.
"""
__slots__ = ('htmlpage', 'token_page_indexes')
__slots__ = ('token_page_indexes', )

def __init__(self, htmlpage, token_dict, page_tokens, token_page_indexes):
"""Construct a new ExtractionPage
Expand All @@ -147,12 +148,11 @@ def __init__(self, htmlpage, token_dict, page_tokens, token_page_indexes):
`page_tokens': array of page tokens for matching
`token_page_indexes`: indexes of each token in the parsed htmlpage
"""
Page.__init__(self, token_dict, page_tokens)
self.htmlpage = htmlpage
Page.__init__(self, htmlpage, token_dict, page_tokens)
self.token_page_indexes = token_page_indexes

def htmlpage_region(self, start_token_index, end_token_index):
"""The region in the HtmlPage corresonding to the area defined by
"""The region in the HtmlPage corresponding to the area defined by
the start_token_index and the end_token_index

This includes the tokens at the specified indexes
Expand All @@ -162,7 +162,7 @@ def htmlpage_region(self, start_token_index, end_token_index):
return self.htmlpage.subregion(start, end)

def htmlpage_region_inside(self, start_token_index, end_token_index):
"""The region in the HtmlPage corresonding to the area between
"""The region in the HtmlPage corresponding to the area between
the start_token_index and the end_token_index.

This excludes the tokens at the specified indexes
Expand Down
2 changes: 1 addition & 1 deletion scrapely/extraction/pageparsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ def _process_text(self, text):

def to_template(self):
"""create a TemplatePage from the data fed to this parser"""
return TemplatePage(self.token_dict, self.token_list, self.annotations,
return TemplatePage(self.html_page, self.token_dict, self.token_list, self.annotations,
self.html_page.page_id, self.ignored_regions, self.extra_required_attrs)

class ExtractionPageParser(InstanceLearningParser):
Expand Down