From c53e507024abd3ccd5dd266eb6f6aa542b6a1f7a Mon Sep 17 00:00:00 2001 From: Martin Olveyra Date: Wed, 22 Aug 2012 20:37:31 +0000 Subject: [PATCH] avoid exception when instantiating htmlregion if parent htmlpage has empty body. Added test. --- scrapely/htmlpage.py | 8 +++++--- scrapely/tests/test_htmlpage.py | 5 ++++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/scrapely/htmlpage.py b/scrapely/htmlpage.py index b79788c..258e3cc 100644 --- a/scrapely/htmlpage.py +++ b/scrapely/htmlpage.py @@ -133,9 +133,11 @@ class HtmlPageParsedRegion(HtmlPageRegion): fragments contained within this region """ def __new__(cls, htmlpage, start_index, end_index): - text_start = htmlpage.parsed_body[start_index].start - text_end = htmlpage.parsed_body[end_index or -1].end - text = htmlpage.body[text_start:text_end] + text = htmlpage.body + if text: + text_start = htmlpage.parsed_body[start_index].start + text_end = htmlpage.parsed_body[end_index or -1].end + text = htmlpage.body[text_start:text_end] return HtmlPageRegion.__new__(cls, htmlpage, text) def __init__(self, htmlpage, start_index, end_index): diff --git a/scrapely/tests/test_htmlpage.py b/scrapely/tests/test_htmlpage.py index c59ed7f..5d83dc1 100644 --- a/scrapely/tests/test_htmlpage.py +++ b/scrapely/tests/test_htmlpage.py @@ -5,7 +5,7 @@ from unittest import TestCase from scrapely.tests import iter_samples -from scrapely.htmlpage import parse_html, HtmlTag, HtmlDataFragment +from scrapely.htmlpage import parse_html, HtmlTag, HtmlDataFragment, HtmlPage from scrapely.tests.test_htmlpage_data import * def _encode_element(el): @@ -135,3 +135,6 @@ def test_malformed2(self): parsed = [_decode_element(d) for d in PARSED9] self._test_sample(PAGE9, parsed) + def test_empty_subregion(self): + htmlpage = HtmlPage(body=u"") + self.assertEqual(htmlpage.subregion(), u"")