From 10fbd1773e2892747d3dd562a80a59755a9ca669 Mon Sep 17 00:00:00 2001 From: Alex Riina Date: Sun, 25 Aug 2013 01:15:23 -0400 Subject: [PATCH 1/4] Ignoring pyc files --- .gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc From 3c53a53c6dd5f669c0a062df24b3d246943e2234 Mon Sep 17 00:00:00 2001 From: Alex Riina Date: Sun, 25 Aug 2013 01:16:09 -0400 Subject: [PATCH 2/4] Converting to use the unittest assertions for better errors --- scrapely/tests/test_pageparsing.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/scrapely/tests/test_pageparsing.py b/scrapely/tests/test_pageparsing.py index de5d2ce..4e50fdd 100644 --- a/scrapely/tests/test_pageparsing.py +++ b/scrapely/tests/test_pageparsing.py @@ -228,11 +228,11 @@ def test_instance_parsing(self): self.assertEqual(_tags(pp, closep), ['

', '']) def _validate_annotation(self, parser, lable_region, name, start_tag, end_tag): - assert lable_region.surrounds_attribute == name + self.assertEqual(lable_region.surrounds_attribute, name) start_token = parser.token_list[lable_region.start_index] - assert parser.token_dict.token_string(start_token) == start_tag + self.assertEqual(parser.token_dict.token_string(start_token), start_tag) end_token = parser.token_list[lable_region.end_index] - assert parser.token_dict.token_string(end_token) == end_tag + self.assertEqual(parser.token_dict.token_string(end_token), end_tag) def test_template_parsing(self): lp = _parse_page(TemplatePageParser, LABELLED_PAGE1) @@ -246,16 +246,16 @@ def test_template_parsing(self): def test_extraction_page_parsing(self): epp = _parse_page(ExtractionPageParser, SIMPLE_PAGE) ep = epp.to_extraction_page() - assert len(ep.page_tokens) == 4 - assert ep.htmlpage.fragment_data(ep.htmlpage_tag(0)) == '' - assert ep.htmlpage.fragment_data(ep.htmlpage_tag(1)) == '

' + self.assertEqual(len(ep.page_tokens), 4) + self.assertEqual(ep.htmlpage.fragment_data(ep.htmlpage_tag(0)), '') + self.assertEqual(ep.htmlpage.fragment_data(ep.htmlpage_tag(1)), '

') - assert ep.htmlpage_region_inside(1, 2) == 'this is a test' - assert ep.htmlpage_region_inside(1, 3) == 'this is a test

' + self.assertEqual(ep.htmlpage_region_inside(1, 2), 'this is a test') + self.assertEqual(ep.htmlpage_region_inside(1, 3), 'this is a test

') def test_invalid_html(self): p = _parse_page(InstanceLearningParser, BROKEN_PAGE) - assert p + self.assertTrue(p) def test_ignore_region(self): """Test ignored regions""" From 3ec86eb3e70cafd954170a09c3f4d33dc0568f10 Mon Sep 17 00:00:00 2001 From: Alex Riina Date: Sun, 25 Aug 2013 01:16:48 -0400 Subject: [PATCH 3/4] Generate unittests instead of amassing them in one test --- scrapely/tests/test_extraction.py | 82 ++++++++++++------------------- 1 file changed, 32 insertions(+), 50 deletions(-) diff --git a/scrapely/tests/test_extraction.py b/scrapely/tests/test_extraction.py index 0af030e..b6f2815 100644 --- a/scrapely/tests/test_extraction.py +++ b/scrapely/tests/test_extraction.py @@ -4,8 +4,7 @@ Page parsing effectiveness is measured through the evaluation system. These tests should focus on specific bits of functionality work correctly. """ -from unittest import TestCase -import numpy +from functools import partial from scrapely.htmlpage import HtmlPage from scrapely.descriptor import (FieldDescriptor as A, @@ -1289,51 +1288,34 @@ ), ] -class TestIbl(TestCase): - - def _run_extraction(self, name, templates, page, descriptor, expected_output): - self.trace = None - template_pages = [HtmlPage(None, {}, t) for t in templates] - # extracts with trace enabled in order to generate traceback - extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages], True) - actual_output, _ = extractor.extract(HtmlPage(None, {}, page)) - if actual_output is not None: - actual_output = actual_output[0] - self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace') - # extracts again with trace disabled in order to get the pure output - extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages]) - actual_output, _ = extractor.extract(HtmlPage(None, {}, page)) - if actual_output is None: - if expected_output is None: - return - assert False, "failed to extract data for test '%s'" % name - else: - actual_output = actual_output[0] - expected_names = set(expected_output.keys()) - actual_names = set(actual_output.keys()) - - missing_in_output = filter(None, expected_names - actual_names) - error = "attributes '%s' were expected but were not present in test '%s'" % \ - ("', '".join(missing_in_output), name) - assert len(missing_in_output) == 0, error - - unexpected = actual_names - expected_names - error = "unexpected attributes %s in test '%s'" % \ - (', '.join(unexpected), name) - assert len(unexpected) == 0, error - - for k, v in expected_output.items(): - extracted = actual_output[k] - assert v == extracted, "in test '%s' for attribute '%s', " \ - "expected value '%s' but got '%s'" % (name, k, v, extracted) - - def test_expected_outputs(self): - try: - for data in TEST_DATA: - self._run_extraction(*data) - except AssertionError: - if self.trace: - print "Trace:" - for line in self.trace: - print "\n---\n%s" % line - raise +def _run_extraction(name, templates, page, descriptor, expected_output): + template_pages = [HtmlPage(None, {}, t) for t in templates] + + extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages]) + actual_output, _ = extractor.extract(HtmlPage(None, {}, page)) + if actual_output is None: + assert not expected_output, "failed to extract data for test '%s'" % name + return + else: + actual_output = actual_output[0] + expected_names = set(expected_output.keys()) + actual_names = set(actual_output.keys()) + + missing_in_output = filter(None, expected_names - actual_names) + error = "attributes '%s' were expected but were not present in test '%s'" % \ + ("', '".join(missing_in_output), name) + assert not missing_in_output, error + + unexpected = actual_names - expected_names + error = "unexpected attributes %s in test '%s'" % \ + (', '.join(unexpected), name) + assert not unexpected, error + + for k, v in expected_output.items(): + extracted = actual_output[k] + assert v == extracted, "in test '%s' for attribute '%s', " \ + "expected value '%s' but got '%s'" % (name, k, v, extracted) + +def test_generator(): + for data in TEST_DATA: + yield partial(_run_extraction, *data) From fa5b55009b99d3d63d02a7c75a40e2b6c92dea38 Mon Sep 17 00:00:00 2001 From: Alex Riina Date: Sun, 25 Aug 2013 01:29:10 -0400 Subject: [PATCH 4/4] TestCase has some nice comparison methods that, together with nose-parameterized, make the actual test code quite clear --- scrapely/tests/test_extraction.py | 45 +++++++++---------------------- 1 file changed, 13 insertions(+), 32 deletions(-) diff --git a/scrapely/tests/test_extraction.py b/scrapely/tests/test_extraction.py index b6f2815..8872497 100644 --- a/scrapely/tests/test_extraction.py +++ b/scrapely/tests/test_extraction.py @@ -4,7 +4,8 @@ Page parsing effectiveness is measured through the evaluation system. These tests should focus on specific bits of functionality work correctly. """ -from functools import partial +from unittest import TestCase +from nose_parameterized import parameterized from scrapely.htmlpage import HtmlPage from scrapely.descriptor import (FieldDescriptor as A, @@ -1288,34 +1289,14 @@ ), ] -def _run_extraction(name, templates, page, descriptor, expected_output): - template_pages = [HtmlPage(None, {}, t) for t in templates] - - extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages]) - actual_output, _ = extractor.extract(HtmlPage(None, {}, page)) - if actual_output is None: - assert not expected_output, "failed to extract data for test '%s'" % name - return - else: - actual_output = actual_output[0] - expected_names = set(expected_output.keys()) - actual_names = set(actual_output.keys()) - - missing_in_output = filter(None, expected_names - actual_names) - error = "attributes '%s' were expected but were not present in test '%s'" % \ - ("', '".join(missing_in_output), name) - assert not missing_in_output, error - - unexpected = actual_names - expected_names - error = "unexpected attributes %s in test '%s'" % \ - (', '.join(unexpected), name) - assert not unexpected, error - - for k, v in expected_output.items(): - extracted = actual_output[k] - assert v == extracted, "in test '%s' for attribute '%s', " \ - "expected value '%s' but got '%s'" % (name, k, v, extracted) - -def test_generator(): - for data in TEST_DATA: - yield partial(_run_extraction, *data) + + +class TestExtraction(TestCase): + @parameterized.expand(TEST_DATA) + def test_extraction(self, name, templates, page, descriptor, expected_output): + template_pages = [HtmlPage(None, {}, t) for t in templates] + + extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages]) + actual_output, _ = extractor.extract(HtmlPage(None, {}, page)) + + self.assertEqual(expected_output, actual_output and actual_output[0])