diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc diff --git a/scrapely/tests/test_extraction.py b/scrapely/tests/test_extraction.py index 0af030e..8872497 100644 --- a/scrapely/tests/test_extraction.py +++ b/scrapely/tests/test_extraction.py @@ -5,7 +5,7 @@ tests should focus on specific bits of functionality work correctly. """ from unittest import TestCase -import numpy +from nose_parameterized import parameterized from scrapely.htmlpage import HtmlPage from scrapely.descriptor import (FieldDescriptor as A, @@ -1289,51 +1289,14 @@ ), ] -class TestIbl(TestCase): - def _run_extraction(self, name, templates, page, descriptor, expected_output): - self.trace = None + +class TestExtraction(TestCase): + @parameterized.expand(TEST_DATA) + def test_extraction(self, name, templates, page, descriptor, expected_output): template_pages = [HtmlPage(None, {}, t) for t in templates] - # extracts with trace enabled in order to generate traceback - extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages], True) - actual_output, _ = extractor.extract(HtmlPage(None, {}, page)) - if actual_output is not None: - actual_output = actual_output[0] - self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace') - # extracts again with trace disabled in order to get the pure output + extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages]) actual_output, _ = extractor.extract(HtmlPage(None, {}, page)) - if actual_output is None: - if expected_output is None: - return - assert False, "failed to extract data for test '%s'" % name - else: - actual_output = actual_output[0] - expected_names = set(expected_output.keys()) - actual_names = set(actual_output.keys()) - - missing_in_output = filter(None, expected_names - actual_names) - error = "attributes '%s' were expected but were not present in test '%s'" % \ - ("', '".join(missing_in_output), name) - assert len(missing_in_output) == 0, error - - unexpected = actual_names - expected_names - error = "unexpected attributes %s in test '%s'" % \ - (', '.join(unexpected), name) - assert len(unexpected) == 0, error - - for k, v in expected_output.items(): - extracted = actual_output[k] - assert v == extracted, "in test '%s' for attribute '%s', " \ - "expected value '%s' but got '%s'" % (name, k, v, extracted) - - def test_expected_outputs(self): - try: - for data in TEST_DATA: - self._run_extraction(*data) - except AssertionError: - if self.trace: - print "Trace:" - for line in self.trace: - print "\n---\n%s" % line - raise + + self.assertEqual(expected_output, actual_output and actual_output[0]) diff --git a/scrapely/tests/test_pageparsing.py b/scrapely/tests/test_pageparsing.py index de5d2ce..4e50fdd 100644 --- a/scrapely/tests/test_pageparsing.py +++ b/scrapely/tests/test_pageparsing.py @@ -228,11 +228,11 @@ def test_instance_parsing(self): self.assertEqual(_tags(pp, closep), ['

', '']) def _validate_annotation(self, parser, lable_region, name, start_tag, end_tag): - assert lable_region.surrounds_attribute == name + self.assertEqual(lable_region.surrounds_attribute, name) start_token = parser.token_list[lable_region.start_index] - assert parser.token_dict.token_string(start_token) == start_tag + self.assertEqual(parser.token_dict.token_string(start_token), start_tag) end_token = parser.token_list[lable_region.end_index] - assert parser.token_dict.token_string(end_token) == end_tag + self.assertEqual(parser.token_dict.token_string(end_token), end_tag) def test_template_parsing(self): lp = _parse_page(TemplatePageParser, LABELLED_PAGE1) @@ -246,16 +246,16 @@ def test_template_parsing(self): def test_extraction_page_parsing(self): epp = _parse_page(ExtractionPageParser, SIMPLE_PAGE) ep = epp.to_extraction_page() - assert len(ep.page_tokens) == 4 - assert ep.htmlpage.fragment_data(ep.htmlpage_tag(0)) == '' - assert ep.htmlpage.fragment_data(ep.htmlpage_tag(1)) == '

' + self.assertEqual(len(ep.page_tokens), 4) + self.assertEqual(ep.htmlpage.fragment_data(ep.htmlpage_tag(0)), '') + self.assertEqual(ep.htmlpage.fragment_data(ep.htmlpage_tag(1)), '

') - assert ep.htmlpage_region_inside(1, 2) == 'this is a test' - assert ep.htmlpage_region_inside(1, 3) == 'this is a test

' + self.assertEqual(ep.htmlpage_region_inside(1, 2), 'this is a test') + self.assertEqual(ep.htmlpage_region_inside(1, 3), 'this is a test

') def test_invalid_html(self): p = _parse_page(InstanceLearningParser, BROKEN_PAGE) - assert p + self.assertTrue(p) def test_ignore_region(self): """Test ignored regions"""