Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.pyc
53 changes: 8 additions & 45 deletions scrapely/tests/test_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
tests should focus on specific bits of functionality work correctly.
"""
from unittest import TestCase
import numpy
from nose_parameterized import parameterized
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding an external dependency, which you may want to avoid.

Since nose doesn't allow you to generate unittests on a TestCase class with yield, I used nose-parameterized to do the test generation for me. The previous revision, however, keeps the original assert style but moves the tests to the module level so the test generation will work.


from scrapely.htmlpage import HtmlPage
from scrapely.descriptor import (FieldDescriptor as A,
Expand Down Expand Up @@ -1289,51 +1289,14 @@
),
]

class TestIbl(TestCase):

def _run_extraction(self, name, templates, page, descriptor, expected_output):
self.trace = None

class TestExtraction(TestCase):
@parameterized.expand(TEST_DATA)
def test_extraction(self, name, templates, page, descriptor, expected_output):
template_pages = [HtmlPage(None, {}, t) for t in templates]
# extracts with trace enabled in order to generate traceback
extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages], True)
actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
if actual_output is not None:
actual_output = actual_output[0]
self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace')
# extracts again with trace disabled in order to get the pure output

extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages])
actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
if actual_output is None:
if expected_output is None:
return
assert False, "failed to extract data for test '%s'" % name
else:
actual_output = actual_output[0]
expected_names = set(expected_output.keys())
actual_names = set(actual_output.keys())

missing_in_output = filter(None, expected_names - actual_names)
error = "attributes '%s' were expected but were not present in test '%s'" % \
("', '".join(missing_in_output), name)
assert len(missing_in_output) == 0, error

unexpected = actual_names - expected_names
error = "unexpected attributes %s in test '%s'" % \
(', '.join(unexpected), name)
assert len(unexpected) == 0, error

for k, v in expected_output.items():
extracted = actual_output[k]
assert v == extracted, "in test '%s' for attribute '%s', " \
"expected value '%s' but got '%s'" % (name, k, v, extracted)

def test_expected_outputs(self):
try:
for data in TEST_DATA:
self._run_extraction(*data)
except AssertionError:
if self.trace:
print "Trace:"
for line in self.trace:
print "\n---\n%s" % line
raise

self.assertEqual(expected_output, actual_output and actual_output[0])
18 changes: 9 additions & 9 deletions scrapely/tests/test_pageparsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,11 +228,11 @@ def test_instance_parsing(self):
self.assertEqual(_tags(pp, closep), ['</p>', '</html>'])

def _validate_annotation(self, parser, lable_region, name, start_tag, end_tag):
assert lable_region.surrounds_attribute == name
self.assertEqual(lable_region.surrounds_attribute, name)
start_token = parser.token_list[lable_region.start_index]
assert parser.token_dict.token_string(start_token) == start_tag
self.assertEqual(parser.token_dict.token_string(start_token), start_tag)
end_token = parser.token_list[lable_region.end_index]
assert parser.token_dict.token_string(end_token) == end_tag
self.assertEqual(parser.token_dict.token_string(end_token), end_tag)

def test_template_parsing(self):
lp = _parse_page(TemplatePageParser, LABELLED_PAGE1)
Expand All @@ -246,16 +246,16 @@ def test_template_parsing(self):
def test_extraction_page_parsing(self):
epp = _parse_page(ExtractionPageParser, SIMPLE_PAGE)
ep = epp.to_extraction_page()
assert len(ep.page_tokens) == 4
assert ep.htmlpage.fragment_data(ep.htmlpage_tag(0)) == '<html>'
assert ep.htmlpage.fragment_data(ep.htmlpage_tag(1)) == '<p some-attr="foo">'
self.assertEqual(len(ep.page_tokens), 4)
self.assertEqual(ep.htmlpage.fragment_data(ep.htmlpage_tag(0)), '<html>')
self.assertEqual(ep.htmlpage.fragment_data(ep.htmlpage_tag(1)), '<p some-attr="foo">')

assert ep.htmlpage_region_inside(1, 2) == 'this is a test'
assert ep.htmlpage_region_inside(1, 3) == 'this is a test</p> '
self.assertEqual(ep.htmlpage_region_inside(1, 2), 'this is a test')
self.assertEqual(ep.htmlpage_region_inside(1, 3), 'this is a test</p> ')

def test_invalid_html(self):
p = _parse_page(InstanceLearningParser, BROKEN_PAGE)
assert p
self.assertTrue(p)

def test_ignore_region(self):
"""Test ignored regions"""
Expand Down