From 10fbd1773e2892747d3dd562a80a59755a9ca669 Mon Sep 17 00:00:00 2001
From: Alex Riina <alex.riina@gmail.com>
Date: Sun, 25 Aug 2013 01:15:23 -0400
Subject: [PATCH 1/4] Ignoring pyc files

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 .gitignore
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0d20b64
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*.pyc

From 3c53a53c6dd5f669c0a062df24b3d246943e2234 Mon Sep 17 00:00:00 2001
From: Alex Riina <alex.riina@gmail.com>
Date: Sun, 25 Aug 2013 01:16:09 -0400
Subject: [PATCH 2/4] Converting to use the unittest assertions for better
 errors

---
 scrapely/tests/test_pageparsing.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/scrapely/tests/test_pageparsing.py b/scrapely/tests/test_pageparsing.py
index de5d2ce..4e50fdd 100644
--- a/scrapely/tests/test_pageparsing.py
+++ b/scrapely/tests/test_pageparsing.py
@@ -228,11 +228,11 @@ def test_instance_parsing(self):
         self.assertEqual(_tags(pp, closep), ['</p>', '</html>'])
     
     def _validate_annotation(self, parser, lable_region, name, start_tag, end_tag):
-        assert lable_region.surrounds_attribute == name
+        self.assertEqual(lable_region.surrounds_attribute, name)
         start_token = parser.token_list[lable_region.start_index]
-        assert parser.token_dict.token_string(start_token) == start_tag
+        self.assertEqual(parser.token_dict.token_string(start_token), start_tag)
         end_token = parser.token_list[lable_region.end_index]
-        assert parser.token_dict.token_string(end_token) == end_tag
+        self.assertEqual(parser.token_dict.token_string(end_token), end_tag)
 
     def test_template_parsing(self):
         lp = _parse_page(TemplatePageParser, LABELLED_PAGE1)
@@ -246,16 +246,16 @@ def test_template_parsing(self):
     def test_extraction_page_parsing(self):
         epp = _parse_page(ExtractionPageParser, SIMPLE_PAGE)
         ep = epp.to_extraction_page()
-        assert len(ep.page_tokens) == 4
-        assert ep.htmlpage.fragment_data(ep.htmlpage_tag(0)) == '<html>'
-        assert ep.htmlpage.fragment_data(ep.htmlpage_tag(1)) == '<p some-attr="foo">'
+        self.assertEqual(len(ep.page_tokens), 4)
+        self.assertEqual(ep.htmlpage.fragment_data(ep.htmlpage_tag(0)), '<html>')
+        self.assertEqual(ep.htmlpage.fragment_data(ep.htmlpage_tag(1)), '<p some-attr="foo">')
         
-        assert ep.htmlpage_region_inside(1, 2) == 'this is a test'
-        assert ep.htmlpage_region_inside(1, 3) == 'this is a test</p> '
+        self.assertEqual(ep.htmlpage_region_inside(1, 2), 'this is a test')
+        self.assertEqual(ep.htmlpage_region_inside(1, 3), 'this is a test</p> ')
 
     def test_invalid_html(self):
         p = _parse_page(InstanceLearningParser, BROKEN_PAGE)
-        assert p
+        self.assertTrue(p)
         
     def test_ignore_region(self):
         """Test ignored regions"""

From 3ec86eb3e70cafd954170a09c3f4d33dc0568f10 Mon Sep 17 00:00:00 2001
From: Alex Riina <alex.riina@gmail.com>
Date: Sun, 25 Aug 2013 01:16:48 -0400
Subject: [PATCH 3/4] Generate unittests instead of amassing them in one test

---
 scrapely/tests/test_extraction.py | 82 ++++++++++++-------------------
 1 file changed, 32 insertions(+), 50 deletions(-)

diff --git a/scrapely/tests/test_extraction.py b/scrapely/tests/test_extraction.py
index 0af030e..b6f2815 100644
--- a/scrapely/tests/test_extraction.py
+++ b/scrapely/tests/test_extraction.py
@@ -4,8 +4,7 @@
 Page parsing effectiveness is measured through the evaluation system. These
 tests should focus on specific bits of functionality work correctly.
 """
-from unittest import TestCase
-import numpy
+from functools import partial
 
 from scrapely.htmlpage import HtmlPage
 from scrapely.descriptor import (FieldDescriptor as A, 
@@ -1289,51 +1288,34 @@
     ),
 ]
 
-class TestIbl(TestCase):
-
-    def _run_extraction(self, name, templates, page, descriptor, expected_output):
-        self.trace = None
-        template_pages = [HtmlPage(None, {}, t) for t in templates]
-        # extracts with trace enabled in order to generate traceback
-        extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages], True)
-        actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
-        if actual_output is not None:
-            actual_output = actual_output[0]
-            self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace')
-        # extracts again with trace disabled in order to get the pure output
-        extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages])
-        actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
-        if actual_output is None:
-            if expected_output is None:
-                return
-            assert False, "failed to extract data for test '%s'" % name
-        else:
-            actual_output = actual_output[0]
-        expected_names = set(expected_output.keys())
-        actual_names = set(actual_output.keys())
-        
-        missing_in_output = filter(None, expected_names - actual_names)
-        error = "attributes '%s' were expected but were not present in test '%s'" % \
-                ("', '".join(missing_in_output), name)
-        assert len(missing_in_output) == 0, error
-
-        unexpected = actual_names - expected_names
-        error = "unexpected attributes %s in test '%s'" % \
-                (', '.join(unexpected), name)
-        assert len(unexpected) == 0, error
-
-        for k, v in expected_output.items():
-            extracted = actual_output[k]
-            assert v == extracted, "in test '%s' for attribute '%s', " \
-                "expected value '%s' but got '%s'" % (name, k, v, extracted)
-
-    def test_expected_outputs(self):
-        try:
-            for data in TEST_DATA:
-                self._run_extraction(*data)
-        except AssertionError:
-            if self.trace:
-                print "Trace:"
-                for line in self.trace:
-                    print "\n---\n%s" % line
-            raise
+def _run_extraction(name, templates, page, descriptor, expected_output):
+    template_pages = [HtmlPage(None, {}, t) for t in templates]
+
+    extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages])
+    actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
+    if actual_output is None:
+        assert not expected_output, "failed to extract data for test '%s'" % name
+        return
+    else:
+        actual_output = actual_output[0]
+    expected_names = set(expected_output.keys())
+    actual_names = set(actual_output.keys())
+    
+    missing_in_output = filter(None, expected_names - actual_names)
+    error = "attributes '%s' were expected but were not present in test '%s'" % \
+            ("', '".join(missing_in_output), name)
+    assert not missing_in_output, error
+
+    unexpected = actual_names - expected_names
+    error = "unexpected attributes %s in test '%s'" % \
+            (', '.join(unexpected), name)
+    assert not unexpected, error
+
+    for k, v in expected_output.items():
+        extracted = actual_output[k]
+        assert v == extracted, "in test '%s' for attribute '%s', " \
+            "expected value '%s' but got '%s'" % (name, k, v, extracted)
+
+def test_generator():
+    for data in TEST_DATA:
+        yield partial(_run_extraction, *data)

From fa5b55009b99d3d63d02a7c75a40e2b6c92dea38 Mon Sep 17 00:00:00 2001
From: Alex Riina <alex.riina@gmail.com>
Date: Sun, 25 Aug 2013 01:29:10 -0400
Subject: [PATCH 4/4] TestCase has some nice comparison methods that, together
 with nose-parameterized, make the actual test code quite clear

---
 scrapely/tests/test_extraction.py | 45 +++++++++----------------------
 1 file changed, 13 insertions(+), 32 deletions(-)

diff --git a/scrapely/tests/test_extraction.py b/scrapely/tests/test_extraction.py
index b6f2815..8872497 100644
--- a/scrapely/tests/test_extraction.py
+++ b/scrapely/tests/test_extraction.py
@@ -4,7 +4,8 @@
 Page parsing effectiveness is measured through the evaluation system. These
 tests should focus on specific bits of functionality work correctly.
 """
-from functools import partial
+from unittest import TestCase
+from nose_parameterized import parameterized
 
 from scrapely.htmlpage import HtmlPage
 from scrapely.descriptor import (FieldDescriptor as A, 
@@ -1288,34 +1289,14 @@
     ),
 ]
 
-def _run_extraction(name, templates, page, descriptor, expected_output):
-    template_pages = [HtmlPage(None, {}, t) for t in templates]
-
-    extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages])
-    actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
-    if actual_output is None:
-        assert not expected_output, "failed to extract data for test '%s'" % name
-        return
-    else:
-        actual_output = actual_output[0]
-    expected_names = set(expected_output.keys())
-    actual_names = set(actual_output.keys())
-    
-    missing_in_output = filter(None, expected_names - actual_names)
-    error = "attributes '%s' were expected but were not present in test '%s'" % \
-            ("', '".join(missing_in_output), name)
-    assert not missing_in_output, error
-
-    unexpected = actual_names - expected_names
-    error = "unexpected attributes %s in test '%s'" % \
-            (', '.join(unexpected), name)
-    assert not unexpected, error
-
-    for k, v in expected_output.items():
-        extracted = actual_output[k]
-        assert v == extracted, "in test '%s' for attribute '%s', " \
-            "expected value '%s' but got '%s'" % (name, k, v, extracted)
-
-def test_generator():
-    for data in TEST_DATA:
-        yield partial(_run_extraction, *data)
+
+
+class TestExtraction(TestCase):
+    @parameterized.expand(TEST_DATA)
+    def test_extraction(self, name, templates, page, descriptor, expected_output):
+        template_pages = [HtmlPage(None, {}, t) for t in templates]
+
+        extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages])
+        actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
+
+        self.assertEqual(expected_output, actual_output and actual_output[0])