Merge pull request #100 from scrapinghub/ignore-parsing-and-unificati…

…on-exceptions ignore any exception if errors='ignore'
scrapinghub · Dec 5, 2018 · 3ab5592 · 3ab5592
2 parents c3ef088 + 49a8570
commit 3ab5592
Show file tree

Hide file tree

Showing 2 changed files with 103 additions and 20 deletions.
diff --git a/extruct/_extruct.py b/extruct/_extruct.py
@@ -13,7 +13,9 @@
 SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa']
 
 
-def extract(htmlstring, base_url=None, encoding="UTF-8",
+def extract(htmlstring,
+            base_url=None,
+            encoding="UTF-8",
             syntaxes=SYNTAXES,
             errors='strict',
             uniform=False,
@@ -38,48 +40,113 @@ def extract(htmlstring, base_url=None, encoding="UTF-8",
                          Each node is of `lxml.etree.Element` type.
        schema_context: schema's context for current page"""
     if base_url is None and 'url' in kwargs:
-        warnings.warn('"url" argument is deprecated, please use "base_url"',
-                      DeprecationWarning, stacklevel=2)
+        warnings.warn(
+            '"url" argument is deprecated, please use "base_url"',
+            DeprecationWarning,
+            stacklevel=2)
         base_url = kwargs.pop('url')
     if kwargs:
         raise TypeError('Unexpected keyword arguments')
-    if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)):
+    if not (isinstance(syntaxes, list) and all(v in SYNTAXES
+                                               for v in syntaxes)):
         raise ValueError("syntaxes must be a list with any or all (default) of"
                          "these values: {}".format(SYNTAXES))
     if errors not in ['log', 'ignore', 'strict']:
         raise ValueError('Invalid error command, valid values are either "log"'
                          ', "ignore" or "strict"')
-    tree = parse_xmldom_html(htmlstring, encoding=encoding)
+    try:
+        tree = parse_xmldom_html(htmlstring, encoding=encoding)
+    except Exception as e:
+        if errors == 'ignore':
+            return {}
+        if errors == 'log':
+            logger.exception(
+                'Failed to parse html, raises {}'.format(e))
+            return {}
+        if errors == 'strict':
+            raise
     processors = []
     if 'microdata' in syntaxes:
-        processors.append(('microdata', MicrodataExtractor(add_html_node=return_html_node).extract_items, tree))
+        processors.append(
+            ('microdata',
+             MicrodataExtractor(add_html_node=return_html_node).extract_items,
+             tree
+             ))
     if 'json-ld' in syntaxes:
-        processors.append(('json-ld', JsonLdExtractor().extract_items, tree))
+        processors.append(
+            ('json-ld',
+             JsonLdExtractor().extract_items,
+             tree,
+             ))
     if 'opengraph' in syntaxes:
-        processors.append(('opengraph', OpenGraphExtractor().extract_items, tree))
+        processors.append(
+            ('opengraph',
+             OpenGraphExtractor().extract_items,
+             tree
+             ))
     if 'microformat' in syntaxes:
-        processors.append(('microformat', MicroformatExtractor().extract_items, htmlstring))
+        processors.append(
+            ('microformat',
+             MicroformatExtractor().extract_items,
+             htmlstring
+             ))
     if 'rdfa' in syntaxes:
-        processors.append(('rdfa', RDFaExtractor().extract_items, tree))
+        processors.append(
+            ('rdfa', RDFaExtractor().extract_items,
+             tree,
+             ))
     output = {}
-    for label, extract, document in processors:
+    for syntax, extract, document in processors:
         try:
-            output[label] = list(extract(document, base_url=base_url))
-        except Exception:
+            output[syntax] = list(extract(document, base_url=base_url))
+        except Exception as e:
             if errors == 'log':
-                logger.exception('Failed to extract {}'.format(label))
+                logger.exception('Failed to extract {}, raises {}'
+                                 .format(syntax, e)
+                                 )
             if errors == 'ignore':
                 pass
             if errors == 'strict':
                 raise
-
     if uniform:
+        uniform_processors = []
         if 'microdata' in syntaxes:
-            output['microdata'] = _umicrodata_microformat(output['microdata'],
-                                                          schema_context=schema_context)
+            uniform_processors.append(
+                ('microdata',
+                 _umicrodata_microformat,
+                 output['microdata'],
+                 schema_context,
+                 ))
         if 'microformat' in syntaxes:
-            output['microformat'] = _umicrodata_microformat(output['microformat'],
-                                                            schema_context='http://microformats.org/wiki/')
+            uniform_processors.append(
+                ('microformat',
+                 _umicrodata_microformat,
+                 output['microformat'],
+                 'http://microformats.org/wiki/',
+                 ))
         if 'opengraph' in syntaxes:
-            output['opengraph'] = _uopengraph(output['opengraph'])
+            uniform_processors.append(
+                ('opengraph',
+                 _uopengraph,
+                 output['opengraph'],
+                 None,
+                 ))
+        for syntax, uniform, raw, schema_context in uniform_processors:
+            try:
+                if syntax == 'opengraph':
+                    output[syntax] = uniform(raw)
+                else:
+                    output[syntax] = uniform(raw, schema_context)
+            except Exception as e:
+                if errors == 'ignore':
+                    output[syntax] = []
+                if errors == 'log':
+                    output[syntax] = []
+                    logger.exception(
+                        'Failed to uniform extracted for {}, raises {}'
+                        .format(syntax, e)
+                        )
+                if errors == 'strict':
+                    raise
+
     return output
diff --git a/tests/test_extruct.py b/tests/test_extruct.py
@@ -49,3 +49,19 @@ def _microdata_custom_url(self, test_file):
             get_testdata('schema.org', test_file)
             .decode('UTF-8'))}
         return body, expected
+
+    def test_errors(self):
+        body = ''
+
+        # raise exceptions
+        with pytest.raises(Exception):
+            data = extruct.extract(body)
+
+        # ignore exceptions
+        expected = {}
+        data = extruct.extract(body, errors='ignore')
+        assert data == expected
+
+        # ignore exceptions
+        data = extruct.extract(body, errors='log')
+        assert data == expected