Skip to content

Commit

Permalink
Merge pull request #100 from scrapinghub/ignore-parsing-and-unificati…
Browse files Browse the repository at this point in the history
…on-exceptions

ignore any exception if errors='ignore'
  • Loading branch information
lopuhin committed Dec 5, 2018
2 parents c3ef088 + 49a8570 commit 3ab5592
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 20 deletions.
107 changes: 87 additions & 20 deletions extruct/_extruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa']


def extract(htmlstring, base_url=None, encoding="UTF-8",
def extract(htmlstring,
base_url=None,
encoding="UTF-8",
syntaxes=SYNTAXES,
errors='strict',
uniform=False,
Expand All @@ -38,48 +40,113 @@ def extract(htmlstring, base_url=None, encoding="UTF-8",
Each node is of `lxml.etree.Element` type.
schema_context: schema's context for current page"""
if base_url is None and 'url' in kwargs:
warnings.warn('"url" argument is deprecated, please use "base_url"',
DeprecationWarning, stacklevel=2)
warnings.warn(
'"url" argument is deprecated, please use "base_url"',
DeprecationWarning,
stacklevel=2)
base_url = kwargs.pop('url')
if kwargs:
raise TypeError('Unexpected keyword arguments')
if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)):
if not (isinstance(syntaxes, list) and all(v in SYNTAXES
for v in syntaxes)):
raise ValueError("syntaxes must be a list with any or all (default) of"
"these values: {}".format(SYNTAXES))
if errors not in ['log', 'ignore', 'strict']:
raise ValueError('Invalid error command, valid values are either "log"'
', "ignore" or "strict"')
tree = parse_xmldom_html(htmlstring, encoding=encoding)
try:
tree = parse_xmldom_html(htmlstring, encoding=encoding)
except Exception as e:
if errors == 'ignore':
return {}
if errors == 'log':
logger.exception(
'Failed to parse html, raises {}'.format(e))
return {}
if errors == 'strict':
raise
processors = []
if 'microdata' in syntaxes:
processors.append(('microdata', MicrodataExtractor(add_html_node=return_html_node).extract_items, tree))
processors.append(
('microdata',
MicrodataExtractor(add_html_node=return_html_node).extract_items,
tree
))
if 'json-ld' in syntaxes:
processors.append(('json-ld', JsonLdExtractor().extract_items, tree))
processors.append(
('json-ld',
JsonLdExtractor().extract_items,
tree,
))
if 'opengraph' in syntaxes:
processors.append(('opengraph', OpenGraphExtractor().extract_items, tree))
processors.append(
('opengraph',
OpenGraphExtractor().extract_items,
tree
))
if 'microformat' in syntaxes:
processors.append(('microformat', MicroformatExtractor().extract_items, htmlstring))
processors.append(
('microformat',
MicroformatExtractor().extract_items,
htmlstring
))
if 'rdfa' in syntaxes:
processors.append(('rdfa', RDFaExtractor().extract_items, tree))
processors.append(
('rdfa', RDFaExtractor().extract_items,
tree,
))
output = {}
for label, extract, document in processors:
for syntax, extract, document in processors:
try:
output[label] = list(extract(document, base_url=base_url))
except Exception:
output[syntax] = list(extract(document, base_url=base_url))
except Exception as e:
if errors == 'log':
logger.exception('Failed to extract {}'.format(label))
logger.exception('Failed to extract {}, raises {}'
.format(syntax, e)
)
if errors == 'ignore':
pass
if errors == 'strict':
raise

if uniform:
uniform_processors = []
if 'microdata' in syntaxes:
output['microdata'] = _umicrodata_microformat(output['microdata'],
schema_context=schema_context)
uniform_processors.append(
('microdata',
_umicrodata_microformat,
output['microdata'],
schema_context,
))
if 'microformat' in syntaxes:
output['microformat'] = _umicrodata_microformat(output['microformat'],
schema_context='http://microformats.org/wiki/')
uniform_processors.append(
('microformat',
_umicrodata_microformat,
output['microformat'],
'http://microformats.org/wiki/',
))
if 'opengraph' in syntaxes:
output['opengraph'] = _uopengraph(output['opengraph'])
uniform_processors.append(
('opengraph',
_uopengraph,
output['opengraph'],
None,
))
for syntax, uniform, raw, schema_context in uniform_processors:
try:
if syntax == 'opengraph':
output[syntax] = uniform(raw)
else:
output[syntax] = uniform(raw, schema_context)
except Exception as e:
if errors == 'ignore':
output[syntax] = []
if errors == 'log':
output[syntax] = []
logger.exception(
'Failed to uniform extracted for {}, raises {}'
.format(syntax, e)
)
if errors == 'strict':
raise

return output
16 changes: 16 additions & 0 deletions tests/test_extruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,19 @@ def _microdata_custom_url(self, test_file):
get_testdata('schema.org', test_file)
.decode('UTF-8'))}
return body, expected

def test_errors(self):
body = ''

# raise exceptions
with pytest.raises(Exception):
data = extruct.extract(body)

# ignore exceptions
expected = {}
data = extruct.extract(body, errors='ignore')
assert data == expected

# ignore exceptions
data = extruct.extract(body, errors='log')
assert data == expected

0 comments on commit 3ab5592

Please sign in to comment.