Skip to content

Commit

Permalink
Merge pull request #73 from scrapinghub/fix-errors-handling
Browse files Browse the repository at this point in the history
fix errors docstring + add errors args to tool
  • Loading branch information
kmike committed Apr 9, 2018
2 parents becf22e + 971c38b commit 9439fa4
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 4 deletions.
2 changes: 1 addition & 1 deletion extruct/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def extract(htmlstring, url=None, encoding="UTF-8",
url: url of the html documents
encoding: encoding of the html document
syntaxes: list of syntaxes to extract, default SYNTAXES
errors: set to 'log' to save exceptions to file, 'ignore' to ignore them
errors: set to 'log' to log the exceptions, 'ignore' to ignore them
or 'strict'(default) to raise them
uniform: if True uniform output format of all syntaxes to a list of dicts.
Returned dicts structure:
Expand Down
14 changes: 11 additions & 3 deletions extruct/tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import extruct
from extruct import SYNTAXES

def metadata_from_url(url, syntaxes=SYNTAXES, uniform=False, schema_context='http://schema.org'):
def metadata_from_url(url, syntaxes=SYNTAXES, uniform=False,
schema_context='http://schema.org', errors='strict'):
resp = requests.get(url, timeout=30)
result = {'url': url, 'status': '{} {}'.format(resp.status_code, resp.reason)}
try:
Expand All @@ -15,7 +16,8 @@ def metadata_from_url(url, syntaxes=SYNTAXES, uniform=False, schema_context='htt
url=url,
syntaxes=syntaxes,
uniform=uniform,
schema_context=schema_context))
schema_context=schema_context,
errors=errors))
return result


Expand All @@ -38,6 +40,12 @@ def main(args=None):
}''')
arg('--schema_context', default='http://schema.org',
help="schema's context for current page")
arg('--errors',
default='log',
choices=['strict', 'log', 'ignore'],
help="errors: set to 'log'(default) to log the exceptions, 'ignore' to ignore"
" them or 'strict' to raise them")
args = parser.parse_args(args)
metadata = metadata_from_url(args.url, args.syntaxes, args.uniform, args.schema_context)
metadata = metadata_from_url(args.url, args.syntaxes, args.uniform,
args.schema_context, args.errors)
return json.dumps(metadata, indent=2, sort_keys=True)

0 comments on commit 9439fa4

Please sign in to comment.