Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Close spider by stats [#47][#50] #216

Closed
wants to merge 4 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 31 additions & 2 deletions spidermon/contrib/scrapy/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
DEFAULT_ERRORS_FIELD = "_validation"
DEFAULT_ADD_ERRORS_TO_ITEM = False
DEFAULT_DROP_ITEMS_WITH_ERRORS = False
DEFAULT_CLOSESPIDER_BY_STATS = {}


class ItemValidationPipeline(object):
Expand All @@ -29,13 +30,15 @@ def __init__(
stats,
drop_items_with_errors=DEFAULT_DROP_ITEMS_WITH_ERRORS,
add_errors_to_items=DEFAULT_ADD_ERRORS_TO_ITEM,
close_spider_by_stats=DEFAULT_CLOSESPIDER_BY_STATS,
errors_field=None,
):
self.drop_items_with_errors = drop_items_with_errors
self.add_errors_to_items = add_errors_to_items or DEFAULT_ADD_ERRORS_TO_ITEM
self.errors_field = errors_field or DEFAULT_ERRORS_FIELD
self.validators = validators
self.stats = ValidationStatsManager(stats)
self.close_spider_by_stats = close_spider_by_stats or DEFAULT_CLOSESPIDER_BY_STATS
for _type, vals in validators.items():
[self.stats.add_validator(_type, val.name) for val in vals]

Expand Down Expand Up @@ -70,7 +73,7 @@ def set_validators(loader, schema):
if not validators:
raise NotConfigured("No validators were found")

return cls(
pipeline = cls(
validators=validators,
stats=crawler.stats,
drop_items_with_errors=crawler.settings.getbool(
Expand All @@ -80,7 +83,14 @@ def set_validators(loader, schema):
"SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS"
),
errors_field=crawler.settings.get("SPIDERMON_VALIDATION_ERRORS_FIELD"),
close_spider_by_stats=crawler.settings.get('SPIDERMON_CLOSESPIDER_BY_STATS'),
)
pipeline._set_crawler(crawler)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not pass it as a parameter to cls()?

return pipeline

@classmethod
def _set_crawler(self, crawler):
self.crawler = crawler

@classmethod
def _load_jsonschema_validator(cls, schema):
Expand All @@ -105,7 +115,7 @@ def _load_schematics_validator(cls, model_path):
)
return SchematicsValidator(model_class)

def process_item(self, item, _):
def process_item(self, item, spider):
validators = self.find_validators(item)
if not validators:
# No validators match this specific item type
Expand All @@ -122,6 +132,9 @@ def process_item(self, item, _):
self._add_errors_to_item(item, errors)
if self.drop_items_with_errors:
self._drop_item(item, errors)
if self.close_spider_by_stats:
stats_dict = self.close_spider_by_stats
self._close_spider_by_stats (spider, stats_dict)
return item

def find_validators(self, item):
Expand Down Expand Up @@ -168,3 +181,19 @@ def _add_error_stats(self, errors):
for message in messages:
self.stats.add_field_error(field_name, message)
self.stats.add_item_with_errors()

def _close_spider_by_stats(self, spider, stats_dict):
"""
This method evaluate each stat value and if no of errors are greater then
max_errors_allowed. Spider will be closed gracefully.
"""
for stat_name, max_errors_allowed in stats_dict.iteritems():
stat_current_value = self.stats.stats.get_value(stat_name, 0)

if stat_current_value > max_errors_allowed:
self.crawler.engine.close_spider(
spider,
'Spidermon: No of errors are greater then the expected no of errors {} > {}'.format(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should not call them errors (they could be warnings or something else entirely), and I would include the name of the specific stat that triggers the spider closing.

Suggested change
'Spidermon: No of errors are greater then the expected no of errors {} > {}'.format(
'Spidermon: The value of stat {} ({}) went above the configured limit ({})'.format(stat_name, stat_current_value, max_errors_allowed)

stat_current_value, max_errors_allowed
)
)