From 34d407f1b4272c962a6c13ddbea1de5d966adf77 Mon Sep 17 00:00:00 2001 From: rosheen33 Date: Fri, 20 Sep 2019 20:26:21 +0500 Subject: [PATCH 1/4] added additional setting for closing spider --- spidermon/contrib/scrapy/pipelines.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/spidermon/contrib/scrapy/pipelines.py b/spidermon/contrib/scrapy/pipelines.py index d5818868..78388abc 100644 --- a/spidermon/contrib/scrapy/pipelines.py +++ b/spidermon/contrib/scrapy/pipelines.py @@ -20,6 +20,7 @@ DEFAULT_ERRORS_FIELD = "_validation" DEFAULT_ADD_ERRORS_TO_ITEM = False DEFAULT_DROP_ITEMS_WITH_ERRORS = False +DEFAULT_SPIDERMON_CLOSESPIDER_BY_STATS = {} class ItemValidationPipeline(object): @@ -29,6 +30,7 @@ def __init__( stats, drop_items_with_errors=DEFAULT_DROP_ITEMS_WITH_ERRORS, add_errors_to_items=DEFAULT_ADD_ERRORS_TO_ITEM, + close_spider_by_stats=DEFAULT_SPIDERMON_CLOSESPIDER_BY_STATS, errors_field=None, ): self.drop_items_with_errors = drop_items_with_errors @@ -36,6 +38,7 @@ def __init__( self.errors_field = errors_field or DEFAULT_ERRORS_FIELD self.validators = validators self.stats = ValidationStatsManager(stats) + self.close_spider_by_stats = close_spider_by_stats or DEFAULT_SPIDERMON_CLOSESPIDER_BY_STATS for _type, vals in validators.items(): [self.stats.add_validator(_type, val.name) for val in vals] @@ -122,6 +125,8 @@ def process_item(self, item, _): self._add_errors_to_item(item, errors) if self.drop_items_with_errors: self._drop_item(item, errors) + if self.close_spider_by_stats: + self._close_spider_by_stats(self.stats, self.close_spider_by_stats) return item def find_validators(self, item): @@ -168,3 +173,11 @@ def _add_error_stats(self, errors): for message in messages: self.stats.add_field_error(field_name, message) self.stats.add_item_with_errors() + + def _close_spider_by_stats(self, stats, error_values): + from scrapy.exceptions import CloseSpider + for key, value in error_values: + stat = stats.get(key, 0) + max_errors_allowed = value + if stat > max_errors_allowed: + raise CloseSpider(reason='Errors are greater then the expected no of errors') From df0f0a2d4de341f53296c3b9a07dd4ee246e727e Mon Sep 17 00:00:00 2001 From: rosheen33 Date: Mon, 23 Sep 2019 13:08:59 +0500 Subject: [PATCH 2/4] added spidermon setting --- spidermon/contrib/scrapy/pipelines.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spidermon/contrib/scrapy/pipelines.py b/spidermon/contrib/scrapy/pipelines.py index 78388abc..b724b34b 100644 --- a/spidermon/contrib/scrapy/pipelines.py +++ b/spidermon/contrib/scrapy/pipelines.py @@ -83,6 +83,7 @@ def set_validators(loader, schema): "SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS" ), errors_field=crawler.settings.get("SPIDERMON_VALIDATION_ERRORS_FIELD"), + close_spider_by_stats=crawler.settings.get('SPIDERMON_CLOSESPIDER_BY_STATS'), ) @classmethod From dabea97b56407b534a2ce6f76d54eaec0cb91abd Mon Sep 17 00:00:00 2001 From: rosheen33 Date: Mon, 23 Sep 2019 16:08:07 +0500 Subject: [PATCH 3/4] Code optimizations --- spidermon/contrib/scrapy/pipelines.py | 39 ++++++++++++++++++--------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/spidermon/contrib/scrapy/pipelines.py b/spidermon/contrib/scrapy/pipelines.py index b724b34b..688e3479 100644 --- a/spidermon/contrib/scrapy/pipelines.py +++ b/spidermon/contrib/scrapy/pipelines.py @@ -20,7 +20,7 @@ DEFAULT_ERRORS_FIELD = "_validation" DEFAULT_ADD_ERRORS_TO_ITEM = False DEFAULT_DROP_ITEMS_WITH_ERRORS = False -DEFAULT_SPIDERMON_CLOSESPIDER_BY_STATS = {} +DEFAULT_CLOSESPIDER_BY_STATS = {} class ItemValidationPipeline(object): @@ -30,7 +30,7 @@ def __init__( stats, drop_items_with_errors=DEFAULT_DROP_ITEMS_WITH_ERRORS, add_errors_to_items=DEFAULT_ADD_ERRORS_TO_ITEM, - close_spider_by_stats=DEFAULT_SPIDERMON_CLOSESPIDER_BY_STATS, + close_spider_by_stats=DEFAULT_CLOSESPIDER_BY_STATS, errors_field=None, ): self.drop_items_with_errors = drop_items_with_errors @@ -38,7 +38,7 @@ def __init__( self.errors_field = errors_field or DEFAULT_ERRORS_FIELD self.validators = validators self.stats = ValidationStatsManager(stats) - self.close_spider_by_stats = close_spider_by_stats or DEFAULT_SPIDERMON_CLOSESPIDER_BY_STATS + self.close_spider_by_stats = close_spider_by_stats or DEFAULT_CLOSESPIDER_BY_STATS for _type, vals in validators.items(): [self.stats.add_validator(_type, val.name) for val in vals] @@ -73,7 +73,7 @@ def set_validators(loader, schema): if not validators: raise NotConfigured("No validators were found") - return cls( + pipeline = cls( validators=validators, stats=crawler.stats, drop_items_with_errors=crawler.settings.getbool( @@ -85,6 +85,12 @@ def set_validators(loader, schema): errors_field=crawler.settings.get("SPIDERMON_VALIDATION_ERRORS_FIELD"), close_spider_by_stats=crawler.settings.get('SPIDERMON_CLOSESPIDER_BY_STATS'), ) + pipeline._set_crawler(crawler) + return pipeline + + @classmethod + def _set_crawler(self, crawler): + self.crawler = crawler @classmethod def _load_jsonschema_validator(cls, schema): @@ -109,7 +115,7 @@ def _load_schematics_validator(cls, model_path): ) return SchematicsValidator(model_class) - def process_item(self, item, _): + def process_item(self, item, spider): validators = self.find_validators(item) if not validators: # No validators match this specific item type @@ -127,7 +133,8 @@ def process_item(self, item, _): if self.drop_items_with_errors: self._drop_item(item, errors) if self.close_spider_by_stats: - self._close_spider_by_stats(self.stats, self.close_spider_by_stats) + stats_dict = self.close_spider_by_stats + self._close_spider_by_stats (spider, stats_dict) return item def find_validators(self, item): @@ -175,10 +182,16 @@ def _add_error_stats(self, errors): self.stats.add_field_error(field_name, message) self.stats.add_item_with_errors() - def _close_spider_by_stats(self, stats, error_values): - from scrapy.exceptions import CloseSpider - for key, value in error_values: - stat = stats.get(key, 0) - max_errors_allowed = value - if stat > max_errors_allowed: - raise CloseSpider(reason='Errors are greater then the expected no of errors') + def _close_spider_by_stats(self, spider, stats_dict): + """ + This method evaluate each stat value and if no of errors are greater then + max_errors_allowed. Spider will be closed gracefully. + """ + for stat_name, max_errors_allowed in stats_dict.iteritems(): + stat_current_value = self.stats.stats.get_value(stat_name, 0) + + if stat_current_value > max_errors_allowed: + self.crawler.engine.close_spider( + spider, + 'Spidermon: No of errors are greater then the expected no of errors' + ) From c7f61edb1b541892b285107c1966bc0da852efdb Mon Sep 17 00:00:00 2001 From: rosheen33 Date: Mon, 23 Sep 2019 16:12:17 +0500 Subject: [PATCH 4/4] Code optimizations --- spidermon/contrib/scrapy/pipelines.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spidermon/contrib/scrapy/pipelines.py b/spidermon/contrib/scrapy/pipelines.py index 688e3479..29c58bf4 100644 --- a/spidermon/contrib/scrapy/pipelines.py +++ b/spidermon/contrib/scrapy/pipelines.py @@ -193,5 +193,7 @@ def _close_spider_by_stats(self, spider, stats_dict): if stat_current_value > max_errors_allowed: self.crawler.engine.close_spider( spider, - 'Spidermon: No of errors are greater then the expected no of errors' + 'Spidermon: No of errors are greater then the expected no of errors {} > {}'.format( + stat_current_value, max_errors_allowed + ) )