Skip to content

Commit

Permalink
Fix ItemValidationPipeline using __setitem__ without ItemAdapter (#415)
Browse files Browse the repository at this point in the history
* Fix ItemValidationPipeline using __setitem__ without ItemAdapter

* Using typing Dict for python3.8 compatibility

* Refactor to avoid casting to list before len

* Add test case to check when validation field is None

* Simplify check for validation field in item or None

* Increase coverage with additional tests
  • Loading branch information
VMRuiz committed Aug 31, 2023
1 parent e070dd8 commit ea21cee
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 25 deletions.
21 changes: 10 additions & 11 deletions spidermon/contrib/scrapy/pipelines.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from collections import defaultdict
from typing import Dict

from itemadapter import ItemAdapter

from scrapy.exceptions import DropItem, NotConfigured
Expand Down Expand Up @@ -103,15 +105,16 @@ def process_item(self, item, _):
# No validators match this specific item type
return item

data = self._convert_item_to_dict(item)
item_adapter = ItemAdapter(item)
item_dict = item_adapter.asdict()
self.stats.add_item()
self.stats.add_fields(len(list(data.keys())))
self.stats.add_fields(len(item_dict.keys()))
for validator in validators:
ok, errors = validator.validate(data)
ok, errors = validator.validate(item_dict)
if not ok:
self._add_error_stats(errors)
if self.add_errors_to_items:
self._add_errors_to_item(item, errors)
self._add_errors_to_item(item_adapter, errors)
if self.drop_items_with_errors:
self._drop_item(item, errors)
return item
Expand All @@ -120,16 +123,12 @@ def find_validators(self, item):
find = lambda x: self.validators.get(x.__name__, [])
return find(item.__class__) or find(Item)

def _convert_item_to_dict(self, item):
return ItemAdapter(item).asdict()

def _add_errors_to_item(self, item, errors):
data = ItemAdapter(item)
if self.errors_field not in data:
def _add_errors_to_item(self, item: ItemAdapter, errors: Dict[str, str]):
if item.get(self.errors_field, None) is None:
item[self.errors_field] = defaultdict(list)

for field_name, messages in errors.items():
data[self.errors_field][field_name] += messages
item[self.errors_field][field_name] += messages

def _drop_item(self, item, errors):
"""
Expand Down
98 changes: 84 additions & 14 deletions tests/contrib/validation/test_item_validation_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,19 @@ def test_validation_errors_field(dummy_schema):
"SPIDERMON_VALIDATION_ERRORS_FIELD": "custom_validation_field",
}

item = {"no": "schema"}

crawler = get_crawler(settings_dict=settings)
pipeline = ItemValidationPipeline.from_crawler(crawler)

# Instantiate validation field if not defined
item = {"no": "schema"}
item = pipeline.process_item(item, None)
assert "custom_validation_field" in item

# Instantiate validation field if None
item = {"no": "schema", "custom_validation_field": None}
item = pipeline.process_item(item, None)
assert item["custom_validation_field"] is not None


def test_add_error_to_items_undefined_validation_field(dummy_schema):
settings = {
Expand Down Expand Up @@ -138,20 +144,84 @@ class DataclassItem:
foo: str

item = DataclassItem(foo="invalid")
# Does not support item assignment
# Supports item assignment but does not support field
with pytest.raises(KeyError, match="custom_validation_field"):
item = pipeline.process_item(item, None)


def test_not_configured():
# No validators
settings = {
"SPIDERMON_ENABLED": True,
"SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS": True,
"SPIDERMON_VALIDATION_ERRORS_FIELD": "custom_validation_field",
}
crawler = get_crawler(settings_dict=settings)
with pytest.raises(
TypeError, match="'DataclassItem' object does not support item assignment"
scrapy.exceptions.NotConfigured, match="No validators were found"
):
item = pipeline.process_item(item, None)
ItemValidationPipeline.from_crawler(crawler)

@dataclass
class DataclassItemWithItemAssignment:
foo: str
# Invalid validator type
settings = {
"SPIDERMON_ENABLED": True,
"SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS": True,
"SPIDERMON_VALIDATION_SCHEMAS": object(),
"SPIDERMON_VALIDATION_ERRORS_FIELD": "custom_validation_field",
}
crawler = get_crawler(settings_dict=settings)
with pytest.raises(
scrapy.exceptions.NotConfigured,
match=r"Invalid <.*> type for <.*> settings",
):
ItemValidationPipeline.from_crawler(crawler)

def __setitem__(self, key, value):
setattr(self, key, value)
# Invalid schema type
settings = {
"SPIDERMON_ENABLED": True,
"SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS": True,
"SPIDERMON_VALIDATION_SCHEMAS": [False],
"SPIDERMON_VALIDATION_ERRORS_FIELD": "custom_validation_field",
}
crawler = get_crawler(settings_dict=settings)
with pytest.raises(
scrapy.exceptions.NotConfigured,
match=r"Invalid schema, jsonschemas must be defined as:.*",
):
ItemValidationPipeline.from_crawler(crawler)

item = DataclassItemWithItemAssignment(foo="invalid")
# Supports item assignment but does not support field
with pytest.raises(KeyError, match="custom_validation_field"):
item = pipeline.process_item(item, None)

def test_drop_invalid_item(dummy_schema):
settings = {
"SPIDERMON_ENABLED": True,
"SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS": True,
"SPIDERMON_VALIDATION_SCHEMAS": [dummy_schema],
"SPIDERMON_VALIDATION_DROP_ITEMS_WITH_ERRORS": True,
"SPIDERMON_VALIDATION_ERRORS_FIELD": "custom_validation_field",
}

crawler = get_crawler(settings_dict=settings)
pipeline = ItemValidationPipeline.from_crawler(crawler)

item = {"foo": "invalid"}
with pytest.raises(scrapy.exceptions.DropItem):
pipeline.process_item(item, None)


def test_ignore_classes_without_schema(dummy_schema):
settings = {
"SPIDERMON_ENABLED": True,
"SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS": True,
"SPIDERMON_VALIDATION_SCHEMAS": {scrapy.Item: dummy_schema},
"SPIDERMON_VALIDATION_DROP_ITEMS_WITH_ERRORS": True,
"SPIDERMON_VALIDATION_ERRORS_FIELD": "custom_validation_field",
}
crawler = get_crawler(settings_dict=settings)
pipeline = ItemValidationPipeline.from_crawler(crawler)

@dataclass
class DummyItem:
foo: str = "bar"

item = DummyItem()
pipeline.process_item(item, None)

0 comments on commit ea21cee

Please sign in to comment.