Fix ItemValidationPipeline using __setitem__ without ItemAdapter (#415)

* Fix ItemValidationPipeline using __setitem__ without ItemAdapter * Using typing Dict for python3.8 compatibility * Refactor to avoid casting to list before len * Add test case to check when validation field is None * Simplify check for validation field in item or None * Increase coverage with additional tests
scrapinghub · Aug 31, 2023 · ea21cee · ea21cee
1 parent e070dd8
commit ea21cee
Show file tree

Hide file tree

Showing 2 changed files with 94 additions and 25 deletions.
diff --git a/spidermon/contrib/scrapy/pipelines.py b/spidermon/contrib/scrapy/pipelines.py
@@ -1,4 +1,6 @@
 from collections import defaultdict
+from typing import Dict
+
 from itemadapter import ItemAdapter
 
 from scrapy.exceptions import DropItem, NotConfigured
@@ -103,15 +105,16 @@ def process_item(self, item, _):
             # No validators match this specific item type
             return item
 
-        data = self._convert_item_to_dict(item)
+        item_adapter = ItemAdapter(item)
+        item_dict = item_adapter.asdict()
         self.stats.add_item()
-        self.stats.add_fields(len(list(data.keys())))
+        self.stats.add_fields(len(item_dict.keys()))
         for validator in validators:
-            ok, errors = validator.validate(data)
+            ok, errors = validator.validate(item_dict)
             if not ok:
                 self._add_error_stats(errors)
                 if self.add_errors_to_items:
-                    self._add_errors_to_item(item, errors)
+                    self._add_errors_to_item(item_adapter, errors)
                 if self.drop_items_with_errors:
                     self._drop_item(item, errors)
         return item
@@ -120,16 +123,12 @@ def find_validators(self, item):
         find = lambda x: self.validators.get(x.__name__, [])
         return find(item.__class__) or find(Item)
 
-    def _convert_item_to_dict(self, item):
-        return ItemAdapter(item).asdict()
-
-    def _add_errors_to_item(self, item, errors):
-        data = ItemAdapter(item)
-        if self.errors_field not in data:
+    def _add_errors_to_item(self, item: ItemAdapter, errors: Dict[str, str]):
+        if item.get(self.errors_field, None) is None:
             item[self.errors_field] = defaultdict(list)
 
         for field_name, messages in errors.items():
-            data[self.errors_field][field_name] += messages
+            item[self.errors_field][field_name] += messages
 
     def _drop_item(self, item, errors):
         """

diff --git a/tests/contrib/validation/test_item_validation_pipeline.py b/tests/contrib/validation/test_item_validation_pipeline.py
@@ -98,13 +98,19 @@ def test_validation_errors_field(dummy_schema):
         "SPIDERMON_VALIDATION_ERRORS_FIELD": "custom_validation_field",
     }
 
-    item = {"no": "schema"}
-
     crawler = get_crawler(settings_dict=settings)
     pipeline = ItemValidationPipeline.from_crawler(crawler)
+
+    # Instantiate validation field if not defined
+    item = {"no": "schema"}
     item = pipeline.process_item(item, None)
     assert "custom_validation_field" in item
 
+    # Instantiate validation field if None
+    item = {"no": "schema", "custom_validation_field": None}
+    item = pipeline.process_item(item, None)
+    assert item["custom_validation_field"] is not None
+
 
 def test_add_error_to_items_undefined_validation_field(dummy_schema):
     settings = {
@@ -138,20 +144,84 @@ class DataclassItem:
         foo: str
 
     item = DataclassItem(foo="invalid")
-    # Does not support item assignment
+    # Supports item assignment but does not support field
+    with pytest.raises(KeyError, match="custom_validation_field"):
+        item = pipeline.process_item(item, None)
+
+
+def test_not_configured():
+    # No validators
+    settings = {
+        "SPIDERMON_ENABLED": True,
+        "SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS": True,
+        "SPIDERMON_VALIDATION_ERRORS_FIELD": "custom_validation_field",
+    }
+    crawler = get_crawler(settings_dict=settings)
     with pytest.raises(
-        TypeError, match="'DataclassItem' object does not support item assignment"
+        scrapy.exceptions.NotConfigured, match="No validators were found"
     ):
-        item = pipeline.process_item(item, None)
+        ItemValidationPipeline.from_crawler(crawler)
 
-    @dataclass
-    class DataclassItemWithItemAssignment:
-        foo: str
+    # Invalid validator type
+    settings = {
+        "SPIDERMON_ENABLED": True,
+        "SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS": True,
+        "SPIDERMON_VALIDATION_SCHEMAS": object(),
+        "SPIDERMON_VALIDATION_ERRORS_FIELD": "custom_validation_field",
+    }
+    crawler = get_crawler(settings_dict=settings)
+    with pytest.raises(
+        scrapy.exceptions.NotConfigured,
+        match=r"Invalid <.*> type for <.*> settings",
+    ):
+        ItemValidationPipeline.from_crawler(crawler)
 
-        def __setitem__(self, key, value):
-            setattr(self, key, value)
+    # Invalid schema type
+    settings = {
+        "SPIDERMON_ENABLED": True,
+        "SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS": True,
+        "SPIDERMON_VALIDATION_SCHEMAS": [False],
+        "SPIDERMON_VALIDATION_ERRORS_FIELD": "custom_validation_field",
+    }
+    crawler = get_crawler(settings_dict=settings)
+    with pytest.raises(
+        scrapy.exceptions.NotConfigured,
+        match=r"Invalid schema, jsonschemas must be defined as:.*",
+    ):
+        ItemValidationPipeline.from_crawler(crawler)
 
-    item = DataclassItemWithItemAssignment(foo="invalid")
-    # Supports item assignment but does not support field
-    with pytest.raises(KeyError, match="custom_validation_field"):
-        item = pipeline.process_item(item, None)
+
+def test_drop_invalid_item(dummy_schema):
+    settings = {
+        "SPIDERMON_ENABLED": True,
+        "SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS": True,
+        "SPIDERMON_VALIDATION_SCHEMAS": [dummy_schema],
+        "SPIDERMON_VALIDATION_DROP_ITEMS_WITH_ERRORS": True,
+        "SPIDERMON_VALIDATION_ERRORS_FIELD": "custom_validation_field",
+    }
+
+    crawler = get_crawler(settings_dict=settings)
+    pipeline = ItemValidationPipeline.from_crawler(crawler)
+
+    item = {"foo": "invalid"}
+    with pytest.raises(scrapy.exceptions.DropItem):
+        pipeline.process_item(item, None)
+
+
+def test_ignore_classes_without_schema(dummy_schema):
+    settings = {
+        "SPIDERMON_ENABLED": True,
+        "SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS": True,
+        "SPIDERMON_VALIDATION_SCHEMAS": {scrapy.Item: dummy_schema},
+        "SPIDERMON_VALIDATION_DROP_ITEMS_WITH_ERRORS": True,
+        "SPIDERMON_VALIDATION_ERRORS_FIELD": "custom_validation_field",
+    }
+    crawler = get_crawler(settings_dict=settings)
+    pipeline = ItemValidationPipeline.from_crawler(crawler)
+
+    @dataclass
+    class DummyItem:
+        foo: str = "bar"
+
+    item = DummyItem()
+    pipeline.process_item(item, None)