diff --git a/CHANGELOG.md b/CHANGELOG.md index 721d49d..1e51836 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,11 @@ The format is (loosely) based on [Keep a Changelog](http://keepachangelog.com/) - Added sponsors and supporters section with logos ([#122](https://github.com/stac-utils/stac-check/pull/122)) - Added check to verify that bbox matches item's polygon geometry ([#123](https://github.com/stac-utils/stac-check/pull/123)) - Added configuration documentation to README ([#124](https://github.com/stac-utils/stac-check/pull/124)) +- Added `--pydantic` option for validating STAC objects using stac-pydantic models, providing enhanced type checking and validation ([#126](https://github.com/stac-utils/stac-check/pull/126)) + +### Enhanced + +- Improved bbox validation output to show detailed information about mismatches between bbox and geometry bounds, including which specific coordinates differ and by how much ([#126](https://github.com/stac-utils/stac-check/pull/126)) ### Updated diff --git a/README.md b/README.md index 5371fb8..09eb3d0 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,7 @@ Options: (enabled by default). --header KEY VALUE HTTP header to include in the requests. Can be used multiple times. + --pydantic Use stac-pydantic for enhanced validation with Pydantic models. --help Show this message and exit. ``` diff --git a/sample_files/1.0.0/bad-item.json b/sample_files/1.0.0/bad-item.json index 5bee838..2b89cf7 100644 --- a/sample_files/1.0.0/bad-item.json +++ b/sample_files/1.0.0/bad-item.json @@ -8,7 +8,7 @@ -122.59750209, 37.48803556, -122.2880486, - 37.613537207 + 37.613531207 ], "geometry": { "type": "Polygon", diff --git a/setup.py b/setup.py index dec29c6..5cc2e67 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import find_packages, setup -__version__ = "1.6.0" +__version__ = "1.7.0" with open("README.md", "r") as fh: long_description = fh.read() @@ -20,7 +20,7 @@ "requests>=2.32.3", "jsonschema>=4.23.0", "click>=8.1.8", - "stac-validator>=3.6.0", + "stac-validator[pydantic]>=3.7.0", "PyYAML", "python-dotenv", ], diff --git a/stac_check/cli.py b/stac_check/cli.py index b8f0a57..8319e84 100644 --- a/stac_check/cli.py +++ b/stac_check/cli.py @@ -91,6 +91,13 @@ def intro_message(linter: Linter) -> None: f"Validator: stac-validator {linter.validator_version}", bg="blue", fg="white" ) + # Always show validation method + validation_method = ( + "Pydantic" if hasattr(linter, "pydantic") and linter.pydantic else "JSONSchema" + ) + click.secho() + click.secho(f"Validation method: {validation_method}", bg="yellow", fg="black") + click.secho() @@ -111,7 +118,17 @@ def cli_message(linter: Linter) -> None: """ schemas validated for core object """ click.secho() - if len(linter.schema) > 0: + + # Determine if we're using Pydantic validation + using_pydantic = hasattr(linter, "pydantic") and linter.pydantic + + # For Pydantic validation, always show the appropriate schema model + if using_pydantic: + click.secho("Schemas validated: ", fg="blue") + asset_type = linter.asset_type.capitalize() if linter.asset_type else "Item" + click.secho(f" stac-pydantic {asset_type} model") + # For JSONSchema validation or when schemas are available + elif len(linter.schema) > 0: click.secho("Schemas validated: ", fg="blue") for schema in linter.schema: click.secho(f" {schema}") @@ -194,10 +211,15 @@ def cli_message(linter: Linter) -> None: multiple=True, help="HTTP header to include in the requests. Can be used multiple times.", ) +@click.option( + "--pydantic", + is_flag=True, + help="Use stac-pydantic for enhanced validation with Pydantic models.", +) @click.command() @click.argument("file") @click.version_option(version=importlib.metadata.distribution("stac-check").version) -def main(file, recursive, max_depth, assets, links, no_assets_urls, header): +def main(file, recursive, max_depth, assets, links, no_assets_urls, header, pydantic): linter = Linter( file, assets=assets, @@ -206,6 +228,7 @@ def main(file, recursive, max_depth, assets, links, no_assets_urls, header): max_depth=max_depth, assets_open_urls=not no_assets_urls, headers=dict(header), + pydantic=pydantic, ) intro_message(linter) if recursive > 0: diff --git a/stac_check/lint.py b/stac_check/lint.py index 2e90b7a..8e11f38 100644 --- a/stac_check/lint.py +++ b/stac_check/lint.py @@ -3,7 +3,7 @@ import json import os from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Tuple, Union import requests import yaml @@ -27,6 +27,7 @@ class Linter: max_depth (Optional[int], optional): An optional integer indicating the maximum depth to validate recursively. Defaults to None. assets_open_urls (bool): Whether to open assets URLs when validating assets. Defaults to True. headers (dict): HTTP headers to include in the requests. + pydantic (bool, optional): A boolean value indicating whether to use pydantic validation. Defaults to False. Attributes: data (dict): A dictionary representing the STAC JSON file. @@ -122,14 +123,15 @@ def check_summaries(self) -> bool: Creates a message with best practices recommendations for the STAC JSON file. """ - item: Union[str, dict] # url, file name, or dictionary + item: Union[str, Dict] config_file: Optional[str] = None assets: bool = False links: bool = False recursive: bool = False max_depth: Optional[int] = None assets_open_urls: bool = True - headers: dict = field(default_factory=dict) + headers: Dict = field(default_factory=dict) + pydantic: bool = False def __post_init__(self): self.data = self.load_data(self.item) @@ -270,16 +272,21 @@ def validate_file(self, file: Union[str, dict]) -> Dict[str, Any]: assets=self.assets, assets_open_urls=self.assets_open_urls, headers=self.headers, + pydantic=self.pydantic, ) stac.run() elif isinstance(file, dict): stac = StacValidate( - assets_open_urls=self.assets_open_urls, headers=self.headers + assets_open_urls=self.assets_open_urls, + headers=self.headers, + pydantic=self.pydantic, ) stac.validate_dict(file) else: raise ValueError("Input must be a file path or STAC dictionary.") - return stac.message[0] + + message = stac.message[0] + return message def recursive_validation(self, file: Union[str, Dict[str, Any]]) -> str: """Recursively validate a STAC item or catalog file and its child items. @@ -302,6 +309,7 @@ def recursive_validation(self, file: Union[str, Dict[str, Any]]) -> str: max_depth=self.max_depth, assets_open_urls=self.assets_open_urls, headers=self.headers, + pydantic=self.pydantic, ) stac.run() else: @@ -310,6 +318,7 @@ def recursive_validation(self, file: Union[str, Dict[str, Any]]) -> str: max_depth=self.max_depth, assets_open_urls=self.assets_open_urls, headers=self.headers, + pydantic=self.pydantic, ) stac.validate_dict(file) return stac.message @@ -454,7 +463,9 @@ def check_geometry_null(self) -> bool: else: return False - def check_bbox_matches_geometry(self) -> bool: + def check_bbox_matches_geometry( + self, + ) -> Union[bool, Tuple[bool, List[float], List[float], List[float]]]: """Checks if the bbox of a STAC item matches its geometry. This function verifies that the bounding box (bbox) accurately represents @@ -462,8 +473,10 @@ def check_bbox_matches_geometry(self) -> bool: items with non-null geometry of type Polygon or MultiPolygon. Returns: - bool: True if the bbox matches the geometry or if the check is not applicable - (e.g., null geometry or non-polygon type). False if there's a mismatch. + Union[bool, Tuple[bool, List[float], List[float], List[float]]]: + - True if the bbox matches the geometry or if the check is not applicable + (e.g., null geometry or non-polygon type). + - When there's a mismatch: a tuple containing (False, calculated_bbox, actual_bbox, differences) """ # Skip check if geometry is null or bbox is not present if ( @@ -504,11 +517,14 @@ def check_bbox_matches_geometry(self) -> bool: calc_bbox = [min(lons), min(lats), max(lons), max(lats)] - # Allow for small floating point differences (epsilon) - epsilon = 1e-8 - for i in range(4): - if abs(bbox[i] - calc_bbox[i]) > epsilon: - return False + # Allow for differences that would be invisible when rounded to 6 decimal places + # 1e-6 would be exactly at the 6th decimal place, so use 5e-7 to be just under that threshold + epsilon = 5e-7 + differences = [abs(bbox[i] - calc_bbox[i]) for i in range(4)] + + if any(diff > epsilon for diff in differences): + # Return False along with the calculated bbox, actual bbox, and the differences + return (False, calc_bbox, bbox, differences) return True @@ -675,12 +691,60 @@ def create_best_practices_dict(self) -> Dict: best_practices_dict["null_geometry"] = [msg_1] # best practices - check if bbox matches geometry - if ( - not self.check_bbox_matches_geometry() - and config.get("check_bbox_geometry_match", True) == True - ): - msg_1 = "The bbox field does not match the bounds of the geometry. The bbox should be the minimum bounding rectangle of the geometry." - best_practices_dict["bbox_geometry_mismatch"] = [msg_1] + bbox_check_result = self.check_bbox_matches_geometry() + bbox_mismatch = False + + if isinstance(bbox_check_result, tuple): + bbox_mismatch = not bbox_check_result[0] + else: + bbox_mismatch = not bbox_check_result + + if bbox_mismatch and config.get("check_bbox_geometry_match", True) == True: + if isinstance(bbox_check_result, tuple): + # Unpack the result + _, calc_bbox, actual_bbox, differences = bbox_check_result + + # Format the bbox values for display + calc_bbox_str = ", ".join([f"{v:.6f}" for v in calc_bbox]) + actual_bbox_str = ", ".join([f"{v:.6f}" for v in actual_bbox]) + + # Create a more detailed message about which coordinates differ + coordinate_labels = [ + "min longitude", + "min latitude", + "max longitude", + "max latitude", + ] + mismatch_details = [] + + # Use the same epsilon threshold as in check_bbox_matches_geometry + epsilon = 5e-7 + + for i, (diff, label) in enumerate(zip(differences, coordinate_labels)): + if diff > epsilon: + mismatch_details.append( + f"{label}: calculated={calc_bbox[i]:.6f}, actual={actual_bbox[i]:.6f}, diff={diff:.7f}" + ) + + msg_1 = "The bbox field does not match the bounds of the geometry. The bbox should be the minimum bounding rectangle of the geometry." + msg_2 = f"Calculated bbox from geometry: [{calc_bbox_str}]" + msg_3 = f"Actual bbox in metadata: [{actual_bbox_str}]" + + messages = [msg_1, msg_2, msg_3] + if mismatch_details: + messages.append("Mismatched coordinates:") + messages.extend(mismatch_details) + else: + # If we got here but there are no visible differences at 6 decimal places, + # add a note explaining that the differences are too small to matter + messages.append( + "Note: The differences are too small to be visible at 6 decimal places and can be ignored." + ) + + best_practices_dict["bbox_geometry_mismatch"] = messages + else: + msg_1 = "The bbox field does not match the bounds of the geometry. The bbox should be the minimum bounding rectangle of the geometry." + best_practices_dict["bbox_geometry_mismatch"] = [msg_1] # check to see if there are too many links if ( diff --git a/tests/test_lint.py b/tests/test_lint.py index 926c549..8755c41 100644 --- a/tests/test_lint.py +++ b/tests/test_lint.py @@ -282,7 +282,7 @@ def test_bbox_matches_geometry(): # Test with matching bbox and geometry file = "sample_files/1.0.0/core-item.json" linter = Linter(file) - assert linter.check_bbox_matches_geometry() == True + assert linter.check_bbox_matches_geometry() is True # Test with mismatched bbox and geometry mismatched_item = { @@ -306,7 +306,30 @@ def test_bbox_matches_geometry(): "properties": {"datetime": "2020-12-11T22:38:32.125Z"}, } linter = Linter(mismatched_item) - assert linter.check_bbox_matches_geometry() == False + result = linter.check_bbox_matches_geometry() + + # Check that the result is a tuple and the first element is False + assert isinstance(result, tuple) + assert result[0] is False + + # Check that the tuple contains the expected elements (calculated bbox, actual bbox, differences) + assert len(result) == 4 + calc_bbox, actual_bbox, differences = result[1], result[2], result[3] + + # Verify the calculated bbox matches the geometry coordinates + assert calc_bbox == [ + 172.91173669923782, + 1.3438851951615003, + 172.95469614953714, + 1.3690476620161975, + ] + + # Verify the actual bbox is what we provided + assert actual_bbox == [100.0, 0.0, 105.0, 1.0] + + # Verify the differences are calculated correctly + expected_differences = [abs(actual_bbox[i] - calc_bbox[i]) for i in range(4)] + assert differences == expected_differences # Test with null geometry (should return True as check is not applicable) null_geom_item = { @@ -318,7 +341,7 @@ def test_bbox_matches_geometry(): "properties": {"datetime": "2020-12-11T22:38:32.125Z"}, } linter = Linter(null_geom_item) - assert linter.check_bbox_matches_geometry() == True + assert linter.check_bbox_matches_geometry() is True # Test with missing bbox (should return True as check is not applicable) no_bbox_item = { @@ -340,7 +363,7 @@ def test_bbox_matches_geometry(): "properties": {"datetime": "2020-12-11T22:38:32.125Z"}, } linter = Linter(no_bbox_item) - assert linter.check_bbox_matches_geometry() == True + assert linter.check_bbox_matches_geometry() is True def test_bloated_item(): @@ -633,3 +656,36 @@ def test_lint_assets_no_links(): "request_invalid": [], }, } + + +def test_lint_pydantic_validation_valid(): + """Test pydantic validation with a valid STAC item.""" + file = "sample_files/1.0.0/core-item.json" + linter = Linter(file, pydantic=True) + + assert linter.valid_stac == True + assert linter.asset_type == "ITEM" + assert "stac-pydantic Item model" in linter.message["schema"] + assert linter.message["validation_method"] == "pydantic" + + +def test_lint_pydantic_validation_invalid(): + """Test pydantic validation with an invalid STAC item (missing required fields).""" + file = "sample_files/1.0.0/bad-item.json" + linter = Linter(file, pydantic=True) + + assert linter.valid_stac == False + assert "PydanticValidationError" in linter.message["error_type"] + assert "id: Field required" in linter.message["error_message"] + assert linter.message["validation_method"] == "pydantic" + + +def test_lint_pydantic_validation_recursive(): + """Test pydantic validation with recursive option.""" + file = "sample_files/1.0.0/collection.json" + linter = Linter(file, recursive=True, max_depth=1, pydantic=True) + + assert linter.valid_stac == True + assert linter.asset_type == "COLLECTION" + assert "stac-pydantic Collection model" in linter.message["schema"] + assert linter.message["validation_method"] == "pydantic"