Skip to content
Merged
86 changes: 85 additions & 1 deletion src/check_datapackage/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
from dataclasses import dataclass, field
from functools import reduce
from types import TracebackType
from typing import Any, Callable, Iterator, Optional
from typing import Any, Callable, Iterator, Optional, cast

from jsonpath import findall, resolve
from jsonschema import Draft7Validator, FormatChecker, ValidationError

from check_datapackage.config import Config
Expand All @@ -16,8 +17,10 @@
from check_datapackage.exclusion import exclude
from check_datapackage.extensions import apply_extensions
from check_datapackage.internals import (
PropertyField,
_filter,
_flat_map,
_get_fields_at_jsonpath,
_map,
)
from check_datapackage.issue import Issue
Expand Down Expand Up @@ -126,6 +129,7 @@ class for more details, especially about the default values.
_set_should_fields_to_required(schema)

issues = _check_object_against_json_schema(properties, schema)
issues += _check_keys(properties, issues)
issues += apply_extensions(properties, config.extensions)
issues = exclude(issues, config.exclusions, properties)
issues = sorted(set(issues))
Expand All @@ -136,6 +140,86 @@ class for more details, especially about the default values.
return issues


def _check_keys(properties: dict[str, Any], issues: list[Issue]) -> list[Issue]:
"""Check that primary and foreign keys exist."""
# Primary keys
resources_with_pk = _get_fields_at_jsonpath(
"$.resources[?(length(@.schema.primaryKey) > 0 || @.schema.primaryKey == '')]",
properties,
)
resources_with_pk = _keep_resources_with_no_issue_at_property(
resources_with_pk, issues, "schema.primaryKey"
)
key_issues = _flat_map(resources_with_pk, _check_primary_key)

# Foreign keys

return key_issues


def _issues_at_property(
resource: PropertyField, issues: list[Issue], jsonpath: str
) -> list[Issue]:
return _filter(
issues,
lambda issue: f"{resource.jsonpath}.{jsonpath}" in issue.jsonpath,
)


def _keep_resources_with_no_issue_at_property(
resources: list[PropertyField], issues: list[Issue], jsonpath: str
) -> list[PropertyField]:
"""Filter out resources that have an issue at or under the given `jsonpath`."""
return _filter(
resources,
lambda resource: not _issues_at_property(resource, issues, jsonpath),
)


def _check_primary_key(resource: PropertyField) -> list[Issue]:
"""Check that primary key fields exist in the resource."""
pk_fields = resolve("/schema/primaryKey", resource.value)
pk_fields_list = _key_fields_as_str_list(pk_fields)
unknown_fields = _get_unknown_key_fields(pk_fields_list, resource.value)

if not unknown_fields:
return []

return [
Issue(
jsonpath=f"{resource.jsonpath}.schema.primaryKey",
type="primary-key",
message=(
f"No fields found in resource for primary key fields: {unknown_fields}."
),
instance=pk_fields,
)
]


def _key_fields_as_str_list(key_fields: Any) -> list[str]:
"""Returns the list representation of primary and foreign key fields.

Key fields can be represented either as a string (containing one field name)
or a list of strings.

The input should contain a correctly typed `key_fields` object.
"""
if not isinstance(key_fields, list):
key_fields = [key_fields]
return cast(list[str], key_fields)


def _get_unknown_key_fields(
key_fields: list[str], properties: dict[str, Any], resource_path: str = ""
) -> str:
"""Return the key fields that don't exist on the specified resource."""
known_fields = findall(f"{resource_path}schema.fields[*].name", properties)
unknown_fields = _filter(key_fields, lambda field: field not in known_fields)
unknown_fields = _map(unknown_fields, lambda field: f"{field!r}")
return ", ".join(unknown_fields)


def _set_should_fields_to_required(schema: dict[str, Any]) -> dict[str, Any]:
"""Set 'SHOULD' fields to 'REQUIRED' in the schema."""
should_fields = ("name", "id", "licenses")
Expand Down
64 changes: 54 additions & 10 deletions tests/test_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,60 @@ def test_fails_properties_with_pattern_mismatch():
assert issues[0].jsonpath == "$.contributors[0].path"


@mark.parametrize("primary_key", ["id", ["id", "name"]])
def test_pass_good_primary_key(primary_key):
properties = example_package_properties()
properties["resources"][0]["schema"]["primaryKey"] = primary_key
properties["resources"][0]["schema"]["fields"].extend(
[
{"name": "id", "type": "integer"},
{"name": "name", "type": "string"},
]
)

issues = check(properties)

assert issues == []


@mark.parametrize("primary_key", ["", "last_name", ["first_name", "last_name"]])
def test_fail_primary_key_with_unknown_fields(primary_key):
properties = example_package_properties()
properties["resources"][0]["schema"]["primaryKey"] = primary_key

issues = check(properties)

assert len(issues) == 1
assert issues[0].jsonpath == "$.resources[0].schema.primaryKey"
assert issues[0].type == "primary-key"
assert issues[0].instance == primary_key


@mark.parametrize("primary_key", [None, 123, [], [123, "a_field"]])
def test_do_not_check_bad_primary_key_against_fields(primary_key):
properties = example_package_properties()
properties["resources"][0]["schema"]["primaryKey"] = primary_key

issues = check(properties)

assert len(issues) == 1
assert issues[0].type != "primary-key"


def test_do_not_check_primary_key_against_bad_field():
properties = example_package_properties()
properties["resources"][0]["schema"]["primaryKey"] = "eye-colour"
properties["resources"][0]["schema"]["fields"].append(
# Bad name
{"name": 123, "type": "integer"},
)

issues = check(properties)

assert len(issues) == 1
assert issues[0].type != "primary-key"


# "SHOULD" checks


Expand Down Expand Up @@ -597,16 +651,6 @@ def test_fail_foreign_keys_with_bad_array_item():
)


@mark.parametrize("primary_key", ["id", ["name", "address"]])
def test_pass_good_primary_key(primary_key):
properties = example_package_properties()
properties["resources"][0]["schema"]["primaryKey"] = primary_key

issues = check(properties)

assert issues == []


def test_fail_primary_key_of_bad_type():
properties = example_package_properties()
properties["resources"][0]["schema"]["primaryKey"] = 123
Expand Down