Skip to content

Commit

Permalink
Skip _type, _key from schema validation, closes #104
Browse files Browse the repository at this point in the history
  • Loading branch information
manycoding committed Jun 7, 2019
1 parent 0d20a92 commit e581d16
Show file tree
Hide file tree
Showing 9 changed files with 65 additions and 43 deletions.
14 changes: 11 additions & 3 deletions src/arche/arche.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,9 @@ def validate_with_json_schema(self) -> None:
"""Run JSON schema check and output results. It will try to find all errors, but
there are no guarantees. Slower than `check_with_json_schema()`
"""
res = schema_rules.validate(self.schema, self.source_items.raw)
res = schema_rules.validate(
self.schema, self.source_items.raw, self.source_items.df.index
)
self.save_result(res)
res.show()

Expand All @@ -173,14 +175,20 @@ def glance(self) -> None:
only the first error per item. Usable for big jobs as it's about 100x faster than
`validate_with_json_schema()`.
"""
res = schema_rules.validate(self.schema, self.source_items.raw, fast=True)
res = schema_rules.validate(
self.schema, self.source_items.raw, self.source_items.df.index, fast=True
)
self.save_result(res)
res.show()

def run_schema_rules(self) -> None:
if not self.schema:
return
self.save_result(schema_rules.validate(self.schema, self.source_items.raw))
self.save_result(
schema_rules.validate(
self.schema, self.source_items.raw, self.source_items.df.index
)
)

tagged_fields = sr.Tags().get(self.schema)
target_columns = (
Expand Down
4 changes: 3 additions & 1 deletion src/arche/data_quality_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,9 @@ def create_figures(self, items):

validation_errors = self.report.results.get(
"JSON Schema Validation",
schema_rules.validate(self.schema, raw_items=items.raw, fast=False),
schema_rules.validate(
self.schema, raw_items=items.raw, keys=items.df.index, fast=False
),
).get_errors_count()

garbage_symbols_result = self.report.results.get(
Expand Down
7 changes: 5 additions & 2 deletions src/arche/rules/json_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
from arche.rules.result import Result
from arche.tools.json_schema_validator import JsonSchemaValidator
import numpy as np
import pandas as pd


def validate(schema: Schema, raw_items: RawItems, fast: bool = False) -> Result:
def validate(
schema: Schema, raw_items: RawItems, keys: pd.Index, fast: bool = False
) -> Result:
"""Run JSON schema validation against data.
Args:
Expand All @@ -14,7 +17,7 @@ def validate(schema: Schema, raw_items: RawItems, fast: bool = False) -> Result:
Returns:
Schema errors if any
"""
validator = JsonSchemaValidator(schema)
validator = JsonSchemaValidator(schema, keys)
validator.run(raw_items, fast)
result = Result("JSON Schema Validation")

Expand Down
12 changes: 9 additions & 3 deletions src/arche/tools/json_schema_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
from arche.readers.schema import Schema
import fastjsonschema
from jsonschema import FormatChecker, validators
import pandas as pd
from tqdm import tqdm_notebook


class JsonSchemaValidator:
def __init__(self, schema: Schema):
def __init__(self, schema: Schema, keys: pd.Index):
self.errors = defaultdict(set)
self.schema = schema
self.keys = keys

def run(self, raw_items: RawItems, fast: bool) -> None:
if fast:
Expand All @@ -26,22 +28,26 @@ def fast_validate(self, raw_items: RawItems) -> None:
for i, raw_item in enumerate(
tqdm_notebook(raw_items, desc="Fast Schema Validation")
):
raw_item.pop("_key", None)
raw_item.pop("_type", None)
try:
validate(raw_item)
except fastjsonschema.JsonSchemaException as error:
self.errors[str(error)].add(raw_item.get("_key", str(i)))
self.errors[str(error)].add(self.keys[i])

def validate(self, raw_items: RawItems) -> None:
validator = validators.validator_for(self.schema)(self.schema)
validator.format_checker = FormatChecker()
for i, raw_item in enumerate(
tqdm_notebook(raw_items, desc="JSON Schema Validation")
):
raw_item.pop("_key", None)
raw_item.pop("_type", None)
for e in validator.iter_errors(raw_item):
error = self.format_validation_message(
e.message, e.path, e.schema_path, e.validator
)
self.errors[error].add(raw_item.get("_key", str(i)))
self.errors[error].add(self.keys[i])

@staticmethod
def format_validation_message(
Expand Down
2 changes: 2 additions & 0 deletions src/arche/tools/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ def create_json_schema(
samples = []
for n in item_numbers:
item = api.get_items(source_key, start_index=n, count=1, p_bar=None)[0]
item.pop("_type", None)
item.pop("_key", None)
samples.append(item)

return infer_schema(samples)
Expand Down
23 changes: 10 additions & 13 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from copy import deepcopy
from itertools import zip_longest
from typing import Dict, List, Optional

Expand All @@ -9,20 +10,16 @@


CLOUD_ITEMS = [
{"_key": "112358/13/21/0", "price": 0, "name": "Elizabeth"},
{"_key": "112358/13/21/1", "name": "Margaret"},
{"_key": "112358/13/21/2", "price": 10, "name": "Yulia"},
{"_key": "112358/13/21/3", "price": 11, "name": "Vivien"},
{"_key": "112358/13/21/0", "_type": "Type", "price": 0, "name": "Elizabeth"},
{"_key": "112358/13/21/1", "_type": "Type", "name": "Margaret"},
{"_key": "112358/13/21/2", "_type": "Type", "price": 10, "name": "Yulia"},
{"_key": "112358/13/21/3", "_type": "Type", "price": 11, "name": "Vivien"},
]
DEFAULT_SCHEMA = {
"$schema": "http://json-schema.org/draft-07/schema",
"required": ["_key", "name"],
"required": ["name"],
"type": "object",
"properties": {
"_key": {"type": "string"},
"price": {"type": "integer"},
"name": {"type": "string"},
},
"properties": {"price": {"type": "integer"}, "name": {"type": "string"}},
"additionalProperties": False,
}

Expand Down Expand Up @@ -152,13 +149,13 @@ def get_job_items(mocker):
mocker.patch(
"arche.readers.items.JobItems.job", return_value=get_job, autospec=True
)
raw_data = deepcopy(CLOUD_ITEMS)
mocker.patch(
"arche.readers.items.JobItems.fetch_data",
return_value=np.array(CLOUD_ITEMS),
return_value=np.array(raw_data),
autospec=True,
)

job_items = JobItems(key="112358/13/21", count=len(CLOUD_ITEMS))
job_items = JobItems(key="112358/13/21", count=len(raw_data))
return job_items


Expand Down
7 changes: 3 additions & 4 deletions tests/test_arche.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,16 +250,15 @@ def test_validate_with_json_schema(mocker, get_job_items, get_schema):

def test_validate_with_json_schema_fails(mocker, get_job_items, get_schema):
mocked_html = mocker.patch("arche.report.HTML", autospec=True)
key = f"112358/13/21"
url_base = f"{SH_URL}/{key}/item"
url = f"{SH_URL}/112358/13/21/item/1"
res = create_result(
"JSON Schema Validation",
{
Level.ERROR: [
(
"4 items were checked, 1 error(s)",
None,
{"'price' is a required property": {f"{key}/1"}},
{"'price' is a required property": {url}},
)
]
},
Expand All @@ -272,7 +271,7 @@ def test_validate_with_json_schema_fails(mocker, get_job_items, get_schema):
assert len(a.report.results) == 1
assert a.report.results.get("JSON Schema Validation") == res
mocked_html.assert_any_call(
f"1 items affected - 'price' is a required property: <a href='{url_base}/1'>1</a>"
f"1 items affected - 'price' is a required property: <a href='{url}'>1</a>"
)


Expand Down
23 changes: 16 additions & 7 deletions tests/tools/test_json_schema_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,27 +66,36 @@ def test_format_validation_message(


def test_validate_fails():
jsv = JSV({"properties": {"NAME": {"type": "string"}}})
jsv = JSV({"properties": {"NAME": {"type": "string"}}}, keys=[0, 1])
jsv.validate(np.array([{"NAME": None, "_key": "0"}, {"NAME": None}]))
assert jsv.errors == {"NAME is not of type 'string'": {"0", "1"}}
assert jsv.errors == {"NAME is not of type 'string'": {0, 1}}


def test_validate_type_key():
jsv = JSV(
{"properties": {"A": {"type": "number"}}, "additionalProperties": False},
keys=[0, 1],
)
jsv.validate(np.array([{"A": 0, "_key": "0", "_type": "Some"}, {"A": 1}]))
assert not jsv.errors


def test_fast_validate_fails():
jsv = JSV({"type": "object", "properties": {"v": {"type": "number"}}})
jsv = JSV({"type": "object", "properties": {"v": {"type": "number"}}}, keys=[0, 1])
jsv.fast_validate(np.array([{"v": "0"}, {"v": "1"}]))
assert jsv.errors == {"data.v must be number": {"0", "1"}}
assert jsv.errors == {"data.v must be number": {0, 1}}


def test_fast_validate(get_schema, get_raw_items):
jsv = JSV(get_schema)
jsv = JSV(get_schema, keys=list(range(len(get_raw_items))))
jsv.fast_validate(get_raw_items)
assert not jsv.errors


def test_run():
schema = {"properties": {"NAME": {"type": "string"}}}
jsv = JSV(schema)
jsv = JSV(schema, keys=[0])
assert jsv.schema == schema
jsv.run(np.array([{"NAME": None, "__DEBUG": None}]), fast=False)

assert jsv.errors == {"NAME is not of type 'string'": {"0"}}
assert jsv.errors == {"NAME is not of type 'string'": {0}}
16 changes: 6 additions & 10 deletions tests/tools/test_schema_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
},
},
"type": "object",
"properties": {"url": {"type": "string"}, "_key": {"type": "integer"}},
"properties": {"url": {"type": "string"}, "id": {"type": "integer"}},
"additionalProperties": False,
"required": ["_key", "url"],
"required": ["id", "url"],
}


Expand All @@ -27,8 +27,8 @@ def test_set_item_no():


def test_infer_schema():
item1 = {"url": "https://example.com", "_key": 0}
item2 = {"url": "https://example.com", "_key": 1}
item1 = {"url": "https://example.com", "id": 0}
item2 = {"url": "https://example.com", "id": 1}

assert schema_tools.infer_schema([item1, item2]) == schema
assert schema_tools.infer_schema([item2]) == schema
Expand Down Expand Up @@ -59,12 +59,8 @@ def test_create_json_schema(mocker, get_job, get_raw_items):
},
"additionalProperties": False,
"type": "object",
"properties": {
"_key": {"type": "string"},
"name": {"type": "string"},
"price": {"type": "integer"},
},
"required": ["_key", "name", "price"],
"properties": {"name": {"type": "string"}, "price": {"type": "integer"}},
"required": ["name", "price"],
}


Expand Down

0 comments on commit e581d16

Please sign in to comment.