### Automating Validation With JSONSchema

#### Setting Up

In [1]:
document = {
        "name": "S&P 500 Index Fund",
        "symbol": "SPY",
        "price": 420.50,
        "currency": "USD",
        "inceptionDate": "1993-01-22"
}

In [2]:
schema = {
    "$schema": "https://json-schema.org/draft/2020-12/schema",
    "type": "object",
    "properties": {
        "name": {"type": "string"},
        "symbol": {"type": "string"},
        "price": {"type": "number"},
        "currency": {"type": "string"},
        "inceptionDate": {"type": "string", "format": "date"},
    },
    "required": ["name", "symbol", "price", "currency", "inceptionDate"],
}

In [4]:
from jsonschema import validate, ValidationError

In [5]:
try:
    validate(document, schema)
    print("The document is valid against the schema")
except ValidationError as e:
    print(f"The document is not valid against the schema: {e}")

The document is valid against the schema


In [6]:
document = {
        "name": "S&P 500 Index Fund",
        "symbol": "SPY",
        "price": "420.50",
        "currency": "USD",
        "inceptionDate": "1993-01-22"
}

In [7]:
try:
    validate(document, schema)
    print("The document is valid against the schema")
except ValidationError as e:
    print(f"The document is not valid against the schema: {e}")

The document is not valid against the schema: '420.50' is not of type 'number'

Failed validating 'type' in schema['properties']['price']:
    {'type': 'number'}

On instance['price']:
    '420.50'


#### Format Validation

In [9]:
schema = {
    "type": "object", 
    "properties": {
        "date": {
            "type": "string",
            "format": "date"
        }
    }
}

In [None]:
# full date->  
# 
    # 2024-07-23 VALID 
    # 2024 07 INVALID

In [10]:
document = {
    "date": "2024 07"
}

In [11]:
try:
    validate(document, schema)
    print("The document is valid against the schema")
except ValidationError as e:
    print(f"The document is not valid against the schema: {e}")

The document is valid against the schema


In [12]:
from jsonschema import Draft202012Validator

In [13]:
Draft202012Validator.FORMAT_CHECKER

<FormatChecker checkers=['date', 'email', 'idn-email', 'idn-hostname', 'ipv4', 'ipv6', 'regex', 'uuid']>

In [14]:
try:
    validate(document, schema, format_checker=Draft202012Validator.FORMAT_CHECKER)
    print("The document is valid against the schema")
except ValidationError as e:
    print(f"The document is not valid against the schema: {e}")

The document is not valid against the schema: '2024 07' is not a 'date'

Failed validating 'format' in schema['properties']['date']:
    {'format': 'date', 'type': 'string'}

On instance['date']:
    '2024 07'


#### Crafting Customized Formats

In [None]:
# US Zip Code format
    # 12345 
    # 12345-4321

In [16]:
schema = {
    "type": "object",
    "properties": {
        "address": {
            "type": "object",
            "properties": {
                "zip_code": {
                    "type": "string",
                    "format": "us-zip-code"
                }
            },
            "required": ["zip_code"]
        }
    },
    "required": ["address"]
}

In [17]:
document = {
    "address": {
        "zip_code": "12345ANDY"
    }
}

In [18]:
try:
    validate(document, schema, format_checker=Draft202012Validator.FORMAT_CHECKER)
    print("The document is valid against the schema")
except ValidationError as e:
    print(f"The document is not valid against the schema: {e}")

The document is valid against the schema


In [19]:
# 1. define a function that implements the validation logic

# target: US Zip Code format
    # 12345 
    # 12345-4321

In [20]:
import re

In [None]:
# ANDY12345BEK  X
# 12345 V
# 12345-1234 V

In [None]:
# \d -> [0-9]

In [21]:
def is_us_zip_code(instance):
    if not re.match(r"^\d{5}(-\d{4})?$", instance):
        return False
    
    return True

In [None]:
# 2. register the format & validation function with the validator

In [22]:
Draft202012Validator.FORMAT_CHECKER

<FormatChecker checkers=['date', 'email', 'idn-email', 'idn-hostname', 'ipv4', 'ipv6', 'regex', 'uuid']>

In [23]:
Draft202012Validator.FORMAT_CHECKER.checks("us-zip-code")(is_us_zip_code)

<function __main__.is_us_zip_code(instance)>

In [26]:
document = {
    "address": {
        "zip_code": "12345ANDY"
    }
}

try:
    validate(document, schema, format_checker=Draft202012Validator.FORMAT_CHECKER)
    print("The document is valid against the schema")
except ValidationError as e:
    print(f"The document is not valid against the schema: {e}")

The document is not valid against the schema: '12345ANDY' is not a 'us-zip-code'

Failed validating 'format' in schema['properties']['address']['properties']['zip_code']:
    {'format': 'us-zip-code', 'type': 'string'}

On instance['address']['zip_code']:
    '12345ANDY'


In [28]:
document = {
    "address": {
        "zip_code": "12345-1234"
    }
}

try:
    validate(document, schema, format_checker=Draft202012Validator.FORMAT_CHECKER)
    print("The document is valid against the schema")
except ValidationError as e:
    print(f"The document is not valid against the schema: {e}")

The document is valid against the schema


#### Dereferencing 

> * $id is used to assign unique identifiers to schemas

In [29]:
address_subschema = {
    "$schema": "https://json-schema.org/draft/2020-12/schema",
    "$id": "https://somedomain.xyz/schemas/address.json",
    "type": "object",
    "properties": {
        "street": {"type": "string"},
        "city": {"type": "string"},
        "state": {"type": "string"}
    },
    "required": ["street", "city", "state"]
}

In [38]:
person_subschema = {
    "$schema": "https://json-schema.org/draft/2020-12/schema",
    "$id": "https://somedomain.xyz/schemas/person.json",
    "type": "object",
    "properties": {
        "name": {"type": "string"},
        "age": {"type": "integer"},
        "address": {"$ref": "https://somedomain.xyz/schemas/address.json"}
    },
    "required": ["name", "age", "address"]
}

In [32]:
main_schema = {
    "$schema": "https://json-schema.org/draft/2020-12/schema",
    "$id": "https://somedomain.xyz/schemas/main.json",
    "type": "object",
    "properties": {
        "person": {"$ref": "https://somedomain.xyz/schemas/person.json"},
        "spouse": {"$ref": "https://somedomain.xyz/schemas/person.json"},
        "isMarried": {"type": "boolean"}
    },
    "required": ["person", "isMarried"],
    "if": {
        "properties": {"isMarried": {"const": True}}
    },
    "then": {
        "required": ["person", "spouse"]
    }
}

In [33]:
document = {
    "person": {
        "name": "John Doe"
    }
}

In [35]:
try:
    validate(document, main_schema)
    print("The document is valid against the schema")
except ValidationError as e:
    print(f"The document is not valid against the schema: {e}")

_WrappedReferencingError: Unresolvable: https://somedomain.xyz/schemas/person.json

In [36]:
from referencing import Registry, Resource

In [39]:
registry = Registry().with_resources(
    [
        ("https://somedomain.xyz/schemas/address.json", Resource.from_contents(address_subschema)),
        ("https://somedomain.xyz/schemas/person.json", Resource.from_contents(person_subschema)),
    ]
)

In [40]:
from jsonschema import Draft202012Validator

In [41]:
validator = Draft202012Validator(
    main_schema,
    registry=registry
)

In [42]:
document = {
    "person": {
        "name": "John Doe"
    }
}

try:
    validator.validate(document)
    print("The document is valid against the schema")
except ValidationError as e:
    print(f"The document is not valid against the schema: {e}")

The document is not valid against the schema: 'age' is a required property

Failed validating 'required' in schema['properties']['person']:
    {'$id': 'https://somedomain.xyz/schemas/person.json',
     '$schema': 'https://json-schema.org/draft/2020-12/schema',
     'properties': {'address': {'$ref': 'https://somedomain.xyz/schemas/address.json'},
                    'age': {'type': 'integer'},
                    'name': {'type': 'string'}},
     'required': ['name', 'age', 'address'],
     'type': 'object'}

On instance['person']:
    {'name': 'John Doe'}


In [46]:
document = {
    "person": {
        "name": "John Doe",
        "age": 30,
        "address": {
            "street": "123 Main St",
            "city": "Anytown",
            "state": "Anystate"
        }
    },
    "spouse": {
        "name": "Jane Doe",
        "age": 30,
        "address": {
            "street": "123 Main St",
            "city": "Anytown",
            "state": "Anystate"
        }
    },    
    "isMarried": True
}

try:
    validator.validate(document)
    print("The document is valid against the schema")
except ValidationError as e:
    print(f"The document is not valid against the schema: {e}")

The document is valid against the schema


#### Skill Challenge: Programmatic JSON Document Validation From API

> * Inspect the following JSON document, which contains USD and CAD stock price information: https://www.andybek.com/api/data/stock-tickers
> * Define a restrictive schema for the data that will identify records 
> * Using python, read in the JSON document and validate it against the schema
> * Generate a report that indicates which records are invalid, e.g.

In [None]:
# Invalid Records:
# Record #2 is invalid
# Record:
# {
#   "ticker": "GOOGL",
#   "price": 100.2
# }
# Reason: 'currency' is a required property

# ========================================

# Record #3 is invalid
# Record:
# "invalid_data"
# Reason: 'invalid_data' is not of type 'number'

#### Solution

In [47]:
URL_CONSTANT = "https://www.andybek.com/api/data/stock-tickers"

In [53]:
import requests
from jsonschema import Draft202012Validator

In [49]:
# fetch the JSON document
response = requests.get(URL_CONSTANT)
data = response.json()

In [51]:
# define our schema
schema = {
    "$schema": "http://json-schema.org/draft/2020-12/schema",
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "ticker": {"type": "string"},
            "price": {"type": "number"},
            "currency": {"type": "string", "enum": ["USD", "CAD"]}
        },
        "required": ["ticker", "price", "currency"],
        "additionalProperties": False
    }
}

In [52]:
# validate the data against the schema

In [54]:
validator = Draft202012Validator(schema)

In [56]:
invalid_records = []

for index, item in enumerate(data):
    # 0, first item
    # 1, second item
    # ...
    
    # do the validation
    
    for error in validator.iter_errors([item]):
        invalid_records.append({
            "index": index,
            "record": error.instance,
            "reason": error.message
        })

In [57]:
len(invalid_records)

6

In [60]:
import json

print("Invalid Records:")
for record in invalid_records:
    print(f"Record #{record['index']+1} is invalid")
    print("Record:")
    print(json.dumps(record['record'], indent=2))
    print("Reason: ", record['reason'])
    print("\n" + "=" * 40 + "\n")

Invalid Records:
Record #2 is invalid
Record:
{
  "ticker": "GOOGL",
  "price": 100.2
}
Reason:  'currency' is a required property


Record #3 is invalid
Record:
"invalid_data"
Reason:  'invalid_data' is not of type 'number'


Record #4 is invalid
Record:
{
  "ticker": "MSFT",
  "currency": "USD"
}
Reason:  'price' is a required property


Record #8 is invalid
Record:
{
  "ticker": "V",
  "price": 210.5
}
Reason:  'currency' is a required property


Record #13 is invalid
Record:
"MXN"
Reason:  'MXN' is not one of ['USD', 'CAD']


Record #23 is invalid
Record:
{
  "ticker": "PFE",
  "price": 44.25,
  "currency": "USD",
  "extra_property": "unexpected"
}
Reason:  Additional properties are not allowed ('extra_property' was unexpected)


