### Validate Inferred Schema from Data with Existing Schema

In [224]:
import pandas as pd
import json
import ast
from visions.typesets import CompleteSet
import numpy as np

In [46]:
# schema from schema registry
schema = r'''
{
  "schemas": [
    {
      "id": "c28fb5ca-8219-436e-bcf8-057d4cb3aa70",
      "data_offer_state_id": "1baeed63-d360-47b7-94b9-915bb0191f2e",
      "source": "INFERRED",
      "encoding": "JSON",
      "content": "{\"$schema\": \"http://json-schema.org/schema#\", \"type\": \"object\", \"properties\": {\"random_string\": {\"type\": \"string\"}, \"random_integer\": {\"type\": \"integer\"}, \"random_float\": {\"type\": \"number\"}, \"random_boolean\": {\"type\": \"boolean\"}}, \"required\": [\"random_boolean\", \"random_float\", \"random_integer\", \"random_string\"]}",
      "created": {
        "username": "spoud-sdm-kafka-profiler",
        "timestamp": {
          "seconds": "1593592254",
          "nanos": 880627000
        }
      },
      "last_seen": {
        "seconds": "1593595565",
        "nanos": 4554000
      },
      "privilege": {
        "privilege": [
          "LINK",
          "NONE",
          "ADMIN",
          "READ_INFO",
          "READ",
          "WRITE"
        ],
        "path": "/default/"
      }
    }
  ]
}

'''

In [7]:
# samples from pandas kafka-profiler
samples = '''
{"random_string": "wvvkuyfdjp", "random_integer": 502, "random_float": 0.9752454439023497, "random_boolean": true}
{"random_string": "aymtvxjtwz", "random_integer": 860, "random_float": 0.9027580964349539, "random_boolean": false}
{"random_string": "vaolbivfof", "random_integer": 351, "random_float": 0.9236287647478101, "random_boolean": true}
{"random_string": "zbbpezhjgo", "random_integer": 622, "random_float": 0.11856938563671815, "random_boolean": true}
{"random_string": "tffwklihfx", "random_integer": 692, "random_float": 0.8876732887875933, "random_boolean": true}
{"random_string": "vhtljjbezh", "random_integer": 2, "random_float": 0.9077423852437432, "random_boolean": true}
{"random_string": "rzphlmtmus", "random_integer": 696, "random_float": 0.7850585269196347, "random_boolean": false}
{"random_string": "sdlqpjhhcs", "random_integer": 855, "random_float": 0.9819586576361585, "random_boolean": true}
{"random_string": "rgryorahcr", "random_integer": 147, "random_float": 0.48084257765078875, "random_boolean": true}
{"random_string": "hmudbkazpo", "random_integer": 309, "random_float": 0.07027422214145584, "random_boolean": false}
{"random_string": "vnewmexoih", "random_integer": 356, "random_float": 0.9328964419808714, "random_boolean": true}
{"random_string": "qmwjaouamg", "random_integer": 376, "random_float": 0.5872442457740908, "random_boolean": true}
{"random_string": "spsmibvvdd", "random_integer": 575, "random_float": 0.38261395969774337, "random_boolean": false}
{"random_string": "cnyfzqqtkm", "random_integer": 843, "random_float": 0.7115285847863924, "random_boolean": true}
{"random_string": "oomjythkfx", "random_integer": 261, "random_float": 0.5624835234081578, "random_boolean": false}
{"random_string": "lhrswwdwfv", "random_integer": 729, "random_float": 0.39471356088979337, "random_boolean": true}
{"random_string": "auskkiztwo", "random_integer": 311, "random_float": 0.07334293049291563, "random_boolean": false}
{"random_string": "okovnasiev", "random_integer": 623, "random_float": 0.4056917539384999, "random_boolean": true}
{"random_string": "baejdnchnz", "random_integer": 467, "random_float": 0.4499033453809097, "random_boolean": false}
{"random_string": "rhoxsuiykr", "random_integer": 669, "random_float": 0.8635202434447838, "random_boolean": false}
{"random_string": "xwxwzydfqd", "random_integer": 385, "random_float": 0.062110682117167415, "random_boolean": true}
{"random_string": "awzdtvfywq", "random_integer": 600, "random_float": 0.9326239593142196, "random_boolean": true}
{"random_string": "gjotzehclo", "random_integer": 894, "random_float": 0.25929448092143925, "random_boolean": false}
{"random_string": "eyazhmwqfr", "random_integer": 56, "random_float": 0.48099136669964004, "random_boolean": false}
{"random_string": "nwosdqqvpe", "random_integer": 521, "random_float": 0.06270766544379236, "random_boolean": true}
{"random_string": "ubqtytuqpr", "random_integer": 810, "random_float": 0.7167855116538012, "random_boolean": false}
{"random_string": "csrdggrfwm", "random_integer": 607, "random_float": 0.25369408622809664, "random_boolean": true}
{"random_string": "eatxfosnwm", "random_integer": 188, "random_float": 0.29444418087368984, "random_boolean": false}
{"random_string": "gsjjeyqtfx", "random_integer": 848, "random_float": 0.61789618258569, "random_boolean": true}
{"random_string": "fciwmkiyld", "random_integer": 608, "random_float": 0.45189560474927004, "random_boolean": false}
'''

In [259]:
lines = iter(samples.splitlines())

# extract events from data sample
events = list()

for line in lines:
    if not line:
        continue
    result_dict = json.loads(line)
    events.append(result_dict)

events[:3]

[{'random_string': 'wvvkuyfdjp',
  'random_integer': 502,
  'random_float': 0.9752454439023497,
  'random_boolean': True},
 {'random_string': 'aymtvxjtwz',
  'random_integer': 860,
  'random_float': 0.9027580964349539,
  'random_boolean': False},
 {'random_string': 'vaolbivfof',
  'random_integer': 351,
  'random_float': 0.9236287647478101,
  'random_boolean': True}]

In [260]:
schema_obj = json.loads(schema)
schema_obj

{'schemas': [{'id': 'c28fb5ca-8219-436e-bcf8-057d4cb3aa70',
   'data_offer_state_id': '1baeed63-d360-47b7-94b9-915bb0191f2e',
   'source': 'INFERRED',
   'encoding': 'JSON',
   'content': '{"$schema": "http://json-schema.org/schema#", "type": "object", "properties": {"random_string": {"type": "string"}, "random_integer": {"type": "integer"}, "random_float": {"type": "number"}, "random_boolean": {"type": "boolean"}}, "required": ["random_boolean", "random_float", "random_integer", "random_string"]}',
   'created': {'username': 'spoud-sdm-kafka-profiler',
    'timestamp': {'seconds': '1593592254', 'nanos': 880627000}},
   'last_seen': {'seconds': '1593595565', 'nanos': 4554000},
   'privilege': {'privilege': ['LINK',
     'NONE',
     'ADMIN',
     'READ_INFO',
     'READ',
     'WRITE'],
    'path': '/default/'}}]}

In [261]:
types = dict()

# load types from schema
for p in json.loads(schema_obj["schemas"][0]["content"])["properties"].items():
    types[p[0]] = p[1]['type']
    
types

{'random_string': 'string',
 'random_integer': 'integer',
 'random_float': 'number',
 'random_boolean': 'boolean'}

In [262]:
# import builtins
# builtin_types = [getattr(builtins, d) 
#                 for d in dir(builtins) if isinstance(getattr(builtins, d), type)]

In [263]:
# Boolean, Float, Integer, String

def create_types(schema_string):
    if schema_string == 'boolean':
        return [bool]
    elif (schema_string == 'number'):
        return [int, float]
    elif (schema_string == 'integer'):
        return [int]
    elif schema_string == 'string':
        return [str]
    else:
        return None

In [264]:
match =  list()

for e in events:
    for key in e.keys():
        schema_types = create_types(types[key])
        if type(e[key]) in schema_types:
            match.append(True)
        else:
            match.append(False)
            print(f"No match: {e[key]}: {schema_types}")

In [265]:
print(f"Attribute quality matching with schema: {np.mean(match)*100}%")

Attribute quality matching with schema: 100.0%
