In [1]:
%pip install faker
%pip install jsonschema

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import json
import jsonschema
from faker import Faker
import string
import random

In [None]:
with open("name_schema.json", "r") as f:
    schema = json.load(f)

FileNotFoundError: [Errno 2] No such file or directory: '../name_schema.json'

In [None]:
class ValueGeneratorUtils:

    def __init__(self, faker=None):
        if faker:
            self._faker = faker
        else:
            self._faker = Faker()
        self.null_threshold = 0.2

    def get_property_value_generators(self, schema: dict, include_nulls=True) -> dict:

        properties = schema.get('properties', {})
        required_properties = schema.get('required', [])
        value_generators = {}

        for prop_name, prop_schema in properties.items():

            nulls = include_nulls and (prop_name not in required_properties)

            if prop_schema.get('type') == 'string':
                value_generators[prop_name] = self.get_string_generator(prop_name, prop_schema, nulls)
            elif prop_schema.get('type') == 'integer':
                value_generators[prop_name] = lambda: None if nulls and random.random() < self.null_threshold \
                    else self._faker.random_int(min=0, max=1000)
            elif prop_schema.get('type') == 'boolean':
                value_generators[prop_name] = lambda: None if nulls and random.random() < self.null_threshold \
                    else self._faker.boolean()
            elif prop_schema.get('type') == 'null' and nulls:
                value_generators[prop_name] = lambda: None
            else:
                value_generators[prop_name] = lambda: None

        return value_generators

    def get_string_generator(self, property_name=None, prop_schema=None, include_nulls=True):
        if "enum" in prop_schema:
            return lambda: None if include_nulls and random.random() < self.null_threshold \
                else self._faker.random_element(elements=prop_schema["enum"])
        elif prop_schema.get("format") == "email":
            return lambda: None if include_nulls and random.random() < self.null_threshold else self._faker.email()

        elif prop_schema.get("format") == "date":
            return lambda: None if include_nulls and random.random() < self.null_threshold else self._faker.date()

        elif "minLength" in prop_schema or "maxLength" in prop_schema:
            description = prop_schema.get("description", "")

            if ("name" in description.lower() and "first" in description.lower()) or \
               ("name" in property_name.lower() and "first" in property_name.lower()):
                return lambda: None if include_nulls and random.random() < self.null_threshold \
                    else self._faker.first_name()

            elif ("name" in description.lower() and "last" in description.lower()) or \
                 ("name" in property_name.lower() and "last" in property_name.lower()):
                return lambda: None if include_nulls and random.random() < self.null_threshold \
                    else self._faker.last_name()

            elif ("name" in description.lower() and "middle" in description.lower()) or \
                 ("name" in property_name.lower() and "middle" in property_name.lower()):
                return lambda: None if include_nulls and random.random() < self.null_threshold \
                    else self._faker.random_element(elements=list(string.ascii_uppercase))

            elif ("name" in description.lower() and "prefix" in description.lower()) or \
                 ("name" in property_name.lower() and "prefix" in property_name.lower()):
                return lambda: None if include_nulls and random.random() < self.null_threshold else self._faker.prefix()

            elif ("name" in description.lower() and "suffix" in description.lower()) or \
                 ("name" in property_name.lower() and "suffix" in property_name.lower()):
                return lambda: None if include_nulls and random.random() < self.null_threshold else self._faker.suffix()

            elif "name" in description.lower() or "name" in property_name.lower():
                return lambda: None if include_nulls and random.random() < self.null_threshold else self._faker.name()

            else:
                return lambda: None if include_nulls and random.random() < self.null_threshold \
                    else self._faker.pystr(min_chars=prop_schema.get("minLength", 1), max_chars=prop_schema.get("maxLength", 10))
        else:
            return lambda: self._faker.pystr(min_chars=1,max_chars=10)

In [None]:
def validate_json(json_data, target_schema=None) -> bool:
    try:
        jsonschema.validate(instance=json_data, schema=target_schema)
        return True
    except jsonschema.ValidationError as e:
        print(f"Validation error for object: {e}")
        return False

In [None]:
def remove_none(d):
    if isinstance(d, dict):
        return {k: remove_none(v) for k, v in d.items() if v is not None}
    elif isinstance(d, list):
        return [remove_none(v) for v in d if v is not None]
    else:
        return d

In [None]:
def get_nm1_segment(rec):
    entity_identifier_code = rec.get("entity_identifier_code", "")
    entity_type_qualifier = rec.get("entity_type_qualifier", "")
    name_last_or_organization_name = rec.get("name_last_or_organizationName", "")
    name_first = rec.get("name_first", "")
    name_middle = rec.get("name_middle", "")
    name_prefix = rec.get("name_prefix", "")
    name_suffix = rec.get("name_suffix", "")
    identification_code_qualifier = rec.get("identification_code_qualifier", "")
    identification_code = rec.get("identification_code", "")
    return f"NM1*{entity_identifier_code}*{entity_type_qualifier}*{name_last_or_organization_name}*{name_first}*{name_middle}*{name_prefix}*{name_suffix}*{identification_code_qualifier}*{identification_code}~"



In [None]:
with open("name_schema.json", "r") as f:
    schema = json.load(f)

value_generator_utils = ValueGeneratorUtils()
generators = value_generator_utils.get_property_value_generators(schema, include_nulls=True)


with open("../data/combinations.jsonl", "w") as outfile:
    no_of_training_records = 100
    records_generated = 0
    while records_generated < no_of_training_records:
        record = {}
        for key, generator in generators.items():
            record[key] = generator()

        record['nm1_segment'] = get_nm1_segment(record)
        record = remove_none(record)

        if validate_json(record, schema):
            json.dump(record, outfile)
            outfile.write("\n")
            records_generated += 1
            print(record)