In [15]:
%pip install faker
%pip install jsonschema

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Imports

In [16]:
import json
import jsonschema
from faker import Faker
import string
import random

### Load Schema

In [17]:
with open("name_schema.json", "r") as f:
    schema = json.load(f)

### Random Data Generator for Json Schema Properties

In [18]:
class JsonPropertyValueGenerator:

    def __init__(self, faker=None):
        if faker:
            self._faker = faker
        else:
            self._faker = Faker()
        self.null_threshold = 0.2

    def get_property_value_generators(self, schema: dict, include_nulls=True) -> dict:

        properties = schema.get('properties', {})
        required_properties = schema.get('required', [])
        value_generators = {}

        for prop_name, prop_schema in properties.items():

            nulls = include_nulls and (prop_name not in required_properties)

            if prop_schema.get('type') == 'string':
                value_generators[prop_name] = self.get_string_generator(prop_name, prop_schema, nulls)
            elif prop_schema.get('type') == 'integer':
                value_generators[prop_name] = lambda: None if nulls and random.random() < self.null_threshold \
                    else self._faker.random_int(min=0, max=1000)
            elif prop_schema.get('type') == 'boolean':
                value_generators[prop_name] = lambda: None if nulls and random.random() < self.null_threshold \
                    else self._faker.boolean()
            elif prop_schema.get('type') == 'null' and nulls:
                value_generators[prop_name] = lambda: None
            else:
                value_generators[prop_name] = lambda: None

        return value_generators

    def get_string_generator(self, property_name=None, prop_schema=None, include_nulls=True):
        if "enum" in prop_schema:
            return lambda: None if include_nulls and random.random() < self.null_threshold \
                else self._faker.random_element(elements=prop_schema["enum"])
        elif prop_schema.get("format") == "email":
            return lambda: None if include_nulls and random.random() < self.null_threshold else self._faker.email()

        elif prop_schema.get("format") == "date":
            return lambda: None if include_nulls and random.random() < self.null_threshold else self._faker.date()

        elif "minLength" in prop_schema or "maxLength" in prop_schema:
            description = prop_schema.get("description", "")

            if ("name" in description.lower() and "first" in description.lower()) or \
               ("name" in property_name.lower() and "first" in property_name.lower()):
                return lambda: None if include_nulls and random.random() < self.null_threshold \
                    else self._faker.first_name()

            elif ("name" in description.lower() and "last" in description.lower()) or \
                 ("name" in property_name.lower() and "last" in property_name.lower()):
                return lambda: None if include_nulls and random.random() < self.null_threshold \
                    else self._faker.last_name()

            elif ("name" in description.lower() and "middle" in description.lower()) or \
                 ("name" in property_name.lower() and "middle" in property_name.lower()):
                return lambda: None if include_nulls and random.random() < self.null_threshold \
                    else self._faker.random_element(elements=list(string.ascii_uppercase))

            elif ("name" in description.lower() and "prefix" in description.lower()) or \
                 ("name" in property_name.lower() and "prefix" in property_name.lower()):
                return lambda: None if include_nulls and random.random() < self.null_threshold else self._faker.prefix()

            elif ("name" in description.lower() and "suffix" in description.lower()) or \
                 ("name" in property_name.lower() and "suffix" in property_name.lower()):
                return lambda: None if include_nulls and random.random() < self.null_threshold else self._faker.suffix()

            elif "name" in description.lower() or "name" in property_name.lower():
                return lambda: None if include_nulls and random.random() < self.null_threshold else self._faker.name()

            else:
                return lambda: None if include_nulls and random.random() < self.null_threshold \
                    else self._faker.pystr(min_chars=prop_schema.get("minLength", 1), max_chars=prop_schema.get("maxLength", 10))
        else:
            return lambda: self._faker.pystr(min_chars=1,max_chars=10)

### Util Functions

In [19]:
def validate_json(json_data, target_schema=None) -> bool:
    try:
        jsonschema.validate(instance=json_data, schema=target_schema)
        return True
    except jsonschema.ValidationError as e:
        print(f"Validation error for object: {e}")
        return False
    

def remove_none(d):
    if isinstance(d, dict):
        return {k: remove_none(v) for k, v in d.items() if v is not None}
    elif isinstance(d, list):
        return [remove_none(v) for v in d if v is not None]
    else:
        return d
    
def get_nm1_segment(rec):
    entity_identifier_code = rec.get("entity_identifier_code", "")
    entity_type_qualifier = rec.get("entity_type_qualifier", "")
    name_last_or_organization_name = rec.get("name_last_or_organizationName", "")
    name_first = rec.get("name_first", "")
    name_middle = rec.get("name_middle", "")
    name_prefix = rec.get("name_prefix", "")
    name_suffix = rec.get("name_suffix", "")
    identification_code_qualifier = rec.get("identification_code_qualifier", "")
    identification_code = rec.get("identification_code", "")
    return f"NM1*{entity_identifier_code}*{entity_type_qualifier}*{name_last_or_organization_name}*{name_first}*{name_middle}*{name_prefix}*{name_suffix}*{identification_code_qualifier}*{identification_code}~"



### Prompt Generator Functions

In [20]:

def generate_prompt(json_data, schema) -> dict:

    record = {
        "messages":[]
    }

    record['messages'].append(get_user_content(json_data, schema)) #user
    record['messages'].append(get_assistant_content(json_data)) #user

    return record


def get_user_content(json_data, schema):
    
    prompt = get_user_prompt(json_data, schema)
    content = f'''You are a Json test data generator generating valid json data based on a json schema. Given the below schema,
    
    ### Schema: {schema}
    
    {prompt}
    '''
    return  {
        "role": "user",
        "content":f"{content}"
    }


def get_user_prompt(json_data, schema):

    # convert json_data keys to a comma-separated string
    properties = ", ".join(json_data.keys())
    # convert json_data values to a comma-separated string
    values = ", ".join([str(value) for value in json_data.values() if value is not None])
    # convert schema required properties to a comma-separated string
    required_properties = ", ".join(schema.get('required', []))
    # convert json_data properties and values to a formatted string
    properties_and_values = ", ".join([f"{key}: {value}" for key, value in json_data.items() if value is not None])

    templates = [
        "Generate a valid JSON object with the following properties: {properties}",
        "Create a JSON object that adheres to the given schema",
        "Generate a JSON object that includes the following properties: {properties}",
        "Produce a JSON object that conforms to the specified schema",
        "Create a JSON object that meets the requirements of the schema",
        "Generate a JSON object that satisfies the schema's constraints",
        "Create a JSON object with the required properties: {required_properties}",
        "Generate a JSON object with the following properties: {properties} and values: {values}",
        "Produce a JSON object that includes the following properties: {properties} and values: {values}",
        "Create a JSON object that contains the following properties: {properties} and values: {values}",
        "Give me a JSON object that has the following properties and values: {properties_and_values}",
        "Generate a JSON object that includes the following properties and values: {properties_and_values}",
        "Create a JSON object that contains the following properties and values: {properties_and_values}"
    ]
    only_required_properties_templates = [
        "Generate a JSON object that includes only the required properties",
        "Create a JSON object that contains only the required properties",
        "Produce a JSON object that adheres to the schema with only the required properties: {required_properties}",
        "Generate a JSON object that includes the required properties",
        "Create a JSON object that contains the required properties: {required_properties}",
        "Produce a JSON object that adheres to the schema with only the required properties",
        "Generate a JSON object that includes the required properties: {required_properties}",
        "Create a sample JSON object that contains the required properties: {required_properties}"
    ]
    all_properties_templates = [
        "Generate a JSON object that includes all properties",
        "Create a JSON object that contains all properties",
        "Produce a JSON object that adheres to the schema with all properties",
        "Generate a JSON object that includes all properties",
        "Create a JSON object that contains all properties",
        "Produce a JSON object that adheres to the schema with all properties",
    ]
    
    # does json_data contain all properties in the schema?
    has_all_properties = set(json_data.keys()) == set(schema.get('properties', {}).keys())
    has_only_required_properties = set(json_data.keys()) == set(schema.get('required', []))

    if has_all_properties:
        template = random.choice(all_properties_templates)
    elif has_only_required_properties:
        template = random.choice(only_required_properties_templates)
    else:
        template = random.choice(templates)

    prompt = template.format(
        properties=properties,
        values=values,
        required_properties=required_properties,
        properties_and_values=properties_and_values
    )
    return prompt


def get_assistant_content(json_data):
    
    return  {
        "role": "assistant",
        "content": f"{json_data}"
    }

### Generate Data

In [21]:

value_generator_utils = JsonPropertyValueGenerator()
generators = value_generator_utils.get_property_value_generators(schema, include_nulls=True)


with open("../data/combinations.jsonl", "w") as outfile:
    no_of_training_records = 1000
    records_generated = 0
    while records_generated < no_of_training_records:
        record = {}
        for key, generator in generators.items():
            record[key] = generator()

        record = remove_none(record)
        record['nm1_segment'] = get_nm1_segment(record)
        
        if validate_json(record, schema):
            record = generate_prompt(record, schema)
            json.dump(record, outfile)
            outfile.write("\n")
            records_generated += 1
            print(record)

{'messages': [{'role': 'user', 'content': 'You are a Json test data generator generating valid json data based on a json schema. Given the below schema,\n\n    ### Schema: {\'$schema\': \'http://json-schema.org/draft-07/schema#\', \'title\': \'X12 NM1 Segment\', \'description\': "Schema for X12 NM1 segment, which contains subscriber\'s or organization name information.", \'type\': \'object\', \'properties\': {\'entity_identifier_code\': {\'type\': \'string\', \'description\': \'Entity Identifier Code (e.g., IL for Insured or 03 for dependent or PR for Payer or 1P for Provider)\', \'enum\': [\'IL\', \'03\', \'PR\', \'1P\']}, \'entity_type_qualifier\': {\'type\': \'string\', \'description\': \'Entity Type Qualifier (1 = Person, 2 = Non-Person Entity)\', \'enum\': [\'1\', \'2\']}, \'name_last_or_organizationName\': {\'type\': \'string\', \'description\': \'Name Last or Organization Name\', \'minLength\': 1, \'maxLength\': 60}, \'name_first\': {\'type\': \'string\', \'description\': \'Name