In [21]:
%pip install -q faker
%pip install -q jsonschema
%pip install rstr

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Setup

In [None]:
from property_gen_utils import properties_list
import property_gen_utils as utils
from faker import Faker
import rstr
import os

faker = Faker()

print("Properties List: ", properties_list)

n=20 # Number of records to generate for each property
file_path = "../data/"
file_name_prefix = "property_style_training_data_"

Properties List:  [{'name': {'type': 'string', 'minLength': 1, 'maxLength': 50, 'description': 'Full name of the individual'}}, {'subscriber_name': {'type': 'string', 'minLength': 1, 'maxLength': 50, 'description': 'Name of the subscriber'}}, {'patient_name': {'type': 'string', 'minLength': 1, 'maxLength': 50, 'description': 'Name of the patient'}}, {'first_name': {'type': 'string', 'minLength': 1, 'maxLength': 50, 'description': 'First name of the individual'}}, {'last_name': {'type': 'string', 'minLength': 1, 'maxLength': 50, 'description': 'Last name of the individual'}}, {'full_name': {'type': 'string', 'minLength': 1, 'maxLength': 100, 'description': 'Full name of the individual'}}, {'organization_name': {'type': 'string', 'minLength': 1, 'maxLength': 50, 'description': 'Name of the organization'}}, {'company_name': {'type': 'string', 'minLength': 1, 'maxLength': 50, 'description': 'Name of the company'}}, {'organization': {'type': 'string', 'minLength': 1, 'maxLength': 50, 'descr

### Generator functions

In [23]:
import random

def get_property_generator(properties):
    
    prop_generator = {}
    for name, prop_schema in properties.items():
        type = prop_schema.get("type", None)
        if type == "string":
            prop_generator[name] = {"generator":get_string_generator(name,prop_schema), "property_schema": properties}
        elif type in ["integer","int","number"]:
            prop_generator[name] = {"generator":get_numeric_generator(name,prop_schema), "property_schema": properties}
        elif type == "boolean":
            prop_generator[name] = {"generator": lambda: random.choice([True, False]), "property_schema": properties}
        elif type == "object":
            prop_generator[name] = {"generator": get_object_generator(name,prop_schema), "property_schema": properties}
        elif type == "array":
            prop_generator[name] = {"generator":get_array_generator(name,prop_schema), "property_schema": properties}

    return prop_generator


def get_string_generator(name,prop_schema):
    type = prop_schema["type"]
    if type == "string":
        if "enum" in prop_schema:
            return lambda: faker.random_element(prop_schema["enum"])
        elif "format" in prop_schema:
            if prop_schema["format"] == "date-time":
                return lambda: faker.date_time().isoformat()
            elif prop_schema["format"] == "email":
                return lambda: faker.email()
            elif prop_schema["format"] == "date":
                return lambda: faker.date()
            elif prop_schema["format"] == "uri":
                return lambda: faker.uri()
            else:
                raise ValueError(f"Unsupported format: {prop_schema['format']}")
        elif "pattern" in prop_schema:
            pattern = prop_schema["pattern"]
            return lambda: rstr.xeger(pattern)
        else:
            if "name" in name:
                if any(word in name for word in ["subscriber", "customer", "patient", "full", "student", "doctor", "employee", "player"]):
                    return lambda: faker.name()
                elif "first" in name: 
                    return lambda: faker.first_name()
                elif "last" in name:
                    return lambda: faker.last_name()
                elif "middle" in name:
                    return lambda: faker.middle_name()
                elif any(word in name for word in ["organization", "company", "business", "firm", "org", "corp", "inc", "ltd", "enterprise", "association", "institution"]):
                    return lambda: faker.company()
                else:
                    return lambda: faker.name()
            elif "diagnosis" in name:
                return lambda: faker.random_choices(["Diabetes", "Hypertension", "High Cholesterol", "Asthma", "Seasonal Allergies", "Arthritis", "Obesity", "Anemia", "Depression", "Back Pain", "Migraines", "Sinusitis"])
            elif "address" in name:
                return lambda: faker.address().replace("\n", ", ")
            elif "department" in name:
                return lambda: faker.random_choices(["Surgical", "Diagnostics", "Pediatrics", "Cardiology", "Neurology", "Oncology", "Orthopedics", "Psychiatry", "Radiology", "Emergency"])
            else:
                return lambda: faker.word()
   

def get_numeric_generator(name,prop_schema):
    
    type = prop_schema["type"]

    if type in ["integer", "int"]:
        if "minimum" in prop_schema and "maximum" in prop_schema:
            return lambda: faker.random_int(min=prop_schema["minimum"], max=prop_schema["maximum"])
        else:
            return lambda: faker.random_int()
        
    elif type == "number":
        if "minimum" in prop_schema and "maximum" in prop_schema:
            return lambda: round(faker.random.uniform(prop_schema["minimum"], prop_schema["maximum"]), 2)
        else:
            return lambda: round(faker.random.uniform(0, 1), 2)
        
def get_array_generator(name,prop_schema):
    
    type = prop_schema["type"]

    if type == "array":
        if "items" in prop_schema:
            item_type = prop_schema["items"]["type"]
            if item_type == "string":
                return lambda: [get_string_generator(name, prop_schema["items"])() for _ in range(faker.random_int(min=1, max=5))]
            elif item_type in ["integer", "int"]:
                return lambda: [get_numeric_generator(name, prop_schema["items"])() for _ in range(faker.random_int(min=1, max=5))]
            else:
                raise ValueError(f"Unsupported item type: {item_type}")
        else:
            raise ValueError("Array schema must have 'items' defined")
        
def get_object_generator(name,prop_schema):
    
    type = prop_schema["type"]

    if type == "object":
        if "properties" in prop_schema:
            # return lambda: {prop_name: get_property_generator({prop_name:prop_details}) for prop_name, prop_details in prop_schema["properties"].items()}
            return lambda: get_property_generator(prop_schema["properties"])
        else:
            raise ValueError("Object schema must have 'properties' defined")

### Prompt functions

In [24]:
import json

def get_prompt(prop_name, prop_value, prop_schema):
    
    prompt_templates = [
        "Generate a {name} property with the value: {value} for the given schema. \n {schema}",
        "Generate a sample data for the given property schema.  \n {schema}",
        "Create a {name} property with the value: {value} based on the schema provided.  \n {schema}",
        "Produce a {name} property with the value: {value} according to the schema provided.  \n {schema}",
        "Create a sample data for the property schema with the value: {value}.  \n {schema}",
        "Generate a {name} property based on the schema.  \n {schema}",
        "Give a sample data for the property schema with the value: {value}.  \n {schema}",
        ]
    
    prompt = faker.random_element(prompt_templates)
    schema = json.dumps(prop_schema, indent=2)
    value = prop_value[prop_name]
    if isinstance(value, dict):
        value = ', '.join(f"{k}={v}" for k, v in value.items())

    return prompt.format(name=prop_name, value=value, schema=schema)

### Data Generation

In [25]:
def execute_lambdas(d):
    result = {}
    for k, v in d.items():
        if isinstance(v, dict):
            if "generator" in v:
                generated = v["generator"]()
                if isinstance(generated, dict):
                    result[k] = execute_lambdas(generated)
                else:
                    result[k] = generated

    return result

prop_generators = []
for properties in properties_list:
    prop_generators.append(get_property_generator(properties))


data_samples = []
for _ in range(n):  # Generate 10 records
    
    for prop_generator in prop_generators:

        name = list(prop_generator.keys())[0]  # Get details from the first property
        value = execute_lambdas(prop_generator)
        schema = prop_generator[name]["property_schema"]
        record = {
            "messages": [
                {
                    "role": "user",
                    "content": get_prompt(name, value, schema)
                },
                {
                    "role": "assistant",
                    "content": json.dumps(value, ensure_ascii=False)
                }
            ],
        }

        data_samples.append(record)


random.shuffle(data_samples)

file_name = f"{file_path}{file_name_prefix}v{utils.get_dataset_version(file_name_prefix, file_path)}.jsonl"

with open(file_name, "w") as outfile:
    for record in data_samples:
        json.dump(record, outfile)
        outfile.write('\n')

print(f"Data generation complete. {len(data_samples)} records written to {file_name}")
        


        

No valid dataset versions found. Starting with version 1.
Data generation complete. 560 records written to ../data/property_style_training_data_v1.jsonl
