In [None]:
%pip install -q faker
%pip install -q jsonschema
%pip install faker-healthcare-system
%pip install rstr

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Setup

In [None]:
from property_gen_utils import properties_list
import property_gen_utils as utils
from faker import Faker
import rstr

faker = Faker()

print("Properties List: ", properties_list)

n=10 # Number of records to generate for each property
file_path = "../data/"
file_name_prefix = "property_data_"

Properties List:  [{'name': {'type': 'string', 'minLength': 1, 'maxLength': 50, 'description': 'Full name of the individual'}}, {'subscriber_name': {'type': 'string', 'minLength': 1, 'maxLength': 50, 'description': 'Name of the subscriber'}}, {'patient_name': {'type': 'string', 'minLength': 1, 'maxLength': 50, 'description': 'Name of the patient'}}, {'first_name': {'type': 'string', 'minLength': 1, 'maxLength': 50, 'description': 'First name of the individual'}}, {'last_name': {'type': 'string', 'minLength': 1, 'maxLength': 50, 'description': 'Last name of the individual'}}, {'full_name': {'type': 'string', 'minLength': 1, 'maxLength': 100, 'description': 'Full name of the individual'}}, {'organization_name': {'type': 'string', 'minLength': 1, 'maxLength': 50, 'description': 'Name of the organization'}}, {'company_name': {'type': 'string', 'minLength': 1, 'maxLength': 50, 'description': 'Name of the company'}}, {'organization': {'type': 'string', 'minLength': 1, 'maxLength': 50, 'descr

### Prompt functions

### Generator functions

In [None]:
def get_string_generator(prop_schema):
    
    name = prop_schema["name"]
    type = prop_schema["type"]
    if type == "string":
        if "enum" in prop_schema:
            return lambda: faker.random_element(prop_schema["enum"])
        elif "format" in prop_schema:
            if prop_schema["format"] == "date-time":
                return lambda: faker.date_time().isoformat()
            elif prop_schema["format"] == "email":
                return lambda: faker.email()
            elif prop_schema["format"] == "date":
                return lambda: faker.date().isoformat()
            elif prop_schema["format"] == "uri":
                return lambda: faker.uri()
            else:
                raise ValueError(f"Unsupported format: {prop_schema['format']}")
        elif "pattern" in prop_schema:
            pattern = prop_schema["pattern"]
            return lambda: rstr.xeger(pattern)
        else:
            if "name" in name:
                if any(word in name for word in ["subscriber", "customer", "patient", "full", "student", "doctor", "employee", "player"]):
                    return lambda: faker.name()
                elif "first" in name: 
                    return lambda: faker.first_name()
                elif "last" in name:
                    return lambda: faker.last_name()
                elif "middle" in name:
                    return lambda: faker.middle_name()
                elif any(word in name for word in ["organization", "company", "business", "firm", "org", "corp", "inc", "ltd", "enterprise", "association", "institution"]):
                    return lambda: faker.company()
            elif "diagnosis" in name:
                return lambda: faker.random_choices(["Diabetes", "Hypertension", "High Cholesterol", "Asthma", "Seasonal Allergies", "Arthritis", "Obesity", "Anemia", "Depression", "Back Pain", "Migraines", "Sinusitis"])
            elif "address" in name:
                return lambda: faker.address().replace("\n", ", ")
            elif "department" in name:
                return lambda: faker.random_choices(["Surgical", "Diagnostics", "Pediatrics", "Cardiology", "Neurology", "Oncology", "Orthopedics", "Psychiatry", "Radiology", "Emergency"])
            else:
                return lambda: faker.word()
   

### Data Generation

In [None]:
import random
import json



prop_generator = {}
for prop in properties_list:
    type = prop.get("type", None)
    if type == "string":
        prop_generator[prop["name"]] = {"generator":get_string_generator(prop), "property": prop}
    elif type in ["integer","int","number"]:
        prop_generator[prop["name"]] = {"generator":get_numeric_generator(prop), "property": prop}
    elif type == "boolean":
        prop_generator[prop["name"]] = {"generator": lambda: random.choice([True, False]), "property": prop}
    elif type == "object":
        prop_generator[prop["name"]] = {"generator": get_object_generator(prop), "property": prop}
    elif type == "array":
        prop_generator[prop["name"]] = {"generator":get_array_generator(prop), "property": prop}


data_samples = []
for _ in range(n):  # Generate 10 records
    
    for prop_name, prop_details in prop_generator.items():

        value = prop_details["generator"]()        
        record = {
            "messages": [
                {
                    "role": "user",
                    "content": get_prompt(prop_details["prop"], value)
                },
                {
                    "role": "assistant",
                    "content": json.dumps(value, ensure_ascii=False)
                }
            ],
        }

        data_samples.append(record)


random.shuffle(data_samples)

file_name = f"{file_path}{file_name_prefix}v{utils.get_dataset_version(file_name_prefix, file_path)}.jsonl"

with open(file_name, "w") as outfile:
    for record in data_samples:
        json.dump(record, outfile)
        outfile.write('\n')

print(f"Data generation complete. {len(data_samples)} records written to {file_name}")
        


        