### Evaluation

In [1]:
import logging
from transformers import BitsAndBytesConfig
#Model configs

logging.basicConfig(
    level=logging.INFO,
    format='%(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

model_name = "mistralai/Mistral-7B-Instruct-v0.3"
training_data = "../data/combinations.jsonl"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_quant_type="nf4"
)

In [2]:
! nvidia-smi

Tue Aug  5 23:19:19 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 576.40                 Driver Version: 576.40         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 5070 Ti   WDDM  |   00000000:01:00.0 Off |                  N/A |
|  0%   38C    P8              8W /  300W |       0MiB /  16303MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
from datasets import load_dataset, Dataset, DatasetDict


# load the dataset and split it into train, validation and test sets
dataset = load_dataset("json", data_files=training_data, split='train')
shuffled_dataset = dataset.shuffle(seed=42)
train_temp_split = shuffled_dataset.train_test_split(test_size=0.3) #30% for validation and test
temp_dataset = train_temp_split['test']
validation_test_split = temp_dataset.train_test_split(test_size=1/3)# 10% for validation and 20% for test
split_datasets = DatasetDict({
    'train': train_temp_split['train'],
    'validation': validation_test_split['train'],
    'test': validation_test_split['test']
})

INFO - PyTorch version 2.7.0+cu128 available.


In [4]:
import os
import mlflow

def get_last_run_id():
    runs = mlflow.search_runs(order_by=["start_time desc"], max_results=1)
    if not runs.empty:
        return runs.iloc[0].run_id
    else:
        logger.warning("No runs found in MLflow.")
        return None

def get_run_artifact_uri(last_run_id):
    if last_run_id:
        run_info = mlflow.get_run(last_run_id)
        artifact_path = run_info.info.artifact_uri
        if artifact_path.startswith("file://"):
            artifact_path = artifact_path.replace("file://", "")
        elif artifact_path.startswith("file:"):
            artifact_path = artifact_path.replace("file:", "")
        if "/training/.." in artifact_path:
            artifact_path = artifact_path.replace("/training/..", "")
        artifact_path = artifact_path+ "/adapters"
        folders = [f for f in os.listdir(artifact_path) if os.path.isdir(os.path.join(artifact_path, f))]
        logger.info(f"Models available: {', '.join(folders)}")
        if folders:
            last_updated_folder = max(folders, key=lambda f: os.path.getmtime(os.path.join(artifact_path, f)))
            logger.info(f"Last updated model: {last_updated_folder}")
            return os.path.join(artifact_path, last_updated_folder)
        else:
            logger.warning("No folders found in the artifact URI.")
        return None
    else:
        logger.warning("No runs found in MLflow.")
        return None
    
mlflow.end_run()

mlflow_uri = "file:../mlruns"
mlflow_experiment_name = "Synthetic-data-generator-fine-tuning"

mlflow.set_tracking_uri(mlflow_uri)
mlflow.set_experiment(mlflow_experiment_name)

run_id = get_last_run_id()
mlflow.start_run(run_id=run_id)
adapter_dir = get_run_artifact_uri(run_id)

print(f"Last run ID: {run_id}")
print(f"Artifact URI: {adapter_dir}")

INFO - Models available: epoch-5
INFO - Last updated model: epoch-5


Last run ID: 219f4ed29a504b2cb00eea34fdf06c57
Artifact URI: c:\Users\Admin\Documents\Projects\Repositories\synthetic-data-generator\mlruns/686303321124169868/219f4ed29a504b2cb00eea34fdf06c57/artifacts/adapters\epoch-5


In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(model_name, 
                                                    quantization_config=quantization_config , 
                                                    device_map="auto")
    
model = PeftModel.from_pretrained(base_model, adapter_dir)
tokenizer = AutoTokenizer.from_pretrained(adapter_dir)

INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
import json
import re
import numpy as np
import torch
from jsonschema import validate, ValidationError
from transformers import StoppingCriteria, StoppingCriteriaList


# function to extact JSON from a string
def find_and_parse_json(text: str) -> dict:

    try:
        # Find the first opening curly brace
        first_brace_index = text.find('{')
        if first_brace_index == -1:
            return None # No JSON object found

        # Start searching from the first brace
        brace_level = 1
        for i, char in enumerate(text[first_brace_index + 1:]):
            if char == '{':
                brace_level += 1
            elif char == '}':
                brace_level -= 1
            
            if brace_level == 0:
                # We found the matching closing brace
                last_brace_index = first_brace_index + i + 1
                json_string = text[first_brace_index : last_brace_index + 1]
                
                # Now, use the built-in json library to parse it
                return json.loads(json_string)
        
        return None # No complete JSON object found
    except (json.JSONDecodeError, IndexError):
        # Handle cases where the substring is not valid JSON or string is malformed
        return None


# Validate a JSON string against a schema
def is_valid_json_schema(json_generated, schema):
    try:
        validate(instance=json_generated, schema=schema)
        return True
    except (json.JSONDecodeError, ValidationError):
        return False
    
# domainn specific checks
def domain_specific_checks(json_ref, json_gen, domain_field, pattern):
    score = 0
    if domain_field in json_ref.keys():
        # Check if NM1 segment is present in both JSONs
        logger.debug(f"{json_ref[domain_field]}  {json_gen.get(domain_field)}")
        if domain_field in json_gen.keys():
            #NM1*PR*2*Palmer*Darlene*M*Mr.*DVM*PI*SqLFzolMNdaVXE~
            # pattern = r'^NM1(?:\*[^*]*){9}~$'
            value = json_gen[domain_field]
            if not bool(re.match(pattern, value)):
                score += 2.5
                logger.debug(f"{domain_field} {value} does not match the expected pattern, reducing score by 2.5")
            # check if all values in the json_gen are present in the value except for the nm1_segment
            for key in json_gen.keys():
                if key != domain_field and json_gen[key] not in value:
                    score += 2.5
                    logger.debug(f"Value {json_gen[key]} not found in {domain_field} {value}, reducing score by 2.5")
                    break
        else:
            score += 5
            logger.debug(f"{domain_field} not found in json_gen, reducing score by 5")

    return score
    

# Compare two JSON strings at field level and match field values
def compare_json_field_values(json_ref, json_gen):
   
    if not isinstance(json_ref, dict) or not isinstance(json_gen, dict):
        return 0.0
    
    score = 10
    # json_gen has all fields of json_ref, if not reduce the score proportional to no of fields missing
    logger.debug(f"Starting score: {score}")
    if set(json_gen.keys()).issubset(set(json_ref.keys())):
        for key in json_ref.keys():
            if json_gen.get(key) is None:
                reduce_score = 3 / len(json_ref.keys())
                logger.debug(f"Key {key} is missing in json_gen, reducing score by {reduce_score}")
                score -= reduce_score
    else:
        reduce_score = (len(json_ref.keys()) - len(json_gen.keys())) * (3 / len(json_ref.keys()))
        logger.debug(f"Keys in json_ref not present in json_gen, reducing score by {reduce_score}")
        score -= reduce_score

    # domainn specific checks
    score -= domain_specific_checks(json_ref, json_gen, "nm1_segment", r'^NM1(?:\*[^*]*){9}~$')
    score -= domain_specific_checks(json_ref, json_gen, "dtp_segment", r'^DTP(?:\*[^*]*){3}~$')
    score -= domain_specific_checks(json_ref, json_gen, "ref_segment", r'^REF(?:\*[^*]*){2}~$')
    score -= domain_specific_checks(json_ref, json_gen, "isa_segment", r'^ISA(?:\*[^*]*){16}~$')

    # If the score is less than 0, set it to 0
    if score < 0.0:
        score = 0.0

    logger.debug(f"Final score: {score}")

    return score

def get_result(valid_json, valid_json_schema, comparison_results):
    return json.dumps({
        "valid_json": np.mean(valid_json) * 100,
        "valid_json_schema": np.mean(valid_json_schema) * 100,
        "domain_field_match": np.mean(comparison_results) * 10
    })


def test_model(model, tokenizer, test_dataset):

    torch.cuda.empty_cache()

    valid_json = []
    valid_json_schema = []
    comparison_results = []

    stop_words = ["[INST]"]
    stop_word_ids = [tokenizer(stop_word, add_special_tokens=False).input_ids[0] for stop_word in stop_words]
    stop_word_ids.append(tokenizer.eos_token_id)  # Add EOS token ID to the stop words
    print(f"Tokens to stop on: {stop_words}")
    print(f"Corresponding Token IDs: {stop_word_ids}")
        
    for i, row in enumerate(test_dataset):

        logger.info(f"Processing row {i+1}/{len(test_dataset)}")

        prompt = f"<s>[INST] {row['messages'][0]['content']} [/INST]"
        # print(f"Prompt: \n {prompt} \n ---------------------------------")
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=1000,
                temperature=0.7,
                do_sample=True,                
                top_p=0.9,
                eos_token_id=stop_word_ids
            )

        generated_text = tokenizer.decode(output[0], skip_special_tokens=False)

        logger.info(f"Generated text: \n {generated_text}")
        
        if("[/INST]" in generated_text):
            generated_json = find_and_parse_json(generated_text.split("[/INST]")[-1].strip())
            gold_json = find_and_parse_json(row['messages'][1]['content'])
            schema = find_and_parse_json(generated_text.split("[/INST]")[-2].strip())
            # print(f"Generated JSON: {generated_json} \n Goled JSON: {gold_json} \n Schema: {schema}")
        
        # is Json valid?
        if generated_json is not None:
            valid_json.append(True)
        else:
            valid_json.append(False)

        # is JSON valid against schema?
        if schema is not None:
            if generated_json is not None and is_valid_json_schema(generated_json, schema):
                valid_json_schema.append(True)
            else:
                valid_json_schema.append(False)
        
        # compare JSON field values
        if gold_json is not None:
            if generated_json is not None:
                comparison_score = compare_json_field_values(gold_json, generated_json)
            else:
                comparison_score = 0.0
            comparison_results.append(comparison_score)

        logger.info(get_result(valid_json, valid_json_schema, comparison_results))
        print("------------------------------------------------------------")

    
    return get_result(valid_json, valid_json_schema, comparison_results)
    

In [7]:
logger.setLevel(logging.DEBUG) 
result = test_model(model, tokenizer, split_datasets["test"])

result = json.loads(result)

epoch = re.search(r'(\d+)$', adapter_dir).group(1)
mlflow.log_metric("syntactic-accuracy", result['valid_json'], step=int(epoch))
mlflow.log_metric("schema-accuracy", result['valid_json_schema'], step=int(epoch))
mlflow.log_metric("domain-field-match", result['domain_field_match'], step=int(epoch))

mlflow.end_run()
logger.info(f"Final Result: {result}")
logger.info("Evaluation completed and results logged to MLflow.")

INFO - Processing row 1/30
Setting `pad_token_id` to `eos_token_id`:3 for open-end generation.


Tokens to stop on: ['[INST]']
Corresponding Token IDs: [3, 2]


INFO - Generated text: 
 <s><s>[INST] You are a Json test data generator generating valid json data based on a json schema. 
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 DTP Segment", "description": "Schema for X12 DTP segment, which contains date/time period information.", "type": "object", "properties": {"dtp_segment": {"type": "string", "description": "Date/Time Period (e.g., DTP*291*D8*20220101-20220131~)", "minLength": 1, "maxLength": 30}}, "required": ["dtp_segment"]}
Generate a JSON object that includes all properties [/INST] {"dtp_segment": "DTP*151*RD8*20010914-20010919"}</s>
DEBUG - Starting score: 10
DEBUG - DTP*349*RD8*19900821-19900826  DTP*151*RD8*20010914-20010919
DEBUG - dtp_segment DTP*151*RD8*20010914-20010919 does not match the expected pattern, reducing score by 2.5
DEBUG - Final score: 7.5
INFO - {"valid_json": 100.0, "valid_json_schema": 100.0, "domain_field_match": 75.0}
INFO - Processing row 2/30
Setting `pad_token_id` t

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a structured data generator that creates valid JSON data based on a schema.
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 DTP Segment", "description": "Schema for X12 DTP segment, which contains date/time period information.", "type": "object", "properties": {"dtp_segment": {"type": "string", "description": "Date/Time Period (e.g., DTP*291*D8*20220101-20220131~)", "minLength": 1, "maxLength": 30}}, "required": ["dtp_segment"]}
Create a JSON object that contains all properties [/INST] {"dtp_segment": "DTP*349*D8*19860304-19860309"}</s>
DEBUG - Starting score: 10
DEBUG - DTP*291*RD8*19850517-19850522  DTP*349*D8*19860304-19860309
DEBUG - dtp_segment DTP*349*D8*19860304-19860309 does not match the expected pattern, reducing score by 2.5
DEBUG - Final score: 7.5
INFO - {"valid_json": 100.0, "valid_json_schema": 100.0, "domain_field_match": 75.0}
INFO - Processing row 3/30
Setting `pad_token_id` to `eos_t

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a structured data generator that creates valid JSON data based on a schema.
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 NM1 Segment", "description": "Schema for X12 NM1 segment, which contains subscriber's or organization name information.", "type": "object", "properties": {"entity_identifier_code": {"type": "string", "description": "Entity Identifier Code (e.g., IL for Insured or 03 for dependent or PR for Payer or 1P for Provider)", "enum": ["IL", "03", "PR", "1P"]}, "entity_type_qualifier": {"type": "string", "description": "Entity Type Qualifier (1 = Person, 2 = Non-Person Entity)", "enum": ["1", "2"]}, "name_last_or_organizationName": {"type": "string", "description": "Name Last or Organization Name", "minLength": 1, "maxLength": 60}, "name_first": {"type": "string", "description": "Name First", "nullable": true, "minLength": 1, "maxLength": 35}, "name_middle": {"type": "string", "description"

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a Json test data generator generating valid json data based on a json schema. 
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 NM1 Segment", "description": "Schema for X12 NM1 segment, which contains subscriber's or organization name information.", "type": "object", "properties": {"entity_identifier_code": {"type": "string", "description": "Entity Identifier Code (e.g., IL for Insured or 03 for dependent or PR for Payer or 1P for Provider)", "enum": ["IL", "03", "PR", "1P"]}, "entity_type_qualifier": {"type": "string", "description": "Entity Type Qualifier (1 = Person, 2 = Non-Person Entity)", "enum": ["1", "2"]}, "name_last_or_organizationName": {"type": "string", "description": "Name Last or Organization Name", "minLength": 1, "maxLength": 60}, "name_first": {"type": "string", "description": "Name First", "nullable": true, "minLength": 1, "maxLength": 35}, "name_middle": {"type": "string", "descripti

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a structured data generator that creates valid JSON data based on a schema.
Generate a sample json for the X12 NM1 Segment schema [/INST] {"entity_identifier_code": "IL", "entity_type_qualifier": "2", "name_last_or_organizationName": "Durham", "name_first": "Cassandra", "name_middle": "C", "name_prefix": "Mx.", "name_suffix": "DDS", "identification_code_qualifier": "MI", "identification_code": "GJqZNzWVXOcGKbxJmJNKkXLQYq", "nm1_segment": "NM1*IL*2*Durham*Cassandra*C*Mx.*DDS*MI*GJqZNzWVXOcGKbxJmJNKkXLQYq~"}</s>
DEBUG - Starting score: 10
DEBUG - Keys in json_ref not present in json_gen, reducing score by -0.3333333333333333
DEBUG - NM1*PR*1*Thompson*Karina**Mr.*Jr.*MI*FuGVCeXtPJeWRWzomDpMUuzgqRFmUJUvJSpDrqhgz~  NM1*IL*2*Durham*Cassandra*C*Mx.*DDS*MI*GJqZNzWVXOcGKbxJmJNKkXLQYq~
DEBUG - Final score: 10.333333333333334
INFO - {"valid_json": 100.0, "valid_json_schema": 100.0, "domain_field_match": 90.0}
INFO - Processing row 6/30
Setting `pad_to

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a sample data generator that creates valid JSON data based on a schema.
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 NM1 Segment", "description": "Schema for X12 NM1 segment, which contains subscriber's or organization name information.", "type": "object", "properties": {"entity_identifier_code": {"type": "string", "description": "Entity Identifier Code (e.g., IL for Insured or 03 for dependent or PR for Payer or 1P for Provider)", "enum": ["IL", "03", "PR", "1P"]}, "entity_type_qualifier": {"type": "string", "description": "Entity Type Qualifier (1 = Person, 2 = Non-Person Entity)", "enum": ["1", "2"]}, "name_last_or_organizationName": {"type": "string", "description": "Name Last or Organization Name", "minLength": 1, "maxLength": 60}, "name_first": {"type": "string", "description": "Name First", "nullable": true, "minLength": 1, "maxLength": 35}, "name_middle": {"type": "string", "description": "N

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a sample data generator that creates valid JSON data based on a schema.
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 DTP Segment", "description": "Schema for X12 DTP segment, which contains date/time period information.", "type": "object", "properties": {"dtp_segment": {"type": "string", "description": "Date/Time Period (e.g., DTP*291*D8*20220101-20220131~)", "minLength": 1, "maxLength": 30}}, "required": ["dtp_segment"]}
Produce a JSON object that adheres to the schema with all properties [/INST] {"dtp_segment": "DTP*153*D8*19900715-19900720"}</s>
DEBUG - Starting score: 10
DEBUG - DTP*155*D8*20180317-20180322  DTP*153*D8*19900715-19900720
DEBUG - dtp_segment DTP*153*D8*19900715-19900720 does not match the expected pattern, reducing score by 2.5
DEBUG - Final score: 7.5
INFO - {"valid_json": 85.71428571428571, "valid_json_schema": 83.33333333333334, "domain_field_match": 75.0}
INFO - Processing row

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a sample data generator that creates valid JSON data based on a schema.
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 NM1 Segment", "description": "Schema for X12 NM1 segment, which contains subscriber's or organization name information.", "type": "object", "properties": {"entity_identifier_code": {"type": "string", "description": "Entity Identifier Code (e.g., IL for Insured or 03 for dependent or PR for Payer or 1P for Provider)", "enum": ["IL", "03", "PR", "1P"]}, "entity_type_qualifier": {"type": "string", "description": "Entity Type Qualifier (1 = Person, 2 = Non-Person Entity)", "enum": ["1", "2"]}, "name_last_or_organizationName": {"type": "string", "description": "Name Last or Organization Name", "minLength": 1, "maxLength": 60}, "name_first": {"type": "string", "description": "Name First", "nullable": true, "minLength": 1, "maxLength": 35}, "name_middle": {"type": "string", "description": "N

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a sample data generator that creates valid JSON data based on a schema.
Generate a sample json for the X12 NM1 Segment schema [/INST] {"entity_identifier_code": "IL", "entity_type_qualifier": "1", "name_last_or_organizationName": "Cruz", "name_first": "Sandra", "name_middle": "P", "name_prefix": "Mx.", "name_suffix": "MD", "identification_code_qualifier": "MI", "identification_code": "VwLzjJhqYcYzGjNnCgYwY", "nm1_segment": "NM1*IL*1*Cruz*Sandra*P*Mx.*MD*MI*VwLzjJhqYcYzGjNnCgYwY~"}</s>
DEBUG - Starting score: 10
DEBUG - Keys in json_ref not present in json_gen, reducing score by -0.3333333333333333
DEBUG - NM1*1P*1*Roberts*Anthony*X*Miss*DDS*XX*~  NM1*IL*1*Cruz*Sandra*P*Mx.*MD*MI*VwLzjJhqYcYzGjNnCgYwY~
DEBUG - Final score: 10.333333333333334
INFO - {"valid_json": 88.88888888888889, "valid_json_schema": 85.71428571428571, "domain_field_match": 80.92592592592592}
INFO - Processing row 10/30
Setting `pad_token_id` to `eos_token_id`:3 for open-e

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a structured data generator that creates valid JSON data based on a schema.
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 DTP Segment", "description": "Schema for X12 DTP segment, which contains date/time period information.", "type": "object", "properties": {"dtp_segment": {"type": "string", "description": "Date/Time Period (e.g., DTP*291*D8*20220101-20220131~)", "minLength": 1, "maxLength": 30}}, "required": ["dtp_segment"]}
Create a JSON object that contains all properties [/INST] {"dtp_segment": "DTP*155*RD8*20100712-20100717"}</s>
DEBUG - Starting score: 10
DEBUG - DTP*152*D8*19890308-19890313  DTP*155*RD8*20100712-20100717
DEBUG - dtp_segment DTP*155*RD8*20100712-20100717 does not match the expected pattern, reducing score by 2.5
DEBUG - Final score: 7.5
INFO - {"valid_json": 90.0, "valid_json_schema": 87.5, "domain_field_match": 80.33333333333333}
INFO - Processing row 11/30
Setting `pad_token

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a Json test data generator generating valid json data based on a json schema. 
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 NM1 Segment", "description": "Schema for X12 NM1 segment, which contains subscriber's or organization name information.", "type": "object", "properties": {"entity_identifier_code": {"type": "string", "description": "Entity Identifier Code (e.g., IL for Insured or 03 for dependent or PR for Payer or 1P for Provider)", "enum": ["IL", "03", "PR", "1P"]}, "entity_type_qualifier": {"type": "string", "description": "Entity Type Qualifier (1 = Person, 2 = Non-Person Entity)", "enum": ["1", "2"]}, "name_last_or_organizationName": {"type": "string", "description": "Name Last or Organization Name", "minLength": 1, "maxLength": 60}, "name_first": {"type": "string", "description": "Name First", "nullable": true, "minLength": 1, "maxLength": 35}, "name_middle": {"type": "string", "descripti

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a sample data generator that creates valid JSON data based on a schema.
Generate a sample json for the X12 REF Segment schema with all properties [/INST] {"reference_identification_qualifier": "MI", "reference_identification": "bXkfXCnzYGKBpJG", "ref_segment": "REF*MI*bXkfXCnzYGKBpJG~"}

{"reference_identification_qualifier": "MI", "reference_identification": "NvLmLkBhJYKhNw", "ref_segment": "REF*MI*NvLmLkBhJYKhNw~"}

{"reference_identification_qualifier": "MI", "reference_identification": "jKXwXlOYOIKyfK", "ref_segment": "REF*MI*jKXwXlOYOIKyfK~"}

{"reference_identification_qualifier": "MI", "reference_identification": "vkXDfPwTpHKh", "ref_segment": "REF*MI*vkXDfPwTpHKh~"}

{"reference_identification_qualifier": "MI", "reference_identification": "YHwYX", "ref_segment": "REF*MI*YHwYX~"}

{"reference_identification_qualifier": "MI", "reference_identification": "PfKY", "ref_segment": "REF*MI*PfKY~"}

{"reference_identification_qualifier": "MI

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a structured data generator that creates valid JSON data based on a schema.
Generate a sample json for the X12 NM1 Segment schema [/INST] {"entity_identifier_code": "IL", "entity_type_qualifier": "2", "name_last_or_organizationName": "Brown", "name_first": "Karen", "name_middle": "C", "name_prefix": "Mx.", "identification_code_qualifier": "MI", "identification_code": "oJDqwYLtVgJHVfQMhkCgJnKVXk", "nm1_segment": "NM1*IL*2*Brown*Karen*C*Mx.**MI*oJDqwYLtVgJHVfQMhkCgJnKVXk~"}</s>
DEBUG - Starting score: 10
DEBUG - Keys in json_ref not present in json_gen, reducing score by 0.0
DEBUG - NM1*IL*1*Escobar*Alan*L**MD*MI*UVKagFgbOxKHPeoaWQyeEUNTBZPCmYwUyEZZBXQq~  NM1*IL*2*Brown*Karen*C*Mx.**MI*oJDqwYLtVgJHVfQMhkCgJnKVXk~
DEBUG - Final score: 10.0
INFO - {"valid_json": 92.3076923076923, "valid_json_schema": 88.88888888888889, "domain_field_match": 85.44871794871794}
INFO - Processing row 14/30
Setting `pad_token_id` to `eos_token_id`:3 for open-end ge

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a sample data generator that creates valid JSON data based on a schema.
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 NM1 Segment", "description": "Schema for X12 NM1 segment, which contains subscriber's or organization name information.", "type": "object", "properties": {"entity_identifier_code": {"type": "string", "description": "Entity Identifier Code (e.g., IL for Insured or 03 for dependent or PR for Payer or 1P for Provider)", "enum": ["IL", "03", "PR", "1P"]}, "entity_type_qualifier": {"type": "string", "description": "Entity Type Qualifier (1 = Person, 2 = Non-Person Entity)", "enum": ["1", "2"]}, "name_last_or_organizationName": {"type": "string", "description": "Name Last or Organization Name", "minLength": 1, "maxLength": 60}, "name_first": {"type": "string", "description": "Name First", "nullable": true, "minLength": 1, "maxLength": 35}, "name_middle": {"type": "string", "description": "N

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a Json test data generator generating valid json data based on a json schema. 
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 NM1 Segment", "description": "Schema for X12 NM1 segment, which contains subscriber's or organization name information.", "type": "object", "properties": {"entity_identifier_code": {"type": "string", "description": "Entity Identifier Code (e.g., IL for Insured or 03 for dependent or PR for Payer or 1P for Provider)", "enum": ["IL", "03", "PR", "1P"]}, "entity_type_qualifier": {"type": "string", "description": "Entity Type Qualifier (1 = Person, 2 = Non-Person Entity)", "enum": ["1", "2"]}, "name_last_or_organizationName": {"type": "string", "description": "Name Last or Organization Name", "minLength": 1, "maxLength": 60}, "name_first": {"type": "string", "description": "Name First", "nullable": true, "minLength": 1, "maxLength": 35}, "name_middle": {"type": "string", "descripti

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a structured data generator that creates valid JSON data based on a schema.
Generate a sample json for the X12 REF Segment schema with all properties [/INST] {"reference_identification_qualifier": "ZZ", "reference_identification": "Wy", "ref_segment": "REF*ZZ*Wy~"}

{"reference_identification_qualifier": "ZZ", "reference_identification": "Wy", "ref_segment": "REF*ZZ*Wy~"}

{"reference_identification_qualifier": "ZZ", "reference_identification": "Wy", "ref_segment": "REF*ZZ*Wy~"}

{"reference_identification_qualifier": "ZZ", "reference_identification": "Wy", "ref_segment": "REF*ZZ*Wy~"}

{"reference_identification_qualifier": "ZZ", "reference_identification": "Wy", "ref_segment": "REF*ZZ*Wy~"}

{"reference_identification_qualifier": "ZZ", "reference_identification": "Wy", "ref_segment": "REF*ZZ*Wy~"}

{"reference_identification_qualifier": "ZZ", "reference_identification": "Wy", "ref_segment": "REF*ZZ*Wy~"}

{"reference_identification_qualif

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a sample data generator that creates valid JSON data based on a schema.
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 NM1 Segment", "description": "Schema for X12 NM1 segment, which contains subscriber's or organization name information.", "type": "object", "properties": {"entity_identifier_code": {"type": "string", "description": "Entity Identifier Code (e.g., IL for Insured or 03 for dependent or PR for Payer or 1P for Provider)", "enum": ["IL", "03", "PR", "1P"]}, "entity_type_qualifier": {"type": "string", "description": "Entity Type Qualifier (1 = Person, 2 = Non-Person Entity)", "enum": ["1", "2"]}, "name_last_or_organizationName": {"type": "string", "description": "Name Last or Organization Name", "minLength": 1, "maxLength": 60}, "name_first": {"type": "string", "description": "Name First", "nullable": true, "minLength": 1, "maxLength": 35}, "name_middle": {"type": "string", "description": "N

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a sample data generator that creates valid JSON data based on a schema.
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 NM1 Segment", "description": "Schema for X12 NM1 segment, which contains subscriber's or organization name information.", "type": "object", "properties": {"entity_identifier_code": {"type": "string", "description": "Entity Identifier Code (e.g., IL for Insured or 03 for dependent or PR for Payer or 1P for Provider)", "enum": ["IL", "03", "PR", "1P"]}, "entity_type_qualifier": {"type": "string", "description": "Entity Type Qualifier (1 = Person, 2 = Non-Person Entity)", "enum": ["1", "2"]}, "name_last_or_organizationName": {"type": "string", "description": "Name Last or Organization Name", "minLength": 1, "maxLength": 60}, "name_first": {"type": "string", "description": "Name First", "nullable": true, "minLength": 1, "maxLength": 35}, "name_middle": {"type": "string", "description": "N

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a structured data generator that creates valid JSON data based on a schema.
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 REF Segment", "description": "Schema for X12 REF segment, which contains reference identification information.", "type": "object", "properties": {"reference_identification_qualifier": {"type": "string", "description": "Reference Identification Qualifier (e.g., ZZ = Mutually Defined, MI = Member ID, XX = National Provider Identifier, SV = Service Provider Number, FI = Federal Tax ID, PI = Payer Identifier, PR = Payer Name, 1P = Provider Name)", "enum": ["ZZ", "MI", "XX", "SV", "FI", "PI", "PR", "1P"]}, "reference_identification": {"type": "string", "description": "Reference Identification (e.g., Member ID, National Provider Identifier)", "minLength": 1, "maxLength": 20}, "ref_segment": {"type": "string", "description": "Ref segment in the format REF*XX*123456789~", "minLength": 1, 

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a sample data generator that creates valid JSON data based on a schema.
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 REF Segment", "description": "Schema for X12 REF segment, which contains reference identification information.", "type": "object", "properties": {"reference_identification_qualifier": {"type": "string", "description": "Reference Identification Qualifier (e.g., ZZ = Mutually Defined, MI = Member ID, XX = National Provider Identifier, SV = Service Provider Number, FI = Federal Tax ID, PI = Payer Identifier, PR = Payer Name, 1P = Provider Name)", "enum": ["ZZ", "MI", "XX", "SV", "FI", "PI", "PR", "1P"]}, "reference_identification": {"type": "string", "description": "Reference Identification (e.g., Member ID, National Provider Identifier)", "minLength": 1, "maxLength": 20}, "ref_segment": {"type": "string", "description": "Ref segment in the format REF*XX*123456789~", "minLength": 1, "max

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a sample data generator that creates valid JSON data based on a schema.
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 NM1 Segment", "description": "Schema for X12 NM1 segment, which contains subscriber's or organization name information.", "type": "object", "properties": {"entity_identifier_code": {"type": "string", "description": "Entity Identifier Code (e.g., IL for Insured or 03 for dependent or PR for Payer or 1P for Provider)", "enum": ["IL", "03", "PR", "1P"]}, "entity_type_qualifier": {"type": "string", "description": "Entity Type Qualifier (1 = Person, 2 = Non-Person Entity)", "enum": ["1", "2"]}, "name_last_or_organizationName": {"type": "string", "description": "Name Last or Organization Name", "minLength": 1, "maxLength": 60}, "name_first": {"type": "string", "description": "Name First", "nullable": true, "minLength": 1, "maxLength": 35}, "name_middle": {"type": "string", "description": "N

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a sample data generator that creates valid JSON data based on a schema.
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 NM1 Segment", "description": "Schema for X12 NM1 segment, which contains subscriber's or organization name information.", "type": "object", "properties": {"entity_identifier_code": {"type": "string", "description": "Entity Identifier Code (e.g., IL for Insured or 03 for dependent or PR for Payer or 1P for Provider)", "enum": ["IL", "03", "PR", "1P"]}, "entity_type_qualifier": {"type": "string", "description": "Entity Type Qualifier (1 = Person, 2 = Non-Person Entity)", "enum": ["1", "2"]}, "name_last_or_organizationName": {"type": "string", "description": "Name Last or Organization Name", "minLength": 1, "maxLength": 60}, "name_first": {"type": "string", "description": "Name First", "nullable": true, "minLength": 1, "maxLength": 35}, "name_middle": {"type": "string", "description": "N

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a sample data generator that creates valid JSON data based on a schema.
Give a sample for the X12 NM1 Segment schema [/INST] {"entity_identifier_code": "ZZ", "entity_type_qualifier": "1", "name_last_or_organizationName": "Murphy", "name_first": "James", "name_middle": "B", "name_prefix": "Mr.", "name_suffix": "DDS", "identification_code_qualifier": "MI", "nm1_segment": "NM1*ZZ*1*Murphy*James*B*Mr.*DDS*MI*~"}</s>
DEBUG - Starting score: 10
DEBUG - Keys in json_ref not present in json_gen, reducing score by -0.375
DEBUG - NM1*PR*1*Ochoa*Ryan*S*Mr.*DDS**~  NM1*ZZ*1*Murphy*James*B*Mr.*DDS*MI*~
DEBUG - Final score: 10.375
INFO - {"valid_json": 91.30434782608695, "valid_json_schema": 88.23529411764706, "domain_field_match": 87.59057971014494}
INFO - Processing row 24/30
Setting `pad_token_id` to `eos_token_id`:3 for open-end generation.


------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a sample data generator that creates valid JSON data based on a schema.
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 NM1 Segment", "description": "Schema for X12 NM1 segment, which contains subscriber's or organization name information.", "type": "object", "properties": {"entity_identifier_code": {"type": "string", "description": "Entity Identifier Code (e.g., IL for Insured or 03 for dependent or PR for Payer or 1P for Provider)", "enum": ["IL", "03", "PR", "1P"]}, "entity_type_qualifier": {"type": "string", "description": "Entity Type Qualifier (1 = Person, 2 = Non-Person Entity)", "enum": ["1", "2"]}, "name_last_or_organizationName": {"type": "string", "description": "Name Last or Organization Name", "minLength": 1, "maxLength": 60}, "name_first": {"type": "string", "description": "Name First", "nullable": true, "minLength": 1, "maxLength": 35}, "name_middle": {"type": "string", "description": "N

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a structured data generator that creates valid JSON data based on a schema.
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 REF Segment", "description": "Schema for X12 REF segment, which contains reference identification information.", "type": "object", "properties": {"reference_identification_qualifier": {"type": "string", "description": "Reference Identification Qualifier (e.g., ZZ = Mutually Defined, MI = Member ID, XX = National Provider Identifier, SV = Service Provider Number, FI = Federal Tax ID, PI = Payer Identifier, PR = Payer Name, 1P = Provider Name)", "enum": ["ZZ", "MI", "XX", "SV", "FI", "PI", "PR", "1P"]}, "reference_identification": {"type": "string", "description": "Reference Identification (e.g., Member ID, National Provider Identifier)", "minLength": 1, "maxLength": 20}, "ref_segment": {"type": "string", "description": "Ref segment in the format REF*XX*123456789~", "minLength": 1, 

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a sample data generator that creates valid JSON data based on a schema.
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 DTP Segment", "description": "Schema for X12 DTP segment, which contains date/time period information.", "type": "object", "properties": {"dtp_segment": {"type": "string", "description": "Date/Time Period (e.g., DTP*291*D8*20220101-20220131~)", "minLength": 1, "maxLength": 30}}, "required": ["dtp_segment"]}
Generate a JSON object that includes all properties [/INST] {"dtp_segment": "DTP*348*D8*20130513-20130518"}</s>
DEBUG - Starting score: 10
DEBUG - DTP*156*D8*19890717-19890722  DTP*348*D8*20130513-20130518
DEBUG - dtp_segment DTP*348*D8*20130513-20130518 does not match the expected pattern, reducing score by 2.5
DEBUG - Final score: 7.5
INFO - {"valid_json": 92.3076923076923, "valid_json_schema": 90.0, "domain_field_match": 88.2051282051282}
INFO - Processing row 27/30
Setting `pad

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a Json test data generator generating valid json data based on a json schema. 
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 REF Segment", "description": "Schema for X12 REF segment, which contains reference identification information.", "type": "object", "properties": {"reference_identification_qualifier": {"type": "string", "description": "Reference Identification Qualifier (e.g., ZZ = Mutually Defined, MI = Member ID, XX = National Provider Identifier, SV = Service Provider Number, FI = Federal Tax ID, PI = Payer Identifier, PR = Payer Name, 1P = Provider Name)", "enum": ["ZZ", "MI", "XX", "SV", "FI", "PI", "PR", "1P"]}, "reference_identification": {"type": "string", "description": "Reference Identification (e.g., Member ID, National Provider Identifier)", "minLength": 1, "maxLength": 20}, "ref_segment": {"type": "string", "description": "Ref segment in the format REF*XX*123456789~", "minLength": 

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a sample data generator that creates valid JSON data based on a schema.
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 REF Segment", "description": "Schema for X12 REF segment, which contains reference identification information.", "type": "object", "properties": {"reference_identification_qualifier": {"type": "string", "description": "Reference Identification Qualifier (e.g., ZZ = Mutually Defined, MI = Member ID, XX = National Provider Identifier, SV = Service Provider Number, FI = Federal Tax ID, PI = Payer Identifier, PR = Payer Name, 1P = Provider Name)", "enum": ["ZZ", "MI", "XX", "SV", "FI", "PI", "PR", "1P"]}, "reference_identification": {"type": "string", "description": "Reference Identification (e.g., Member ID, National Provider Identifier)", "minLength": 1, "maxLength": 20}, "ref_segment": {"type": "string", "description": "Ref segment in the format REF*XX*123456789~", "minLength": 1, "max

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a sample data generator that creates valid JSON data based on a schema.
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 DTP Segment", "description": "Schema for X12 DTP segment, which contains date/time period information.", "type": "object", "properties": {"dtp_segment": {"type": "string", "description": "Date/Time Period (e.g., DTP*291*D8*20220101-20220131~)", "minLength": 1, "maxLength": 30}}, "required": ["dtp_segment"]}
Generate a JSON object that includes all properties [/INST] {"dtp_segment": "DTP*349*RD8*20021113-20021118"}</s>
DEBUG - Starting score: 10
DEBUG - DTP*349*RD8*20050102-20050107  DTP*349*RD8*20021113-20021118
DEBUG - dtp_segment DTP*349*RD8*20021113-20021118 does not match the expected pattern, reducing score by 2.5
DEBUG - Final score: 7.5
INFO - {"valid_json": 93.10344827586206, "valid_json_schema": 91.30434782608695, "domain_field_match": 88.56321839080462}
INFO - Processing row

------------------------------------------------------------


INFO - Generated text: 
 <s><s>[INST] You are a Json test data generator generating valid json data based on a json schema. 
Given the Schema: {"$schema": "http://json-schema.org/draft-07/schema#", "title": "X12 REF Segment", "description": "Schema for X12 REF segment, which contains reference identification information.", "type": "object", "properties": {"reference_identification_qualifier": {"type": "string", "description": "Reference Identification Qualifier (e.g., ZZ = Mutually Defined, MI = Member ID, XX = National Provider Identifier, SV = Service Provider Number, FI = Federal Tax ID, PI = Payer Identifier, PR = Payer Name, 1P = Provider Name)", "enum": ["ZZ", "MI", "XX", "SV", "FI", "PI", "PR", "1P"]}, "reference_identification": {"type": "string", "description": "Reference Identification (e.g., Member ID, National Provider Identifier)", "minLength": 1, "maxLength": 20}, "ref_segment": {"type": "string", "description": "Ref segment in the format REF*XX*123456789~", "minLength": 

------------------------------------------------------------
