In [164]:
import sys, os, time, pathlib, json, shelve
from src import LLMFrontEnd

# Customize the following variables
- eval_dir -- Path to the results 
- sample_dir -- Path to the system prompts
- cache_file -- Path to a temporary file for caching the results from runs
- cache_status -- Sets the cache status, true means use the cache
- models -- List of models for which the results will be analyzed

In [165]:
eval_dir = pathlib.Path("evals", "v3")
sample_dir = pathlib.Path("samples")
cache_file = pathlib.Path("evals", "cache").as_posix()
cache_status = True
models = ["gpt-4o-mini", "gemma2_9b", "qwen2.5_3b", "llama3.2_1b"]

### Helper code for cache 

In [166]:
def store_to_cache(list_key, value):
    key = json.dumps(list_key)  # Convert list to JSON string for a key
    with shelve.open(cache_file, writeback=True) as cache:
        cache[key] = value  # Store value in cache

def retrieve_from_cache(list_key):
    if not cache_status:
        return None

    key = json.dumps(list_key)  # Convert list to JSON string for a key
    with shelve.open(cache_file, writeback=False) as cache:
        return cache.get(key, None)  # Retrieve value or return None if not found

def enable_cache():
    global cache_status
    cache_status = True

def disable_cache():
    global cache_status
    cache_status = False

### Helper code for analysis

In [167]:
def extract_system_prompt(system_prompt):
    system_prompt_file = pathlib.Path(sample_dir, system_prompt, f"{system_prompt}.prompty")
    if not system_prompt_file.exists():
        return None
    with open(system_prompt_file, "r") as f:
        return f.read().split("system:")[1].split("user:")[0].strip()

def check_test_result(result):
    if "ERR" in result:
        return "ERR"
    else:
        if "OK" not in result:
            pass
            # print warning in yellow that OK not found
            # print(f"\033[93mWarning: OK not found in result")
            # reset color without newline
            # print("\033[0m", end="")
        return "OK"

In [173]:
def check_rules_grounded(system_prompt):
    # print(f"Checking {system_prompt}")
    rule_file = pathlib.Path(eval_dir, system_prompt, "rules.txt")
    system_prompt_file = pathlib.Path(sample_dir, system_prompt, f"{system_prompt}.prompty")

    if not system_prompt_file.exists():
        print(f"System prompt file not found: {system_prompt_file}")
        return

    with open(system_prompt_file, "r") as f:
        prompt = extract_system_prompt(system_prompt)
        if not prompt:
            return

        with open(rule_file, "r") as f:
            rules = f.read().splitlines()
            total = len(rules)
            grounded = 0

            for rule in rules:
                result = retrieve_from_cache(["check_grounded", rule, prompt])
                if result is None:
                    result = LLMFrontEnd().check_rule_grounded(rule, prompt)
                    store_to_cache(["check_grounded", rule, prompt], result)

                if result == "0":
                    grounded += 1
                # print(f"Rule: {rule} Grounded: {result}")
            print(f"Grounded: {grounded}/{total} ({grounded/total:.2f})")

def check_test_validity(system_prompt):
    tests = categorize_test(system_prompt)

    if not tests:
        return

    test_run_path = pathlib.Path(eval_dir, system_prompt, "coverage")
    results = {"promptpex": {"total": 0, "passed": 0},
              "baseline": {"total": 0, "passed": 0}}

    for test_run_file in tests["valid"]:
        test_run_file = pathlib.Path(test_run_path, test_run_file)
        if test_run_file.suffix == ".json":
            with open(test_run_file, "r") as f:
                test_run = json.load(f)
                if "ruleid" in test_run and test_run["ruleid"] != None:
                    results["promptpex"]["total"] += 1
                    results["promptpex"]["passed"] += 1
                else:
                    results["baseline"]["total"] += 1
                    results["baseline"]["passed"] += 1

    for test_run_file in tests["invalid"]:
        test_run_file = pathlib.Path(test_run_path, test_run_file)
        if test_run_file.suffix == ".json":
            with open(test_run_file, "r") as f:
                test_run = json.load(f)
                if "ruleid" in test_run and test_run["ruleid"] != None:
                    results["promptpex"]["total"] += 1
                else:
                    results["baseline"]["total"] += 1

    promptpex_total = results["promptpex"]["total"]
    promptpex_passed = results["promptpex"]["passed"]
    baseline_total = results["baseline"]["total"]
    baseline_passed = results["baseline"]["passed"]

    print(f"Valid:\t\t{promptpex_passed}/{promptpex_total}", end="")
    if promptpex_total == 0:
        print("\t", end="")
    else:
        print(f" ({promptpex_passed/promptpex_total:.2f})", end="")
    print(f"\t {baseline_passed}/{baseline_total} ", end="")
    if baseline_total == 0:
        print("\t")
    else:
        print(f" ({baseline_passed/baseline_total:.2f})")

def categorize_test(system_prompt):
    tests = {"valid": [], "invalid": []}

    test_run_path = pathlib.Path(eval_dir, system_prompt, "coverage")
    system_prompt = extract_system_prompt(system_prompt)

    if not system_prompt:
        return None

    for test_run_file in test_run_path.iterdir():
        if test_run_file.suffix == ".json":
            with open(test_run_file, "r") as f:
                test_run = json.load(f)
                if "input" in test_run:
                    result = retrieve_from_cache(["categorize_test", test_run["input"], system_prompt])
                    if result is None:
                        result = LLMFrontEnd().check_violation_with_input_spec(test_run["input"], system_prompt)
                        store_to_cache(["categorize_test", test_run["input"], system_prompt], result)
                    test_file_name = test_run_file.parts[-1]
                    if check_test_result(result) == "OK":
                        tests["valid"].append(test_file_name)
                    else:
                        tests["invalid"].append(test_file_name)
    return tests

def check_coverage(system_prompt):
    test_run_path = pathlib.Path(eval_dir, system_prompt, "coverage")
    # loop over all the json files in test_run_path
    total = 0
    passed = 0

    valid_tests = categorize_test(system_prompt)
    if not valid_tests:
        return
    valid_tests = valid_tests["valid"]

    for test_run_file in valid_tests:
        test_run_file = pathlib.Path(test_run_path, test_run_file)
        if test_run_file.suffix == ".json":
            with open(test_run_file, "r") as f:
                test_run = json.load(f)
                if "evaluation" in test_run:
                    total += 1
                    result = check_test_result(test_run["evaluation"])
                    if result == "OK":
                        passed += 1
    print(f"Coverage: {passed}/{total} ({passed/total:.2f})")

def check_failure(system_prompt):
    # list of colors for printing
    colors = ["\033[91m", "\033[93m", "\033[94m", "\033[95m", "\033[96m", "\033[97m", "\033[98m"]
    for model in models:
        test_run_path = pathlib.Path(eval_dir, system_prompt, model)
        if not test_run_path.exists():
            continue
        # loop over all the json files in test_run_path
        results = {"promptpex": {"total": 0, "passed": 0}, "baseline": {"total": 0, "passed": 0}}
        for test_run_file in test_run_path.iterdir():
            if test_run_file.suffix == ".json":
                with open(test_run_file, "r") as f:
                    test_run = json.load(f)
                    if "evaluation" in test_run:
                        result = check_test_result(test_run["evaluation"])
                        if "ruleid" in test_run and test_run["ruleid"] != None:
                            results["promptpex"]["total"] += 1
                            if result == "OK":
                                results["promptpex"]["passed"] += 1
                        else:
                            results["baseline"]["total"] += 1
                            if result == "OK":
                                results["baseline"]["passed"] += 1

        promptpex_total = results["promptpex"]["total"]
        promptpex_passed = results["promptpex"]["passed"]
        baseline_total =  results["baseline"]["total"]
        baseline_passed = results["baseline"]["passed"]

        # print in color using the model index
        print(f"{colors[models.index(model)]}{model}:\t{promptpex_passed}/{promptpex_total}", end="")
        if promptpex_total == 0:
            print("\t", end="")
        else:
            print(f" ({promptpex_passed/promptpex_total:.2f})", end="")
        print(f"\t {baseline_passed}/{baseline_total} ", end="")
        if baseline_total == 0:
            print("\t")
        else:
            print(f" ({baseline_passed/baseline_total:.2f})")
        print("\033[0m", end="")

        valid_results = {"promptpex": {"total": 0, "passed": 0}, "baseline": {"total": 0, "passed": 0}}
        tests = categorize_test(system_prompt)
        if not tests:
            continue

        for test_run_file in tests["valid"]:
            test_run_file = pathlib.Path(test_run_path, test_run_file)
            if test_run_file.suffix == ".json":
                with open(test_run_file, "r") as f:
                    test_run = json.load(f)
                    if "evaluation" in test_run:
                        result = check_test_result(test_run["evaluation"])
                        if "ruleid" in test_run and test_run["ruleid"] != None:
                            valid_results["promptpex"]["total"] += 1
                            if result == "OK":
                                valid_results["promptpex"]["passed"] += 1
                        else:
                            valid_results["baseline"]["total"] += 1
                            if result == "OK":
                                valid_results["baseline"]["passed"] += 1

        valid_promptpex_total = valid_results["promptpex"]["total"]
        valid_promptpex_passed = valid_results["promptpex"]["passed"]
        valid_baseline_total =  valid_results["baseline"]["total"]
        valid_baseline_passed = valid_results["baseline"]["passed"]

        # print in color using the model index
        print(f"{colors[models.index(model)]}V-{model}:\t{valid_promptpex_passed}/{valid_promptpex_total}", end="")
        if valid_promptpex_total == 0:
            print("\t", end="")
        else:
            print(f" ({valid_promptpex_passed/valid_promptpex_total:.2f})", end="")
        print(f"\t {valid_baseline_passed}/{valid_baseline_total} ", end="")
        if valid_baseline_total == 0:
            print("\t")
        else:
            print(f" ({valid_baseline_passed/valid_baseline_total:.2f})")
        print("\033[0m", end="")

In [178]:
def print_help():
    print(f"\033[92mProcessing Prompt Name")
    print("-" * 40)
    print("\033[0m", end="")
    data = """Grounded: Number of grounded rules/Total Rules
Coverage: Number of passed tests in the coverage run/Total test runs (uses all the valid tests) 
\t\tPromptPex\t\t Baseline
Valid: Number of valid tests/total Test
Model Name: Number of passed tests for this model/total test runs
V-Model Name: Number of passed tests for this model/valid test runs
"""
    print(data)

def print_invalid_tests(system_prompt):
    tests = categorize_test(system_prompt)
    if not tests:
        return
    print("Invalid tests:")
    test_run_path = pathlib.Path(eval_dir, system_prompt, "coverage")
    for test_run_file in tests["invalid"]:
        test_run_file = pathlib.Path(test_run_path, test_run_file)
        if test_run_file.suffix == ".json":
            with open(test_run_file, "r") as f:
                test_run = json.load(f)
                if "ruleid" in test_run and test_run["ruleid"] != None:
                    print("promptpex: ", end="")
                else:
                    print("baseline: ", end="")
                print(f"{test_run['input']}")

In [179]:
enable_cache()
print_help()
for sample in eval_dir.iterdir():
    if sample.is_dir():
        # print in green
        print("")
        print(f"\033[92mProcessing {sample.parts[-1]}")
        print("-" * 40)
        # reset color without newline
        print("\033[0m", end="")
        # get name of the last dir in the path
        system_prompt = sample.parts[-1]

        
        check_rules_grounded(system_prompt)
        check_coverage(system_prompt)
        print(f"\t\t PromptPex \t Baseline")
        check_test_validity(system_prompt)
        check_failure(system_prompt)

        # print_invalid_tests(system_prompt)

[92mProcessing Prompt Name
----------------------------------------
[0mGrounded: Number of grounded rules/Total Rules
Coverage: Number of passed tests in the coverage run/Total test runs (uses all the valid tests) 
		PromptPex		 Baseline
Valid: Number of valid tests/total Test
Model Name: Number of passed tests for this model/total test runs
V-Model Name: Number of passed tests for this model/valid test runs


[92mProcessing speech-tag
----------------------------------------
[0mGrounded: 5/5 (1.00)
Coverage: 42/42 (1.00)
		 PromptPex 	 Baseline
Valid:		23/30 (0.77)	 19/20  (0.95)
[91mgpt-4o-mini:	30/30 (1.00)	 20/20  (1.00)
[0m[91mV-gpt-4o-mini:	23/23 (1.00)	 19/19  (1.00)
[0m[93mgemma2_9b:	25/30 (0.83)	 20/20  (1.00)
[0m[93mV-gemma2_9b:	19/23 (0.83)	 19/19  (1.00)
[0m[94mqwen2.5_3b:	28/30 (0.93)	 18/20  (0.90)
[0m[94mV-qwen2.5_3b:	21/23 (0.91)	 17/19  (0.89)
[0m[95mllama3.2_1b:	2/30 (0.07)	 2/20  (0.10)
[0m[95mV-llama3.2_1b:	2/23 (0.09)	 2/19  (0.11)
[0m
[92mProc