In [7]:
import sys, os, time, pathlib, json, shelve
from src import LLMFrontEnd

In [8]:
eval_dir = pathlib.Path("evals", "v0")
sample_dir = pathlib.Path("samples")
cache_file = pathlib.Path("evals", "cache").as_posix()
cache_status = True
models = ["gpt-4o-mini", "gemma2_9b", "qwen2.5_3b"]

In [9]:
def store_to_cache(list_key, value):
    key = json.dumps(list_key)  # Convert list to JSON string for a key
    with shelve.open(cache_file, writeback=True) as cache:
        cache[key] = value  # Store value in cache

def retrieve_from_cache(list_key):
    if not cache_status:
        return None

    key = json.dumps(list_key)  # Convert list to JSON string for a key
    with shelve.open(cache_file, writeback=False) as cache:
        return cache.get(key, None)  # Retrieve value or return None if not found

def enable_cache():
    global cache_status
    cache_status = True

def disable_cache():
    global cache_status
    cache_status = False

In [10]:
def extract_system_prompt(system_prompt):
    system_prompt_file = pathlib.Path(sample_dir, system_prompt, f"{system_prompt}.prompty")
    with open(system_prompt_file, "r") as f:
        return f.read().split("system:")[1].split("user:")[0].strip()

def check_test_result(result):
    if "ERR" in result:
        return "ERR"
    else:
        if "OK" not in result:
            pass
            # print warning in yellow that OK not found
            # print(f"\033[93mWarning: OK not found in result")
            # reset color without newline
            # print("\033[0m", end="")
        return "OK"

In [21]:
def check_rules_grounded(system_prompt):
    # print(f"Checking {system_prompt}")
    rule_file = pathlib.Path(eval_dir, system_prompt, "rules.txt")
    system_prompt_file = pathlib.Path(sample_dir, system_prompt, f"{system_prompt}.prompty")

    if not system_prompt_file.exists():
        print(f"System prompt file not found: {system_prompt_file}")
        return

    with open(system_prompt_file, "r") as f:
        prompt = extract_system_prompt(system_prompt)

        with open(rule_file, "r") as f:
            rules = f.read().splitlines()
            total = len(rules)
            grounded = 0

            for rule in rules:
                result = retrieve_from_cache(["check_grounded", rule, prompt])
                if result is None:
                    result = LLMFrontEnd().check_rule_grounded(rule, prompt)
                    store_to_cache(["check_grounded", rule, prompt], result)

                if result == "0":
                    grounded += 1
                # print(f"Rule: {rule} Grounded: {result}")
            print(f"Grounded: {grounded}/{total} ({grounded/total:.2f})")

def check_test_validity(system_prompt):
    # assumes coverage has all the tests
    test_run_path = pathlib.Path(eval_dir, system_prompt, "coverage")
    total = 0
    passed = 0

    system_prompt_file = pathlib.Path(sample_dir, system_prompt, f"{system_prompt}.prompty")
    input_spec_file = pathlib.Path(eval_dir, system_prompt, "input_spec.txt")

    if not input_spec_file.exists():
        print(f"Input spec file not found: {input_spec_file}")
        return

    with open(input_spec_file, "r") as f:
        input_spec = f.read()

        for test_run_file in test_run_path.iterdir():
            if test_run_file.suffix == ".json":
                with open(test_run_file, "r") as f:
                    test_run = json.load(f)
                    if "input" in test_run:
                        total += 1
                        result = retrieve_from_cache(["check_violation", test_run["input"], input_spec])
                        if result is None:
                            result = LLMFrontEnd().check_violation_with_input_spec(test_run["input"], input_spec)
                            store_to_cache(["check_violation", test_run["input"], input_spec], result)
                        if result == "0":
                            passed += 1
    print(f"Valid: {passed}/{total} ({passed/total:.2f})")
    
def check_coverage(system_prompt):
    test_run_path = pathlib.Path(eval_dir, system_prompt, "coverage")
    # loop over all the json files in test_run_path
    total = 0
    passed = 0
    for test_run_file in test_run_path.iterdir():
        if test_run_file.suffix == ".json":
            with open(test_run_file, "r") as f:
                test_run = json.load(f)
                if "evaluation" in test_run:
                    total += 1
                    result = check_test_result(test_run["evaluation"])
                    if result == "OK":
                        passed += 1
    print(f"Coverage: {passed}/{total} ({passed/total:.2f})")

def check_failure(system_prompt):
    # list of colors for printing
    colors = ["\033[91m", "\033[93m", "\033[94m", "\033[95m", "\033[96m", "\033[97m", "\033[98m"]
    for model in models:
        test_run_path = pathlib.Path(eval_dir, system_prompt, model)
        # loop over all the json files in test_run_path
        total = 0
        passed = 0
        for test_run_file in test_run_path.iterdir():
            if test_run_file.suffix == ".json":
                with open(test_run_file, "r") as f:
                    test_run = json.load(f)
                    if "evaluation" in test_run:
                        total += 1
                        result = check_test_result(test_run["evaluation"])
                        if result == "OK":
                            passed += 1
        # print in color using the model index
        print(f"{colors[models.index(model)]}{model}: {passed}/{total} ({passed/total:.2f})")
        # reset color without newline
        print("\033[0m", end="")

In [22]:
enable_cache()
for sample in eval_dir.iterdir():
    if sample.is_dir():
        # print in green
        print("")
        print(f"\033[92mProcessing {sample.parts[-1]}")
        print("-" * 40)
        # reset color without newline
        print("\033[0m", end="")
        # get name of the last dir in the path
        system_prompt = sample.parts[-1]

        check_rules_grounded(system_prompt)
        check_test_validity(system_prompt)
        check_coverage(system_prompt)
        check_failure(system_prompt)


[92mProcessing elements
----------------------------------------
[0mSystem prompt file not found: samples/elements/elements.prompty
Valid: 7/9 (0.78)
Coverage: 9/9 (1.00)
[91mgpt-4o-mini: 8/9 (0.89)
[0m[93mgemma2_9b: 7/9 (0.78)
[0m[94mqwen2.5_3b: 4/9 (0.44)
[0m
[92mProcessing speech-tag
----------------------------------------
[0mGrounded: 4/4 (1.00)
Valid: 10/12 (0.83)
Coverage: 12/12 (1.00)
[91mgpt-4o-mini: 12/12 (1.00)
[0m[93mgemma2_9b: 11/12 (0.92)
[0m[94mqwen2.5_3b: 11/12 (0.92)
[0m
[92mProcessing text-to-p
----------------------------------------
[0mGrounded: 5/6 (0.83)
Valid: 3/3 (1.00)
Coverage: 3/3 (1.00)
[91mgpt-4o-mini: 0/3 (0.00)
[0m[93mgemma2_9b: 1/3 (0.33)
[0m[94mqwen2.5_3b: 0/3 (0.00)
[0m