# LLM Watermark Judger

## Setup

Load necessary packages and initialize OpenAI API.

In [1]:
# ----- SYSTEM PACKAGES ----- ##
import os
import sys
import time
import numpy as np

## ----- IMPORT FUNCTIONS ----- ##
sys.path.insert(0, os.getcwd())
from functions import *

## Judge New Samples

Load sample pairs from a JSON file and use the GPT-judger to evaluate each sample, saving the results to the same data structure.

In [3]:
data_list = load_from_json(filename="results.json")

for i in range(len(data_list)):

    item = data_list[i]

    # unpack needed fields
    prompt = item["prompt"]
    uw_output = data_list[i]["uw_output"]
    w_output = item["w_output"]

    # evaluate responses using gpt
    judge_choice = ""
    exception_counter = 0
    while judge_choice == "" or judge_choice == "Model Failure":
        try:
            t1 = time.time()
            judge_output, judge_choice, scores, is_randomized = gpt_judge(prompt, uw_output, w_output)
            t2 = time.time()
        except Exception as e:
            print(e)
            pass

    # # save results
    item["judge_output"] = judge_output
    item["randomized"] = is_randomized
    item["judge_choice"] = judge_choice
    item["scores"] = scores

    print(i+1, "out of", len(data_list), "items processed!", "Execution Time:", round(t2 - t1, 3), end="\r")
    
save_to_json(data_list, filename="results_judged.json")

## Display Results

In [8]:
# whether to display results as counts (0) or percentages (1)
percent = 1

# print results
print("Judger Preference:", extract_and_count_choices(data_list, 'judge_choice')[percent], "\n")